486 files changed, 56575 insertions, 7414 deletions
diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index c1e9545c59b..9f59fcbf5d8 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -13,9 +13,9 @@
 3.6 Constraints
 3.7 Example
 
-4 DRIVER DEVELOPER NOTES
+4 DMAENGINE DRIVER DEVELOPER NOTES
 4.1 Conformance points
-4.2 "My application needs finer control of hardware channels"
+4.2 "My application needs exclusive control of hardware channels"
 
 5 SOURCE
 
@@ -150,6 +150,7 @@ ops_run_* and ops_complete_* routines in drivers/md/raid5.c for more
 implementation examples.
 
 4 DRIVER DEVELOPMENT NOTES
+
 4.1 Conformance points:
 There are a few conformance points required in dmaengine drivers to
 accommodate assumptions made by applications using the async_tx API:
@@ -158,58 +159,49 @@ accommodate assumptions made by applications using the async_tx API:
 3/ Use async_tx_run_dependencies() in the descriptor clean up path to
    handle submission of dependent operations
 
-4.2 "My application needs finer control of hardware channels"
-This requirement seems to arise from cases where a DMA engine driver is
-trying to support device-to-memory DMA.  The dmaengine and async_tx
-implementations were designed for offloading memory-to-memory
-operations; however, there are some capabilities of the dmaengine layer
-that can be used for platform-specific channel management.
-Platform-specific constraints can be handled by registering the
-application as a 'dma_client' and implementing a 'dma_event_callback' to
-apply a filter to the available channels in the system.  Before showing
-how to implement a custom dma_event callback some background of
-dmaengine's client support is required.
-
-The following routines in dmaengine support multiple clients requesting
-use of a channel:
-- dma_async_client_register(struct dma_client *client)
-- dma_async_client_chan_request(struct dma_client *client)
-
-dma_async_client_register takes a pointer to an initialized dma_client
-structure.  It expects that the 'event_callback' and 'cap_mask' fields
-are already initialized.
-
-dma_async_client_chan_request triggers dmaengine to notify the client of
-all channels that satisfy the capability mask.  It is up to the client's
-event_callback routine to track how many channels the client needs and
-how many it is currently using.  The dma_event_callback routine returns a
-dma_state_client code to let dmaengine know the status of the
-allocation.
-
-Below is the example of how to extend this functionality for
-platform-specific filtering of the available channels beyond the
-standard capability mask:
-
-static enum dma_state_client
-my_dma_client_callback(struct dma_client *client,
-			struct dma_chan *chan, enum dma_state state)
-{
-	struct dma_device *dma_dev;
-	struct my_platform_specific_dma *plat_dma_dev;
-	
-	dma_dev = chan->device;
-	plat_dma_dev = container_of(dma_dev,
-				    struct my_platform_specific_dma,
-				    dma_dev);
-
-	if (!plat_dma_dev->platform_specific_capability)
-		return DMA_DUP;
-
-	. . .
-}
+4.2 "My application needs exclusive control of hardware channels"
+Primarily this requirement arises from cases where a DMA engine driver
+is being used to support device-to-memory operations.  A channel that is
+performing these operations cannot, for many platform specific reasons,
+be shared.  For these cases the dma_request_channel() interface is
+provided.
+
+The interface is:
+struct dma_chan *dma_request_channel(dma_cap_mask_t mask,
+				     dma_filter_fn filter_fn,
+				     void *filter_param);
+
+Where dma_filter_fn is defined as:
+typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
+
+When the optional 'filter_fn' parameter is set to NULL
+dma_request_channel simply returns the first channel that satisfies the
+capability mask.  Otherwise, when the mask parameter is insufficient for
+specifying the necessary channel, the filter_fn routine can be used to
+disposition the available channels in the system. The filter_fn routine
+is called once for each free channel in the system.  Upon seeing a
+suitable channel filter_fn returns DMA_ACK which flags that channel to
+be the return value from dma_request_channel.  A channel allocated via
+this interface is exclusive to the caller, until dma_release_channel()
+is called.
+
+The DMA_PRIVATE capability flag is used to tag dma devices that should
+not be used by the general-purpose allocator.  It can be set at
+initialization time if it is known that a channel will always be
+private.  Alternatively, it is set when dma_request_channel() finds an
+unused "public" channel.
+
+A couple caveats to note when implementing a driver and consumer:
+1/ Once a channel has been privately allocated it will no longer be
+   considered by the general-purpose allocator even after a call to
+   dma_release_channel().
+2/ Since capabilities are specified at the device level a dma_device
+   with multiple channels will either have all channels public, or all
+   channels private.
 
 5 SOURCE
-include/linux/dmaengine.h: core header file for DMA drivers and clients
+
+include/linux/dmaengine.h: core header file for DMA drivers and api users
 drivers/dma/dmaengine.c: offload engine channel management routines
 drivers/dma/: location for offload engine drivers
 include/linux/async_tx.h: core header file for the async_tx api
diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt
new file mode 100644
index 00000000000..0c1c2f63c0a
--- /dev/null
+++ b/Documentation/dmaengine.txt
@@ -0,0 +1 @@
+See Documentation/crypto/async-tx-api.txt
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
new file mode 100644
index 00000000000..64087c34327
--- /dev/null
+++ b/Documentation/filesystems/btrfs.txt
@@ -0,0 +1,91 @@
+
+	BTRFS
+	=====
+
+Btrfs is a new copy on write filesystem for Linux aimed at
+implementing advanced features while focusing on fault tolerance,
+repair and easy administration. Initially developed by Oracle, Btrfs
+is licensed under the GPL and open for contribution from anyone.
+
+Linux has a wealth of filesystems to choose from, but we are facing a
+number of challenges with scaling to the large storage subsystems that
+are becoming common in today's data centers. Filesystems need to scale
+in their ability to address and manage large storage, and also in
+their ability to detect, repair and tolerate errors in the data stored
+on disk.  Btrfs is under heavy development, and is not suitable for
+any uses other than benchmarking and review. The Btrfs disk format is
+not yet finalized.
+
+The main Btrfs features include:
+
+    * Extent based file storage (2^64 max file size)
+    * Space efficient packing of small files
+    * Space efficient indexed directories
+    * Dynamic inode allocation
+    * Writable snapshots
+    * Subvolumes (separate internal filesystem roots)
+    * Object level mirroring and striping
+    * Checksums on data and metadata (multiple algorithms available)
+    * Compression
+    * Integrated multiple device support, with several raid algorithms
+    * Online filesystem check (not yet implemented)
+    * Very fast offline filesystem check
+    * Efficient incremental backup and FS mirroring (not yet implemented)
+    * Online filesystem defragmentation
+
+
+
+	MAILING LIST
+	============
+
+There is a Btrfs mailing list hosted on vger.kernel.org. You can
+find details on how to subscribe here:
+
+http://vger.kernel.org/vger-lists.html#linux-btrfs
+
+Mailing list archives are available from gmane:
+
+http://dir.gmane.org/gmane.comp.file-systems.btrfs
+
+
+
+	IRC
+	===
+
+Discussion of Btrfs also occurs on the #btrfs channel of the Freenode
+IRC network.
+
+
+
+	UTILITIES
+	=========
+
+Userspace tools for creating and manipulating Btrfs file systems are
+available from the git repository at the following location:
+
+ http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git
+ git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs-unstable.git
+
+These include the following tools:
+
+mkfs.btrfs: create a filesystem
+
+btrfsctl: control program to create snapshots and subvolumes:
+
+	mount /dev/sda2 /mnt
+	btrfsctl -s new_subvol_name /mnt
+	btrfsctl -s snapshot_of_default /mnt/default
+	btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
+	btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
+	ls /mnt
+	default snapshot_of_a_snapshot snapshot_of_new_subvol
+	new_subvol_name snapshot_of_default
+
+	Snapshots and subvolumes cannot be deleted right now, but you can
+	rm -rf all the files and directories inside them.
+
+btrfsck: do a limited check of the FS extent trees.
+
+btrfs-debug-tree: print all of the FS metadata in text form.  Example:
+
+	btrfs-debug-tree /dev/sda2 >& big_output_file
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ed0a72442cf..8511d3532c2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -141,6 +141,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			ht -- run only enough ACPI to enable Hyper Threading
 			strict -- Be less tolerant of platforms that are not
 				strictly ACPI specification compliant.
+			rsdt -- prefer RSDT over (default) XSDT
 
 			See also Documentation/power/pm.txt, pci=noacpi
 
@@ -151,16 +152,20 @@ and is between 256 and 4096 characters. It is defined in the file
 			default: 0
 
 	acpi_sleep=	[HW,ACPI] Sleep options
-			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, old_ordering }
-			See Documentation/power/video.txt for s3_bios and s3_mode.
+			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
+				  old_ordering, s4_nonvs }
+			See Documentation/power/video.txt for information on
+			s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
 			as soon as the kernel's real-mode entry point is called.
 			s4_nohwsig prevents ACPI hardware signature from being
 			used during resume from hibernation.
 			old_ordering causes the ACPI 1.0 ordering of the _PTS
-			control method, wrt putting devices into low power
-			states, to be enforced (the ACPI 2.0 ordering of _PTS is
-			used by default).
+			control method, with respect to putting devices into
+			low power states, to be enforced (the ACPI 2.0 ordering
+			of _PTS is used by default).
+			s4_nonvs prevents the kernel from saving/restoring the
+			ACPI NVS memory during hibernation.
 
 	acpi_sci=	[HW,ACPI] ACPI System Control Interrupt trigger mode
 			Format: { level | edge | high | low }
@@ -195,7 +200,7 @@ and is between 256 and 4096 characters. It is defined in the file
 	acpi_skip_timer_override [HW,ACPI]
 			Recognize and ignore IRQ0/pin2 Interrupt Override.
 			For broken nForce2 BIOS resulting in XT-PIC timer.
-	acpi_use_timer_override [HW,ACPI}
+	acpi_use_timer_override [HW,ACPI]
 			Use timer override. For some broken Nvidia NF5 boards
 			that require a timer override, but don't have
 			HPET
@@ -878,17 +883,19 @@ and is between 256 and 4096 characters. It is defined in the file
 			See Documentation/ide/ide.txt.
 
 	idle=		[X86]
-			Format: idle=poll or idle=mwait, idle=halt, idle=nomwait
-			Poll forces a polling idle loop that can slightly improves the performance
-			of waking up a idle CPU, but will use a lot of power and make the system
-			run hot. Not recommended.
-			idle=mwait. On systems which support MONITOR/MWAIT but the kernel chose
-			to not use it because it doesn't save as much power as a normal idle
-			loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same
-			as idle=poll.
-			idle=halt. Halt is forced to be used for CPU idle.
+			Format: idle=poll, idle=mwait, idle=halt, idle=nomwait
+			Poll forces a polling idle loop that can slightly
+			improve the performance of waking up a idle CPU, but
+			will use a lot of power and make the system run hot.
+			Not recommended.
+			idle=mwait: On systems which support MONITOR/MWAIT but
+			the kernel chose to not use it because it doesn't save
+			as much power as a normal idle loop, use the
+			MONITOR/MWAIT idle loop anyways. Performance should be
+			the same as idle=poll.
+			idle=halt: Halt is forced to be used for CPU idle.
 			In such case C2/C3 won't be used again.
-			idle=nomwait. Disable mwait for CPU C-states
+			idle=nomwait: Disable mwait for CPU C-states
 
 	ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
 			Claim all unknown PCI IDE storage controllers.
@@ -1074,8 +1081,8 @@ and is between 256 and 4096 characters. It is defined in the file
 	lapic		[X86-32,APIC] Enable the local APIC even if BIOS
 			disabled it.
 
-	lapic_timer_c2_ok	[X86-32,x86-64,APIC] trust the local apic timer in
-			C2 power state.
+	lapic_timer_c2_ok	[X86-32,x86-64,APIC] trust the local apic timer
+			in C2 power state.
 
 	libata.dma=	[LIBATA] DMA control
 			libata.dma=0	  Disable all PATA and SATA DMA
@@ -2303,7 +2310,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	thermal.psv=	[HW,ACPI]
 			-1: disable all passive trip points
-			<degrees C>: override all passive trip points to this value
+			<degrees C>: override all passive trip points to this
+			value
 
 	thermal.tzp=	[HW,ACPI]
 			Specify global default ACPI thermal zone polling rate
diff --git a/Documentation/powerpc/dts-bindings/4xx/ndfc.txt b/Documentation/powerpc/dts-bindings/4xx/ndfc.txt
new file mode 100644
index 00000000000..869f0b5f16e
--- /dev/null
+++ b/Documentation/powerpc/dts-bindings/4xx/ndfc.txt
@@ -0,0 +1,39 @@
+AMCC NDFC (NanD Flash Controller)
+
+Required properties:
+- compatible : "ibm,ndfc".
+- reg : should specify chip select and size used for the chip (0x2000).
+
+Optional properties:
+- ccr : NDFC config and control register value (default 0).
+- bank-settings : NDFC bank configuration register value (default 0).
+
+Notes:
+- partition(s) - follows the OF MTD standard for partitions
+
+Example:
+
+ndfc@1,0 {
+	compatible = "ibm,ndfc";
+	reg = <0x00000001 0x00000000 0x00002000>;
+	ccr = <0x00001000>;
+	bank-settings = <0x80002222>;
+	#address-cells = <1>;
+	#size-cells = <1>;
+
+	nand {
+		#address-cells = <1>;
+		#size-cells = <1>;
+
+		partition@0 {
+			label = "kernel";
+			reg = <0x00000000 0x00200000>;
+		};
+		partition@200000 {
+			label = "root";
+			reg = <0x00200000 0x03E00000>;
+		};
+	};
+};
+
+
diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c
index c5e28a46b29..a8d91b6c136 100644
--- a/arch/arm/mach-pxa/corgi.c
+++ b/arch/arm/mach-pxa/corgi.c
@@ -27,6 +27,7 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/ads7846.h>
 #include <linux/spi/corgi_lcd.h>
+#include <linux/mtd/sharpsl.h>
 #include <video/w100fb.h>
 
 #include <asm/setup.h>
@@ -542,6 +543,55 @@ err_free_1:
 static inline void corgi_init_spi(void) {}
 #endif
 
+static struct mtd_partition sharpsl_nand_partitions[] = {
+	{
+		.name = "System Area",
+		.offset = 0,
+		.size = 7 * 1024 * 1024,
+	},
+	{
+		.name = "Root Filesystem",
+		.offset = 7 * 1024 * 1024,
+		.size = 25 * 1024 * 1024,
+	},
+	{
+		.name = "Home Filesystem",
+		.offset = MTDPART_OFS_APPEND,
+		.size = MTDPART_SIZ_FULL,
+	},
+};
+
+static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
+
+static struct nand_bbt_descr sharpsl_bbt = {
+	.options = 0,
+	.offs = 4,
+	.len = 2,
+	.pattern = scan_ff_pattern
+};
+
+static struct sharpsl_nand_platform_data sharpsl_nand_platform_data = {
+	.badblock_pattern	= &sharpsl_bbt,
+	.partitions		= sharpsl_nand_partitions,
+	.nr_partitions		= ARRAY_SIZE(sharpsl_nand_partitions),
+};
+
+static struct resource sharpsl_nand_resources[] = {
+	{
+		.start	= 0x0C000000,
+		.end	= 0x0C000FFF,
+		.flags	= IORESOURCE_MEM,
+	},
+};
+
+static struct platform_device sharpsl_nand_device = {
+	.name		= "sharpsl-nand",
+	.id		= -1,
+	.resource	= sharpsl_nand_resources,
+	.num_resources	= ARRAY_SIZE(sharpsl_nand_resources),
+	.dev.platform_data	= &sharpsl_nand_platform_data,
+};
+
 static struct mtd_partition sharpsl_rom_parts[] = {
 	{
 		.name	="Boot PROM Filesystem",
@@ -577,6 +627,7 @@ static struct platform_device *devices[] __initdata = {
 	&corgifb_device,
 	&corgikbd_device,
 	&corgiled_device,
+	&sharpsl_nand_device,
 	&sharpsl_rom_device,
 };
 
@@ -617,6 +668,9 @@ static void __init corgi_init(void)
 
 	platform_scoop_config = &corgi_pcmcia_config;
 
+	if (machine_is_husky())
+		sharpsl_nand_partitions[1].size = 53 * 1024 * 1024;
+
 	platform_add_devices(devices, ARRAY_SIZE(devices));
 }
 
diff --git a/arch/arm/mach-pxa/poodle.c b/arch/arm/mach-pxa/poodle.c
index ae88855bf97..f9093beba75 100644
--- a/arch/arm/mach-pxa/poodle.c
+++ b/arch/arm/mach-pxa/poodle.c
@@ -24,6 +24,7 @@
 #include <linux/gpio.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/ads7846.h>
+#include <linux/mtd/sharpsl.h>
 
 #include <mach/hardware.h>
 #include <asm/mach-types.h>
@@ -414,6 +415,55 @@ static struct pxafb_mach_info poodle_fb_info = {
 	.lcd_conn	= LCD_COLOR_TFT_16BPP,
 };
 
+static struct mtd_partition sharpsl_nand_partitions[] = {
+	{
+		.name = "System Area",
+		.offset = 0,
+		.size = 7 * 1024 * 1024,
+	},
+	{
+		.name = "Root Filesystem",
+		.offset = 7 * 1024 * 1024,
+		.size = 22 * 1024 * 1024,
+	},
+	{
+		.name = "Home Filesystem",
+		.offset = MTDPART_OFS_APPEND,
+		.size = MTDPART_SIZ_FULL,
+	},
+};
+
+static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
+
+static struct nand_bbt_descr sharpsl_bbt = {
+	.options = 0,
+	.offs = 4,
+	.len = 2,
+	.pattern = scan_ff_pattern
+};
+
+static struct sharpsl_nand_platform_data sharpsl_nand_platform_data = {
+	.badblock_pattern	= &sharpsl_bbt,
+	.partitions		= sharpsl_nand_partitions,
+	.nr_partitions		= ARRAY_SIZE(sharpsl_nand_partitions),
+};
+
+static struct resource sharpsl_nand_resources[] = {
+	{
+		.start	= 0x0C000000,
+		.end	= 0x0C000FFF,
+		.flags	= IORESOURCE_MEM,
+	},
+};
+
+static struct platform_device sharpsl_nand_device = {
+	.name		= "sharpsl-nand",
+	.id		= -1,
+	.resource	= sharpsl_nand_resources,
+	.num_resources	= ARRAY_SIZE(sharpsl_nand_resources),
+	.dev.platform_data	= &sharpsl_nand_platform_data,
+};
+
 static struct mtd_partition sharpsl_rom_parts[] = {
 	{
 		.name	="Boot PROM Filesystem",
@@ -447,6 +497,7 @@ static struct platform_device sharpsl_rom_device = {
 static struct platform_device *devices[] __initdata = {
 	&poodle_locomo_device,
 	&poodle_scoop_device,
+	&sharpsl_nand_device,
 	&sharpsl_rom_device,
 };
 
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index 7299d87a1cb..6d447c9ce8a 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -31,6 +31,7 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/ads7846.h>
 #include <linux/spi/corgi_lcd.h>
+#include <linux/mtd/sharpsl.h>
 
 #include <asm/setup.h>
 #include <asm/memory.h>
@@ -613,6 +614,54 @@ static struct pxafb_mach_info spitz_pxafb_info = {
 	.lcd_conn	= LCD_COLOR_TFT_16BPP | LCD_ALTERNATE_MAPPING,
 };
 
+static struct mtd_partition sharpsl_nand_partitions[] = {
+	{
+		.name = "System Area",
+		.offset = 0,
+		.size = 7 * 1024 * 1024,
+	},
+	{
+		.name = "Root Filesystem",
+		.offset = 7 * 1024 * 1024,
+	},
+	{
+		.name = "Home Filesystem",
+		.offset = MTDPART_OFS_APPEND,
+		.size = MTDPART_SIZ_FULL,
+	},
+};
+
+static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
+
+static struct nand_bbt_descr sharpsl_bbt = {
+	.options = 0,
+	.offs = 4,
+	.len = 2,
+	.pattern = scan_ff_pattern
+};
+
+static struct sharpsl_nand_platform_data sharpsl_nand_platform_data = {
+	.badblock_pattern	= &sharpsl_bbt,
+	.partitions		= sharpsl_nand_partitions,
+	.nr_partitions		= ARRAY_SIZE(sharpsl_nand_partitions),
+};
+
+static struct resource sharpsl_nand_resources[] = {
+	{
+		.start	= 0x0C000000,
+		.end	= 0x0C000FFF,
+		.flags	= IORESOURCE_MEM,
+	},
+};
+
+static struct platform_device sharpsl_nand_device = {
+	.name		= "sharpsl-nand",
+	.id		= -1,
+	.resource	= sharpsl_nand_resources,
+	.num_resources	= ARRAY_SIZE(sharpsl_nand_resources),
+	.dev.platform_data	= &sharpsl_nand_platform_data,
+};
+
 
 static struct mtd_partition sharpsl_rom_parts[] = {
 	{
@@ -648,6 +697,7 @@ static struct platform_device *devices[] __initdata = {
 	&spitzscoop_device,
 	&spitzkbd_device,
 	&spitzled_device,
+	&sharpsl_nand_device,
 	&sharpsl_rom_device,
 };
 
@@ -671,6 +721,14 @@ static void __init common_init(void)
 	pm_power_off = spitz_poweroff;
 	arm_pm_restart = spitz_restart;
 
+	if (machine_is_spitz()) {
+		sharpsl_nand_partitions[1].size = 5 * 1024 * 1024;
+	} else if (machine_is_akita()) {
+		sharpsl_nand_partitions[1].size = 58 * 1024 * 1024;
+	} else if (machine_is_borzoi()) {
+		sharpsl_nand_partitions[1].size = 32 * 1024 * 1024;
+	}
+
 	PMCR = 0x00;
 
 	/* Stop 3.6MHz and drive HIGH to PCMCIA and CS */
@@ -715,10 +773,29 @@ static struct i2c_board_info akita_i2c_board_info[] = {
 	},
 };
 
+static struct nand_bbt_descr sharpsl_akita_bbt = {
+	.options = 0,
+	.offs = 4,
+	.len = 1,
+	.pattern = scan_ff_pattern
+};
+
+static struct nand_ecclayout akita_oobinfo = {
+	.eccbytes = 24,
+	.eccpos = {
+		   0x5, 0x1, 0x2, 0x3, 0x6, 0x7, 0x15, 0x11,
+		   0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23,
+		   0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37},
+	.oobfree = {{0x08, 0x09}}
+};
+
 static void __init akita_init(void)
 {
 	spitz_ficp_platform_data.transceiver_mode = akita_irda_transceiver_mode;
 
+	sharpsl_nand_platform_data.badblock_pattern = &sharpsl_akita_bbt;
+	sharpsl_nand_platform_data.ecc_layout = &akita_oobinfo;
+
 	/* We just pretend the second element of the array doesn't exist */
 	spitz_pcmcia_config.num_devs = 1;
 	platform_scoop_config = &spitz_pcmcia_config;
diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c
index ea7bc1e8562..3fbfd1e32a9 100644
--- a/arch/avr32/mach-at32ap/at32ap700x.c
+++ b/arch/avr32/mach-at32ap/at32ap700x.c
@@ -1305,7 +1305,7 @@ struct platform_device *__init
 at32_add_device_mci(unsigned int id, struct mci_platform_data *data)
 {
 	struct platform_device		*pdev;
-	struct dw_dma_slave		*dws;
+	struct dw_dma_slave		*dws = &data->dma_slave;
 	u32				pioa_mask;
 	u32				piob_mask;
 
@@ -1324,22 +1324,13 @@ at32_add_device_mci(unsigned int id, struct mci_platform_data *data)
 				ARRAY_SIZE(atmel_mci0_resource)))
 		goto fail;
 
-	if (data->dma_slave)
-		dws = kmemdup(to_dw_dma_slave(data->dma_slave),
-				sizeof(struct dw_dma_slave), GFP_KERNEL);
-	else
-		dws = kzalloc(sizeof(struct dw_dma_slave), GFP_KERNEL);
-
-	dws->slave.dev = &pdev->dev;
-	dws->slave.dma_dev = &dw_dmac0_device.dev;
-	dws->slave.reg_width = DMA_SLAVE_WIDTH_32BIT;
+	dws->dma_dev = &dw_dmac0_device.dev;
+	dws->reg_width = DW_DMA_SLAVE_WIDTH_32BIT;
 	dws->cfg_hi = (DWC_CFGH_SRC_PER(0)
 				| DWC_CFGH_DST_PER(1));
 	dws->cfg_lo &= ~(DWC_CFGL_HS_DST_POL
 				| DWC_CFGL_HS_SRC_POL);
 
-	data->dma_slave = &dws->slave;
-
 	if (platform_device_add_data(pdev, data,
 				sizeof(struct mci_platform_data)))
 		goto fail;
diff --git a/arch/ia64/include/asm/acpi-ext.h b/arch/ia64/include/asm/acpi-ext.h
index 734d137dda6..7f8362b379e 100644
--- a/arch/ia64/include/asm/acpi-ext.h
+++ b/arch/ia64/include/asm/acpi-ext.h
@@ -14,7 +14,6 @@
 #define _ASM_IA64_ACPI_EXT_H
 
 #include <linux/types.h>
-#include <acpi/actypes.h>
 
 extern acpi_status hp_acpi_csr_space (acpi_handle, u64 *base, u64 *length);
 
diff --git a/arch/ia64/include/asm/sn/acpi.h b/arch/ia64/include/asm/sn/acpi.h
index 9ce2801cbd5..fd480db2556 100644
--- a/arch/ia64/include/asm/sn/acpi.h
+++ b/arch/ia64/include/asm/sn/acpi.h
@@ -9,8 +9,6 @@
 #ifndef _ASM_IA64_SN_ACPI_H
 #define _ASM_IA64_SN_ACPI_H
 
-#include "acpi/acglobal.h"
-
 extern int sn_acpi_rev;
 #define SN_ACPI_BASE_SUPPORT()   (sn_acpi_rev >= 0x20101)
 
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 0553648b759..d541671caf4 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -65,6 +65,7 @@ EXPORT_SYMBOL(pm_idle);
 void (*pm_power_off) (void);
 EXPORT_SYMBOL(pm_power_off);
 
+u32 acpi_rsdt_forced;
 unsigned int acpi_cpei_override;
 unsigned int acpi_cpei_phys_cpuid;
 
diff --git a/arch/ia64/sn/kernel/io_acpi_init.c b/arch/ia64/sn/kernel/io_acpi_init.c
index bc610a6c785..c5a214026a7 100644
--- a/arch/ia64/sn/kernel/io_acpi_init.c
+++ b/arch/ia64/sn/kernel/io_acpi_init.c
@@ -13,7 +13,6 @@
 #include <asm/sn/sn_sal.h>
 #include "xtalk/hubdev.h"
 #include <linux/acpi.h>
-#include <acpi/acnamesp.h>
 
 
 /*
@@ -64,6 +63,7 @@ static acpi_status __init
 sn_acpi_hubdev_init(acpi_handle handle, u32 depth, void *context, void **ret)
 {
 	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 	u64 addr;
 	struct hubdev_info *hubdev;
 	struct hubdev_info *hubdev_ptr;
@@ -77,11 +77,12 @@ sn_acpi_hubdev_init(acpi_handle handle, u32 depth, void *context, void **ret)
 	status = acpi_get_vendor_resource(handle, METHOD_NAME__CRS,
 					  &sn_uuid, &buffer);
 	if (ACPI_FAILURE(status)) {
+		acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer);
 		printk(KERN_ERR
 		       "sn_acpi_hubdev_init: acpi_get_vendor_resource() "
-		       "(0x%x) failed for: ", status);
-		acpi_ns_print_node_pathname(handle, NULL);
-		printk("\n");
+		       "(0x%x) failed for: %s\n", status,
+			(char *)name_buffer.pointer);
+		kfree(name_buffer.pointer);
 		return AE_OK;		/* Continue walking namespace */
 	}
 
@@ -89,11 +90,12 @@ sn_acpi_hubdev_init(acpi_handle handle, u32 depth, void *context, void **ret)
 	vendor = &resource->data.vendor_typed;
 	if ((vendor->byte_length - sizeof(struct acpi_vendor_uuid)) !=
 	    sizeof(struct hubdev_info *)) {
+		acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer);
 		printk(KERN_ERR
-		       "sn_acpi_hubdev_init: Invalid vendor data length: %d for: ",
-		        vendor->byte_length);
-		acpi_ns_print_node_pathname(handle, NULL);
-		printk("\n");
+		       "sn_acpi_hubdev_init: Invalid vendor data length: "
+		       "%d for: %s\n",
+			vendor->byte_length, (char *)name_buffer.pointer);
+		kfree(name_buffer.pointer);
 		goto exit;
 	}
 
@@ -120,6 +122,7 @@ sn_get_bussoft_ptr(struct pci_bus *bus)
 {
 	u64 addr;
 	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 	acpi_handle handle;
 	struct pcibus_bussoft *prom_bussoft_ptr;
 	struct acpi_resource *resource;
@@ -131,11 +134,11 @@ sn_get_bussoft_ptr(struct pci_bus *bus)
 	status = acpi_get_vendor_resource(handle, METHOD_NAME__CRS,
 					  &sn_uuid, &buffer);
 	if (ACPI_FAILURE(status)) {
+		acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer);
 		printk(KERN_ERR "%s: "
-		       "acpi_get_vendor_resource() failed (0x%x) for: ",
-		       __func__, status);
-		acpi_ns_print_node_pathname(handle, NULL);
-		printk("\n");
+		       "acpi_get_vendor_resource() failed (0x%x) for: %s\n",
+		       __func__, status, (char *)name_buffer.pointer);
+		kfree(name_buffer.pointer);
 		return NULL;
 	}
 	resource = buffer.pointer;
@@ -168,6 +171,7 @@ sn_extract_device_info(acpi_handle handle, struct pcidev_info **pcidev_info,
 {
 	u64 addr;
 	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 	struct sn_irq_info *irq_info, *irq_info_prom;
 	struct pcidev_info *pcidev_ptr, *pcidev_prom_ptr;
 	struct acpi_resource *resource;
@@ -182,11 +186,11 @@ sn_extract_device_info(acpi_handle handle, struct pcidev_info **pcidev_info,
 	status = acpi_get_vendor_resource(handle, METHOD_NAME__CRS,
 					  &sn_uuid, &buffer);
 	if (ACPI_FAILURE(status)) {
+		acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer);
 		printk(KERN_ERR
-		       "%s: acpi_get_vendor_resource() failed (0x%x) for: ",
-		        __func__, status);
-		acpi_ns_print_node_pathname(handle, NULL);
-		printk("\n");
+		       "%s: acpi_get_vendor_resource() failed (0x%x) for: %s\n",
+			__func__, status, (char *)name_buffer.pointer);
+		kfree(name_buffer.pointer);
 		return 1;
 	}
 
@@ -194,11 +198,12 @@ sn_extract_device_info(acpi_handle handle, struct pcidev_info **pcidev_info,
 	vendor = &resource->data.vendor_typed;
 	if ((vendor->byte_length - sizeof(struct acpi_vendor_uuid)) !=
 	    sizeof(struct pci_devdev_info *)) {
+		acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer);
 		printk(KERN_ERR
-		       "%s: Invalid vendor data length: %d for: ",
-		        __func__, vendor->byte_length);
-		acpi_ns_print_node_pathname(handle, NULL);
-		printk("\n");
+		       "%s: Invalid vendor data length: %d for: %s\n",
+			 __func__, vendor->byte_length,
+			(char *)name_buffer.pointer);
+		kfree(name_buffer.pointer);
 		ret = 1;
 		goto exit;
 	}
@@ -239,6 +244,9 @@ get_host_devfn(acpi_handle device_handle, acpi_handle rootbus_handle)
 	acpi_handle parent;
 	int slot;
 	acpi_status status;
+	struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+
+	acpi_get_name(device_handle, ACPI_FULL_PATHNAME, &name_buffer);
 
 	/*
 	 * Do an upward search to find the root bus device, and
@@ -249,9 +257,8 @@ get_host_devfn(acpi_handle device_handle, acpi_handle rootbus_handle)
 		status = acpi_get_parent(child, &parent);
 		if (ACPI_FAILURE(status)) {
 			printk(KERN_ERR "%s: acpi_get_parent() failed "
-			       "(0x%x) for: ", __func__, status);
-			acpi_ns_print_node_pathname(child, NULL);
-			printk("\n");
+			       "(0x%x) for: %s\n", __func__, status,
+				(char *)name_buffer.pointer);
 			panic("%s: Unable to find host devfn\n", __func__);
 		}
 		if (parent == rootbus_handle)
@@ -259,22 +266,20 @@ get_host_devfn(acpi_handle device_handle, acpi_handle rootbus_handle)
 		child = parent;
 	}
 	if (!child) {
-		printk(KERN_ERR "%s: Unable to find root bus for: ",
-		       __func__);
-		acpi_ns_print_node_pathname(device_handle, NULL);
-		printk("\n");
+		printk(KERN_ERR "%s: Unable to find root bus for: %s\n",
+		       __func__, (char *)name_buffer.pointer);
 		BUG();
 	}
 
 	status = acpi_evaluate_integer(child, METHOD_NAME__ADR, NULL, &adr);
 	if (ACPI_FAILURE(status)) {
-		printk(KERN_ERR "%s: Unable to get _ADR (0x%x) for: ",
-		       __func__, status);
-		acpi_ns_print_node_pathname(child, NULL);
-		printk("\n");
+		printk(KERN_ERR "%s: Unable to get _ADR (0x%x) for: %s\n",
+		       __func__, status, (char *)name_buffer.pointer);
 		panic("%s: Unable to find host devfn\n", __func__);
 	}
 
+	kfree(name_buffer.pointer);
+
 	slot = (adr >> 16) & 0xffff;
 	function = adr & 0xffff;
 	devfn = PCI_DEVFN(slot, function);
@@ -300,27 +305,28 @@ find_matching_device(acpi_handle handle, u32 lvl, void *context, void **rv)
 	int function;
 	int slot;
 	struct sn_pcidev_match *info = context;
+	struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 
         status = acpi_evaluate_integer(handle, METHOD_NAME__ADR, NULL,
                                        &adr);
         if (ACPI_SUCCESS(status)) {
 		status = acpi_get_parent(handle, &parent);
 		if (ACPI_FAILURE(status)) {
+			acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer);
 			printk(KERN_ERR
-			       "%s: acpi_get_parent() failed (0x%x) for: ",
-					__func__, status);
-			acpi_ns_print_node_pathname(handle, NULL);
-			printk("\n");
+			       "%s: acpi_get_parent() failed (0x%x) for: %s\n",
+				__func__, status, (char *)name_buffer.pointer);
+			kfree(name_buffer.pointer);
 			return AE_OK;
 		}
 		status = acpi_evaluate_integer(parent, METHOD_NAME__BBN,
 					       NULL, &bbn);
 		if (ACPI_FAILURE(status)) {
+			acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer);
 			printk(KERN_ERR
-			  "%s: Failed to find _BBN in parent of: ",
-					__func__);
-			acpi_ns_print_node_pathname(handle, NULL);
-			printk("\n");
+			  "%s: Failed to find _BBN in parent of: %s\n",
+					__func__, (char *)name_buffer.pointer);
+			kfree(name_buffer.pointer);
 			return AE_OK;
 		}
 
@@ -350,24 +356,27 @@ sn_acpi_get_pcidev_info(struct pci_dev *dev, struct pcidev_info **pcidev_info,
 	acpi_handle rootbus_handle;
 	unsigned long long segment;
 	acpi_status status;
+	struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 
 	rootbus_handle = PCI_CONTROLLER(dev)->acpi_handle;
         status = acpi_evaluate_integer(rootbus_handle, METHOD_NAME__SEG, NULL,
                                        &segment);
         if (ACPI_SUCCESS(status)) {
 		if (segment != pci_domain_nr(dev)) {
+			acpi_get_name(rootbus_handle, ACPI_FULL_PATHNAME,
+				&name_buffer);
 			printk(KERN_ERR
-			       "%s: Segment number mismatch, 0x%llx vs 0x%x for: ",
-			       __func__, segment, pci_domain_nr(dev));
-			acpi_ns_print_node_pathname(rootbus_handle, NULL);
-			printk("\n");
+			       "%s: Segment number mismatch, 0x%llx vs 0x%x for: %s\n",
+			       __func__, segment, pci_domain_nr(dev),
+			       (char *)name_buffer.pointer);
+			kfree(name_buffer.pointer);
 			return 1;
 		}
 	} else {
-		printk(KERN_ERR "%s: Unable to get __SEG from: ",
-		       __func__);
-		acpi_ns_print_node_pathname(rootbus_handle, NULL);
-		printk("\n");
+		acpi_get_name(rootbus_handle, ACPI_FULL_PATHNAME, &name_buffer);
+		printk(KERN_ERR "%s: Unable to get __SEG from: %s\n",
+		       __func__, (char *)name_buffer.pointer);
+		kfree(name_buffer.pointer);
 		return 1;
 	}
 
diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c
index 8a924a5661d..0d4ffa4da1d 100644
--- a/arch/ia64/sn/kernel/io_common.c
+++ b/arch/ia64/sn/kernel/io_common.c
@@ -26,7 +26,6 @@
 #include <linux/acpi.h>
 #include <asm/sn/sn2/sn_hwperf.h>
 #include <asm/sn/acpi.h>
-#include "acpi/acglobal.h"
 
 extern void sn_init_cpei_timer(void);
 extern void register_sn_procfs(void);
@@ -473,7 +472,7 @@ sn_io_early_init(void)
 	{
 		struct acpi_table_header *header = NULL;
 
-		acpi_get_table_by_index(ACPI_TABLE_INDEX_DSDT, &header);
+		acpi_get_table(ACPI_SIG_DSDT, 1, &header);
 		BUG_ON(header == NULL);
 		sn_acpi_rev = header->oem_revision;
 	}
@@ -505,7 +504,7 @@ sn_io_early_init(void)
 
 	{
 		struct acpi_table_header *header;
-		(void)acpi_get_table_by_index(ACPI_TABLE_INDEX_DSDT, &header);
+		(void)acpi_get_table(ACPI_SIG_DSDT, 1, &header);
 		printk(KERN_INFO "ACPI  DSDT OEM Rev 0x%x\n",
 			header->oem_revision);
 	}
diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile
index 5ddad7bd60a..0d428278356 100644
--- a/arch/parisc/Makefile
+++ b/arch/parisc/Makefile
@@ -77,7 +77,7 @@ libs-y	+= arch/parisc/lib/ `$(CC) -print-libgcc-file-name`
 
 drivers-$(CONFIG_OPROFILE)		+= arch/parisc/oprofile/
 
-PALO := $(shell if which palo; then : ; \
+PALO := $(shell if (which palo 2>&1); then : ; \
 	elif [ -x /sbin/palo ]; then echo /sbin/palo; \
 	fi)
 
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index f88b252e419..2121d99f836 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -1,3 +1,4 @@
 include include/asm-generic/Kbuild.asm
 
 unifdef-y += pdc.h
+unifdef-y += swab.h
diff --git a/arch/parisc/include/asm/byteorder.h b/arch/parisc/include/asm/byteorder.h
index db148313de5..da66029c4cb 100644
--- a/arch/parisc/include/asm/byteorder.h
+++ b/arch/parisc/include/asm/byteorder.h
@@ -1,82 +1,7 @@
 #ifndef _PARISC_BYTEORDER_H
 #define _PARISC_BYTEORDER_H
 
-#include <asm/types.h>
-#include <linux/compiler.h>
-
-#ifdef __GNUC__
-
-static __inline__ __attribute_const__ __u16 ___arch__swab16(__u16 x)
-{
-	__asm__("dep %0, 15, 8, %0\n\t"		/* deposit 00ab -> 0bab */
-		"shd %%r0, %0, 8, %0"		/* shift 000000ab -> 00ba */
-		: "=r" (x)
-		: "0" (x));
-	return x;
-}
-
-static __inline__ __attribute_const__ __u32 ___arch__swab24(__u32 x)
-{
-	__asm__("shd %0, %0, 8, %0\n\t"		/* shift xabcxabc -> cxab */
-		"dep %0, 15, 8, %0\n\t"		/* deposit cxab -> cbab */
-		"shd %%r0, %0, 8, %0"		/* shift 0000cbab -> 0cba */
-		: "=r" (x)
-		: "0" (x));
-	return x;
-}
-
-static __inline__ __attribute_const__ __u32 ___arch__swab32(__u32 x)
-{
-	unsigned int temp;
-	__asm__("shd %0, %0, 16, %1\n\t"	/* shift abcdabcd -> cdab */
-		"dep %1, 15, 8, %1\n\t"		/* deposit cdab -> cbab */
-		"shd %0, %1, 8, %0"		/* shift abcdcbab -> dcba */
-		: "=r" (x), "=&r" (temp)
-		: "0" (x));
-	return x;
-}
-
-
-#if BITS_PER_LONG > 32
-/*
-** From "PA-RISC 2.0 Architecture", HP Professional Books.
-** See Appendix I page 8 , "Endian Byte Swapping".
-**
-** Pretty cool algorithm: (* == zero'd bits)
-**      PERMH   01234567 -> 67452301 into %0
-**      HSHL    67452301 -> 7*5*3*1* into %1
-**      HSHR    67452301 -> *6*4*2*0 into %0
-**      OR      %0 | %1  -> 76543210 into %0 (all done!)
-*/
-static __inline__ __attribute_const__ __u64 ___arch__swab64(__u64 x) {
-	__u64 temp;
-	__asm__("permh,3210 %0, %0\n\t"
-		"hshl %0, 8, %1\n\t"
-		"hshr,u %0, 8, %0\n\t"
-		"or %1, %0, %0"
-		: "=r" (x), "=&r" (temp)
-		: "0" (x));
-	return x;
-}
-#define __arch__swab64(x) ___arch__swab64(x)
-#define __BYTEORDER_HAS_U64__
-#elif !defined(__STRICT_ANSI__)
-static __inline__ __attribute_const__ __u64 ___arch__swab64(__u64 x)
-{
-	__u32 t1 = ___arch__swab32((__u32) x);
-	__u32 t2 = ___arch__swab32((__u32) (x >> 32));
-	return (((__u64) t1 << 32) | t2);
-}
-#define __arch__swab64(x) ___arch__swab64(x)
-#define __BYTEORDER_HAS_U64__
-#endif
-
-#define __arch__swab16(x) ___arch__swab16(x)
-#define __arch__swab24(x) ___arch__swab24(x)
-#define __arch__swab32(x) ___arch__swab32(x)
-
-#endif /* __GNUC__ */
-
+#include <asm/swab.h>
 #include <linux/byteorder/big_endian.h>
 
 #endif /* _PARISC_BYTEORDER_H */
diff --git a/arch/parisc/include/asm/checksum.h b/arch/parisc/include/asm/checksum.h
index e9639ccc3fc..c84b2fcb18a 100644
--- a/arch/parisc/include/asm/checksum.h
+++ b/arch/parisc/include/asm/checksum.h
@@ -182,7 +182,7 @@ static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
 #endif
 	: "=r" (sum), "=r" (saddr), "=r" (daddr), "=r" (len)
 	: "0" (sum), "1" (saddr), "2" (daddr), "3" (len), "r" (proto)
-	: "r19", "r20", "r21", "r22");
+	: "r19", "r20", "r21", "r22", "memory");
 	return csum_fold(sum);
 }
 
diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h
index 55ddb184210..d3031d1f9d0 100644
--- a/arch/parisc/include/asm/io.h
+++ b/arch/parisc/include/asm/io.h
@@ -4,12 +4,6 @@
 #include <linux/types.h>
 #include <asm/pgtable.h>
 
-extern unsigned long parisc_vmerge_boundary;
-extern unsigned long parisc_vmerge_max_size;
-
-#define BIO_VMERGE_BOUNDARY	parisc_vmerge_boundary
-#define BIO_VMERGE_MAX_SIZE	parisc_vmerge_max_size
-
 #define virt_to_phys(a) ((unsigned long)__pa(a))
 #define phys_to_virt(a) __va(a)
 #define virt_to_bus virt_to_phys
@@ -182,9 +176,9 @@ static inline void __raw_writeq(unsigned long long b, volatile void __iomem *add
 
 /* readb can never be const, so use __fswab instead of le*_to_cpu */
 #define readb(addr) __raw_readb(addr)
-#define readw(addr) __fswab16(__raw_readw(addr))
-#define readl(addr) __fswab32(__raw_readl(addr))
-#define readq(addr) __fswab64(__raw_readq(addr))
+#define readw(addr) le16_to_cpu(__raw_readw(addr))
+#define readl(addr) le32_to_cpu(__raw_readl(addr))
+#define readq(addr) le64_to_cpu(__raw_readq(addr))
 #define writeb(b, addr) __raw_writeb(b, addr)
 #define writew(b, addr) __raw_writew(cpu_to_le16(b), addr)
 #define writel(b, addr) __raw_writel(cpu_to_le32(b), addr)
diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h
index 85856c74ad1..354b2aca990 100644
--- a/arch/parisc/include/asm/mmu_context.h
+++ b/arch/parisc/include/asm/mmu_context.h
@@ -34,16 +34,21 @@ destroy_context(struct mm_struct *mm)
 	mm->context = 0;
 }
 
-static inline void load_context(mm_context_t context)
+static inline unsigned long __space_to_prot(mm_context_t context)
 {
-	mtsp(context, 3);
 #if SPACEID_SHIFT == 0
-	mtctl(context << 1,8);
+	return context << 1;
 #else
-	mtctl(context >> (SPACEID_SHIFT - 1),8);
+	return context >> (SPACEID_SHIFT - 1);
 #endif
 }
 
+static inline void load_context(mm_context_t context)
+{
+	mtsp(context, 3);
+	mtctl(__space_to_prot(context), 8);
+}
+
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk)
 {
 
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h
index 3c9d34844c8..9d64df8754b 100644
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -17,6 +17,7 @@
 #include <asm/ptrace.h>
 #include <asm/types.h>
 #include <asm/system.h>
+#include <asm/percpu.h>
 #endif /* __ASSEMBLY__ */
 
 #define KERNEL_STACK_SIZE 	(4*PAGE_SIZE)
@@ -109,8 +110,7 @@ struct cpuinfo_parisc {
 };
 
 extern struct system_cpuinfo_parisc boot_cpu_data;
-extern struct cpuinfo_parisc cpu_data[NR_CPUS];
-#define current_cpu_data cpu_data[smp_processor_id()]
+DECLARE_PER_CPU(struct cpuinfo_parisc, cpu_data);
 
 #define CPU_HVERSION ((boot_cpu_data.hversion >> 4) & 0x0FFF)
 
diff --git a/arch/parisc/include/asm/swab.h b/arch/parisc/include/asm/swab.h
new file mode 100644
index 00000000000..3ff16c5a335
--- /dev/null
+++ b/arch/parisc/include/asm/swab.h
@@ -0,0 +1,66 @@
+#ifndef _PARISC_SWAB_H
+#define _PARISC_SWAB_H
+
+#include <asm/types.h>
+#include <linux/compiler.h>
+
+#define __SWAB_64_THRU_32__
+
+static inline __attribute_const__ __u16 __arch_swab16(__u16 x)
+{
+	__asm__("dep %0, 15, 8, %0\n\t"		/* deposit 00ab -> 0bab */
+		"shd %%r0, %0, 8, %0"		/* shift 000000ab -> 00ba */
+		: "=r" (x)
+		: "0" (x));
+	return x;
+}
+#define __arch_swab16 __arch_swab16
+
+static inline __attribute_const__ __u32 __arch_swab24(__u32 x)
+{
+	__asm__("shd %0, %0, 8, %0\n\t"		/* shift xabcxabc -> cxab */
+		"dep %0, 15, 8, %0\n\t"		/* deposit cxab -> cbab */
+		"shd %%r0, %0, 8, %0"		/* shift 0000cbab -> 0cba */
+		: "=r" (x)
+		: "0" (x));
+	return x;
+}
+
+static inline __attribute_const__ __u32 __arch_swab32(__u32 x)
+{
+	unsigned int temp;
+	__asm__("shd %0, %0, 16, %1\n\t"	/* shift abcdabcd -> cdab */
+		"dep %1, 15, 8, %1\n\t"		/* deposit cdab -> cbab */
+		"shd %0, %1, 8, %0"		/* shift abcdcbab -> dcba */
+		: "=r" (x), "=&r" (temp)
+		: "0" (x));
+	return x;
+}
+#define __arch_swab32 __arch_swab32
+
+#if BITS_PER_LONG > 32
+/*
+** From "PA-RISC 2.0 Architecture", HP Professional Books.
+** See Appendix I page 8 , "Endian Byte Swapping".
+**
+** Pretty cool algorithm: (* == zero'd bits)
+**      PERMH   01234567 -> 67452301 into %0
+**      HSHL    67452301 -> 7*5*3*1* into %1
+**      HSHR    67452301 -> *6*4*2*0 into %0
+**      OR      %0 | %1  -> 76543210 into %0 (all done!)
+*/
+static inline __attribute_const__ __u64 __arch_swab64(__u64 x)
+{
+	__u64 temp;
+	__asm__("permh,3210 %0, %0\n\t"
+		"hshl %0, 8, %1\n\t"
+		"hshr,u %0, 8, %0\n\t"
+		"or %1, %0, %0"
+		: "=r" (x), "=&r" (temp)
+		: "0" (x));
+	return x;
+}
+#define __arch_swab64 __arch_swab64
+#endif /* BITS_PER_LONG > 32 */
+
+#endif /* _PARISC_SWAB_H */
diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h
index 4878b9501f2..1c6dbb6f6e5 100644
--- a/arch/parisc/include/asm/uaccess.h
+++ b/arch/parisc/include/asm/uaccess.h
@@ -241,4 +241,6 @@ unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned lo
 #define __copy_to_user_inatomic __copy_to_user
 #define __copy_from_user_inatomic __copy_from_user
 
+int fixup_exception(struct pt_regs *regs);
+
 #endif /* __PARISC_UACCESS_H */
diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c
index 884b7ce16a3..994bcd98090 100644
--- a/arch/parisc/kernel/drivers.c
+++ b/arch/parisc/kernel/drivers.c
@@ -549,6 +549,38 @@ static int parisc_generic_match(struct device *dev, struct device_driver *drv)
 	return match_device(to_parisc_driver(drv), to_parisc_device(dev));
 }
 
+static ssize_t make_modalias(struct device *dev, char *buf)
+{
+	const struct parisc_device *padev = to_parisc_device(dev);
+	const struct parisc_device_id *id = &padev->id;
+
+	return sprintf(buf, "parisc:t%02Xhv%04Xrev%02Xsv%08X\n",
+		(u8)id->hw_type, (u16)id->hversion, (u8)id->hversion_rev,
+		(u32)id->sversion);
+}
+
+static int parisc_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	const struct parisc_device *padev;
+	char modalias[40];
+
+	if (!dev)
+		return -ENODEV;
+
+	padev = to_parisc_device(dev);
+	if (!padev)
+		return -ENODEV;
+
+	if (add_uevent_var(env, "PARISC_NAME=%s", padev->name))
+		return -ENOMEM;
+
+	make_modalias(dev, modalias);
+	if (add_uevent_var(env, "MODALIAS=%s", modalias))
+		return -ENOMEM;
+
+	return 0;
+}
+
 #define pa_dev_attr(name, field, format_string)				\
 static ssize_t name##_show(struct device *dev, struct device_attribute *attr, char *buf)		\
 {									\
@@ -566,12 +598,7 @@ pa_dev_attr_id(sversion, "0x%05x\n");
 
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct parisc_device *padev = to_parisc_device(dev);
-	struct parisc_device_id *id = &padev->id;
-
-	return sprintf(buf, "parisc:t%02Xhv%04Xrev%02Xsv%08X\n",
-		(u8)id->hw_type, (u16)id->hversion, (u8)id->hversion_rev,
-		(u32)id->sversion);
+	return make_modalias(dev, buf);
 }
 
 static struct device_attribute parisc_device_attrs[] = {
@@ -587,6 +614,7 @@ static struct device_attribute parisc_device_attrs[] = {
 struct bus_type parisc_bus_type = {
 	.name = "parisc",
 	.match = parisc_generic_match,
+	.uevent = parisc_uevent,
 	.dev_attrs = parisc_device_attrs,
 	.probe = parisc_driver_probe,
 	.remove = parisc_driver_remove,
diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S
index 2cbf13b3ef1..5595a2f3118 100644
--- a/arch/parisc/kernel/hpmc.S
+++ b/arch/parisc/kernel/hpmc.S
@@ -80,6 +80,7 @@ END(hpmc_pim_data)
 
 	.import intr_save, code
 ENTRY(os_hpmc)
+.os_hpmc:
 
 	/*
 	 * registers modified:
@@ -295,5 +296,10 @@ os_hpmc_6:
 	b .
 	nop
 ENDPROC(os_hpmc)
-ENTRY(os_hpmc_end)	/* this label used to compute os_hpmc checksum */
+.os_hpmc_end:
 	nop
+.data
+.align 4
+	.export os_hpmc_size
+os_hpmc_size:
+	.word .os_hpmc_end-.os_hpmc
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 4cea935e2f9..ac2c822928c 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -298,7 +298,7 @@ unsigned long txn_affinity_addr(unsigned int irq, int cpu)
 	irq_desc[irq].affinity = cpumask_of_cpu(cpu);
 #endif
 
-	return cpu_data[cpu].txn_addr;
+	return per_cpu(cpu_data, cpu).txn_addr;
 }
 
 
@@ -309,8 +309,9 @@ unsigned long txn_alloc_addr(unsigned int virt_irq)
 	next_cpu++; /* assign to "next" CPU we want this bugger on */
 
 	/* validate entry */
-	while ((next_cpu < NR_CPUS) && (!cpu_data[next_cpu].txn_addr || 
-		!cpu_online(next_cpu)))
+	while ((next_cpu < NR_CPUS) &&
+		(!per_cpu(cpu_data, next_cpu).txn_addr ||
+		 !cpu_online(next_cpu)))
 		next_cpu++;
 
 	if (next_cpu >= NR_CPUS) 
@@ -359,7 +360,7 @@ void do_cpu_irq_mask(struct pt_regs *regs)
 		printk(KERN_DEBUG "redirecting irq %d from CPU %d to %d\n",
 		       irq, smp_processor_id(), cpu);
 		gsc_writel(irq + CPU_IRQ_BASE,
-			   cpu_data[cpu].hpa);
+			   per_cpu(cpu_data, cpu).hpa);
 		goto set_out;
 	}
 #endif
@@ -421,5 +422,5 @@ void __init init_IRQ(void)
 
 void ack_bad_irq(unsigned int irq)
 {
-	printk("unexpected IRQ %d\n", irq);
+	printk(KERN_WARNING "unexpected IRQ %d\n", irq);
 }
diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c
index ccb68090781..1ff366cb968 100644
--- a/arch/parisc/kernel/pdc_cons.c
+++ b/arch/parisc/kernel/pdc_cons.c
@@ -52,7 +52,7 @@
 #include <linux/tty.h>
 #include <asm/pdc.h>		/* for iodc_call() proto and friends */
 
-static spinlock_t pdc_console_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(pdc_console_lock);
 
 static void pdc_console_write(struct console *co, const char *s, unsigned count)
 {
diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c
index f696f57faa1..75099efb3bf 100644
--- a/arch/parisc/kernel/perf.c
+++ b/arch/parisc/kernel/perf.c
@@ -541,9 +541,9 @@ static int __init perf_init(void)
 	spin_lock_init(&perf_lock);
 
 	/* TODO: this only lets us access the first cpu.. what to do for SMP? */
-	cpu_device = cpu_data[0].dev;
+	cpu_device = per_cpu(cpu_data, 0).dev;
 	printk("Performance monitoring counters enabled for %s\n",
-		cpu_data[0].dev->name);
+		per_cpu(cpu_data, 0).dev->name);
 
 	return 0;
 }
diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c
index 370086fb833..ecb609342fe 100644
--- a/arch/parisc/kernel/processor.c
+++ b/arch/parisc/kernel/processor.c
@@ -3,7 +3,7 @@
  *    Initial setup-routines for HP 9000 based hardware.
  *
  *    Copyright (C) 1991, 1992, 1995  Linus Torvalds
- *    Modifications for PA-RISC (C) 1999 Helge Deller <deller@gmx.de>
+ *    Modifications for PA-RISC (C) 1999-2008 Helge Deller <deller@gmx.de>
  *    Modifications copyright 1999 SuSE GmbH (Philipp Rumpf)
  *    Modifications copyright 2000 Martin K. Petersen <mkp@mkp.net>
  *    Modifications copyright 2000 Philipp Rumpf <prumpf@tux.org>
@@ -46,7 +46,7 @@
 struct system_cpuinfo_parisc boot_cpu_data __read_mostly;
 EXPORT_SYMBOL(boot_cpu_data);
 
-struct cpuinfo_parisc cpu_data[NR_CPUS] __read_mostly;
+DEFINE_PER_CPU(struct cpuinfo_parisc, cpu_data);
 
 extern int update_cr16_clocksource(void);	/* from time.c */
 
@@ -69,6 +69,23 @@ extern int update_cr16_clocksource(void);	/* from time.c */
 */
 
 /**
+ * init_cpu_profiler - enable/setup per cpu profiling hooks.
+ * @cpunum: The processor instance.
+ *
+ * FIXME: doesn't do much yet...
+ */
+static void __cpuinit
+init_percpu_prof(unsigned long cpunum)
+{
+	struct cpuinfo_parisc *p;
+
+	p = &per_cpu(cpu_data, cpunum);
+	p->prof_counter = 1;
+	p->prof_multiplier = 1;
+}
+
+
+/**
  * processor_probe - Determine if processor driver should claim this device.
  * @dev: The device which has been found.
  *
@@ -147,7 +164,7 @@ static int __cpuinit processor_probe(struct parisc_device *dev)
 	}
 #endif
 
-	p = &cpu_data[cpuid];
+	p = &per_cpu(cpu_data, cpuid);
 	boot_cpu_data.cpu_count++;
 
 	/* initialize counters - CPU 0 gets it_value set in time_init() */
@@ -162,12 +179,9 @@ static int __cpuinit processor_probe(struct parisc_device *dev)
 #ifdef CONFIG_SMP
 	/*
 	** FIXME: review if any other initialization is clobbered
-	**	for boot_cpu by the above memset().
+	**	  for boot_cpu by the above memset().
 	*/
-
-	/* stolen from init_percpu_prof() */
-	cpu_data[cpuid].prof_counter = 1;
-	cpu_data[cpuid].prof_multiplier = 1;
+	init_percpu_prof(cpuid);
 #endif
 
 	/*
@@ -261,19 +275,6 @@ void __init collect_boot_cpu_data(void)
 }
 
 
-/**
- * init_cpu_profiler - enable/setup per cpu profiling hooks.
- * @cpunum: The processor instance.
- *
- * FIXME: doesn't do much yet...
- */
-static inline void __init
-init_percpu_prof(int cpunum)
-{
-	cpu_data[cpunum].prof_counter = 1;
-	cpu_data[cpunum].prof_multiplier = 1;
-}
-
 
 /**
  * init_per_cpu - Handle individual processor initializations.
@@ -293,7 +294,7 @@ init_percpu_prof(int cpunum)
  *
  * o Enable CPU profiling hooks.
  */
-int __init init_per_cpu(int cpunum)
+int __cpuinit init_per_cpu(int cpunum)
 {
 	int ret;
 	struct pdc_coproc_cfg coproc_cfg;
@@ -307,8 +308,8 @@ int __init init_per_cpu(int cpunum)
 		/* FWIW, FP rev/model is a more accurate way to determine
 		** CPU type. CPU rev/model has some ambiguous cases.
 		*/
-		cpu_data[cpunum].fp_rev = coproc_cfg.revision;
-		cpu_data[cpunum].fp_model = coproc_cfg.model;
+		per_cpu(cpu_data, cpunum).fp_rev = coproc_cfg.revision;
+		per_cpu(cpu_data, cpunum).fp_model = coproc_cfg.model;
 
 		printk(KERN_INFO  "FP[%d] enabled: Rev %ld Model %ld\n",
 			cpunum, coproc_cfg.revision, coproc_cfg.model);
@@ -344,16 +345,17 @@ int __init init_per_cpu(int cpunum)
 int
 show_cpuinfo (struct seq_file *m, void *v)
 {
-	int	n;
+	unsigned long cpu;
 
-	for(n=0; n<boot_cpu_data.cpu_count; n++) {
+	for_each_online_cpu(cpu) {
+		const struct cpuinfo_parisc *cpuinfo = &per_cpu(cpu_data, cpu);
 #ifdef CONFIG_SMP
-		if (0 == cpu_data[n].hpa)
+		if (0 == cpuinfo->hpa)
 			continue;
 #endif
-		seq_printf(m, "processor\t: %d\n"
+		seq_printf(m, "processor\t: %lu\n"
 				"cpu family\t: PA-RISC %s\n",
-				 n, boot_cpu_data.family_name);
+				 cpu, boot_cpu_data.family_name);
 
 		seq_printf(m, "cpu\t\t: %s\n",  boot_cpu_data.cpu_name );
 
@@ -365,8 +367,8 @@ show_cpuinfo (struct seq_file *m, void *v)
 		seq_printf(m, "model\t\t: %s\n"
 				"model name\t: %s\n",
 				 boot_cpu_data.pdc.sys_model_name,
-				 cpu_data[n].dev ? 
-				 cpu_data[n].dev->name : "Unknown" );
+				 cpuinfo->dev ?
+				 cpuinfo->dev->name : "Unknown");
 
 		seq_printf(m, "hversion\t: 0x%08x\n"
 			        "sversion\t: 0x%08x\n",
@@ -377,8 +379,8 @@ show_cpuinfo (struct seq_file *m, void *v)
 		show_cache_info(m);
 
 		seq_printf(m, "bogomips\t: %lu.%02lu\n",
-			     cpu_data[n].loops_per_jiffy / (500000 / HZ),
-			     (cpu_data[n].loops_per_jiffy / (5000 / HZ)) % 100);
+			     cpuinfo->loops_per_jiffy / (500000 / HZ),
+			     (cpuinfo->loops_per_jiffy / (5000 / HZ)) % 100);
 
 		seq_printf(m, "software id\t: %ld\n\n",
 				boot_cpu_data.pdc.model.sw_id);
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index 7d27853ff8c..82131ca8e05 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -58,11 +58,6 @@ int parisc_bus_is_phys __read_mostly = 1;	/* Assume no IOMMU is present */
 EXPORT_SYMBOL(parisc_bus_is_phys);
 #endif
 
-/* This sets the vmerge boundary and size, it's here because it has to
- * be available on all platforms (zero means no-virtual merging) */
-unsigned long parisc_vmerge_boundary = 0;
-unsigned long parisc_vmerge_max_size = 0;
-
 void __init setup_cmdline(char **cmdline_p)
 {
 	extern unsigned int boot_args[];
@@ -321,7 +316,7 @@ static int __init parisc_init(void)
 	
 	processor_init();
 	printk(KERN_INFO "CPU(s): %d x %s at %d.%06d MHz\n",
-			boot_cpu_data.cpu_count,
+			num_present_cpus(),
 			boot_cpu_data.cpu_name,
 			boot_cpu_data.cpu_hz / 1000000,
 			boot_cpu_data.cpu_hz % 1000000	);
@@ -387,8 +382,8 @@ void start_parisc(void)
 	if (ret >= 0 && coproc_cfg.ccr_functional) {
 		mtctl(coproc_cfg.ccr_functional, 10);
 
-		cpu_data[cpunum].fp_rev = coproc_cfg.revision;
-		cpu_data[cpunum].fp_model = coproc_cfg.model;
+		per_cpu(cpu_data, cpunum).fp_rev = coproc_cfg.revision;
+		per_cpu(cpu_data, cpunum).fp_model = coproc_cfg.model;
 
 		asm volatile ("fstd	%fr0,8(%sp)");
 	} else {
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 80bc000523f..9995d7ed581 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -56,16 +56,17 @@ static int smp_debug_lvl = 0;
 		if (lvl >= smp_debug_lvl)	\
 			printk(printargs);
 #else
-#define smp_debug(lvl, ...)
+#define smp_debug(lvl, ...)	do { } while(0)
 #endif /* DEBUG_SMP */
 
 DEFINE_SPINLOCK(smp_lock);
 
 volatile struct task_struct *smp_init_current_idle_task;
 
-static volatile int cpu_now_booting __read_mostly = 0;	/* track which CPU is booting */
+/* track which CPU is booting */
+static volatile int cpu_now_booting __cpuinitdata;
 
-static int parisc_max_cpus __read_mostly = 1;
+static int parisc_max_cpus __cpuinitdata = 1;
 
 DEFINE_PER_CPU(spinlock_t, ipi_lock) = SPIN_LOCK_UNLOCKED;
 
@@ -123,7 +124,7 @@ irqreturn_t
 ipi_interrupt(int irq, void *dev_id) 
 {
 	int this_cpu = smp_processor_id();
-	struct cpuinfo_parisc *p = &cpu_data[this_cpu];
+	struct cpuinfo_parisc *p = &per_cpu(cpu_data, this_cpu);
 	unsigned long ops;
 	unsigned long flags;
 
@@ -202,13 +203,13 @@ ipi_interrupt(int irq, void *dev_id)
 static inline void
 ipi_send(int cpu, enum ipi_message_type op)
 {
-	struct cpuinfo_parisc *p = &cpu_data[cpu];
+	struct cpuinfo_parisc *p = &per_cpu(cpu_data, cpu);
 	spinlock_t *lock = &per_cpu(ipi_lock, cpu);
 	unsigned long flags;
 
 	spin_lock_irqsave(lock, flags);
 	p->pending_ipi |= 1 << op;
-	gsc_writel(IPI_IRQ - CPU_IRQ_BASE, cpu_data[cpu].hpa);
+	gsc_writel(IPI_IRQ - CPU_IRQ_BASE, p->hpa);
 	spin_unlock_irqrestore(lock, flags);
 }
 
@@ -224,10 +225,7 @@ send_IPI_mask(cpumask_t mask, enum ipi_message_type op)
 static inline void
 send_IPI_single(int dest_cpu, enum ipi_message_type op)
 {
-	if (dest_cpu == NO_PROC_ID) {
-		BUG();
-		return;
-	}
+	BUG_ON(dest_cpu == NO_PROC_ID);
 
 	ipi_send(dest_cpu, op);
 }
@@ -309,8 +307,7 @@ smp_cpu_init(int cpunum)
 	/* Initialise the idle task for this CPU */
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
-	if(current->mm)
-		BUG();
+	BUG_ON(current->mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	init_IRQ();   /* make sure no IRQs are enabled or pending */
@@ -345,6 +342,7 @@ void __init smp_callin(void)
  */
 int __cpuinit smp_boot_one_cpu(int cpuid)
 {
+	const struct cpuinfo_parisc *p = &per_cpu(cpu_data, cpuid);
 	struct task_struct *idle;
 	long timeout;
 
@@ -376,7 +374,7 @@ int __cpuinit smp_boot_one_cpu(int cpuid)
 	smp_init_current_idle_task = idle ;
 	mb();
 
-	printk("Releasing cpu %d now, hpa=%lx\n", cpuid, cpu_data[cpuid].hpa);
+	printk(KERN_INFO "Releasing cpu %d now, hpa=%lx\n", cpuid, p->hpa);
 
 	/*
 	** This gets PDC to release the CPU from a very tight loop.
@@ -387,7 +385,7 @@ int __cpuinit smp_boot_one_cpu(int cpuid)
 	** EIR{0}). MEM_RENDEZ is valid only when it is nonzero and the 
 	** contents of memory are valid."
 	*/
-	gsc_writel(TIMER_IRQ - CPU_IRQ_BASE, cpu_data[cpuid].hpa);
+	gsc_writel(TIMER_IRQ - CPU_IRQ_BASE, p->hpa);
 	mb();
 
 	/* 
@@ -419,12 +417,12 @@ alive:
 	return 0;
 }
 
-void __devinit smp_prepare_boot_cpu(void)
+void __init smp_prepare_boot_cpu(void)
 {
-	int bootstrap_processor=cpu_data[0].cpuid;	/* CPU ID of BSP */
+	int bootstrap_processor = per_cpu(cpu_data, 0).cpuid;
 
 	/* Setup BSP mappings */
-	printk("SMP: bootstrap CPU ID is %d\n",bootstrap_processor);
+	printk(KERN_INFO "SMP: bootstrap CPU ID is %d\n", bootstrap_processor);
 
 	cpu_set(bootstrap_processor, cpu_online_map);
 	cpu_set(bootstrap_processor, cpu_present_map);
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 4d09203bc69..9d46c43a415 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -60,7 +60,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 	unsigned long cycles_elapsed, ticks_elapsed;
 	unsigned long cycles_remainder;
 	unsigned int cpu = smp_processor_id();
-	struct cpuinfo_parisc *cpuinfo = &cpu_data[cpu];
+	struct cpuinfo_parisc *cpuinfo = &per_cpu(cpu_data, cpu);
 
 	/* gcc can optimize for "read-only" case with a local clocktick */
 	unsigned long cpt = clocktick;
@@ -213,7 +213,7 @@ void __init start_cpu_itimer(void)
 
 	mtctl(next_tick, 16);		/* kick off Interval Timer (CR16) */
 
-	cpu_data[cpu].it_value = next_tick;
+	per_cpu(cpu_data, cpu).it_value = next_tick;
 }
 
 struct platform_device rtc_parisc_dev = {
diff --git a/arch/parisc/kernel/topology.c b/arch/parisc/kernel/topology.c
index d71cb018a21..f5159381fdd 100644
--- a/arch/parisc/kernel/topology.c
+++ b/arch/parisc/kernel/topology.c
@@ -22,14 +22,14 @@
 #include <linux/cpu.h>
 #include <linux/cache.h>
 
-static struct cpu cpu_devices[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(struct cpu, cpu_devices);
 
 static int __init topology_init(void)
 {
 	int num;
 
 	for_each_present_cpu(num) {
-		register_cpu(&cpu_devices[num], num);
+		register_cpu(&per_cpu(cpu_devices, num), num);
 	}
 	return 0;
 }
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 4c771cd580e..ba658d2086f 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -745,6 +745,10 @@ void handle_interruption(int code, struct pt_regs *regs)
 		/* Fall Through */
 	case 27: 
 		/* Data memory protection ID trap */
+		if (code == 27 && !user_mode(regs) &&
+			fixup_exception(regs))
+			return;
+
 		die_if_kernel("Protection id trap", regs, code);
 		si.si_code = SEGV_MAPERR;
 		si.si_signo = SIGSEGV;
@@ -821,8 +825,8 @@ void handle_interruption(int code, struct pt_regs *regs)
 
 int __init check_ivt(void *iva)
 {
+	extern u32 os_hpmc_size;
 	extern const u32 os_hpmc[];
-	extern const u32 os_hpmc_end[];
 
 	int i;
 	u32 check = 0;
@@ -839,8 +843,7 @@ int __init check_ivt(void *iva)
 	    *ivap++ = 0;
 
 	/* Compute Checksum for HPMC handler */
-
-	length = os_hpmc_end - os_hpmc;
+	length = os_hpmc_size;
 	ivap[7] = length;
 
 	hpmcp = (u32 *)os_hpmc;
diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c
index 6773c582e45..69dad5a850a 100644
--- a/arch/parisc/kernel/unwind.c
+++ b/arch/parisc/kernel/unwind.c
@@ -372,7 +372,7 @@ void unwind_frame_init_from_blocked_task(struct unwind_frame_info *info, struct
 	struct pt_regs *r = &t->thread.regs;
 	struct pt_regs *r2;
 
-	r2 = kmalloc(sizeof(struct pt_regs), GFP_KERNEL);
+	r2 = kmalloc(sizeof(struct pt_regs), GFP_ATOMIC);
 	if (!r2)
 		return;
 	*r2 = *r;
diff --git a/arch/parisc/lib/iomap.c b/arch/parisc/lib/iomap.c
index 9abed07db7f..5069e8b2ca7 100644
--- a/arch/parisc/lib/iomap.c
+++ b/arch/parisc/lib/iomap.c
@@ -261,7 +261,7 @@ static const struct iomap_ops iomem_ops = {
 	iomem_write32r,
 };
 
-const struct iomap_ops *iomap_ops[8] = {
+static const struct iomap_ops *iomap_ops[8] = {
 	[0] = &ioport_ops,
 	[7] = &iomem_ops
 };
diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c
index 2d68431fc22..bbda909c866 100644
--- a/arch/parisc/lib/memcpy.c
+++ b/arch/parisc/lib/memcpy.c
@@ -275,7 +275,7 @@ handle_store_error:
 
 
 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
-unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
+static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
 {
 	register unsigned long src, dst, t1, t2, t3;
 	register unsigned char *pcs, *pcd;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index b2e3e9a8cec..92c7fa4ecc3 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -139,13 +139,41 @@ parisc_acctyp(unsigned long code, unsigned int inst)
 			}
 #endif
 
+int fixup_exception(struct pt_regs *regs)
+{
+	const struct exception_table_entry *fix;
+
+	fix = search_exception_tables(regs->iaoq[0]);
+	if (fix) {
+		struct exception_data *d;
+		d = &__get_cpu_var(exception_data);
+		d->fault_ip = regs->iaoq[0];
+		d->fault_space = regs->isr;
+		d->fault_addr = regs->ior;
+
+		regs->iaoq[0] = ((fix->fixup) & ~3);
+		/*
+		 * NOTE: In some cases the faulting instruction
+		 * may be in the delay slot of a branch. We
+		 * don't want to take the branch, so we don't
+		 * increment iaoq[1], instead we set it to be
+		 * iaoq[0]+4, and clear the B bit in the PSW
+		 */
+		regs->iaoq[1] = regs->iaoq[0] + 4;
+		regs->gr[0] &= ~PSW_B; /* IPSW in gr[0] */
+
+		return 1;
+	}
+
+	return 0;
+}
+
 void do_page_fault(struct pt_regs *regs, unsigned long code,
 			      unsigned long address)
 {
 	struct vm_area_struct *vma, *prev_vma;
 	struct task_struct *tsk = current;
 	struct mm_struct *mm = tsk->mm;
-	const struct exception_table_entry *fix;
 	unsigned long acc_type;
 	int fault;
 
@@ -229,32 +257,8 @@ bad_area:
 
 no_context:
 
-	if (!user_mode(regs)) {
-		fix = search_exception_tables(regs->iaoq[0]);
-
-		if (fix) {
-			struct exception_data *d;
-
-			d = &__get_cpu_var(exception_data);
-			d->fault_ip = regs->iaoq[0];
-			d->fault_space = regs->isr;
-			d->fault_addr = regs->ior;
-
-			regs->iaoq[0] = ((fix->fixup) & ~3);
-
-			/*
-			 * NOTE: In some cases the faulting instruction
-			 * may be in the delay slot of a branch. We
-			 * don't want to take the branch, so we don't
-			 * increment iaoq[1], instead we set it to be
-			 * iaoq[0]+4, and clear the B bit in the PSW
-			 */
-
-			regs->iaoq[1] = regs->iaoq[0] + 4;
-			regs->gr[0] &= ~PSW_B; /* IPSW in gr[0] */
-
-			return;
-		}
+	if (!user_mode(regs) && fixup_exception(regs)) {
+		return;
 	}
 
 	parisc_terminate("Bad Address (null pointer deref?)", regs, code, address);
diff --git a/arch/powerpc/include/asm/cell-pmu.h b/arch/powerpc/include/asm/cell-pmu.h
index 8066eede3a0..b4b7338ad79 100644
--- a/arch/powerpc/include/asm/cell-pmu.h
+++ b/arch/powerpc/include/asm/cell-pmu.h
@@ -37,9 +37,11 @@
 #define CBE_PM_STOP_AT_MAX                 0x40000000
 #define CBE_PM_TRACE_MODE_GET(pm_control)  (((pm_control) >> 28) & 0x3)
 #define CBE_PM_TRACE_MODE_SET(mode)        (((mode)  & 0x3) << 28)
+#define CBE_PM_TRACE_BUF_OVFLW(bit)        (((bit) & 0x1) << 17)
 #define CBE_PM_COUNT_MODE_SET(count)       (((count) & 0x3) << 18)
 #define CBE_PM_FREEZE_ALL_CTRS             0x00100000
 #define CBE_PM_ENABLE_EXT_TRACE            0x00008000
+#define CBE_PM_SPU_ADDR_TRACE_SET(msk)     (((msk) & 0x3) << 9)
 
 /* Macros for the trace_address register. */
 #define CBE_PM_TRACE_BUF_FULL              0x00000800
diff --git a/arch/powerpc/include/asm/oprofile_impl.h b/arch/powerpc/include/asm/oprofile_impl.h
index 95035c602ba..639dc96077a 100644
--- a/arch/powerpc/include/asm/oprofile_impl.h
+++ b/arch/powerpc/include/asm/oprofile_impl.h
@@ -32,6 +32,12 @@ struct op_system_config {
 	unsigned long mmcr0;
 	unsigned long mmcr1;
 	unsigned long mmcra;
+#ifdef CONFIG_OPROFILE_CELL
+	/* Register for oprofile user tool to check cell kernel profiling
+	 * suport.
+	 */
+	unsigned long cell_support;
+#endif
 #endif
 	unsigned long enable_kernel;
 	unsigned long enable_user;
diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h
index dfdbffa0681..964b93974d8 100644
--- a/arch/powerpc/oprofile/cell/pr_util.h
+++ b/arch/powerpc/oprofile/cell/pr_util.h
@@ -30,6 +30,10 @@
 extern struct delayed_work spu_work;
 extern int spu_prof_running;
 
+#define TRACE_ARRAY_SIZE 1024
+
+extern spinlock_t oprof_spu_smpl_arry_lck;
+
 struct spu_overlay_info {	/* map of sections within an SPU overlay */
 	unsigned int vma;	/* SPU virtual memory address from elf */
 	unsigned int size;	/* size of section from elf */
@@ -89,10 +93,11 @@ void vma_map_free(struct vma_to_fileoffset_map *map);
  * Entry point for SPU profiling.
  * cycles_reset is the SPU_CYCLES count value specified by the user.
  */
-int start_spu_profiling(unsigned int cycles_reset);
-
-void stop_spu_profiling(void);
+int start_spu_profiling_cycles(unsigned int cycles_reset);
+void start_spu_profiling_events(void);
 
+void stop_spu_profiling_cycles(void);
+void stop_spu_profiling_events(void);
 
 /* add the necessary profiling hooks */
 int spu_sync_start(void);
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
index 83faa958b9d..9305ddaac51 100644
--- a/arch/powerpc/oprofile/cell/spu_profiler.c
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -18,11 +18,21 @@
 #include <asm/cell-pmu.h>
 #include "pr_util.h"
 
-#define TRACE_ARRAY_SIZE 1024
 #define SCALE_SHIFT 14
 
 static u32 *samples;
 
+/* spu_prof_running is a flag used to indicate if spu profiling is enabled
+ * or not.  It is set by the routines start_spu_profiling_cycles() and
+ * start_spu_profiling_events().  The flag is cleared by the routines
+ * stop_spu_profiling_cycles() and stop_spu_profiling_events().  These
+ * routines are called via global_start() and global_stop() which are called in
+ * op_powerpc_start() and op_powerpc_stop().  These routines are called once
+ * per system as a result of the user starting/stopping oprofile.  Hence, only
+ * one CPU per user at a time will be changing  the value of spu_prof_running.
+ * In general, OProfile does not protect against multiple users trying to run
+ * OProfile at a time.
+ */
 int spu_prof_running;
 static unsigned int profiling_interval;
 
@@ -31,8 +41,8 @@ static unsigned int profiling_interval;
 
 #define SPU_PC_MASK	     0xFFFF
 
-static DEFINE_SPINLOCK(sample_array_lock);
-unsigned long sample_array_lock_flags;
+DEFINE_SPINLOCK(oprof_spu_smpl_arry_lck);
+unsigned long oprof_spu_smpl_arry_lck_flags;
 
 void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset)
 {
@@ -145,13 +155,13 @@ static enum hrtimer_restart profile_spus(struct hrtimer *timer)
 		 * sample array must be loaded and then processed for a given
 		 * cpu.	 The sample array is not per cpu.
 		 */
-		spin_lock_irqsave(&sample_array_lock,
-				  sample_array_lock_flags);
+		spin_lock_irqsave(&oprof_spu_smpl_arry_lck,
+				  oprof_spu_smpl_arry_lck_flags);
 		num_samples = cell_spu_pc_collection(cpu);
 
 		if (num_samples == 0) {
-			spin_unlock_irqrestore(&sample_array_lock,
-					       sample_array_lock_flags);
+			spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck,
+					       oprof_spu_smpl_arry_lck_flags);
 			continue;
 		}
 
@@ -162,8 +172,8 @@ static enum hrtimer_restart profile_spus(struct hrtimer *timer)
 					num_samples);
 		}
 
-		spin_unlock_irqrestore(&sample_array_lock,
-				       sample_array_lock_flags);
+		spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck,
+				       oprof_spu_smpl_arry_lck_flags);
 
 	}
 	smp_wmb();	/* insure spu event buffer updates are written */
@@ -182,13 +192,13 @@ static enum hrtimer_restart profile_spus(struct hrtimer *timer)
 
 static struct hrtimer timer;
 /*
- * Entry point for SPU profiling.
+ * Entry point for SPU cycle profiling.
  * NOTE:  SPU profiling is done system-wide, not per-CPU.
  *
  * cycles_reset is the count value specified by the user when
  * setting up OProfile to count SPU_CYCLES.
  */
-int start_spu_profiling(unsigned int cycles_reset)
+int start_spu_profiling_cycles(unsigned int cycles_reset)
 {
 	ktime_t kt;
 
@@ -212,10 +222,30 @@ int start_spu_profiling(unsigned int cycles_reset)
 	return 0;
 }
 
-void stop_spu_profiling(void)
+/*
+ * Entry point for SPU event profiling.
+ * NOTE:  SPU profiling is done system-wide, not per-CPU.
+ *
+ * cycles_reset is the count value specified by the user when
+ * setting up OProfile to count SPU_CYCLES.
+ */
+void start_spu_profiling_events(void)
+{
+	spu_prof_running = 1;
+	schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
+
+	return;
+}
+
+void stop_spu_profiling_cycles(void)
 {
 	spu_prof_running = 0;
 	hrtimer_cancel(&timer);
 	kfree(samples);
-	pr_debug("SPU_PROF: stop_spu_profiling issued\n");
+	pr_debug("SPU_PROF: stop_spu_profiling_cycles issued\n");
+}
+
+void stop_spu_profiling_events(void)
+{
+	spu_prof_running = 0;
 }
diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c
index 17807acb05d..21f16edf6c8 100644
--- a/arch/powerpc/oprofile/common.c
+++ b/arch/powerpc/oprofile/common.c
@@ -132,6 +132,28 @@ static int op_powerpc_create_files(struct super_block *sb, struct dentry *root)
 	oprofilefs_create_ulong(sb, root, "mmcr0", &sys.mmcr0);
 	oprofilefs_create_ulong(sb, root, "mmcr1", &sys.mmcr1);
 	oprofilefs_create_ulong(sb, root, "mmcra", &sys.mmcra);
+#ifdef CONFIG_OPROFILE_CELL
+	/* create a file the user tool can check to see what level of profiling
+	 * support exits with this kernel. Initialize bit mask to indicate
+	 * what support the kernel has:
+	 * bit 0      -  Supports SPU event profiling in addition to PPU
+	 *               event and cycles; and SPU cycle profiling
+	 * bits 1-31  -  Currently unused.
+	 *
+	 * If the file does not exist, then the kernel only supports SPU
+	 * cycle profiling, PPU event and cycle profiling.
+	 */
+	oprofilefs_create_ulong(sb, root, "cell_support", &sys.cell_support);
+	sys.cell_support = 0x1; /* Note, the user OProfile tool must check
+				 * that this bit is set before attempting to
+				 * user SPU event profiling.  Older kernels
+				 * will not have this file, hence the user
+				 * tool is not allowed to do SPU event
+				 * profiling on older kernels.  Older kernels
+				 * will accept SPU events but collected data
+				 * is garbage.
+				 */
+#endif
 #endif
 
 	for (i = 0; i < model->num_counters; ++i) {
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index 25a4ec2514a..ae06c6236d9 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -40,14 +40,15 @@
 #include "../platforms/cell/interrupt.h"
 #include "cell/pr_util.h"
 
-static void cell_global_stop_spu(void);
+#define PPU_PROFILING            0
+#define SPU_PROFILING_CYCLES     1
+#define SPU_PROFILING_EVENTS     2
 
-/*
- * spu_cycle_reset is the number of cycles between samples.
- * This variable is used for SPU profiling and should ONLY be set
- * at the beginning of cell_reg_setup; otherwise, it's read-only.
- */
-static unsigned int spu_cycle_reset;
+#define SPU_EVENT_NUM_START      4100
+#define SPU_EVENT_NUM_STOP       4399
+#define SPU_PROFILE_EVENT_ADDR          4363  /* spu, address trace, decimal */
+#define SPU_PROFILE_EVENT_ADDR_MASK_A   0x146 /* sub unit set to zero */
+#define SPU_PROFILE_EVENT_ADDR_MASK_B   0x186 /* sub unit set to zero */
 
 #define NUM_SPUS_PER_NODE    8
 #define SPU_CYCLES_EVENT_NUM 2	/*  event number for SPU_CYCLES */
@@ -66,6 +67,21 @@ static unsigned int spu_cycle_reset;
 
 #define MAX_SPU_COUNT 0xFFFFFF	/* maximum 24 bit LFSR value */
 
+/* Minumum HW interval timer setting to send value to trace buffer is 10 cycle.
+ * To configure counter to send value every N cycles set counter to
+ * 2^32 - 1 - N.
+ */
+#define NUM_INTERVAL_CYC  0xFFFFFFFF - 10
+
+/*
+ * spu_cycle_reset is the number of cycles between samples.
+ * This variable is used for SPU profiling and should ONLY be set
+ * at the beginning of cell_reg_setup; otherwise, it's read-only.
+ */
+static unsigned int spu_cycle_reset;
+static unsigned int profiling_mode;
+static int spu_evnt_phys_spu_indx;
+
 struct pmc_cntrl_data {
 	unsigned long vcntr;
 	unsigned long evnts;
@@ -105,6 +121,8 @@ struct pm_cntrl {
 	u16 trace_mode;
 	u16 freeze;
 	u16 count_mode;
+	u16 spu_addr_trace;
+	u8  trace_buf_ovflw;
 };
 
 static struct {
@@ -122,7 +140,7 @@ static struct {
 #define GET_INPUT_CONTROL(x) ((x & 0x00000004) >> 2)
 
 static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values);
-
+static unsigned long spu_pm_cnt[MAX_NUMNODES * NUM_SPUS_PER_NODE];
 static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS];
 
 /*
@@ -152,6 +170,7 @@ static u32 hdw_thread;
 
 static u32 virt_cntr_inter_mask;
 static struct timer_list timer_virt_cntr;
+static struct timer_list timer_spu_event_swap;
 
 /*
  * pm_signal needs to be global since it is initialized in
@@ -165,7 +184,7 @@ static int spu_rtas_token;   /* token for SPU cycle profiling */
 static u32 reset_value[NR_PHYS_CTRS];
 static int num_counters;
 static int oprofile_running;
-static DEFINE_SPINLOCK(virt_cntr_lock);
+static DEFINE_SPINLOCK(cntr_lock);
 
 static u32 ctr_enabled;
 
@@ -336,13 +355,13 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask)
 	for (i = 0; i < NUM_DEBUG_BUS_WORDS; i++) {
 		if (bus_word & (1 << i)) {
 			pm_regs.debug_bus_control |=
-			    (bus_type << (30 - (2 * i)));
+				(bus_type << (30 - (2 * i)));
 
 			for (j = 0; j < NUM_INPUT_BUS_WORDS; j++) {
 				if (input_bus[j] == 0xff) {
 					input_bus[j] = i;
 					pm_regs.group_control |=
-					    (i << (30 - (2 * j)));
+						(i << (30 - (2 * j)));
 
 					break;
 				}
@@ -367,12 +386,16 @@ static void write_pm_cntrl(int cpu)
 	if (pm_regs.pm_cntrl.stop_at_max == 1)
 		val |= CBE_PM_STOP_AT_MAX;
 
-	if (pm_regs.pm_cntrl.trace_mode == 1)
+	if (pm_regs.pm_cntrl.trace_mode != 0)
 		val |= CBE_PM_TRACE_MODE_SET(pm_regs.pm_cntrl.trace_mode);
 
+	if (pm_regs.pm_cntrl.trace_buf_ovflw == 1)
+		val |= CBE_PM_TRACE_BUF_OVFLW(pm_regs.pm_cntrl.trace_buf_ovflw);
 	if (pm_regs.pm_cntrl.freeze == 1)
 		val |= CBE_PM_FREEZE_ALL_CTRS;
 
+	val |= CBE_PM_SPU_ADDR_TRACE_SET(pm_regs.pm_cntrl.spu_addr_trace);
+
 	/*
 	 * Routine set_count_mode must be called previously to set
 	 * the count mode based on the user selection of user and kernel.
@@ -441,7 +464,7 @@ static void cell_virtual_cntr(unsigned long data)
 	 * not both playing with the counters on the same node.
 	 */
 
-	spin_lock_irqsave(&virt_cntr_lock, flags);
+	spin_lock_irqsave(&cntr_lock, flags);
 
 	prev_hdw_thread = hdw_thread;
 
@@ -480,7 +503,7 @@ static void cell_virtual_cntr(unsigned long data)
 		cbe_disable_pm_interrupts(cpu);
 		for (i = 0; i < num_counters; i++) {
 			per_cpu(pmc_values, cpu + prev_hdw_thread)[i]
-			    = cbe_read_ctr(cpu, i);
+				= cbe_read_ctr(cpu, i);
 
 			if (per_cpu(pmc_values, cpu + next_hdw_thread)[i]
 			    == 0xFFFFFFFF)
@@ -527,7 +550,7 @@ static void cell_virtual_cntr(unsigned long data)
 		cbe_enable_pm(cpu);
 	}
 
-	spin_unlock_irqrestore(&virt_cntr_lock, flags);
+	spin_unlock_irqrestore(&cntr_lock, flags);
 
 	mod_timer(&timer_virt_cntr, jiffies + HZ / 10);
 }
@@ -541,38 +564,146 @@ static void start_virt_cntrs(void)
 	add_timer(&timer_virt_cntr);
 }
 
-/* This function is called once for all cpus combined */
-static int cell_reg_setup(struct op_counter_config *ctr,
+static int cell_reg_setup_spu_cycles(struct op_counter_config *ctr,
 			struct op_system_config *sys, int num_ctrs)
 {
-	int i, j, cpu;
-	spu_cycle_reset = 0;
+	spu_cycle_reset = ctr[0].count;
 
-	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
-		spu_cycle_reset = ctr[0].count;
+	/*
+	 * Each node will need to make the rtas call to start
+	 * and stop SPU profiling.  Get the token once and store it.
+	 */
+	spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
+
+	if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+		printk(KERN_ERR
+		       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
+		       __func__);
+		return -EIO;
+	}
+	return 0;
+}
+
+/* Unfortunately, the hardware will only support event profiling
+ * on one SPU per node at a time.  Therefore, we must time slice
+ * the profiling across all SPUs in the node.  Note, we do this
+ * in parallel for each node.  The following routine is called
+ * periodically based on kernel timer to switch which SPU is
+ * being monitored in a round robbin fashion.
+ */
+static void spu_evnt_swap(unsigned long data)
+{
+	int node;
+	int cur_phys_spu, nxt_phys_spu, cur_spu_evnt_phys_spu_indx;
+	unsigned long flags;
+	int cpu;
+	int ret;
+	u32 interrupt_mask;
+
+
+	/* enable interrupts on cntr 0 */
+	interrupt_mask = CBE_PM_CTR_OVERFLOW_INTR(0);
+
+	hdw_thread = 0;
+
+	/* Make sure spu event interrupt handler and spu event swap
+	 * don't access the counters simultaneously.
+	 */
+	spin_lock_irqsave(&cntr_lock, flags);
+
+	cur_spu_evnt_phys_spu_indx = spu_evnt_phys_spu_indx;
+
+	if (++(spu_evnt_phys_spu_indx) == NUM_SPUS_PER_NODE)
+		spu_evnt_phys_spu_indx = 0;
+
+	pm_signal[0].sub_unit = spu_evnt_phys_spu_indx;
+	pm_signal[1].sub_unit = spu_evnt_phys_spu_indx;
+	pm_signal[2].sub_unit = spu_evnt_phys_spu_indx;
+
+	/* switch the SPU being profiled on each node */
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		node = cbe_cpu_to_node(cpu);
+		cur_phys_spu = (node * NUM_SPUS_PER_NODE)
+			+ cur_spu_evnt_phys_spu_indx;
+		nxt_phys_spu = (node * NUM_SPUS_PER_NODE)
+			+ spu_evnt_phys_spu_indx;
 
 		/*
-		 * Each node will need to make the rtas call to start
-		 * and stop SPU profiling.  Get the token once and store it.
+		 * stop counters, save counter values, restore counts
+		 * for previous physical SPU
 		 */
-		spu_rtas_token = rtas_token("ibm,cbe-spu-perftools");
+		cbe_disable_pm(cpu);
+		cbe_disable_pm_interrupts(cpu);
 
-		if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) {
-			printk(KERN_ERR
-			       "%s: rtas token ibm,cbe-spu-perftools unknown\n",
-			       __func__);
-			return -EIO;
-		}
+		spu_pm_cnt[cur_phys_spu]
+			= cbe_read_ctr(cpu, 0);
+
+		/* restore previous count for the next spu to sample */
+		/* NOTE, hardware issue, counter will not start if the
+		 * counter value is at max (0xFFFFFFFF).
+		 */
+		if (spu_pm_cnt[nxt_phys_spu] >= 0xFFFFFFFF)
+			cbe_write_ctr(cpu, 0, 0xFFFFFFF0);
+		 else
+			 cbe_write_ctr(cpu, 0, spu_pm_cnt[nxt_phys_spu]);
+
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+
+		/* setup the debug bus measure the one event and
+		 * the two events to route the next SPU's PC on
+		 * the debug bus
+		 */
+		ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu), 3);
+		if (ret)
+			printk(KERN_ERR "%s: pm_rtas_activate_signals failed, "
+			       "SPU event swap\n", __func__);
+
+		/* clear the trace buffer, don't want to take PC for
+		 * previous SPU*/
+		cbe_write_pm(cpu, trace_address, 0);
+
+		enable_ctr(cpu, 0, pm_regs.pm07_cntrl);
+
+		/* Enable interrupts on the CPU thread that is starting */
+		cbe_enable_pm_interrupts(cpu, hdw_thread,
+					 interrupt_mask);
+		cbe_enable_pm(cpu);
 	}
 
-	pm_rtas_token = rtas_token("ibm,cbe-perftools");
+	spin_unlock_irqrestore(&cntr_lock, flags);
 
+	/* swap approximately every 0.1 seconds */
+	mod_timer(&timer_spu_event_swap, jiffies + HZ / 25);
+}
+
+static void start_spu_event_swap(void)
+{
+	init_timer(&timer_spu_event_swap);
+	timer_spu_event_swap.function = spu_evnt_swap;
+	timer_spu_event_swap.data = 0UL;
+	timer_spu_event_swap.expires = jiffies + HZ / 25;
+	add_timer(&timer_spu_event_swap);
+}
+
+static int cell_reg_setup_spu_events(struct op_counter_config *ctr,
+			struct op_system_config *sys, int num_ctrs)
+{
+	int i;
+
+	/* routine is called once for all nodes */
+
+	spu_evnt_phys_spu_indx = 0;
 	/*
-	 * For all events excetp PPU CYCLEs, each node will need to make
+	 * For all events except PPU CYCLEs, each node will need to make
 	 * the rtas cbe-perftools call to setup and reset the debug bus.
 	 * Make the token lookup call once and store it in the global
 	 * variable pm_rtas_token.
 	 */
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
+
 	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
 		printk(KERN_ERR
 		       "%s: rtas token ibm,cbe-perftools unknown\n",
@@ -580,6 +711,58 @@ static int cell_reg_setup(struct op_counter_config *ctr,
 		return -EIO;
 	}
 
+	/* setup the pm_control register settings,
+	 * settings will be written per node by the
+	 * cell_cpu_setup() function.
+	 */
+	pm_regs.pm_cntrl.trace_buf_ovflw = 1;
+
+	/* Use the occurrence trace mode to have SPU PC saved
+	 * to the trace buffer.  Occurrence data in trace buffer
+	 * is not used.  Bit 2 must be set to store SPU addresses.
+	 */
+	pm_regs.pm_cntrl.trace_mode = 2;
+
+	pm_regs.pm_cntrl.spu_addr_trace = 0x1;  /* using debug bus
+						   event 2 & 3 */
+
+	/* setup the debug bus event array with the SPU PC routing events.
+	*  Note, pm_signal[0] will be filled in by set_pm_event() call below.
+	*/
+	pm_signal[1].signal_group = SPU_PROFILE_EVENT_ADDR / 100;
+	pm_signal[1].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_A);
+	pm_signal[1].bit = SPU_PROFILE_EVENT_ADDR % 100;
+	pm_signal[1].sub_unit = spu_evnt_phys_spu_indx;
+
+	pm_signal[2].signal_group = SPU_PROFILE_EVENT_ADDR / 100;
+	pm_signal[2].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_B);
+	pm_signal[2].bit = SPU_PROFILE_EVENT_ADDR % 100;
+	pm_signal[2].sub_unit = spu_evnt_phys_spu_indx;
+
+	/* Set the user selected spu event to profile on,
+	 * note, only one SPU profiling event is supported
+	 */
+	num_counters = 1;  /* Only support one SPU event at a time */
+	set_pm_event(0, ctr[0].event, ctr[0].unit_mask);
+
+	reset_value[0] = 0xFFFFFFFF - ctr[0].count;
+
+	/* global, used by cell_cpu_setup */
+	ctr_enabled |= 1;
+
+	/* Initialize the count for each SPU to the reset value */
+	for (i=0; i < MAX_NUMNODES * NUM_SPUS_PER_NODE; i++)
+		spu_pm_cnt[i] = reset_value[0];
+
+	return 0;
+}
+
+static int cell_reg_setup_ppu(struct op_counter_config *ctr,
+			struct op_system_config *sys, int num_ctrs)
+{
+	/* routine is called once for all nodes */
+	int i, j, cpu;
+
 	num_counters = num_ctrs;
 
 	if (unlikely(num_ctrs > NR_PHYS_CTRS)) {
@@ -589,14 +772,6 @@ static int cell_reg_setup(struct op_counter_config *ctr,
 		       __func__);
 		return -EIO;
 	}
-	pm_regs.group_control = 0;
-	pm_regs.debug_bus_control = 0;
-
-	/* setup the pm_control register */
-	memset(&pm_regs.pm_cntrl, 0, sizeof(struct pm_cntrl));
-	pm_regs.pm_cntrl.stop_at_max = 1;
-	pm_regs.pm_cntrl.trace_mode = 0;
-	pm_regs.pm_cntrl.freeze = 1;
 
 	set_count_mode(sys->enable_kernel, sys->enable_user);
 
@@ -665,6 +840,63 @@ static int cell_reg_setup(struct op_counter_config *ctr,
 }
 
 
+/* This function is called once for all cpus combined */
+static int cell_reg_setup(struct op_counter_config *ctr,
+			struct op_system_config *sys, int num_ctrs)
+{
+	int ret=0;
+	spu_cycle_reset = 0;
+
+	/* initialize the spu_arr_trace value, will be reset if
+	 * doing spu event profiling.
+	 */
+	pm_regs.group_control = 0;
+	pm_regs.debug_bus_control = 0;
+	pm_regs.pm_cntrl.stop_at_max = 1;
+	pm_regs.pm_cntrl.trace_mode = 0;
+	pm_regs.pm_cntrl.freeze = 1;
+	pm_regs.pm_cntrl.trace_buf_ovflw = 0;
+	pm_regs.pm_cntrl.spu_addr_trace = 0;
+
+	/*
+	 * For all events except PPU CYCLEs, each node will need to make
+	 * the rtas cbe-perftools call to setup and reset the debug bus.
+	 * Make the token lookup call once and store it in the global
+	 * variable pm_rtas_token.
+	 */
+	pm_rtas_token = rtas_token("ibm,cbe-perftools");
+
+	if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) {
+		printk(KERN_ERR
+		       "%s: rtas token ibm,cbe-perftools unknown\n",
+		       __func__);
+		return -EIO;
+	}
+
+	if (ctr[0].event == SPU_CYCLES_EVENT_NUM) {
+		profiling_mode = SPU_PROFILING_CYCLES;
+		ret = cell_reg_setup_spu_cycles(ctr, sys, num_ctrs);
+	} else if ((ctr[0].event >= SPU_EVENT_NUM_START) &&
+		   (ctr[0].event <= SPU_EVENT_NUM_STOP)) {
+		profiling_mode = SPU_PROFILING_EVENTS;
+		spu_cycle_reset = ctr[0].count;
+
+		/* for SPU event profiling, need to setup the
+		 * pm_signal array with the events to route the
+		 * SPU PC before making the FW call.  Note, only
+		 * one SPU event for profiling can be specified
+		 * at a time.
+		 */
+		cell_reg_setup_spu_events(ctr, sys, num_ctrs);
+	} else {
+		profiling_mode = PPU_PROFILING;
+		ret = cell_reg_setup_ppu(ctr, sys, num_ctrs);
+	}
+
+	return ret;
+}
+
+
 
 /* This function is called once for each cpu */
 static int cell_cpu_setup(struct op_counter_config *cntr)
@@ -672,8 +904,13 @@ static int cell_cpu_setup(struct op_counter_config *cntr)
 	u32 cpu = smp_processor_id();
 	u32 num_enabled = 0;
 	int i;
+	int ret;
 
-	if (spu_cycle_reset)
+	/* Cycle based SPU profiling does not use the performance
+	 * counters.  The trace array is configured to collect
+	 * the data.
+	 */
+	if (profiling_mode == SPU_PROFILING_CYCLES)
 		return 0;
 
 	/* There is one performance monitor per processor chip (i.e. node),
@@ -686,7 +923,6 @@ static int cell_cpu_setup(struct op_counter_config *cntr)
 	cbe_disable_pm(cpu);
 	cbe_disable_pm_interrupts(cpu);
 
-	cbe_write_pm(cpu, pm_interval, 0);
 	cbe_write_pm(cpu, pm_start_stop, 0);
 	cbe_write_pm(cpu, group_control, pm_regs.group_control);
 	cbe_write_pm(cpu, debug_bus_control, pm_regs.debug_bus_control);
@@ -703,7 +939,20 @@ static int cell_cpu_setup(struct op_counter_config *cntr)
 	 * The pm_rtas_activate_signals will return -EIO if the FW
 	 * call failed.
 	 */
-	return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled);
+	if (profiling_mode == SPU_PROFILING_EVENTS) {
+		/* For SPU event profiling also need to setup the
+		 * pm interval timer
+		 */
+		ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu),
+					       num_enabled+2);
+		/* store PC from debug bus to Trace buffer as often
+		 * as possible (every 10 cycles)
+		 */
+		cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
+		return ret;
+	} else
+		return pm_rtas_activate_signals(cbe_cpu_to_node(cpu),
+						num_enabled);
 }
 
 #define ENTRIES	 303
@@ -885,7 +1134,122 @@ static struct notifier_block cpu_freq_notifier_block = {
 };
 #endif
 
-static int cell_global_start_spu(struct op_counter_config *ctr)
+/*
+ * Note the generic OProfile stop calls do not support returning
+ * an error on stop.  Hence, will not return an error if the FW
+ * calls fail on stop.	Failure to reset the debug bus is not an issue.
+ * Failure to disable the SPU profiling is not an issue.  The FW calls
+ * to enable the performance counters and debug bus will work even if
+ * the hardware was not cleanly reset.
+ */
+static void cell_global_stop_spu_cycles(void)
+{
+	int subfunc, rtn_value;
+	unsigned int lfsr_value;
+	int cpu;
+
+	oprofile_running = 0;
+	smp_wmb();
+
+#ifdef CONFIG_CPU_FREQ
+	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
+				    CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		subfunc = 3;	/*
+				 * 2 - activate SPU tracing,
+				 * 3 - deactivate
+				 */
+		lfsr_value = 0x8f100000;
+
+		rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL,
+				      subfunc, cbe_cpu_to_node(cpu),
+				      lfsr_value);
+
+		if (unlikely(rtn_value != 0)) {
+			printk(KERN_ERR
+			       "%s: rtas call ibm,cbe-spu-perftools " \
+			       "failed, return = %d\n",
+			       __func__, rtn_value);
+		}
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+	}
+
+	stop_spu_profiling_cycles();
+}
+
+static void cell_global_stop_spu_events(void)
+{
+	int cpu;
+	oprofile_running = 0;
+
+	stop_spu_profiling_events();
+	smp_wmb();
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		cbe_sync_irq(cbe_cpu_to_node(cpu));
+		/* Stop the counters */
+		cbe_disable_pm(cpu);
+		cbe_write_pm07_control(cpu, 0, 0);
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+
+		/* Deactivate interrupts */
+		cbe_disable_pm_interrupts(cpu);
+	}
+	del_timer_sync(&timer_spu_event_swap);
+}
+
+static void cell_global_stop_ppu(void)
+{
+	int cpu;
+
+	/*
+	 * This routine will be called once for the system.
+	 * There is one performance monitor per node, so we
+	 * only need to perform this function once per node.
+	 */
+	del_timer_sync(&timer_virt_cntr);
+	oprofile_running = 0;
+	smp_wmb();
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		cbe_sync_irq(cbe_cpu_to_node(cpu));
+		/* Stop the counters */
+		cbe_disable_pm(cpu);
+
+		/* Deactivate the signals */
+		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+
+		/* Deactivate interrupts */
+		cbe_disable_pm_interrupts(cpu);
+	}
+}
+
+static void cell_global_stop(void)
+{
+	if (profiling_mode == PPU_PROFILING)
+		cell_global_stop_ppu();
+	else if (profiling_mode == SPU_PROFILING_EVENTS)
+		cell_global_stop_spu_events();
+	else
+		cell_global_stop_spu_cycles();
+}
+
+static int cell_global_start_spu_cycles(struct op_counter_config *ctr)
 {
 	int subfunc;
 	unsigned int lfsr_value;
@@ -951,18 +1315,18 @@ static int cell_global_start_spu(struct op_counter_config *ctr)
 
 		/* start profiling */
 		ret = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc,
-		  cbe_cpu_to_node(cpu), lfsr_value);
+				cbe_cpu_to_node(cpu), lfsr_value);
 
 		if (unlikely(ret != 0)) {
 			printk(KERN_ERR
-			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
-			       __func__, ret);
+			       "%s: rtas call ibm,cbe-spu-perftools failed, " \
+			       "return = %d\n", __func__, ret);
 			rtas_error = -EIO;
 			goto out;
 		}
 	}
 
-	rtas_error = start_spu_profiling(spu_cycle_reset);
+	rtas_error = start_spu_profiling_cycles(spu_cycle_reset);
 	if (rtas_error)
 		goto out_stop;
 
@@ -970,11 +1334,74 @@ static int cell_global_start_spu(struct op_counter_config *ctr)
 	return 0;
 
 out_stop:
-	cell_global_stop_spu();		/* clean up the PMU/debug bus */
+	cell_global_stop_spu_cycles();	/* clean up the PMU/debug bus */
 out:
 	return rtas_error;
 }
 
+static int cell_global_start_spu_events(struct op_counter_config *ctr)
+{
+	int cpu;
+	u32 interrupt_mask = 0;
+	int rtn = 0;
+
+	hdw_thread = 0;
+
+	/* spu event profiling, uses the performance counters to generate
+	 * an interrupt.  The hardware is setup to store the SPU program
+	 * counter into the trace array.  The occurrence mode is used to
+	 * enable storing data to the trace buffer.  The bits are set
+	 * to send/store the SPU address in the trace buffer.  The debug
+	 * bus must be setup to route the SPU program counter onto the
+	 * debug bus.  The occurrence data in the trace buffer is not used.
+	 */
+
+	/* This routine gets called once for the system.
+	 * There is one performance monitor per node, so we
+	 * only need to perform this function once per node.
+	 */
+
+	for_each_online_cpu(cpu) {
+		if (cbe_get_hw_thread_id(cpu))
+			continue;
+
+		/*
+		 * Setup SPU event-based profiling.
+		 * Set perf_mon_control bit 0 to a zero before
+		 * enabling spu collection hardware.
+		 *
+		 * Only support one SPU event on one SPU per node.
+		 */
+		if (ctr_enabled & 1) {
+			cbe_write_ctr(cpu, 0, reset_value[0]);
+			enable_ctr(cpu, 0, pm_regs.pm07_cntrl);
+			interrupt_mask |=
+				CBE_PM_CTR_OVERFLOW_INTR(0);
+		} else {
+			/* Disable counter */
+			cbe_write_pm07_control(cpu, 0, 0);
+		}
+
+		cbe_get_and_clear_pm_interrupts(cpu);
+		cbe_enable_pm_interrupts(cpu, hdw_thread, interrupt_mask);
+		cbe_enable_pm(cpu);
+
+		/* clear the trace buffer */
+		cbe_write_pm(cpu, trace_address, 0);
+	}
+
+	/* Start the timer to time slice collecting the event profile
+	 * on each of the SPUs.  Note, can collect profile on one SPU
+	 * per node at a time.
+	 */
+	start_spu_event_swap();
+	start_spu_profiling_events();
+	oprofile_running = 1;
+	smp_wmb();
+
+	return rtn;
+}
+
 static int cell_global_start_ppu(struct op_counter_config *ctr)
 {
 	u32 cpu, i;
@@ -994,8 +1421,7 @@ static int cell_global_start_ppu(struct op_counter_config *ctr)
 			if (ctr_enabled & (1 << i)) {
 				cbe_write_ctr(cpu, i, reset_value[i]);
 				enable_ctr(cpu, i, pm_regs.pm07_cntrl);
-				interrupt_mask |=
-				    CBE_PM_CTR_OVERFLOW_INTR(i);
+				interrupt_mask |= CBE_PM_CTR_OVERFLOW_INTR(i);
 			} else {
 				/* Disable counter */
 				cbe_write_pm07_control(cpu, i, 0);
@@ -1024,99 +1450,162 @@ static int cell_global_start_ppu(struct op_counter_config *ctr)
 
 static int cell_global_start(struct op_counter_config *ctr)
 {
-	if (spu_cycle_reset)
-		return cell_global_start_spu(ctr);
+	if (profiling_mode == SPU_PROFILING_CYCLES)
+		return cell_global_start_spu_cycles(ctr);
+	else if (profiling_mode == SPU_PROFILING_EVENTS)
+		return cell_global_start_spu_events(ctr);
 	else
 		return cell_global_start_ppu(ctr);
 }
 
-/*
- * Note the generic OProfile stop calls do not support returning
- * an error on stop.  Hence, will not return an error if the FW
- * calls fail on stop.	Failure to reset the debug bus is not an issue.
- * Failure to disable the SPU profiling is not an issue.  The FW calls
- * to enable the performance counters and debug bus will work even if
- * the hardware was not cleanly reset.
+
+/* The SPU interrupt handler
+ *
+ * SPU event profiling works as follows:
+ * The pm_signal[0] holds the one SPU event to be measured.  It is routed on
+ * the debug bus using word 0 or 1.  The value of pm_signal[1] and
+ * pm_signal[2] contain the necessary events to route the SPU program
+ * counter for the selected SPU onto the debug bus using words 2 and 3.
+ * The pm_interval register is setup to write the SPU PC value into the
+ * trace buffer at the maximum rate possible.  The trace buffer is configured
+ * to store the PCs, wrapping when it is full.  The performance counter is
+ * intialized to the max hardware count minus the number of events, N, between
+ * samples.  Once the N events have occured, a HW counter overflow occurs
+ * causing the generation of a HW counter interrupt which also stops the
+ * writing of the SPU PC values to the trace buffer.  Hence the last PC
+ * written to the trace buffer is the SPU PC that we want.  Unfortunately,
+ * we have to read from the beginning of the trace buffer to get to the
+ * last value written.  We just hope the PPU has nothing better to do then
+ * service this interrupt. The PC for the specific SPU being profiled is
+ * extracted from the trace buffer processed and stored.  The trace buffer
+ * is cleared, interrupts are cleared, the counter is reset to max - N.
+ * A kernel timer is used to periodically call the routine spu_evnt_swap()
+ * to switch to the next physical SPU in the node to profile in round robbin
+ * order.  This way data is collected for all SPUs on the node. It does mean
+ * that we need to use a relatively small value of N to ensure enough samples
+ * on each SPU are collected each SPU is being profiled 1/8 of the time.
+ * It may also be necessary to use a longer sample collection period.
  */
-static void cell_global_stop_spu(void)
+static void cell_handle_interrupt_spu(struct pt_regs *regs,
+				      struct op_counter_config *ctr)
 {
-	int subfunc, rtn_value;
-	unsigned int lfsr_value;
-	int cpu;
+	u32 cpu, cpu_tmp;
+	u64 trace_entry;
+	u32 interrupt_mask;
+	u64 trace_buffer[2];
+	u64 last_trace_buffer;
+	u32 sample;
+	u32 trace_addr;
+	unsigned long sample_array_lock_flags;
+	int spu_num;
+	unsigned long flags;
 
-	oprofile_running = 0;
+	/* Make sure spu event interrupt handler and spu event swap
+	 * don't access the counters simultaneously.
+	 */
+	cpu = smp_processor_id();
+	spin_lock_irqsave(&cntr_lock, flags);
 
-#ifdef CONFIG_CPU_FREQ
-	cpufreq_unregister_notifier(&cpu_freq_notifier_block,
-				    CPUFREQ_TRANSITION_NOTIFIER);
-#endif
+	cpu_tmp = cpu;
+	cbe_disable_pm(cpu);
 
-	for_each_online_cpu(cpu) {
-		if (cbe_get_hw_thread_id(cpu))
-			continue;
+	interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu);
 
-		subfunc = 3;	/*
-				 * 2 - activate SPU tracing,
-				 * 3 - deactivate
-				 */
-		lfsr_value = 0x8f100000;
+	sample = 0xABCDEF;
+	trace_entry = 0xfedcba;
+	last_trace_buffer = 0xdeadbeaf;
 
-		rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL,
-				      subfunc, cbe_cpu_to_node(cpu),
-				      lfsr_value);
+	if ((oprofile_running == 1) && (interrupt_mask != 0)) {
+		/* disable writes to trace buff */
+		cbe_write_pm(cpu, pm_interval, 0);
 
-		if (unlikely(rtn_value != 0)) {
-			printk(KERN_ERR
-			       "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n",
-			       __func__, rtn_value);
+		/* only have one perf cntr being used, cntr 0 */
+		if ((interrupt_mask & CBE_PM_CTR_OVERFLOW_INTR(0))
+		    && ctr[0].enabled)
+			/* The SPU PC values will be read
+			 * from the trace buffer, reset counter
+			 */
+
+			cbe_write_ctr(cpu, 0, reset_value[0]);
+
+		trace_addr = cbe_read_pm(cpu, trace_address);
+
+		while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) {
+			/* There is data in the trace buffer to process
+			 * Read the buffer until you get to the last
+			 * entry.  This is the value we want.
+			 */
+
+			cbe_read_trace_buffer(cpu, trace_buffer);
+			trace_addr = cbe_read_pm(cpu, trace_address);
 		}
 
-		/* Deactivate the signals */
-		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
-	}
+		/* SPU Address 16 bit count format for 128 bit
+		 * HW trace buffer is used for the SPU PC storage
+		 *    HDR bits          0:15
+		 *    SPU Addr 0 bits   16:31
+		 *    SPU Addr 1 bits   32:47
+		 *    unused bits       48:127
+		 *
+		 * HDR: bit4 = 1 SPU Address 0 valid
+		 * HDR: bit5 = 1 SPU Address 1 valid
+		 *  - unfortunately, the valid bits don't seem to work
+		 *
+		 * Note trace_buffer[0] holds bits 0:63 of the HW
+		 * trace buffer, trace_buffer[1] holds bits 64:127
+		 */
 
-	stop_spu_profiling();
-}
+		trace_entry = trace_buffer[0]
+			& 0x00000000FFFF0000;
 
-static void cell_global_stop_ppu(void)
-{
-	int cpu;
+		/* only top 16 of the 18 bit SPU PC address
+		 * is stored in trace buffer, hence shift right
+		 * by 16 -2 bits */
+		sample = trace_entry >> 14;
+		last_trace_buffer = trace_buffer[0];
 
-	/*
-	 * This routine will be called once for the system.
-	 * There is one performance monitor per node, so we
-	 * only need to perform this function once per node.
-	 */
-	del_timer_sync(&timer_virt_cntr);
-	oprofile_running = 0;
-	smp_wmb();
+		spu_num = spu_evnt_phys_spu_indx
+			+ (cbe_cpu_to_node(cpu) * NUM_SPUS_PER_NODE);
 
-	for_each_online_cpu(cpu) {
-		if (cbe_get_hw_thread_id(cpu))
-			continue;
+		/* make sure only one process at a time is calling
+		 * spu_sync_buffer()
+		 */
+		spin_lock_irqsave(&oprof_spu_smpl_arry_lck,
+				  sample_array_lock_flags);
+		spu_sync_buffer(spu_num, &sample, 1);
+		spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck,
+				       sample_array_lock_flags);
 
-		cbe_sync_irq(cbe_cpu_to_node(cpu));
-		/* Stop the counters */
-		cbe_disable_pm(cpu);
+		smp_wmb();    /* insure spu event buffer updates are written
+			       * don't want events intermingled... */
 
-		/* Deactivate the signals */
-		pm_rtas_reset_signals(cbe_cpu_to_node(cpu));
+		/* The counters were frozen by the interrupt.
+		 * Reenable the interrupt and restart the counters.
+		 */
+		cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
+		cbe_enable_pm_interrupts(cpu, hdw_thread,
+					 virt_cntr_inter_mask);
 
-		/* Deactivate interrupts */
-		cbe_disable_pm_interrupts(cpu);
-	}
-}
+		/* clear the trace buffer, re-enable writes to trace buff */
+		cbe_write_pm(cpu, trace_address, 0);
+		cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC);
 
-static void cell_global_stop(void)
-{
-	if (spu_cycle_reset)
-		cell_global_stop_spu();
-	else
-		cell_global_stop_ppu();
+		/* The writes to the various performance counters only writes
+		 * to a latch.  The new values (interrupt setting bits, reset
+		 * counter value etc.) are not copied to the actual registers
+		 * until the performance monitor is enabled.  In order to get
+		 * this to work as desired, the permormance monitor needs to
+		 * be disabled while writing to the latches.  This is a
+		 * HW design issue.
+		 */
+		write_pm_cntrl(cpu);
+		cbe_enable_pm(cpu);
+	}
+	spin_unlock_irqrestore(&cntr_lock, flags);
 }
 
-static void cell_handle_interrupt(struct pt_regs *regs,
-				struct op_counter_config *ctr)
+static void cell_handle_interrupt_ppu(struct pt_regs *regs,
+				      struct op_counter_config *ctr)
 {
 	u32 cpu;
 	u64 pc;
@@ -1132,7 +1621,7 @@ static void cell_handle_interrupt(struct pt_regs *regs,
 	 * routine are not running at the same time. See the
 	 * cell_virtual_cntr() routine for additional comments.
 	 */
-	spin_lock_irqsave(&virt_cntr_lock, flags);
+	spin_lock_irqsave(&cntr_lock, flags);
 
 	/*
 	 * Need to disable and reenable the performance counters
@@ -1185,7 +1674,16 @@ static void cell_handle_interrupt(struct pt_regs *regs,
 		 */
 		cbe_enable_pm(cpu);
 	}
-	spin_unlock_irqrestore(&virt_cntr_lock, flags);
+	spin_unlock_irqrestore(&cntr_lock, flags);
+}
+
+static void cell_handle_interrupt(struct pt_regs *regs,
+				  struct op_counter_config *ctr)
+{
+	if (profiling_mode == PPU_PROFILING)
+		cell_handle_interrupt_ppu(regs, ctr);
+	else
+		cell_handle_interrupt_spu(regs, ctr);
 }
 
 /*
@@ -1195,7 +1693,8 @@ static void cell_handle_interrupt(struct pt_regs *regs,
  */
 static int cell_sync_start(void)
 {
-	if (spu_cycle_reset)
+	if ((profiling_mode == SPU_PROFILING_CYCLES) ||
+	    (profiling_mode == SPU_PROFILING_EVENTS))
 		return spu_sync_start();
 	else
 		return DO_GENERIC_SYNC;
@@ -1203,7 +1702,8 @@ static int cell_sync_start(void)
 
 static int cell_sync_stop(void)
 {
-	if (spu_cycle_reset)
+	if ((profiling_mode == SPU_PROFILING_CYCLES) ||
+	    (profiling_mode == SPU_PROFILING_EVENTS))
 		return spu_sync_stop();
 	else
 		return 1;
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 9fa9dcdf344..e02a359d2aa 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -300,7 +300,7 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
 	return oldbit;
 }
 
-static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
+static inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr)
 {
 	return ((1UL << (nr % BITS_PER_LONG)) &
 		(((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 29dc0c89d4a..d37593c2f43 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -47,7 +47,7 @@
 #endif
 
 static int __initdata acpi_force = 0;
-
+u32 acpi_rsdt_forced;
 #ifdef	CONFIG_ACPI
 int acpi_disabled = 0;
 #else
@@ -1374,6 +1374,17 @@ static void __init acpi_process_madt(void)
 			       "Invalid BIOS MADT, disabling ACPI\n");
 			disable_acpi();
 		}
+	} else {
+		/*
+ 		 * ACPI found no MADT, and so ACPI wants UP PIC mode.
+ 		 * In the event an MPS table was found, forget it.
+ 		 * Boot with "acpi=off" to use MPS on such a system.
+ 		 */
+		if (smp_found_config) {
+			printk(KERN_WARNING PREFIX
+				"No APIC-table, disabling MPS\n");
+			smp_found_config = 0;
+		}
 	}
 
 	/*
@@ -1809,6 +1820,10 @@ static int __init parse_acpi(char *arg)
 			disable_acpi();
 		acpi_ht = 1;
 	}
+	/* acpi=rsdt use RSDT instead of XSDT */
+	else if (strcmp(arg, "rsdt") == 0) {
+		acpi_rsdt_forced = 1;
+	}
 	/* "acpi=noirq" disables ACPI interrupt routing */
 	else if (strcmp(arg, "noirq") == 0) {
 		acpi_noirq_set();
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index c2502eb9aa8..a4805b3b409 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -56,6 +56,7 @@ static struct cstate_entry *cpu_cstate_entry;	/* per CPU ptr */
 static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
 
 #define MWAIT_SUBSTATE_MASK	(0xf)
+#define MWAIT_CSTATE_MASK	(0xf)
 #define MWAIT_SUBSTATE_SIZE	(4)
 
 #define CPUID_MWAIT_LEAF (5)
@@ -98,7 +99,8 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
 
 	/* Check whether this particular cx_type (in CST) is supported or not */
-	cstate_type = (cx->address >> MWAIT_SUBSTATE_SIZE) + 1;
+	cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
+			MWAIT_CSTATE_MASK) + 1;
 	edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
 	num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
 
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 806b4e9051b..707c1f6f95f 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -159,6 +159,8 @@ static int __init acpi_sleep_setup(char *str)
 #endif
 		if (strncmp(str, "old_ordering", 12) == 0)
 			acpi_old_suspend_ordering();
+		if (strncmp(str, "s4_nonvs", 8) == 0)
+			acpi_s4_no_nvs();
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 65a13943e09..e85826829cf 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -665,6 +665,27 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 }
 #endif
 
+#ifdef CONFIG_HIBERNATION
+/**
+ * Mark ACPI NVS memory region, so that we can save/restore it during
+ * hibernation and the subsequent resume.
+ */
+static int __init e820_mark_nvs_memory(void)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (ei->type == E820_NVS)
+			hibernate_nvs_register(ei->addr, ei->size);
+	}
+
+	return 0;
+}
+core_initcall(e820_mark_nvs_memory);
+#endif
+
 /*
  * Early reserved memory areas.
  */
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 744aa7fc49d..76b8cd953de 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -201,6 +201,12 @@ struct chipset {
 	void (*f)(int num, int slot, int func);
 };
 
+/*
+ * Only works for devices on the root bus. If you add any devices
+ * not on bus 0 readd another loop level in early_quirks(). But
+ * be careful because at least the Nvidia quirk here relies on
+ * only matching on bus 0.
+ */
 static struct chipset early_qrk[] __initdata = {
 	{ PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
 	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
@@ -267,17 +273,17 @@ static int __init check_dev_quirk(int num, int slot, int func)
 
 void __init early_quirks(void)
 {
-	int num, slot, func;
+	int slot, func;
 
 	if (!early_pci_allowed())
 		return;
 
 	/* Poor man's PCI discovery */
-	for (num = 0; num < 32; num++)
-		for (slot = 0; slot < 32; slot++)
-			for (func = 0; func < 8; func++) {
-				/* Only probe function 0 on single fn devices */
-				if (check_dev_quirk(num, slot, func))
-					break;
-			}
+	/* Only scan the root bus */
+	for (slot = 0; slot < 32; slot++)
+		for (func = 0; func < 8; func++) {
+			/* Only probe function 0 on single fn devices */
+			if (check_dev_quirk(0, slot, func))
+				break;
+		}
 }
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 98658f25f54..8fdf06e4edf 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -2,7 +2,7 @@
  * @file op_model_amd.c
  * athlon / K7 / K8 / Family 10h model-specific MSR operations
  *
- * @remark Copyright 2002-2008 OProfile authors
+ * @remark Copyright 2002-2009 OProfile authors
  * @remark Read the file COPYING
  *
  * @author John Levon
@@ -10,7 +10,7 @@
  * @author Graydon Hoare
  * @author Robert Richter <robert.richter@amd.com>
  * @author Barry Kasindorf
-*/
+ */
 
 #include <linux/oprofile.h>
 #include <linux/device.h>
@@ -60,53 +60,10 @@ static unsigned long reset_value[NUM_COUNTERS];
 #define IBS_OP_LOW_VALID_BIT		(1ULL<<18)	/* bit 18 */
 #define IBS_OP_LOW_ENABLE		(1ULL<<17)	/* bit 17 */
 
-/* Codes used in cpu_buffer.c */
-/* This produces duplicate code, need to be fixed */
-#define IBS_FETCH_BEGIN 3
-#define IBS_OP_BEGIN    4
-
-/*
- * The function interface needs to be fixed, something like add
- * data. Should then be added to linux/oprofile.h.
- */
-extern void
-oprofile_add_ibs_sample(struct pt_regs * const regs,
-			unsigned int * const ibs_sample, int ibs_code);
-
-struct ibs_fetch_sample {
-	/* MSRC001_1031 IBS Fetch Linear Address Register */
-	unsigned int ibs_fetch_lin_addr_low;
-	unsigned int ibs_fetch_lin_addr_high;
-	/* MSRC001_1030 IBS Fetch Control Register */
-	unsigned int ibs_fetch_ctl_low;
-	unsigned int ibs_fetch_ctl_high;
-	/* MSRC001_1032 IBS Fetch Physical Address Register */
-	unsigned int ibs_fetch_phys_addr_low;
-	unsigned int ibs_fetch_phys_addr_high;
-};
-
-struct ibs_op_sample {
-	/* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */
-	unsigned int ibs_op_rip_low;
-	unsigned int ibs_op_rip_high;
-	/* MSRC001_1035 IBS Op Data Register */
-	unsigned int ibs_op_data1_low;
-	unsigned int ibs_op_data1_high;
-	/* MSRC001_1036 IBS Op Data 2 Register */
-	unsigned int ibs_op_data2_low;
-	unsigned int ibs_op_data2_high;
-	/* MSRC001_1037 IBS Op Data 3 Register */
-	unsigned int ibs_op_data3_low;
-	unsigned int ibs_op_data3_high;
-	/* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */
-	unsigned int ibs_dc_linear_low;
-	unsigned int ibs_dc_linear_high;
-	/* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */
-	unsigned int ibs_dc_phys_low;
-	unsigned int ibs_dc_phys_high;
-};
+#define IBS_FETCH_SIZE	6
+#define IBS_OP_SIZE	12
 
-static int ibs_allowed;	/* AMD Family10h and later */
+static int has_ibs;	/* AMD Family10h and later */
 
 struct op_ibs_config {
 	unsigned long op_enabled;
@@ -197,31 +154,29 @@ static inline int
 op_amd_handle_ibs(struct pt_regs * const regs,
 		  struct op_msrs const * const msrs)
 {
-	unsigned int low, high;
-	struct ibs_fetch_sample ibs_fetch;
-	struct ibs_op_sample ibs_op;
+	u32 low, high;
+	u64 msr;
+	struct op_entry entry;
 
-	if (!ibs_allowed)
+	if (!has_ibs)
 		return 1;
 
 	if (ibs_config.fetch_enabled) {
 		rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
 		if (high & IBS_FETCH_HIGH_VALID_BIT) {
-			ibs_fetch.ibs_fetch_ctl_high = high;
-			ibs_fetch.ibs_fetch_ctl_low = low;
-			rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high);
-			ibs_fetch.ibs_fetch_lin_addr_high = high;
-			ibs_fetch.ibs_fetch_lin_addr_low = low;
-			rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high);
-			ibs_fetch.ibs_fetch_phys_addr_high = high;
-			ibs_fetch.ibs_fetch_phys_addr_low = low;
-
-			oprofile_add_ibs_sample(regs,
-						(unsigned int *)&ibs_fetch,
-						IBS_FETCH_BEGIN);
+			rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr);
+			oprofile_write_reserve(&entry, regs, msr,
+					       IBS_FETCH_CODE, IBS_FETCH_SIZE);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			oprofile_add_data(&entry, low);
+			oprofile_add_data(&entry, high);
+			rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			oprofile_write_commit(&entry);
 
 			/* reenable the IRQ */
-			rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
 			high &= ~IBS_FETCH_HIGH_VALID_BIT;
 			high |= IBS_FETCH_HIGH_ENABLE;
 			low &= IBS_FETCH_LOW_MAX_CNT_MASK;
@@ -232,30 +187,29 @@ op_amd_handle_ibs(struct pt_regs * const regs,
 	if (ibs_config.op_enabled) {
 		rdmsr(MSR_AMD64_IBSOPCTL, low, high);
 		if (low & IBS_OP_LOW_VALID_BIT) {
-			rdmsr(MSR_AMD64_IBSOPRIP, low, high);
-			ibs_op.ibs_op_rip_low = low;
-			ibs_op.ibs_op_rip_high = high;
-			rdmsr(MSR_AMD64_IBSOPDATA, low, high);
-			ibs_op.ibs_op_data1_low = low;
-			ibs_op.ibs_op_data1_high = high;
-			rdmsr(MSR_AMD64_IBSOPDATA2, low, high);
-			ibs_op.ibs_op_data2_low = low;
-			ibs_op.ibs_op_data2_high = high;
-			rdmsr(MSR_AMD64_IBSOPDATA3, low, high);
-			ibs_op.ibs_op_data3_low = low;
-			ibs_op.ibs_op_data3_high = high;
-			rdmsr(MSR_AMD64_IBSDCLINAD, low, high);
-			ibs_op.ibs_dc_linear_low = low;
-			ibs_op.ibs_dc_linear_high = high;
-			rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high);
-			ibs_op.ibs_dc_phys_low = low;
-			ibs_op.ibs_dc_phys_high = high;
+			rdmsrl(MSR_AMD64_IBSOPRIP, msr);
+			oprofile_write_reserve(&entry, regs, msr,
+					       IBS_OP_CODE, IBS_OP_SIZE);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			rdmsrl(MSR_AMD64_IBSOPDATA, msr);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			rdmsrl(MSR_AMD64_IBSOPDATA2, msr);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			rdmsrl(MSR_AMD64_IBSOPDATA3, msr);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			rdmsrl(MSR_AMD64_IBSDCLINAD, msr);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr);
+			oprofile_add_data(&entry, (u32)msr);
+			oprofile_add_data(&entry, (u32)(msr >> 32));
+			oprofile_write_commit(&entry);
 
 			/* reenable the IRQ */
-			oprofile_add_ibs_sample(regs,
-						(unsigned int *)&ibs_op,
-						IBS_OP_BEGIN);
-			rdmsr(MSR_AMD64_IBSOPCTL, low, high);
 			high = 0;
 			low &= ~IBS_OP_LOW_VALID_BIT;
 			low |= IBS_OP_LOW_ENABLE;
@@ -305,14 +259,14 @@ static void op_amd_start(struct op_msrs const * const msrs)
 	}
 
 #ifdef CONFIG_OPROFILE_IBS
-	if (ibs_allowed && ibs_config.fetch_enabled) {
+	if (has_ibs && ibs_config.fetch_enabled) {
 		low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
 		high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */
 			+ IBS_FETCH_HIGH_ENABLE;
 		wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
 	}
 
-	if (ibs_allowed && ibs_config.op_enabled) {
+	if (has_ibs && ibs_config.op_enabled) {
 		low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF)
 			+ ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */
 			+ IBS_OP_LOW_ENABLE;
@@ -341,14 +295,14 @@ static void op_amd_stop(struct op_msrs const * const msrs)
 	}
 
 #ifdef CONFIG_OPROFILE_IBS
-	if (ibs_allowed && ibs_config.fetch_enabled) {
+	if (has_ibs && ibs_config.fetch_enabled) {
 		/* clear max count and enable */
 		low = 0;
 		high = 0;
 		wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
 	}
 
-	if (ibs_allowed && ibs_config.op_enabled) {
+	if (has_ibs && ibs_config.op_enabled) {
 		/* clear max count and enable */
 		low = 0;
 		high = 0;
@@ -409,6 +363,7 @@ static int init_ibs_nmi(void)
 				       | IBSCTL_LVTOFFSETVAL);
 		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
 		if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
+			pci_dev_put(cpu_cfg);
 			printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
 				"IBSCTL = 0x%08x", value);
 			return 1;
@@ -436,20 +391,20 @@ static int init_ibs_nmi(void)
 /* uninitialize the APIC for the IBS interrupts if needed */
 static void clear_ibs_nmi(void)
 {
-	if (ibs_allowed)
+	if (has_ibs)
 		on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
 }
 
 /* initialize the APIC for the IBS interrupts if available */
 static void ibs_init(void)
 {
-	ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
+	has_ibs = boot_cpu_has(X86_FEATURE_IBS);
 
-	if (!ibs_allowed)
+	if (!has_ibs)
 		return;
 
 	if (init_ibs_nmi()) {
-		ibs_allowed = 0;
+		has_ibs = 0;
 		return;
 	}
 
@@ -458,7 +413,7 @@ static void ibs_init(void)
 
 static void ibs_exit(void)
 {
-	if (!ibs_allowed)
+	if (!has_ibs)
 		return;
 
 	clear_ibs_nmi();
@@ -478,7 +433,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
 	if (ret)
 		return ret;
 
-	if (!ibs_allowed)
+	if (!has_ibs)
 		return ret;
 
 	/* model specific files */
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index dcbf1be149f..f21147f3626 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -28,351 +28,18 @@
 #include <linux/async_tx.h>
 
 #ifdef CONFIG_DMA_ENGINE
-static enum dma_state_client
-dma_channel_add_remove(struct dma_client *client,
-	struct dma_chan *chan, enum dma_state state);
-
-static struct dma_client async_tx_dma = {
-	.event_callback = dma_channel_add_remove,
-	/* .cap_mask == 0 defaults to all channels */
-};
-
-/**
- * dma_cap_mask_all - enable iteration over all operation types
- */
-static dma_cap_mask_t dma_cap_mask_all;
-
-/**
- * chan_ref_percpu - tracks channel allocations per core/opertion
- */
-struct chan_ref_percpu {
-	struct dma_chan_ref *ref;
-};
-
-static int channel_table_initialized;
-static struct chan_ref_percpu *channel_table[DMA_TX_TYPE_END];
-
-/**
- * async_tx_lock - protect modification of async_tx_master_list and serialize
- *	rebalance operations
- */
-static spinlock_t async_tx_lock;
-
-static LIST_HEAD(async_tx_master_list);
-
-/* async_tx_issue_pending_all - start all transactions on all channels */
-void async_tx_issue_pending_all(void)
-{
-	struct dma_chan_ref *ref;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(ref, &async_tx_master_list, node)
-		ref->chan->device->device_issue_pending(ref->chan);
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(async_tx_issue_pending_all);
-
-/* dma_wait_for_async_tx - spin wait for a transcation to complete
- * @tx: transaction to wait on
- */
-enum dma_status
-dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
-{
-	enum dma_status status;
-	struct dma_async_tx_descriptor *iter;
-	struct dma_async_tx_descriptor *parent;
-
-	if (!tx)
-		return DMA_SUCCESS;
-
-	/* poll through the dependency chain, return when tx is complete */
-	do {
-		iter = tx;
-
-		/* find the root of the unsubmitted dependency chain */
-		do {
-			parent = iter->parent;
-			if (!parent)
-				break;
-			else
-				iter = parent;
-		} while (parent);
-
-		/* there is a small window for ->parent == NULL and
-		 * ->cookie == -EBUSY
-		 */
-		while (iter->cookie == -EBUSY)
-			cpu_relax();
-
-		status = dma_sync_wait(iter->chan, iter->cookie);
-	} while (status == DMA_IN_PROGRESS || (iter != tx));
-
-	return status;
-}
-EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
-
-/* async_tx_run_dependencies - helper routine for dma drivers to process
- *	(start) dependent operations on their target channel
- * @tx: transaction with dependencies
- */
-void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx)
-{
-	struct dma_async_tx_descriptor *dep = tx->next;
-	struct dma_async_tx_descriptor *dep_next;
-	struct dma_chan *chan;
-
-	if (!dep)
-		return;
-
-	chan = dep->chan;
-
-	/* keep submitting up until a channel switch is detected
-	 * in that case we will be called again as a result of
-	 * processing the interrupt from async_tx_channel_switch
-	 */
-	for (; dep; dep = dep_next) {
-		spin_lock_bh(&dep->lock);
-		dep->parent = NULL;
-		dep_next = dep->next;
-		if (dep_next && dep_next->chan == chan)
-			dep->next = NULL; /* ->next will be submitted */
-		else
-			dep_next = NULL; /* submit current dep and terminate */
-		spin_unlock_bh(&dep->lock);
-
-		dep->tx_submit(dep);
-	}
-
-	chan->device->device_issue_pending(chan);
-}
-EXPORT_SYMBOL_GPL(async_tx_run_dependencies);
-
-static void
-free_dma_chan_ref(struct rcu_head *rcu)
-{
-	struct dma_chan_ref *ref;
-	ref = container_of(rcu, struct dma_chan_ref, rcu);
-	kfree(ref);
-}
-
-static void
-init_dma_chan_ref(struct dma_chan_ref *ref, struct dma_chan *chan)
-{
-	INIT_LIST_HEAD(&ref->node);
-	INIT_RCU_HEAD(&ref->rcu);
-	ref->chan = chan;
-	atomic_set(&ref->count, 0);
-}
-
-/**
- * get_chan_ref_by_cap - returns the nth channel of the given capability
- * 	defaults to returning the channel with the desired capability and the
- * 	lowest reference count if the index can not be satisfied
- * @cap: capability to match
- * @index: nth channel desired, passing -1 has the effect of forcing the
- *  default return value
- */
-static struct dma_chan_ref *
-get_chan_ref_by_cap(enum dma_transaction_type cap, int index)
-{
-	struct dma_chan_ref *ret_ref = NULL, *min_ref = NULL, *ref;
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(ref, &async_tx_master_list, node)
-		if (dma_has_cap(cap, ref->chan->device->cap_mask)) {
-			if (!min_ref)
-				min_ref = ref;
-			else if (atomic_read(&ref->count) <
-				atomic_read(&min_ref->count))
-				min_ref = ref;
-
-			if (index-- == 0) {
-				ret_ref = ref;
-				break;
-			}
-		}
-	rcu_read_unlock();
-
-	if (!ret_ref)
-		ret_ref = min_ref;
-
-	if (ret_ref)
-		atomic_inc(&ret_ref->count);
-
-	return ret_ref;
-}
-
-/**
- * async_tx_rebalance - redistribute the available channels, optimize
- * for cpu isolation in the SMP case, and opertaion isolation in the
- * uniprocessor case
- */
-static void async_tx_rebalance(void)
-{
-	int cpu, cap, cpu_idx = 0;
-	unsigned long flags;
-
-	if (!channel_table_initialized)
-		return;
-
-	spin_lock_irqsave(&async_tx_lock, flags);
-
-	/* undo the last distribution */
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		for_each_possible_cpu(cpu) {
-			struct dma_chan_ref *ref =
-				per_cpu_ptr(channel_table[cap], cpu)->ref;
-			if (ref) {
-				atomic_set(&ref->count, 0);
-				per_cpu_ptr(channel_table[cap], cpu)->ref =
-									NULL;
-			}
-		}
-
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		for_each_online_cpu(cpu) {
-			struct dma_chan_ref *new;
-			if (NR_CPUS > 1)
-				new = get_chan_ref_by_cap(cap, cpu_idx++);
-			else
-				new = get_chan_ref_by_cap(cap, -1);
-
-			per_cpu_ptr(channel_table[cap], cpu)->ref = new;
-		}
-
-	spin_unlock_irqrestore(&async_tx_lock, flags);
-}
-
-static enum dma_state_client
-dma_channel_add_remove(struct dma_client *client,
-	struct dma_chan *chan, enum dma_state state)
-{
-	unsigned long found, flags;
-	struct dma_chan_ref *master_ref, *ref;
-	enum dma_state_client ack = DMA_DUP; /* default: take no action */
-
-	switch (state) {
-	case DMA_RESOURCE_AVAILABLE:
-		found = 0;
-		rcu_read_lock();
-		list_for_each_entry_rcu(ref, &async_tx_master_list, node)
-			if (ref->chan == chan) {
-				found = 1;
-				break;
-			}
-		rcu_read_unlock();
-
-		pr_debug("async_tx: dma resource available [%s]\n",
-			found ? "old" : "new");
-
-		if (!found)
-			ack = DMA_ACK;
-		else
-			break;
-
-		/* add the channel to the generic management list */
-		master_ref = kmalloc(sizeof(*master_ref), GFP_KERNEL);
-		if (master_ref) {
-			/* keep a reference until async_tx is unloaded */
-			dma_chan_get(chan);
-			init_dma_chan_ref(master_ref, chan);
-			spin_lock_irqsave(&async_tx_lock, flags);
-			list_add_tail_rcu(&master_ref->node,
-				&async_tx_master_list);
-			spin_unlock_irqrestore(&async_tx_lock,
-				flags);
-		} else {
-			printk(KERN_WARNING "async_tx: unable to create"
-				" new master entry in response to"
-				" a DMA_RESOURCE_ADDED event"
-				" (-ENOMEM)\n");
-			return 0;
-		}
-
-		async_tx_rebalance();
-		break;
-	case DMA_RESOURCE_REMOVED:
-		found = 0;
-		spin_lock_irqsave(&async_tx_lock, flags);
-		list_for_each_entry(ref, &async_tx_master_list, node)
-			if (ref->chan == chan) {
-				/* permit backing devices to go away */
-				dma_chan_put(ref->chan);
-				list_del_rcu(&ref->node);
-				call_rcu(&ref->rcu, free_dma_chan_ref);
-				found = 1;
-				break;
-			}
-		spin_unlock_irqrestore(&async_tx_lock, flags);
-
-		pr_debug("async_tx: dma resource removed [%s]\n",
-			found ? "ours" : "not ours");
-
-		if (found)
-			ack = DMA_ACK;
-		else
-			break;
-
-		async_tx_rebalance();
-		break;
-	case DMA_RESOURCE_SUSPEND:
-	case DMA_RESOURCE_RESUME:
-		printk(KERN_WARNING "async_tx: does not support dma channel"
-			" suspend/resume\n");
-		break;
-	default:
-		BUG();
-	}
-
-	return ack;
-}
-
-static int __init
-async_tx_init(void)
+static int __init async_tx_init(void)
 {
-	enum dma_transaction_type cap;
-
-	spin_lock_init(&async_tx_lock);
-	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
-
-	/* an interrupt will never be an explicit operation type.
-	 * clearing this bit prevents allocation to a slot in 'channel_table'
-	 */
-	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
-
-	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
-		channel_table[cap] = alloc_percpu(struct chan_ref_percpu);
-		if (!channel_table[cap])
-			goto err;
-	}
-
-	channel_table_initialized = 1;
-	dma_async_client_register(&async_tx_dma);
-	dma_async_client_chan_request(&async_tx_dma);
+	dmaengine_get();
 
 	printk(KERN_INFO "async_tx: api initialized (async)\n");
 
 	return 0;
-err:
-	printk(KERN_ERR "async_tx: initialization failure\n");
-
-	while (--cap >= 0)
-		free_percpu(channel_table[cap]);
-
-	return 1;
 }
 
 static void __exit async_tx_exit(void)
 {
-	enum dma_transaction_type cap;
-
-	channel_table_initialized = 0;
-
-	for_each_dma_cap_mask(cap, dma_cap_mask_all)
-		if (channel_table[cap])
-			free_percpu(channel_table[cap]);
-
-	dma_async_client_unregister(&async_tx_dma);
+	dmaengine_put();
 }
 
 /**
@@ -387,16 +54,9 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
 {
 	/* see if we can keep the chain on one channel */
 	if (depend_tx &&
-		dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
+	    dma_has_cap(tx_type, depend_tx->chan->device->cap_mask))
 		return depend_tx->chan;
-	else if (likely(channel_table_initialized)) {
-		struct dma_chan_ref *ref;
-		int cpu = get_cpu();
-		ref = per_cpu_ptr(channel_table[tx_type], cpu)->ref;
-		put_cpu();
-		return ref ? ref->chan : NULL;
-	} else
-		return NULL;
+	return dma_find_channel(tx_type);
 }
 EXPORT_SYMBOL_GPL(__async_tx_find_channel);
 #else
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 2f557f570ad..00cf9553f74 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -107,4 +107,6 @@ source "drivers/uio/Kconfig"
 source "drivers/xen/Kconfig"
 
 source "drivers/staging/Kconfig"
+
+source "drivers/platform/Kconfig"
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 6326f4dbbda..c1bf4173793 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -105,3 +105,4 @@ obj-$(CONFIG_OF)		+= of/
 obj-$(CONFIG_SSB)		+= ssb/
 obj-$(CONFIG_VIRTIO)		+= virtio/
 obj-$(CONFIG_STAGING)		+= staging/
+obj-y				+= platform/
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index b0243fd55ac..d7f9839ba26 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -196,90 +196,6 @@ config ACPI_NUMA
 	depends on (X86 || IA64)
 	default y if IA64_GENERIC || IA64_SGI_SN2
 
-config ACPI_WMI
-	tristate "WMI (EXPERIMENTAL)"
-	depends on X86
-	depends on EXPERIMENTAL
-	help
-	  This driver adds support for the ACPI-WMI (Windows Management
-	  Instrumentation) mapper device (PNP0C14) found on some systems.
-
-	  ACPI-WMI is a proprietary extension to ACPI to expose parts of the
-	  ACPI firmware to userspace - this is done through various vendor
-	  defined methods and data blocks in a PNP0C14 device, which are then
-	  made available for userspace to call.
-
-	  The implementation of this in Linux currently only exposes this to
-	  other kernel space drivers.
-
-	  This driver is a required dependency to build the firmware specific
-	  drivers needed on many machines, including Acer and HP laptops.
-
-	  It is safe to enable this driver even if your DSDT doesn't define
-	  any ACPI-WMI devices.
-
-config ACPI_ASUS
-        tristate "ASUS/Medion Laptop Extras"
-	depends on X86
-	select BACKLIGHT_CLASS_DEVICE
-        ---help---
-          This driver provides support for extra features of ACPI-compatible
-          ASUS laptops. As some of Medion laptops are made by ASUS, it may also
-          support some Medion laptops (such as 9675 for example).  It makes all
-          the extra buttons generate standard ACPI events that go through
-          /proc/acpi/events, and (on some models) adds support for changing the
-          display brightness and output, switching the LCD backlight on and off,
-          and most importantly, allows you to blink those fancy LEDs intended
-          for reporting mail and wireless status.
-
-	  Note: display switching code is currently considered EXPERIMENTAL,
-	  toying with these values may even lock your machine.
-
-          All settings are changed via /proc/acpi/asus directory entries. Owner
-          and group for these entries can be set with asus_uid and asus_gid
-          parameters.
-
-          More information and a userspace daemon for handling the extra buttons
-          at <http://sourceforge.net/projects/acpi4asus/>.
-
-          If you have an ACPI-compatible ASUS laptop, say Y or M here. This
-          driver is still under development, so if your laptop is unsupported or
-          something works not quite as expected, please use the mailing list
-          available on the above page (acpi4asus-user@lists.sourceforge.net).
-
-	  NOTE: This driver is deprecated and will probably be removed soon,
-	  use asus-laptop instead.
-
-config ACPI_TOSHIBA
-	tristate "Toshiba Laptop Extras"
-	depends on X86 && INPUT
-	select INPUT_POLLDEV
-	select NET
-	select RFKILL
-	select BACKLIGHT_CLASS_DEVICE
-	---help---
-	  This driver adds support for access to certain system settings
-	  on "legacy free" Toshiba laptops.  These laptops can be recognized by
-	  their lack of a BIOS setup menu and APM support.
-
-	  On these machines, all system configuration is handled through the
-	  ACPI.  This driver is required for access to controls not covered
-	  by the general ACPI drivers, such as LCD brightness, video output,
-	  etc.
-
-	  This driver differs from the non-ACPI Toshiba laptop driver (located
-	  under "Processor type and features") in several aspects.
-	  Configuration is accessed by reading and writing text files in the
-	  /proc tree instead of by program interface to /dev.  Furthermore, no
-	  power management functions are exposed, as those are handled by the
-	  general ACPI drivers.
-
-	  More information about this driver is available at
-	  <http://memebeam.org/toys/ToshibaAcpiDriver>.
-
-	  If you have a legacy free Toshiba laptop (such as the Libretto L1
-	  series), say Y.
-
 config ACPI_CUSTOM_DSDT_FILE
 	string "Custom DSDT Table file to include"
 	default ""
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 3c0c93300f1..d80f4cc2e0d 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -2,15 +2,8 @@
 # Makefile for the Linux ACPI interpreter
 #
 
-export ACPI_CFLAGS
-
-ACPI_CFLAGS	:= -Os
-
-ifdef CONFIG_ACPI_DEBUG
-  ACPI_CFLAGS	+= -DACPI_DEBUG_OUTPUT
-endif
-
-EXTRA_CFLAGS	+= $(ACPI_CFLAGS)
+ccflags-y			:= -Os
+ccflags-$(CONFIG_ACPI_DEBUG)	+= -DACPI_DEBUG_OUTPUT
 
 #
 # ACPI Boot-Time Table Parsing
@@ -22,9 +15,13 @@ obj-$(CONFIG_X86)		+= blacklist.o
 # ACPI Core Subsystem (Interpreter)
 #
 obj-y				+= osl.o utils.o reboot.o\
-				   dispatcher/ events/ executer/ hardware/ \
-				   namespace/ parser/ resources/ tables/ \
-				   utilities/
+					acpica/
+
+# sleep related files
+obj-y				+= wakeup.o
+obj-y				+= main.o
+obj-$(CONFIG_ACPI_SLEEP)	+= proc.o
+
 
 #
 # ACPI Bus and Device Drivers
@@ -35,7 +32,6 @@ ifdef CONFIG_CPU_FREQ
 processor-objs	+= processor_perflib.o
 endif
 
-obj-y				+= sleep/
 obj-y				+= bus.o glue.o
 obj-y				+= scan.o
 # Keep EC driver first. Initialization of others depend on it.
@@ -59,9 +55,6 @@ obj-y				+= power.o
 obj-$(CONFIG_ACPI_SYSTEM)	+= system.o event.o
 obj-$(CONFIG_ACPI_DEBUG)	+= debug.o
 obj-$(CONFIG_ACPI_NUMA)		+= numa.o
-obj-$(CONFIG_ACPI_WMI)		+= wmi.o
-obj-$(CONFIG_ACPI_ASUS)		+= asus_acpi.o
-obj-$(CONFIG_ACPI_TOSHIBA)	+= toshiba_acpi.o
 obj-$(CONFIG_ACPI_HOTPLUG_MEMORY)	+= acpi_memhotplug.o
 obj-$(CONFIG_ACPI_PROCFS_POWER)	+= cm_sbs.o
 obj-$(CONFIG_ACPI_SBS)		+= sbshc.o
diff --git a/drivers/acpi/acpica/Makefile b/drivers/acpi/acpica/Makefile
new file mode 100644
index 00000000000..3f23298ee3f
--- /dev/null
+++ b/drivers/acpi/acpica/Makefile
@@ -0,0 +1,44 @@
+#
+# Makefile for ACPICA Core interpreter
+#
+
+ccflags-y			:= -Os
+ccflags-$(CONFIG_ACPI_DEBUG)	+= -DACPI_DEBUG_OUTPUT
+
+obj-y := dsfield.o   dsmthdat.o  dsopcode.o  dswexec.o  dswscope.o \
+	 dsmethod.o  dsobject.o  dsutils.o   dswload.o  dswstate.o \
+	 dsinit.o
+
+obj-y += evevent.o  evregion.o  evsci.o    evxfevnt.o \
+	 evmisc.o   evrgnini.o  evxface.o  evxfregn.o \
+	 evgpe.o    evgpeblk.o
+
+obj-y += exconfig.o  exfield.o  exnames.o   exoparg6.o  exresolv.o  exstorob.o\
+	 exconvrt.o  exfldio.o  exoparg1.o  exprep.o    exresop.o   exsystem.o\
+	 excreate.o  exmisc.o   exoparg2.o  exregion.o  exstore.o   exutils.o \
+	 exdump.o    exmutex.o  exoparg3.o  exresnte.o  exstoren.o
+
+obj-y += hwacpi.o  hwgpe.o  hwregs.o  hwsleep.o hwxface.o
+
+obj-$(ACPI_FUTURE_USAGE) += hwtimer.o
+
+obj-y += nsaccess.o  nsload.o    nssearch.o  nsxfeval.o \
+	 nsalloc.o   nseval.o    nsnames.o   nsutils.o   nsxfname.o \
+	 nsdump.o    nsinit.o    nsobject.o  nswalk.o    nsxfobj.o  \
+	 nsparse.o   nspredef.o
+
+obj-$(ACPI_FUTURE_USAGE) += nsdumpdv.o
+
+obj-y += psargs.o    psparse.o  psloop.o pstree.o   pswalk.o  \
+	 psopcode.o  psscope.o  psutils.o  psxface.o
+
+obj-y += rsaddr.o rscreate.o rsinfo.o rsio.o rslist.o rsmisc.o rsxface.o \
+	 rscalc.o  rsirq.o  rsmemory.o  rsutils.o
+
+obj-$(ACPI_FUTURE_USAGE) += rsdump.o
+
+obj-y += tbxface.o tbinstal.o tbutils.o tbfind.o tbfadt.o tbxfroot.o
+
+obj-y += utalloc.o utdebug.o uteval.o utinit.o utmisc.o utxface.o \
+		utcopy.o utdelete.o utglobal.o utmath.o utobject.o \
+		utstate.o utmutex.o utobject.o utresrc.o
diff --git a/drivers/acpi/acpica/accommon.h b/drivers/acpi/acpica/accommon.h
new file mode 100644
index 00000000000..3b20786cbb0
--- /dev/null
+++ b/drivers/acpi/acpica/accommon.h
@@ -0,0 +1,63 @@
+/******************************************************************************
+ *
+ * Name: accommon.h - Common include files for generation of ACPICA source
+ *
+ *****************************************************************************/
+
+/*
+ * Copyright (C) 2000 - 2008, Intel Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *    of any contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ */
+
+#ifndef __ACCOMMON_H__
+#define __ACCOMMON_H__
+
+/*
+ * Common set of includes for all ACPICA source files.
+ * We put them here because we don't want to duplicate them
+ * in the the source code again and again.
+ *
+ * Note: The order of these include files is important.
+ */
+#include "acconfig.h"		/* Global configuration constants */
+#include "acmacros.h"		/* C macros */
+#include "aclocal.h"		/* Internal data types */
+#include "acobject.h"		/* ACPI internal object */
+#include "acstruct.h"		/* Common structures */
+#include "acglobal.h"		/* All global variables */
+#include "achware.h"		/* Hardware defines and interfaces */
+#include "acutils.h"		/* Utility interfaces */
+
+#endif				/* __ACCOMMON_H__ */
diff --git a/include/acpi/acconfig.h b/drivers/acpi/acpica/acconfig.h
index 29feee27f0e..e6777fb883d 100644
--- a/include/acpi/acconfig.h
+++ b/drivers/acpi/acpica/acconfig.h
@@ -61,10 +61,6 @@
  *
  */
 
-/* Current ACPICA subsystem version in YYYYMMDD format */
-
-#define ACPI_CA_VERSION                 0x20080926
-
 /*
  * OS name, used for the _OS object.  The _OS object is essentially obsolete,
  * but there is a large base of ASL/AML code in existing machines that check
@@ -119,6 +115,10 @@
 
 #define ACPI_ROOT_TABLE_SIZE_INCREMENT  4
 
+/* Maximum number of While() loop iterations before forced abort */
+
+#define ACPI_MAX_LOOP_ITERATIONS        0xFFFF
+
 /******************************************************************************
  *
  * ACPI Specification constants (Do not change unless the specification changes)
diff --git a/include/acpi/acdebug.h b/drivers/acpi/acpica/acdebug.h
index 62c59df3b86..62c59df3b86 100644
--- a/include/acpi/acdebug.h
+++ b/drivers/acpi/acpica/acdebug.h
diff --git a/include/acpi/acdispat.h b/drivers/acpi/acpica/acdispat.h
index 6291904be01..6291904be01 100644
--- a/include/acpi/acdispat.h
+++ b/drivers/acpi/acpica/acdispat.h
diff --git a/include/acpi/acevents.h b/drivers/acpi/acpica/acevents.h
index d5d099bf349..07e20135f01 100644
--- a/include/acpi/acevents.h
+++ b/drivers/acpi/acpica/acevents.h
@@ -93,11 +93,13 @@ struct acpi_gpe_event_info *acpi_ev_get_gpe_event_info(acpi_handle gpe_device,
  */
 u8 acpi_ev_valid_gpe_event(struct acpi_gpe_event_info *gpe_event_info);
 
-acpi_status acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback);
+acpi_status
+acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback, void *context);
 
 acpi_status
 acpi_ev_delete_gpe_handlers(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
-			    struct acpi_gpe_block_info *gpe_block);
+			    struct acpi_gpe_block_info *gpe_block,
+			    void *context);
 
 acpi_status
 acpi_ev_create_gpe_block(struct acpi_namespace_node *gpe_device,
diff --git a/include/acpi/acglobal.h b/drivers/acpi/acpica/acglobal.h
index 15dda46b70d..ddb40f5c68f 100644
--- a/include/acpi/acglobal.h
+++ b/drivers/acpi/acpica/acglobal.h
@@ -102,6 +102,12 @@ ACPI_EXTERN u8 ACPI_INIT_GLOBAL(acpi_gbl_create_osi_method, TRUE);
  */
 ACPI_EXTERN u8 ACPI_INIT_GLOBAL(acpi_gbl_leave_wake_gpes_disabled, TRUE);
 
+/*
+ * Optionally use default values for the ACPI register widths. Set this to
+ * TRUE to use the defaults, if an FADT contains incorrect widths/lengths.
+ */
+ACPI_EXTERN u8 ACPI_INIT_GLOBAL(acpi_gbl_use_default_register_widths, TRUE);
+
 /*****************************************************************************
  *
  * Debug support
@@ -140,7 +146,7 @@ ACPI_EXTERN u32 acpi_gbl_trace_flags;
  */
 ACPI_EXTERN struct acpi_internal_rsdt acpi_gbl_root_table_list;
 ACPI_EXTERN struct acpi_table_fadt acpi_gbl_FADT;
-extern u8 acpi_gbl_permanent_mmap;
+ACPI_EXTERN struct acpi_table_facs *acpi_gbl_FACS;
 
 /* These addresses are calculated from FADT address values */
 
@@ -326,6 +332,7 @@ ACPI_EXTERN struct acpi_fixed_event_handler
 ACPI_EXTERN struct acpi_gpe_xrupt_info *acpi_gbl_gpe_xrupt_list_head;
 ACPI_EXTERN struct acpi_gpe_block_info
 *acpi_gbl_gpe_fadt_blocks[ACPI_MAX_GPE_BLOCKS];
+ACPI_EXTERN u32 acpi_current_gpe_count;
 
 /*****************************************************************************
  *
diff --git a/include/acpi/achware.h b/drivers/acpi/acpica/achware.h
index 97a72b19327..58c69dc49ab 100644
--- a/include/acpi/achware.h
+++ b/drivers/acpi/acpica/achware.h
@@ -44,11 +44,7 @@
 #ifndef __ACHWARE_H__
 #define __ACHWARE_H__
 
-/* PM Timer ticks per second (HZ) */
-
-#define PM_TIMER_FREQUENCY  3579545
-
-/* Values for the _SST reserved method */
+/* Values for the _SST predefined method */
 
 #define ACPI_SST_INDICATOR_OFF  0
 #define ACPI_SST_WORKING        1
@@ -56,8 +52,6 @@
 #define ACPI_SST_SLEEPING       3
 #define ACPI_SST_SLEEP_CONTEXT  4
 
-/* Prototypes */
-
 /*
  * hwacpi - high level functions
  */
@@ -75,13 +69,6 @@ acpi_hw_register_read(u32 register_id, u32 * return_value);
 
 acpi_status acpi_hw_register_write(u32 register_id, u32 value);
 
-acpi_status
-acpi_hw_low_level_read(u32 width,
-		       u32 * value, struct acpi_generic_address *reg);
-
-acpi_status
-acpi_hw_low_level_write(u32 width, u32 value, struct acpi_generic_address *reg);
-
 acpi_status acpi_hw_clear_acpi_status(void);
 
 /*
@@ -94,13 +81,13 @@ acpi_hw_write_gpe_enable_reg(struct acpi_gpe_event_info *gpe_event_info);
 
 acpi_status
 acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
-			  struct acpi_gpe_block_info *gpe_block);
+			  struct acpi_gpe_block_info *gpe_block, void *context);
 
 acpi_status acpi_hw_clear_gpe(struct acpi_gpe_event_info *gpe_event_info);
 
 acpi_status
 acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
-			struct acpi_gpe_block_info *gpe_block);
+			struct acpi_gpe_block_info *gpe_block, void *context);
 
 acpi_status
 acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info,
@@ -114,7 +101,8 @@ acpi_status acpi_hw_enable_all_wakeup_gpes(void);
 
 acpi_status
 acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
-				 struct acpi_gpe_block_info *gpe_block);
+				 struct acpi_gpe_block_info *gpe_block,
+				 void *context);
 
 #ifdef	ACPI_FUTURE_USAGE
 /*
diff --git a/include/acpi/acinterp.h b/drivers/acpi/acpica/acinterp.h
index e8db7a3143a..e8db7a3143a 100644
--- a/include/acpi/acinterp.h
+++ b/drivers/acpi/acpica/acinterp.h
diff --git a/include/acpi/aclocal.h b/drivers/acpi/acpica/aclocal.h
index ecab527cf78..492d02761bb 100644
--- a/include/acpi/aclocal.h
+++ b/drivers/acpi/acpica/aclocal.h
@@ -46,8 +46,6 @@
 
 /* acpisrc:struct_defs -- for acpisrc conversion */
 
-#define ACPI_WAIT_FOREVER               0xFFFF	/* u16, as per ACPI spec */
-#define ACPI_DO_NOT_WAIT                0
 #define ACPI_SERIALIZED                 0xFF
 
 typedef u32 acpi_mutex_handle;
@@ -120,11 +118,6 @@ static char *acpi_gbl_mutex_names[ACPI_NUM_MUTEX] = {
 #define ACPI_MAX_LOCK                   1
 #define ACPI_NUM_LOCK                   ACPI_MAX_LOCK+1
 
-/* Owner IDs are used to track namespace nodes for selective deletion */
-
-typedef u8 acpi_owner_id;
-#define ACPI_OWNER_ID_MAX               0xFF
-
 /* This Thread ID means that the mutex is not in use (unlocked) */
 
 #define ACPI_MUTEX_NOT_ACQUIRED         (acpi_thread_id) 0
@@ -165,11 +158,6 @@ typedef enum {
 	ACPI_IMODE_EXECUTE = 0x03
 } acpi_interpreter_mode;
 
-union acpi_name_union {
-	u32 integer;
-	char ascii[4];
-};
-
 /*
  * The Namespace Node describes a named object that appears in the AML.
  * descriptor_type is used to differentiate between internal descriptors.
@@ -216,26 +204,6 @@ struct acpi_namespace_node {
 #define ANOBJ_IS_BIT_OFFSET             0x40	/* i_aSL only: Reference is a bit offset */
 #define ANOBJ_IS_REFERENCED             0x80	/* i_aSL only: Object was referenced */
 
-/*
- * ACPI Table Descriptor.  One per ACPI table
- */
-struct acpi_table_desc {
-	acpi_physical_address address;
-	struct acpi_table_header *pointer;
-	u32 length;		/* Length fixed at 32 bits */
-	union acpi_name_union signature;
-	acpi_owner_id owner_id;
-	u8 flags;
-};
-
-/* Flags for above */
-
-#define ACPI_TABLE_ORIGIN_UNKNOWN       (0)
-#define ACPI_TABLE_ORIGIN_MAPPED        (1)
-#define ACPI_TABLE_ORIGIN_ALLOCATED     (2)
-#define ACPI_TABLE_ORIGIN_MASK          (3)
-#define ACPI_TABLE_IS_LOADED            (4)
-
 /* One internal RSDT for table management */
 
 struct acpi_internal_rsdt {
@@ -266,15 +234,6 @@ struct acpi_ns_search_data {
 	struct acpi_namespace_node *node;
 };
 
-/*
- * Predefined Namespace items
- */
-struct acpi_predefined_names {
-	char *name;
-	u8 type;
-	char *val;
-};
-
 /* Object types used during package copies */
 
 #define ACPI_COPY_TYPE_SIMPLE           0
@@ -487,10 +446,15 @@ struct acpi_gpe_walk_info {
 	struct acpi_gpe_block_info *gpe_block;
 };
 
-typedef acpi_status(*acpi_gpe_callback) (struct acpi_gpe_xrupt_info *
-					 gpe_xrupt_info,
-					 struct acpi_gpe_block_info *
-					 gpe_block);
+struct acpi_gpe_device_info {
+	u32 index;
+	u32 next_block_base_index;
+	acpi_status status;
+	struct acpi_namespace_node *gpe_device;
+};
+
+typedef acpi_status(*acpi_gpe_callback) (struct acpi_gpe_xrupt_info *gpe_xrupt_info,
+		struct acpi_gpe_block_info *gpe_block, void *context);
 
 /* Information about each particular fixed event */
 
@@ -566,6 +530,7 @@ struct acpi_control_state {
 	union acpi_parse_object *predicate_op;
 	u8 *aml_predicate_start;	/* Start of if/while predicate */
 	u8 *package_end;	/* End of if/while block */
+	u32 loop_count;		/* While() loop counter */
 };
 
 /*
@@ -671,6 +636,12 @@ union acpi_parse_value {
 	union acpi_parse_object *arg;	/* arguments and contained ops */
 };
 
+#ifdef ACPI_DISASSEMBLER
+#define ACPI_DISASM_ONLY_MEMBERS(a)     a;
+#else
+#define ACPI_DISASM_ONLY_MEMBERS(a)
+#endif
+
 #define ACPI_PARSE_COMMON \
 	union acpi_parse_object         *parent;        /* Parent op */\
 	u8                              descriptor_type; /* To differentiate various internal objs */\
@@ -790,9 +761,6 @@ struct acpi_parse_state {
  *
  ****************************************************************************/
 
-#define PCI_ROOT_HID_STRING             "PNP0A03"
-#define PCI_EXPRESS_ROOT_HID_STRING     "PNP0A08"
-
 struct acpi_bit_register_info {
 	u8 parent_register;
 	u8 bit_position;
@@ -1019,26 +987,4 @@ struct acpi_debug_mem_block {
 #define ACPI_MEM_LIST_MAX               1
 #define ACPI_NUM_MEM_LISTS              2
 
-struct acpi_memory_list {
-	char *list_name;
-	void *list_head;
-	u16 object_size;
-	u16 max_depth;
-	u16 current_depth;
-	u16 link_offset;
-
-#ifdef ACPI_DBG_TRACK_ALLOCATIONS
-
-	/* Statistics for debug memory tracking only */
-
-	u32 total_allocated;
-	u32 total_freed;
-	u32 max_occupied;
-	u32 total_size;
-	u32 current_total_size;
-	u32 requests;
-	u32 hits;
-#endif
-};
-
 #endif				/* __ACLOCAL_H__ */
diff --git a/include/acpi/acmacros.h b/drivers/acpi/acpica/acmacros.h
index 1954c9d1d01..9c127e8e2d6 100644
--- a/include/acpi/acmacros.h
+++ b/drivers/acpi/acpica/acmacros.h
@@ -45,23 +45,6 @@
 #define __ACMACROS_H__
 
 /*
- * Data manipulation macros
- */
-#define ACPI_LOWORD(l)                  ((u16)(u32)(l))
-#define ACPI_HIWORD(l)                  ((u16)((((u32)(l)) >> 16) & 0xFFFF))
-#define ACPI_LOBYTE(l)                  ((u8)(u16)(l))
-#define ACPI_HIBYTE(l)                  ((u8)((((u16)(l)) >> 8) & 0xFF))
-
-#define ACPI_SET_BIT(target,bit)        ((target) |= (bit))
-#define ACPI_CLEAR_BIT(target,bit)      ((target) &= ~(bit))
-#define ACPI_MIN(a,b)                   (((a)<(b))?(a):(b))
-#define ACPI_MAX(a,b)                   (((a)>(b))?(a):(b))
-
-/* Size calculation */
-
-#define ACPI_ARRAY_LENGTH(x)            (sizeof(x) / sizeof((x)[0]))
-
-/*
  * Extract data using a pointer. Any more than a byte and we
  * get into potential aligment issues -- see the STORE macros below.
  * Use with care.
@@ -76,39 +59,6 @@
 #define ACPI_SET64(ptr)                 *ACPI_CAST_PTR (u64, ptr)
 
 /*
- * Pointer manipulation
- */
-#define ACPI_CAST_PTR(t, p)             ((t *) (acpi_uintptr_t) (p))
-#define ACPI_CAST_INDIRECT_PTR(t, p)    ((t **) (acpi_uintptr_t) (p))
-#define ACPI_ADD_PTR(t, a, b)		ACPI_CAST_PTR (t, (ACPI_CAST_PTR (u8, (a)) + (acpi_size)(b)))
-#define ACPI_PTR_DIFF(a, b)		(acpi_size) (ACPI_CAST_PTR (u8, (a)) - ACPI_CAST_PTR (u8, (b)))
-
-/* Pointer/Integer type conversions */
-
-#define ACPI_TO_POINTER(i)		ACPI_ADD_PTR (void, (void *) NULL, (acpi_size) i)
-#define ACPI_TO_INTEGER(p)              ACPI_PTR_DIFF (p, (void *) NULL)
-#define ACPI_OFFSET(d, f)               (acpi_size) ACPI_PTR_DIFF (&(((d *)0)->f), (void *) NULL)
-#define ACPI_PHYSADDR_TO_PTR(i)         ACPI_TO_POINTER(i)
-#define ACPI_PTR_TO_PHYSADDR(i)         ACPI_TO_INTEGER(i)
-
-#ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED
-#define ACPI_COMPARE_NAME(a, b)         (*ACPI_CAST_PTR (u32, (a)) == *ACPI_CAST_PTR (u32, (b)))
-#else
-#define ACPI_COMPARE_NAME(a, b)         (!ACPI_STRNCMP (ACPI_CAST_PTR (char, (a)), ACPI_CAST_PTR (char, (b)), ACPI_NAME_SIZE))
-#endif
-
-/*
- * Full 64-bit integer must be available on both 32-bit and 64-bit platforms
- */
-struct acpi_integer_overlay {
-	u32 lo_dword;
-	u32 hi_dword;
-};
-
-#define ACPI_LODWORD(integer)           (ACPI_CAST_PTR (struct acpi_integer_overlay, &integer)->lo_dword)
-#define ACPI_HIDWORD(integer)           (ACPI_CAST_PTR (struct acpi_integer_overlay, &integer)->hi_dword)
-
-/*
  * printf() format helpers
  */
 
@@ -209,7 +159,7 @@ struct acpi_integer_overlay {
 /*
  * The hardware does not support unaligned transfers. We must move the
  * data one byte at a time. These macros work whether the source or
- * the destination (or both) is/are unaligned.  (Little-endian move)
+ * the destination (or both) is/are unaligned. (Little-endian move)
  */
 
 /* 16-bit source, 16/32/64 destination */
@@ -357,12 +307,6 @@ struct acpi_integer_overlay {
 	{(u32)(Pargs), (u32)(Iargs), (u32)(flags), obj_type, class, type}
 #endif
 
-#ifdef ACPI_DISASSEMBLER
-#define ACPI_DISASM_ONLY_MEMBERS(a)     a;
-#else
-#define ACPI_DISASM_ONLY_MEMBERS(a)
-#endif
-
 #define ARG_TYPE_WIDTH                  5
 #define ARG_1(x)                        ((u32)(x))
 #define ARG_2(x)                        ((u32)(x) << (1 * ARG_TYPE_WIDTH))
@@ -388,32 +332,16 @@ struct acpi_integer_overlay {
 #define GET_CURRENT_ARG_TYPE(list)      (list & ((u32) 0x1F))
 #define INCREMENT_ARG_LIST(list)        (list >>= ((u32) ARG_TYPE_WIDTH))
 
-#if defined (ACPI_DEBUG_OUTPUT) || !defined (ACPI_NO_ERROR_MESSAGES)
-/*
- * Module name is include in both debug and non-debug versions primarily for
- * error messages. The __FILE__ macro is not very useful for this, because it
- * often includes the entire pathname to the module
- */
-#define ACPI_MODULE_NAME(name)		static const char ACPI_UNUSED_VAR _acpi_module_name[] = name;
-#else
-#define ACPI_MODULE_NAME(name)
-#endif
-
 /*
  * Ascii error messages can be configured out
  */
 #ifndef ACPI_NO_ERROR_MESSAGES
-#define AE_INFO                         _acpi_module_name, __LINE__
 
 /*
  * Error reporting. Callers module and line number are inserted by AE_INFO,
  * the plist contains a set of parens to allow variable-length lists.
  * These macros are used for both the debug and non-debug versions of the code.
  */
-#define ACPI_INFO(plist)                acpi_ut_info plist
-#define ACPI_WARNING(plist)             acpi_ut_warning plist
-#define ACPI_EXCEPTION(plist)           acpi_ut_exception plist
-#define ACPI_ERROR(plist)               acpi_ut_error plist
 #define ACPI_ERROR_NAMESPACE(s, e)      acpi_ns_report_error (AE_INFO, s, e);
 #define ACPI_ERROR_METHOD(s, n, p, e)   acpi_ns_report_method_error (AE_INFO, s, n, p, e);
 
@@ -421,13 +349,9 @@ struct acpi_integer_overlay {
 
 /* No error messages */
 
-#define ACPI_INFO(plist)
-#define ACPI_WARNING(plist)
-#define ACPI_EXCEPTION(plist)
-#define ACPI_ERROR(plist)
 #define ACPI_ERROR_NAMESPACE(s, e)
 #define ACPI_ERROR_METHOD(s, n, p, e)
-#endif
+#endif		/* ACPI_NO_ERROR_MESSAGES */
 
 /*
  * Debug macros that are conditionally compiled
@@ -435,36 +359,8 @@ struct acpi_integer_overlay {
 #ifdef ACPI_DEBUG_OUTPUT
 
 /*
- * Common parameters used for debug output functions:
- * line number, function name, module(file) name, component ID
- */
-#define ACPI_DEBUG_PARAMETERS           __LINE__, ACPI_GET_FUNCTION_NAME, _acpi_module_name, _COMPONENT
-
-/*
  * Function entry tracing
  */
-
-/*
- * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header,
- * define it now. This is the case where there the compiler does not support
- * a __func__ macro or equivalent.
- */
-#ifndef ACPI_GET_FUNCTION_NAME
-#define ACPI_GET_FUNCTION_NAME          _acpi_function_name
-/*
- * The Name parameter should be the procedure name as a quoted string.
- * The function name is also used by the function exit macros below.
- * Note: (const char) is used to be compatible with the debug interfaces
- * and macros such as __func__.
- */
-#define ACPI_FUNCTION_NAME(name)	static const char _acpi_function_name[] = #name;
-
-#else
-/* Compiler supports __func__ (or equivalent) -- Ignore this macro */
-
-#define ACPI_FUNCTION_NAME(name)
-#endif
-
 #ifdef CONFIG_ACPI_DEBUG_FUNC_TRACE
 
 #define ACPI_FUNCTION_TRACE(a)          ACPI_FUNCTION_NAME(a) \
@@ -584,15 +480,6 @@ struct acpi_integer_overlay {
 #define ACPI_DUMP_RESOURCE_LIST(a)      acpi_rs_dump_resource_list(a)
 #define ACPI_DUMP_BUFFER(a, b)          acpi_ut_dump_buffer((u8 *) a, b, DB_BYTE_DISPLAY, _COMPONENT)
 
-/*
- * Master debug print macros
- * Print iff:
- *    1) Debug print for the current component is enabled
- *    2) Debug error level or trace level for the print statement is enabled
- */
-#define ACPI_DEBUG_PRINT(plist)         acpi_ut_debug_print plist
-#define ACPI_DEBUG_PRINT_RAW(plist)     acpi_ut_debug_print_raw plist
-
 #else
 /*
  * This is the non-debug case -- make everything go away,
@@ -603,7 +490,6 @@ struct acpi_integer_overlay {
 
 #define ACPI_DEBUG_DEFINE(a)		do { } while(0)
 #define ACPI_DEBUG_ONLY_MEMBERS(a)	do { } while(0)
-#define ACPI_FUNCTION_NAME(a)		do { } while(0)
 #define ACPI_FUNCTION_TRACE(a)		do { } while(0)
 #define ACPI_FUNCTION_TRACE_PTR(a, b)	do { } while(0)
 #define ACPI_FUNCTION_TRACE_U32(a, b)	do { } while(0)
@@ -619,8 +505,6 @@ struct acpi_integer_overlay {
 #define ACPI_DUMP_PATHNAME(a, b, c, d)	do { } while(0)
 #define ACPI_DUMP_RESOURCE_LIST(a)	do { } while(0)
 #define ACPI_DUMP_BUFFER(a, b)		do { } while(0)
-#define ACPI_DEBUG_PRINT(pl)		do { } while(0)
-#define ACPI_DEBUG_PRINT_RAW(pl)	do { } while(0)
 
 #define return_VOID                     return
 #define return_ACPI_STATUS(s)           return(s)
@@ -629,7 +513,7 @@ struct acpi_integer_overlay {
 #define return_UINT32(s)                return(s)
 #define return_PTR(s)                   return(s)
 
-#endif
+#endif				/* ACPI_DEBUG_OUTPUT */
 
 /*
  * Some code only gets executed when the debugger is built in.
diff --git a/include/acpi/acnamesp.h b/drivers/acpi/acpica/acnamesp.h
index db4e6f67785..46cb5b46d28 100644
--- a/include/acpi/acnamesp.h
+++ b/drivers/acpi/acpica/acnamesp.h
@@ -182,7 +182,9 @@ acpi_status acpi_ns_evaluate(struct acpi_evaluate_info *info);
  */
 acpi_status
 acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
-			       union acpi_operand_object *return_object);
+			       u32 user_param_count,
+			       acpi_status return_status,
+			       union acpi_operand_object **return_object);
 
 const union acpi_predefined_info *acpi_ns_check_for_predefined_name(struct
 								    acpi_namespace_node
@@ -191,6 +193,7 @@ const union acpi_predefined_info *acpi_ns_check_for_predefined_name(struct
 void
 acpi_ns_check_parameter_count(char *pathname,
 			      struct acpi_namespace_node *node,
+			      u32 user_param_count,
 			      const union acpi_predefined_info *info);
 
 /*
diff --git a/include/acpi/acobject.h b/drivers/acpi/acpica/acobject.h
index eb6f038b03d..eb6f038b03d 100644
--- a/include/acpi/acobject.h
+++ b/drivers/acpi/acpica/acobject.h
diff --git a/include/acpi/acopcode.h b/drivers/acpi/acpica/acopcode.h
index dfdf6332788..dfdf6332788 100644
--- a/include/acpi/acopcode.h
+++ b/drivers/acpi/acpica/acopcode.h
diff --git a/include/acpi/acparser.h b/drivers/acpi/acpica/acparser.h
index 23ee0fbf561..23ee0fbf561 100644
--- a/include/acpi/acparser.h
+++ b/drivers/acpi/acpica/acparser.h
diff --git a/include/acpi/acpredef.h b/drivers/acpi/acpica/acpredef.h
index 16a9ca9a66e..16a9ca9a66e 100644
--- a/include/acpi/acpredef.h
+++ b/drivers/acpi/acpica/acpredef.h
diff --git a/include/acpi/acresrc.h b/drivers/acpi/acpica/acresrc.h
index eef5bd7a59f..eef5bd7a59f 100644
--- a/include/acpi/acresrc.h
+++ b/drivers/acpi/acpica/acresrc.h
diff --git a/include/acpi/acstruct.h b/drivers/acpi/acpica/acstruct.h
index 7980a26bad3..7980a26bad3 100644
--- a/include/acpi/acstruct.h
+++ b/drivers/acpi/acpica/acstruct.h
diff --git a/include/acpi/actables.h b/drivers/acpi/acpica/actables.h
index 0cbe1b9ab52..7ce6e33c7f7 100644
--- a/include/acpi/actables.h
+++ b/drivers/acpi/acpica/actables.h
@@ -94,6 +94,8 @@ void acpi_tb_set_table_loaded_flag(u32 table_index, u8 is_loaded);
 /*
  * tbutils - table manager utilities
  */
+acpi_status acpi_tb_initialize_facs(void);
+
 u8 acpi_tb_tables_loaded(void);
 
 void
diff --git a/include/acpi/acutils.h b/drivers/acpi/acpica/acutils.h
index d8307b2987e..80d8813484f 100644
--- a/include/acpi/acutils.h
+++ b/drivers/acpi/acpica/acutils.h
@@ -297,42 +297,6 @@ void acpi_ut_report_info(char *module_name, u32 line_number);
 
 void acpi_ut_report_warning(char *module_name, u32 line_number);
 
-/* Error and message reporting interfaces */
-
-void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_debug_print(u32 requested_debug_level,
-		    u32 line_number,
-		    const char *function_name,
-		    const char *module_name,
-		    u32 component_id,
-		    const char *format, ...) ACPI_PRINTF_LIKE(6);
-
-void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_debug_print_raw(u32 requested_debug_level,
-			u32 line_number,
-			const char *function_name,
-			const char *module_name,
-			u32 component_id,
-			const char *format, ...) ACPI_PRINTF_LIKE(6);
-
-void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_error(const char *module_name,
-	      u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3);
-
-void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_exception(const char *module_name,
-		  u32 line_number,
-		  acpi_status status,
-		  const char *format, ...) ACPI_PRINTF_LIKE(4);
-
-void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_warning(const char *module_name,
-		u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3);
-
-void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_info(const char *module_name,
-	     u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3);
-
 /*
  * utdelete - Object deletion and reference counts
  */
diff --git a/include/acpi/amlcode.h b/drivers/acpi/acpica/amlcode.h
index ff851c5df69..ff851c5df69 100644
--- a/include/acpi/amlcode.h
+++ b/drivers/acpi/acpica/amlcode.h
diff --git a/include/acpi/amlresrc.h b/drivers/acpi/acpica/amlresrc.h
index 7b070e42b7c..7b070e42b7c 100644
--- a/include/acpi/amlresrc.h
+++ b/drivers/acpi/acpica/amlresrc.h
diff --git a/drivers/acpi/dispatcher/dsfield.c b/drivers/acpi/acpica/dsfield.c
index f988a5e7d2b..53e27bc5a73 100644
--- a/drivers/acpi/dispatcher/dsfield.c
+++ b/drivers/acpi/acpica/dsfield.c
@@ -42,11 +42,12 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/amlcode.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acparser.h>
+#include "accommon.h"
+#include "amlcode.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "acnamesp.h"
+#include "acparser.h"
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dsfield")
diff --git a/drivers/acpi/dispatcher/dsinit.c b/drivers/acpi/acpica/dsinit.c
index 949f7c75029..eb144b13d8f 100644
--- a/drivers/acpi/dispatcher/dsinit.c
+++ b/drivers/acpi/acpica/dsinit.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acdispat.h>
-#include <acpi/acnamesp.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "acdispat.h"
+#include "acnamesp.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dsinit")
diff --git a/drivers/acpi/dispatcher/dsmethod.c b/drivers/acpi/acpica/dsmethod.c
index 279a5a60a0d..14b8b8ed802 100644
--- a/drivers/acpi/dispatcher/dsmethod.c
+++ b/drivers/acpi/acpica/dsmethod.c
@@ -42,11 +42,14 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/amlcode.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "amlcode.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "acnamesp.h"
+#ifdef	ACPI_DISASSEMBLER
 #include <acpi/acdisasm.h>
+#endif
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dsmethod")
@@ -412,6 +415,9 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread,
 
 	if (obj_desc->method.method_flags & AML_METHOD_INTERNAL_ONLY) {
 		status = obj_desc->method.implementation(next_walk_state);
+		if (status == AE_OK) {
+			status = AE_CTRL_TERMINATE;
+		}
 	}
 
 	return_ACPI_STATUS(status);
diff --git a/drivers/acpi/dispatcher/dsmthdat.c b/drivers/acpi/acpica/dsmthdat.c
index d03f81bd1bc..da0f5468184 100644
--- a/drivers/acpi/dispatcher/dsmthdat.c
+++ b/drivers/acpi/acpica/dsmthdat.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acdispat.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acdispat.h"
+#include "acnamesp.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dsmthdat")
diff --git a/drivers/acpi/dispatcher/dsobject.c b/drivers/acpi/acpica/dsobject.c
index 4f08e599d07..15c628e6aa0 100644
--- a/drivers/acpi/dispatcher/dsobject.c
+++ b/drivers/acpi/acpica/dsobject.c
@@ -42,11 +42,12 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
-#include <acpi/acdispat.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "amlcode.h"
+#include "acdispat.h"
+#include "acnamesp.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dsobject")
diff --git a/drivers/acpi/dispatcher/dsopcode.c b/drivers/acpi/acpica/dsopcode.c
index 69fae5905bb..0c3b4dd60e8 100644
--- a/drivers/acpi/dispatcher/dsopcode.c
+++ b/drivers/acpi/acpica/dsopcode.c
@@ -43,13 +43,14 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acevents.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "amlcode.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "acnamesp.h"
+#include "acevents.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dsopcode")
@@ -1140,10 +1141,29 @@ acpi_ds_exec_begin_control_op(struct acpi_walk_state *walk_state,
 			  op->common.aml_opcode, walk_state));
 
 	switch (op->common.aml_opcode) {
-	case AML_IF_OP:
 	case AML_WHILE_OP:
 
 		/*
+		 * If this is an additional iteration of a while loop, continue.
+		 * There is no need to allocate a new control state.
+		 */
+		if (walk_state->control_state) {
+			if (walk_state->control_state->control.aml_predicate_start
+				== (walk_state->parser_state.aml - 1)) {
+
+				/* Reset the state to start-of-loop */
+
+				walk_state->control_state->common.state =
+				    ACPI_CONTROL_CONDITIONAL_EXECUTING;
+				break;
+			}
+		}
+
+		/*lint -fallthrough */
+
+	case AML_IF_OP:
+
+		/*
 		 * IF/WHILE: Create a new control state to manage these
 		 * constructs. We need to manage these as a stack, in order
 		 * to handle nesting.
@@ -1243,13 +1263,36 @@ acpi_ds_exec_end_control_op(struct acpi_walk_state * walk_state,
 
 		ACPI_DEBUG_PRINT((ACPI_DB_DISPATCH, "[WHILE_OP] Op=%p\n", op));
 
-		if (walk_state->control_state->common.value) {
+		control_state = walk_state->control_state;
+		if (control_state->common.value) {
 
-			/* Predicate was true, go back and evaluate it again! */
+			/* Predicate was true, the body of the loop was just executed */
 
+			/*
+			 * This loop counter mechanism allows the interpreter to escape
+			 * possibly infinite loops. This can occur in poorly written AML
+			 * when the hardware does not respond within a while loop and the
+			 * loop does not implement a timeout.
+			 */
+			control_state->control.loop_count++;
+			if (control_state->control.loop_count >
+				ACPI_MAX_LOOP_ITERATIONS) {
+				status = AE_AML_INFINITE_LOOP;
+				break;
+			}
+
+			/*
+			 * Go back and evaluate the predicate and maybe execute the loop
+			 * another time
+			 */
 			status = AE_CTRL_PENDING;
+			walk_state->aml_last_while =
+			    control_state->control.aml_predicate_start;
+			break;
 		}
 
+		/* Predicate was false, terminate this while loop */
+
 		ACPI_DEBUG_PRINT((ACPI_DB_DISPATCH,
 				  "[WHILE_OP] termination! Op=%p\n", op));
 
@@ -1257,9 +1300,6 @@ acpi_ds_exec_end_control_op(struct acpi_walk_state * walk_state,
 
 		control_state =
 		    acpi_ut_pop_generic_state(&walk_state->control_state);
-
-		walk_state->aml_last_while =
-		    control_state->control.aml_predicate_start;
 		acpi_ut_delete_generic_state(control_state);
 		break;
 
diff --git a/drivers/acpi/dispatcher/dsutils.c b/drivers/acpi/acpica/dsutils.c
index b398982f0d8..dabc23a4617 100644
--- a/drivers/acpi/dispatcher/dsutils.c
+++ b/drivers/acpi/acpica/dsutils.c
@@ -42,12 +42,13 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acdebug.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "amlcode.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "acnamesp.h"
+#include "acdebug.h"
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dsutils")
diff --git a/drivers/acpi/dispatcher/dswexec.c b/drivers/acpi/acpica/dswexec.c
index 396fe12078c..350e6656bc8 100644
--- a/drivers/acpi/dispatcher/dswexec.c
+++ b/drivers/acpi/acpica/dswexec.c
@@ -43,12 +43,13 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acdebug.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "amlcode.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "acnamesp.h"
+#include "acdebug.h"
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dswexec")
diff --git a/drivers/acpi/dispatcher/dswload.c b/drivers/acpi/acpica/dswload.c
index dff7a3e445a..3023ceaa8d5 100644
--- a/drivers/acpi/dispatcher/dswload.c
+++ b/drivers/acpi/acpica/dswload.c
@@ -42,12 +42,13 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acevents.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "amlcode.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "acnamesp.h"
+#include "acevents.h"
 
 #ifdef ACPI_ASL_COMPILER
 #include <acpi/acdisasm.h>
diff --git a/drivers/acpi/dispatcher/dswscope.c b/drivers/acpi/acpica/dswscope.c
index 9e607326587..908645e72f0 100644
--- a/drivers/acpi/dispatcher/dswscope.c
+++ b/drivers/acpi/acpica/dswscope.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acdispat.h>
+#include "accommon.h"
+#include "acdispat.h"
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dswscope")
diff --git a/drivers/acpi/dispatcher/dswstate.c b/drivers/acpi/acpica/dswstate.c
index b00d4af791a..40f92bf7dce 100644
--- a/drivers/acpi/dispatcher/dswstate.c
+++ b/drivers/acpi/acpica/dswstate.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/acdispat.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "acdispat.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_DISPATCHER
 ACPI_MODULE_NAME("dswstate")
diff --git a/drivers/acpi/events/evevent.c b/drivers/acpi/acpica/evevent.c
index c56c5c6ea77..803edd9e3f6 100644
--- a/drivers/acpi/events/evevent.c
+++ b/drivers/acpi/acpica/evevent.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
+#include "accommon.h"
+#include "acevents.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evevent")
@@ -72,8 +73,8 @@ acpi_status acpi_ev_initialize_events(void)
 
 	/*
 	 * Initialize the Fixed and General Purpose Events. This is done prior to
-	 * enabling SCIs to prevent interrupts from occurring before the handlers are
-	 * installed.
+	 * enabling SCIs to prevent interrupts from occurring before the handlers
+	 * are installed.
 	 */
 	status = acpi_ev_fixed_event_initialize();
 	if (ACPI_FAILURE(status)) {
@@ -192,8 +193,8 @@ static acpi_status acpi_ev_fixed_event_initialize(void)
 	acpi_status status;
 
 	/*
-	 * Initialize the structure that keeps track of fixed event handlers
-	 * and enable the fixed events.
+	 * Initialize the structure that keeps track of fixed event handlers and
+	 * enable the fixed events.
 	 */
 	for (i = 0; i < ACPI_NUM_FIXED_EVENTS; i++) {
 		acpi_gbl_fixed_event_handlers[i].handler = NULL;
@@ -237,7 +238,7 @@ u32 acpi_ev_fixed_event_detect(void)
 
 	/*
 	 * Read the fixed feature status and enable registers, as all the cases
-	 * depend on their values.  Ignore errors here.
+	 * depend on their values. Ignore errors here.
 	 */
 	(void)acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS, &fixed_status);
 	(void)acpi_hw_register_read(ACPI_REGISTER_PM1_ENABLE, &fixed_enable);
@@ -291,8 +292,8 @@ static u32 acpi_ev_fixed_event_dispatch(u32 event)
 				status_register_id, 1);
 
 	/*
-	 * Make sure we've got a handler.  If not, report an error.
-	 * The event is disabled to prevent further interrupts.
+	 * Make sure we've got a handler. If not, report an error. The event is
+	 * disabled to prevent further interrupts.
 	 */
 	if (NULL == acpi_gbl_fixed_event_handlers[event].handler) {
 		(void)acpi_set_register(acpi_gbl_fixed_event_info[event].
diff --git a/drivers/acpi/events/evgpe.c b/drivers/acpi/acpica/evgpe.c
index f45c74fe745..f345ced3647 100644
--- a/drivers/acpi/events/evgpe.c
+++ b/drivers/acpi/acpica/evgpe.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acevents.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evgpe")
@@ -125,7 +126,7 @@ acpi_ev_update_gpe_enable_masks(struct acpi_gpe_event_info *gpe_event_info,
 	    (1 <<
 	     (gpe_event_info->gpe_number - gpe_register_info->base_gpe_number));
 
-	/* 1) Disable case.  Simply clear all enable bits */
+	/* 1) Disable case. Simply clear all enable bits */
 
 	if (type == ACPI_GPE_DISABLE) {
 		ACPI_CLEAR_BIT(gpe_register_info->enable_for_wake,
@@ -134,7 +135,7 @@ acpi_ev_update_gpe_enable_masks(struct acpi_gpe_event_info *gpe_event_info,
 		return_ACPI_STATUS(AE_OK);
 	}
 
-	/* 2) Enable case.  Set/Clear the appropriate enable bits */
+	/* 2) Enable case. Set/Clear the appropriate enable bits */
 
 	switch (gpe_event_info->flags & ACPI_GPE_TYPE_MASK) {
 	case ACPI_GPE_TYPE_WAKE:
@@ -295,7 +296,7 @@ acpi_status acpi_ev_disable_gpe(struct acpi_gpe_event_info *gpe_event_info)
  *
  * FUNCTION:    acpi_ev_get_gpe_event_info
  *
- * PARAMETERS:  gpe_device          - Device node.  NULL for GPE0/GPE1
+ * PARAMETERS:  gpe_device          - Device node. NULL for GPE0/GPE1
  *              gpe_number          - Raw GPE number
  *
  * RETURN:      A GPE event_info struct. NULL if not a valid GPE
@@ -372,7 +373,7 @@ struct acpi_gpe_event_info *acpi_ev_get_gpe_event_info(acpi_handle gpe_device,
  *
  * RETURN:      INTERRUPT_HANDLED or INTERRUPT_NOT_HANDLED
  *
- * DESCRIPTION: Detect if any GP events have occurred.  This function is
+ * DESCRIPTION: Detect if any GP events have occurred. This function is
  *              executed at interrupt level.
  *
  ******************************************************************************/
@@ -400,8 +401,8 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list)
 
 	/*
 	 * We need to obtain the GPE lock for both the data structs and registers
-	 * Note: Not necessary to obtain the hardware lock, since the GPE registers
-	 * are owned by the gpe_lock.
+	 * Note: Not necessary to obtain the hardware lock, since the GPE
+	 * registers are owned by the gpe_lock.
 	 */
 	flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock);
 
@@ -410,9 +411,8 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list)
 	gpe_block = gpe_xrupt_list->gpe_block_list_head;
 	while (gpe_block) {
 		/*
-		 * Read all of the 8-bit GPE status and enable registers
-		 * in this GPE block, saving all of them.
-		 * Find all currently active GP events.
+		 * Read all of the 8-bit GPE status and enable registers in this GPE
+		 * block, saving all of them. Find all currently active GP events.
 		 */
 		for (i = 0; i < gpe_block->register_count; i++) {
 
@@ -423,10 +423,8 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list)
 			/* Read the Status Register */
 
 			status =
-			    acpi_hw_low_level_read(ACPI_GPE_REGISTER_WIDTH,
-						   &status_reg,
-						   &gpe_register_info->
-						   status_address);
+			    acpi_read(&status_reg,
+				      &gpe_register_info->status_address);
 			if (ACPI_FAILURE(status)) {
 				goto unlock_and_exit;
 			}
@@ -434,10 +432,8 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list)
 			/* Read the Enable Register */
 
 			status =
-			    acpi_hw_low_level_read(ACPI_GPE_REGISTER_WIDTH,
-						   &enable_reg,
-						   &gpe_register_info->
-						   enable_address);
+			    acpi_read(&enable_reg,
+				      &gpe_register_info->enable_address);
 			if (ACPI_FAILURE(status)) {
 				goto unlock_and_exit;
 			}
@@ -527,8 +523,8 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context)
 	(void)acpi_ev_enable_gpe(gpe_event_info, FALSE);
 
 	/*
-	 * Take a snapshot of the GPE info for this level - we copy the
-	 * info to prevent a race condition with remove_handler/remove_block.
+	 * Take a snapshot of the GPE info for this level - we copy the info to
+	 * prevent a race condition with remove_handler/remove_block.
 	 */
 	ACPI_MEMCPY(&local_gpe_event_info, gpe_event_info,
 		    sizeof(struct acpi_gpe_event_info));
@@ -539,8 +535,8 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context)
 	}
 
 	/*
-	 * Must check for control method type dispatch one more
-	 * time to avoid race with ev_gpe_install_handler
+	 * Must check for control method type dispatch one more time to avoid a
+	 * race with ev_gpe_install_handler
 	 */
 	if ((local_gpe_event_info.flags & ACPI_GPE_DISPATCH_MASK) ==
 	    ACPI_GPE_DISPATCH_METHOD) {
@@ -584,8 +580,8 @@ static void acpi_ev_asynch_enable_gpe(void *context)
 	if ((gpe_event_info->flags & ACPI_GPE_XRUPT_TYPE_MASK) ==
 	    ACPI_GPE_LEVEL_TRIGGERED) {
 		/*
-		 * GPE is level-triggered, we clear the GPE status bit after
-		 * handling the event.
+		 * GPE is level-triggered, we clear the GPE status bit after handling
+		 * the event.
 		 */
 		status = acpi_hw_clear_gpe(gpe_event_info);
 		if (ACPI_FAILURE(status)) {
@@ -624,7 +620,7 @@ acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number)
 	acpi_os_gpe_count(gpe_number);
 
 	/*
-	 * If edge-triggered, clear the GPE status bit now.  Note that
+	 * If edge-triggered, clear the GPE status bit now. Note that
 	 * level-triggered events are cleared after the GPE is serviced.
 	 */
 	if ((gpe_event_info->flags & ACPI_GPE_XRUPT_TYPE_MASK) ==
@@ -650,7 +646,8 @@ acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number)
 
 		/*
 		 * Invoke the installed handler (at interrupt level)
-		 * Ignore return status for now.  TBD: leave GPE disabled on error?
+		 * Ignore return status for now.
+		 * TBD: leave GPE disabled on error?
 		 */
 		(void)gpe_event_info->dispatch.handler->address(gpe_event_info->
 								dispatch.
@@ -708,7 +705,7 @@ acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number)
 			    gpe_number));
 
 		/*
-		 * Disable the GPE. The GPE will remain disabled until the ACPI
+		 * Disable the GPE. The GPE will remain disabled until the ACPICA
 		 * Core Subsystem is restarted, or a handler is installed.
 		 */
 		status = acpi_ev_disable_gpe(gpe_event_info);
diff --git a/drivers/acpi/events/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c
index 73c058e2f5c..484cc0565d5 100644
--- a/drivers/acpi/events/evgpeblk.c
+++ b/drivers/acpi/acpica/evgpeblk.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acevents.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evgpeblk")
@@ -124,6 +125,7 @@ u8 acpi_ev_valid_gpe_event(struct acpi_gpe_event_info *gpe_event_info)
  * FUNCTION:    acpi_ev_walk_gpe_list
  *
  * PARAMETERS:  gpe_walk_callback   - Routine called for each GPE block
+ *              Context             - Value passed to callback
  *
  * RETURN:      Status
  *
@@ -131,7 +133,8 @@ u8 acpi_ev_valid_gpe_event(struct acpi_gpe_event_info *gpe_event_info)
  *
  ******************************************************************************/
 
-acpi_status acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback)
+acpi_status
+acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback, void *context)
 {
 	struct acpi_gpe_block_info *gpe_block;
 	struct acpi_gpe_xrupt_info *gpe_xrupt_info;
@@ -154,8 +157,13 @@ acpi_status acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback)
 
 			/* One callback per GPE block */
 
-			status = gpe_walk_callback(gpe_xrupt_info, gpe_block);
+			status =
+			    gpe_walk_callback(gpe_xrupt_info, gpe_block,
+					      context);
 			if (ACPI_FAILURE(status)) {
+				if (status == AE_CTRL_END) {	/* Callback abort */
+					status = AE_OK;
+				}
 				goto unlock_and_exit;
 			}
 
@@ -186,7 +194,8 @@ acpi_status acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback)
 
 acpi_status
 acpi_ev_delete_gpe_handlers(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
-			    struct acpi_gpe_block_info *gpe_block)
+			    struct acpi_gpe_block_info *gpe_block,
+			    void *context)
 {
 	struct acpi_gpe_event_info *gpe_event_info;
 	u32 i;
@@ -309,17 +318,17 @@ acpi_ev_save_method_info(acpi_handle obj_handle,
 	     (gpe_block->block_base_number +
 	      (gpe_block->register_count * 8)))) {
 		/*
-		 * Not valid for this GPE block, just ignore it
-		 * However, it may be valid for a different GPE block, since GPE0 and GPE1
-		 * methods both appear under \_GPE.
+		 * Not valid for this GPE block, just ignore it. However, it may be
+		 * valid for a different GPE block, since GPE0 and GPE1 methods both
+		 * appear under \_GPE.
 		 */
 		return_ACPI_STATUS(AE_OK);
 	}
 
 	/*
-	 * Now we can add this information to the gpe_event_info block
-	 * for use during dispatch of this GPE. Default type is RUNTIME, although
-	 * this may change when the _PRW methods are executed later.
+	 * Now we can add this information to the gpe_event_info block for use
+	 * during dispatch of this GPE. Default type is RUNTIME, although this may
+	 * change when the _PRW methods are executed later.
 	 */
 	gpe_event_info =
 	    &gpe_block->event_info[gpe_number - gpe_block->block_base_number];
@@ -394,8 +403,8 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle,
 	gpe_block = gpe_info->gpe_block;
 
 	/*
-	 * The _PRW object must return a package, we are only interested
-	 * in the first element
+	 * The _PRW object must return a package, we are only interested in the
+	 * first element
 	 */
 	obj_desc = pkg_desc->package.elements[0];
 
@@ -434,7 +443,7 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle,
 	/*
 	 * Is this GPE within this block?
 	 *
-	 * TRUE iff these conditions are true:
+	 * TRUE if and only if these conditions are true:
 	 *     1) The GPE devices match.
 	 *     2) The GPE index(number) is within the range of the Gpe Block
 	 *          associated with the GPE device.
@@ -457,6 +466,7 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle,
 		if (ACPI_FAILURE(status)) {
 			goto cleanup;
 		}
+
 		status =
 		    acpi_ev_update_gpe_enable_masks(gpe_event_info,
 						    ACPI_GPE_DISABLE);
@@ -476,9 +486,9 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle,
  * RETURN:      A GPE interrupt block
  *
  * DESCRIPTION: Get or Create a GPE interrupt block. There is one interrupt
- *              block per unique interrupt level used for GPEs.
- *              Should be called only when the GPE lists are semaphore locked
- *              and not subject to change.
+ *              block per unique interrupt level used for GPEs. Should be
+ *              called only when the GPE lists are semaphore locked and not
+ *              subject to change.
  *
  ******************************************************************************/
 
@@ -608,8 +618,9 @@ acpi_ev_delete_gpe_xrupt(struct acpi_gpe_xrupt_info *gpe_xrupt)
  *
  * FUNCTION:    acpi_ev_install_gpe_block
  *
- * PARAMETERS:  gpe_block       - New GPE block
- *              interrupt_number - Xrupt to be associated with this GPE block
+ * PARAMETERS:  gpe_block               - New GPE block
+ *              interrupt_number        - Xrupt to be associated with this
+ *                                        GPE block
  *
  * RETURN:      Status
  *
@@ -666,7 +677,7 @@ acpi_ev_install_gpe_block(struct acpi_gpe_block_info *gpe_block,
  *
  * FUNCTION:    acpi_ev_delete_gpe_block
  *
- * PARAMETERS:  gpe_block       - Existing GPE block
+ * PARAMETERS:  gpe_block           - Existing GPE block
  *
  * RETURN:      Status
  *
@@ -688,7 +699,8 @@ acpi_status acpi_ev_delete_gpe_block(struct acpi_gpe_block_info *gpe_block)
 
 	/* Disable all GPEs in this block */
 
-	status = acpi_hw_disable_gpe_block(gpe_block->xrupt_block, gpe_block);
+	status =
+	    acpi_hw_disable_gpe_block(gpe_block->xrupt_block, gpe_block, NULL);
 
 	if (!gpe_block->previous && !gpe_block->next) {
 
@@ -715,6 +727,9 @@ acpi_status acpi_ev_delete_gpe_block(struct acpi_gpe_block_info *gpe_block)
 		acpi_os_release_lock(acpi_gbl_gpe_lock, flags);
 	}
 
+	acpi_current_gpe_count -=
+	    gpe_block->register_count * ACPI_GPE_REGISTER_WIDTH;
+
 	/* Free the gpe_block */
 
 	ACPI_FREE(gpe_block->register_info);
@@ -786,9 +801,9 @@ acpi_ev_create_gpe_info_blocks(struct acpi_gpe_block_info *gpe_block)
 
 	/*
 	 * Initialize the GPE Register and Event structures. A goal of these
-	 * tables is to hide the fact that there are two separate GPE register sets
-	 * in a given GPE hardware block, the status registers occupy the first half,
-	 * and the enable registers occupy the second half.
+	 * tables is to hide the fact that there are two separate GPE register
+	 * sets in a given GPE hardware block, the status registers occupy the
+	 * first half, and the enable registers occupy the second half.
 	 */
 	this_register = gpe_register_info;
 	this_event = gpe_event_info;
@@ -816,10 +831,8 @@ acpi_ev_create_gpe_info_blocks(struct acpi_gpe_block_info *gpe_block)
 		    ACPI_GPE_REGISTER_WIDTH;
 		this_register->enable_address.bit_width =
 		    ACPI_GPE_REGISTER_WIDTH;
-		this_register->status_address.bit_offset =
-		    ACPI_GPE_REGISTER_WIDTH;
-		this_register->enable_address.bit_offset =
-		    ACPI_GPE_REGISTER_WIDTH;
+		this_register->status_address.bit_offset = 0;
+		this_register->enable_address.bit_offset = 0;
 
 		/* Init the event_info for each GPE within this register */
 
@@ -832,18 +845,14 @@ acpi_ev_create_gpe_info_blocks(struct acpi_gpe_block_info *gpe_block)
 
 		/* Disable all GPEs within this register */
 
-		status = acpi_hw_low_level_write(ACPI_GPE_REGISTER_WIDTH, 0x00,
-						 &this_register->
-						 enable_address);
+		status = acpi_write(0x00, &this_register->enable_address);
 		if (ACPI_FAILURE(status)) {
 			goto error_exit;
 		}
 
 		/* Clear any pending GPE events within this register */
 
-		status = acpi_hw_low_level_write(ACPI_GPE_REGISTER_WIDTH, 0xFF,
-						 &this_register->
-						 status_address);
+		status = acpi_write(0xFF, &this_register->status_address);
 		if (ACPI_FAILURE(status)) {
 			goto error_exit;
 		}
@@ -956,6 +965,9 @@ acpi_ev_create_gpe_block(struct acpi_namespace_node *gpe_device,
 			  gpe_device->name.ascii, gpe_block->register_count,
 			  interrupt_number));
 
+	/* Update global count of currently available GPEs */
+
+	acpi_current_gpe_count += register_count * ACPI_GPE_REGISTER_WIDTH;
 	return_ACPI_STATUS(AE_OK);
 }
 
@@ -1055,7 +1067,7 @@ acpi_ev_initialize_gpe_block(struct acpi_namespace_node *gpe_device,
 
 	/* Enable all valid runtime GPEs found above */
 
-	status = acpi_hw_enable_runtime_gpe_block(NULL, gpe_block);
+	status = acpi_hw_enable_runtime_gpe_block(NULL, gpe_block, NULL);
 	if (ACPI_FAILURE(status)) {
 		ACPI_ERROR((AE_INFO, "Could not enable GPEs in GpeBlock %p",
 			    gpe_block));
diff --git a/drivers/acpi/events/evmisc.c b/drivers/acpi/acpica/evmisc.c
index 1d5670be729..5f893057bcc 100644
--- a/drivers/acpi/events/evmisc.c
+++ b/drivers/acpi/acpica/evmisc.c
@@ -42,18 +42,15 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acevents.h"
+#include "acnamesp.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evmisc")
 
-/* Pointer to FACS needed for the Global Lock */
-static struct acpi_table_facs *facs = NULL;
-
 /* Local prototypes */
-
 static void ACPI_SYSTEM_XFACE acpi_ev_notify_dispatch(void *context);
 
 static u32 acpi_ev_global_lock_handler(void *context);
@@ -152,7 +149,9 @@ acpi_ev_queue_notify_request(struct acpi_namespace_node * node,
 			break;
 
 		default:
+
 			/* All other types are not supported */
+
 			return (AE_TYPE);
 		}
 	}
@@ -193,9 +192,8 @@ acpi_ev_queue_notify_request(struct acpi_namespace_node * node,
 			acpi_ut_delete_generic_state(notify_info);
 		}
 	} else {
-		/*
-		 * There is no notify handler (per-device or system) for this device.
-		 */
+		/* There is no notify handler (per-device or system) for this device */
+
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 				  "No notify handler for Notify (%4.4s, %X) node %p\n",
 				  acpi_ut_get_node_name(node), notify_value,
@@ -229,9 +227,8 @@ static void ACPI_SYSTEM_XFACE acpi_ev_notify_dispatch(void *context)
 	ACPI_FUNCTION_ENTRY();
 
 	/*
-	 * We will invoke a global notify handler if installed.
-	 * This is done _before_ we invoke the per-device handler attached
-	 * to the device.
+	 * We will invoke a global notify handler if installed. This is done
+	 * _before_ we invoke the per-device handler attached to the device.
 	 */
 	if (notify_info->notify.value <= ACPI_MAX_SYS_NOTIFY) {
 
@@ -299,7 +296,7 @@ static u32 acpi_ev_global_lock_handler(void *context)
 	 * If we don't get it now, it will be marked pending and we will
 	 * take another interrupt when it becomes free.
 	 */
-	ACPI_ACQUIRE_GLOBAL_LOCK(facs, acquired);
+	ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired);
 	if (acquired) {
 
 		/* Got the lock, now wake all threads waiting for it */
@@ -336,34 +333,27 @@ acpi_status acpi_ev_init_global_lock_handler(void)
 
 	ACPI_FUNCTION_TRACE(ev_init_global_lock_handler);
 
-	status = acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS,
-					 ACPI_CAST_INDIRECT_PTR(struct
-								acpi_table_header,
-								&facs));
-	if (ACPI_FAILURE(status)) {
-		return_ACPI_STATUS(status);
-	}
+	/* Attempt installation of the global lock handler */
 
-	acpi_gbl_global_lock_present = TRUE;
 	status = acpi_install_fixed_event_handler(ACPI_EVENT_GLOBAL,
 						  acpi_ev_global_lock_handler,
 						  NULL);
 
 	/*
-	 * If the global lock does not exist on this platform, the attempt
-	 * to enable GBL_STATUS will fail (the GBL_ENABLE bit will not stick)
-	 * Map to AE_OK, but mark global lock as not present.
-	 * Any attempt to actually use the global lock will be flagged
-	 * with an error.
+	 * If the global lock does not exist on this platform, the attempt to
+	 * enable GBL_STATUS will fail (the GBL_ENABLE bit will not stick).
+	 * Map to AE_OK, but mark global lock as not present. Any attempt to
+	 * actually use the global lock will be flagged with an error.
 	 */
 	if (status == AE_NO_HARDWARE_RESPONSE) {
 		ACPI_ERROR((AE_INFO,
 			    "No response from Global Lock hardware, disabling lock"));
 
 		acpi_gbl_global_lock_present = FALSE;
-		status = AE_OK;
+		return_ACPI_STATUS(AE_OK);
 	}
 
+	acpi_gbl_global_lock_present = TRUE;
 	return_ACPI_STATUS(status);
 }
 
@@ -462,8 +452,8 @@ acpi_status acpi_ev_acquire_global_lock(u16 timeout)
 	}
 
 	/*
-	 * Make sure that a global lock actually exists. If not, just treat
-	 * the lock as a standard mutex.
+	 * Make sure that a global lock actually exists. If not, just treat the
+	 * lock as a standard mutex.
 	 */
 	if (!acpi_gbl_global_lock_present) {
 		acpi_gbl_global_lock_acquired = TRUE;
@@ -472,7 +462,7 @@ acpi_status acpi_ev_acquire_global_lock(u16 timeout)
 
 	/* Attempt to acquire the actual hardware lock */
 
-	ACPI_ACQUIRE_GLOBAL_LOCK(facs, acquired);
+	ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired);
 	if (acquired) {
 
 		/* We got the lock */
@@ -536,7 +526,7 @@ acpi_status acpi_ev_release_global_lock(void)
 
 		/* Allow any thread to release the lock */
 
-		ACPI_RELEASE_GLOBAL_LOCK(facs, pending);
+		ACPI_RELEASE_GLOBAL_LOCK(acpi_gbl_FACS, pending);
 
 		/*
 		 * If the pending bit was set, we must write GBL_RLS to the control
@@ -582,8 +572,8 @@ void acpi_ev_terminate(void)
 
 	if (acpi_gbl_events_initialized) {
 		/*
-		 * Disable all event-related functionality.
-		 * In all cases, on error, print a message but obviously we don't abort.
+		 * Disable all event-related functionality. In all cases, on error,
+		 * print a message but obviously we don't abort.
 		 */
 
 		/* Disable all fixed events */
@@ -599,7 +589,7 @@ void acpi_ev_terminate(void)
 
 		/* Disable all GPEs in all GPE blocks */
 
-		status = acpi_ev_walk_gpe_list(acpi_hw_disable_gpe_block);
+		status = acpi_ev_walk_gpe_list(acpi_hw_disable_gpe_block, NULL);
 
 		/* Remove SCI handler */
 
@@ -617,7 +607,7 @@ void acpi_ev_terminate(void)
 
 	/* Deallocate all handler objects installed within GPE info structs */
 
-	status = acpi_ev_walk_gpe_list(acpi_ev_delete_gpe_handlers);
+	status = acpi_ev_walk_gpe_list(acpi_ev_delete_gpe_handlers, NULL);
 
 	/* Return to original mode if necessary */
 
diff --git a/drivers/acpi/events/evregion.c b/drivers/acpi/acpica/evregion.c
index 236fbd1ca43..665c0887ab4 100644
--- a/drivers/acpi/events/evregion.c
+++ b/drivers/acpi/acpica/evregion.c
@@ -42,22 +42,15 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acevents.h"
+#include "acnamesp.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evregion")
-#define ACPI_NUM_DEFAULT_SPACES     4
-static u8 acpi_gbl_default_address_spaces[ACPI_NUM_DEFAULT_SPACES] = {
-	ACPI_ADR_SPACE_SYSTEM_MEMORY,
-	ACPI_ADR_SPACE_SYSTEM_IO,
-	ACPI_ADR_SPACE_PCI_CONFIG,
-	ACPI_ADR_SPACE_DATA_TABLE
-};
 
 /* Local prototypes */
-
 static acpi_status
 acpi_ev_reg_run(acpi_handle obj_handle,
 		u32 level, void *context, void **return_value);
@@ -66,6 +59,17 @@ static acpi_status
 acpi_ev_install_handler(acpi_handle obj_handle,
 			u32 level, void *context, void **return_value);
 
+/* These are the address spaces that will get default handlers */
+
+#define ACPI_NUM_DEFAULT_SPACES     4
+
+static u8 acpi_gbl_default_address_spaces[ACPI_NUM_DEFAULT_SPACES] = {
+	ACPI_ADR_SPACE_SYSTEM_MEMORY,
+	ACPI_ADR_SPACE_SYSTEM_IO,
+	ACPI_ADR_SPACE_PCI_CONFIG,
+	ACPI_ADR_SPACE_DATA_TABLE
+};
+
 /*******************************************************************************
  *
  * FUNCTION:    acpi_ev_install_region_handlers
@@ -91,18 +95,19 @@ acpi_status acpi_ev_install_region_handlers(void)
 	}
 
 	/*
-	 * All address spaces (PCI Config, EC, SMBus) are scope dependent
-	 * and registration must occur for a specific device.
+	 * All address spaces (PCI Config, EC, SMBus) are scope dependent and
+	 * registration must occur for a specific device.
 	 *
-	 * In the case of the system memory and IO address spaces there is currently
-	 * no device associated with the address space.  For these we use the root.
+	 * In the case of the system memory and IO address spaces there is
+	 * currently no device associated with the address space. For these we
+	 * use the root.
 	 *
-	 * We install the default PCI config space handler at the root so
-	 * that this space is immediately available even though the we have
-	 * not enumerated all the PCI Root Buses yet.  This is to conform
-	 * to the ACPI specification which states that the PCI config
-	 * space must be always available -- even though we are nowhere
-	 * near ready to find the PCI root buses at this point.
+	 * We install the default PCI config space handler at the root so that
+	 * this space is immediately available even though the we have not
+	 * enumerated all the PCI Root Buses yet. This is to conform to the ACPI
+	 * specification which states that the PCI config space must be always
+	 * available -- even though we are nowhere near ready to find the PCI root
+	 * buses at this point.
 	 *
 	 * NOTE: We ignore AE_ALREADY_EXISTS because this means that a handler
 	 * has already been installed (via acpi_install_address_space_handler).
@@ -160,12 +165,11 @@ acpi_status acpi_ev_initialize_op_regions(void)
 		return_ACPI_STATUS(status);
 	}
 
-	/*
-	 * Run the _REG methods for op_regions in each default address space
-	 */
-	for (i = 0; i < ACPI_NUM_DEFAULT_SPACES; i++) {
+	/* Run the _REG methods for op_regions in each default address space */
 
-		/* TBD: Make sure handler is the DEFAULT handler, otherwise
+	for (i = 0; i < ACPI_NUM_DEFAULT_SPACES; i++) {
+		/*
+		 * TBD: Make sure handler is the DEFAULT handler, otherwise
 		 * _REG will have already been run.
 		 */
 		status = acpi_ev_execute_reg_methods(acpi_gbl_root_node,
@@ -318,13 +322,13 @@ acpi_ev_address_space_dispatch(union acpi_operand_object *region_obj,
 	}
 
 	/*
-	 * It may be the case that the region has never been initialized
+	 * It may be the case that the region has never been initialized.
 	 * Some types of regions require special init code
 	 */
 	if (!(region_obj->region.flags & AOPOBJ_SETUP_COMPLETE)) {
-		/*
-		 * This region has not been initialized yet, do it
-		 */
+
+		/* This region has not been initialized yet, do it */
+
 		region_setup = handler_desc->address_space.setup;
 		if (!region_setup) {
 
@@ -339,9 +343,9 @@ acpi_ev_address_space_dispatch(union acpi_operand_object *region_obj,
 		}
 
 		/*
-		 * We must exit the interpreter because the region
-		 * setup will potentially execute control methods
-		 * (e.g., _REG method for this region)
+		 * We must exit the interpreter because the region setup will
+		 * potentially execute control methods (for example, the _REG method
+		 * for this region)
 		 */
 		acpi_ex_exit_interpreter();
 
@@ -364,9 +368,8 @@ acpi_ev_address_space_dispatch(union acpi_operand_object *region_obj,
 			return_ACPI_STATUS(status);
 		}
 
-		/*
-		 * Region initialization may have been completed by region_setup
-		 */
+		/* Region initialization may have been completed by region_setup */
+
 		if (!(region_obj->region.flags & AOPOBJ_SETUP_COMPLETE)) {
 			region_obj->region.flags |= AOPOBJ_SETUP_COMPLETE;
 
@@ -521,8 +524,8 @@ acpi_ev_detach_region(union acpi_operand_object *region_obj,
 			}
 
 			/*
-			 * If the region has been activated, call the setup handler
-			 * with the deactivate notification
+			 * If the region has been activated, call the setup handler with
+			 * the deactivate notification
 			 */
 			if (region_obj->region.flags & AOPOBJ_SETUP_COMPLETE) {
 				region_setup = handler_obj->address_space.setup;
@@ -668,8 +671,8 @@ acpi_ev_install_handler(acpi_handle obj_handle,
 	}
 
 	/*
-	 * We only care about regions.and objects
-	 * that are allowed to have address space handlers
+	 * We only care about regions and objects that are allowed to have
+	 * address space handlers
 	 */
 	if ((node->type != ACPI_TYPE_DEVICE) &&
 	    (node->type != ACPI_TYPE_REGION) && (node != acpi_gbl_root_node)) {
@@ -710,9 +713,9 @@ acpi_ev_install_handler(acpi_handle obj_handle,
 				/*
 				 * Since the object we found it on was a device, then it
 				 * means that someone has already installed a handler for
-				 * the branch of the namespace from this device on.  Just
+				 * the branch of the namespace from this device on. Just
 				 * bail out telling the walk routine to not traverse this
-				 * branch.  This preserves the scoping rule for handlers.
+				 * branch. This preserves the scoping rule for handlers.
 				 */
 				return (AE_CTRL_DEPTH);
 			}
@@ -723,9 +726,8 @@ acpi_ev_install_handler(acpi_handle obj_handle,
 		}
 
 		/*
-		 * As long as the device didn't have a handler for this
-		 * space we don't care about it.  We just ignore it and
-		 * proceed.
+		 * As long as the device didn't have a handler for this space we
+		 * don't care about it. We just ignore it and proceed.
 		 */
 		return (AE_OK);
 	}
@@ -733,16 +735,14 @@ acpi_ev_install_handler(acpi_handle obj_handle,
 	/* Object is a Region */
 
 	if (obj_desc->region.space_id != handler_obj->address_space.space_id) {
-		/*
-		 * This region is for a different address space
-		 * -- just ignore it
-		 */
+
+		/* This region is for a different address space, just ignore it */
+
 		return (AE_OK);
 	}
 
 	/*
-	 * Now we have a region and it is for the handler's address
-	 * space type.
+	 * Now we have a region and it is for the handler's address space type.
 	 *
 	 * First disconnect region for any previous handler (if any)
 	 */
@@ -786,9 +786,8 @@ acpi_ev_install_space_handler(struct acpi_namespace_node * node,
 	ACPI_FUNCTION_TRACE(ev_install_space_handler);
 
 	/*
-	 * This registration is valid for only the types below
-	 * and the root.  This is where the default handlers
-	 * get placed.
+	 * This registration is valid for only the types below and the root. This
+	 * is where the default handlers get placed.
 	 */
 	if ((node->type != ACPI_TYPE_DEVICE) &&
 	    (node->type != ACPI_TYPE_PROCESSOR) &&
@@ -848,8 +847,8 @@ acpi_ev_install_space_handler(struct acpi_namespace_node * node,
 	obj_desc = acpi_ns_get_attached_object(node);
 	if (obj_desc) {
 		/*
-		 * The attached device object already exists.
-		 * Make sure the handler is not already installed.
+		 * The attached device object already exists. Make sure the handler
+		 * is not already installed.
 		 */
 		handler_obj = obj_desc->device.handler;
 
@@ -864,8 +863,8 @@ acpi_ev_install_space_handler(struct acpi_namespace_node * node,
 				    handler) {
 					/*
 					 * It is (relatively) OK to attempt to install the SAME
-					 * handler twice. This can easily happen
-					 * with PCI_Config space.
+					 * handler twice. This can easily happen with the
+					 * PCI_Config space.
 					 */
 					status = AE_SAME_HANDLER;
 					goto unlock_and_exit;
@@ -925,9 +924,8 @@ acpi_ev_install_space_handler(struct acpi_namespace_node * node,
 	/*
 	 * Install the handler
 	 *
-	 * At this point there is no existing handler.
-	 * Just allocate the object for the handler and link it
-	 * into the list.
+	 * At this point there is no existing handler. Just allocate the object
+	 * for the handler and link it into the list.
 	 */
 	handler_obj =
 	    acpi_ut_create_internal_object(ACPI_TYPE_LOCAL_ADDRESS_HANDLER);
@@ -1000,11 +998,10 @@ acpi_ev_execute_reg_methods(struct acpi_namespace_node *node,
 	ACPI_FUNCTION_TRACE(ev_execute_reg_methods);
 
 	/*
-	 * Run all _REG methods for all Operation Regions for this
-	 * space ID.  This is a separate walk in order to handle any
-	 * interdependencies between regions and _REG methods.  (i.e. handlers
-	 * must be installed for all regions of this Space ID before we
-	 * can run any _REG methods)
+	 * Run all _REG methods for all Operation Regions for this space ID. This
+	 * is a separate walk in order to handle any interdependencies between
+	 * regions and _REG methods. (i.e. handlers must be installed for all
+	 * regions of this Space ID before we can run any _REG methods)
 	 */
 	status = acpi_ns_walk_namespace(ACPI_TYPE_ANY, node, ACPI_UINT32_MAX,
 					ACPI_NS_WALK_UNLOCK, acpi_ev_reg_run,
@@ -1042,8 +1039,8 @@ acpi_ev_reg_run(acpi_handle obj_handle,
 	}
 
 	/*
-	 * We only care about regions.and objects
-	 * that are allowed to have address space handlers
+	 * We only care about regions.and objects that are allowed to have address
+	 * space handlers
 	 */
 	if ((node->type != ACPI_TYPE_REGION) && (node != acpi_gbl_root_node)) {
 		return (AE_OK);
@@ -1062,10 +1059,9 @@ acpi_ev_reg_run(acpi_handle obj_handle,
 	/* Object is a Region */
 
 	if (obj_desc->region.space_id != space_id) {
-		/*
-		 * This region is for a different address space
-		 * -- just ignore it
-		 */
+
+		/* This region is for a different address space, just ignore it */
+
 		return (AE_OK);
 	}
 
diff --git a/drivers/acpi/events/evrgnini.c b/drivers/acpi/acpica/evrgnini.c
index 6b94b38df07..f3f1fb45c3d 100644
--- a/drivers/acpi/events/evrgnini.c
+++ b/drivers/acpi/acpica/evrgnini.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acevents.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evrgnini")
@@ -233,9 +234,9 @@ acpi_ev_pci_config_region_setup(acpi_handle handle,
 				if (ACPI_FAILURE(status)) {
 					if (status == AE_SAME_HANDLER) {
 						/*
-						 * It is OK if the handler is already installed on the root
-						 * bridge.  Still need to return a context object for the
-						 * new PCI_Config operation region, however.
+						 * It is OK if the handler is already installed on the
+						 * root bridge. Still need to return a context object
+						 * for the new PCI_Config operation region, however.
 						 */
 						status = AE_OK;
 					} else {
@@ -272,8 +273,8 @@ acpi_ev_pci_config_region_setup(acpi_handle handle,
 	}
 
 	/*
-	 * For PCI_Config space access, we need the segment, bus,
-	 * device and function numbers.  Acquire them here.
+	 * For PCI_Config space access, we need the segment, bus, device and
+	 * function numbers. Acquire them here.
 	 *
 	 * Find the parent device object. (This allows the operation region to be
 	 * within a subscope under the device, such as a control method.)
@@ -289,16 +290,16 @@ acpi_ev_pci_config_region_setup(acpi_handle handle,
 	}
 
 	/*
-	 * Get the PCI device and function numbers from the _ADR object
-	 * contained in the parent's scope.
+	 * Get the PCI device and function numbers from the _ADR object contained
+	 * in the parent's scope.
 	 */
 	status =
 	    acpi_ut_evaluate_numeric_object(METHOD_NAME__ADR, pci_device_node,
 					    &pci_value);
 
 	/*
-	 * The default is zero, and since the allocation above zeroed
-	 * the data, just do nothing on failure.
+	 * The default is zero, and since the allocation above zeroed the data,
+	 * just do nothing on failure.
 	 */
 	if (ACPI_SUCCESS(status)) {
 		pci_id->device = ACPI_HIWORD(ACPI_LODWORD(pci_value));
@@ -382,9 +383,8 @@ static u8 acpi_ev_is_pci_root_bridge(struct acpi_namespace_node *node)
 	struct acpi_compatible_id_list *cid;
 	u32 i;
 
-	/*
-	 * Get the _HID and check for a PCI Root Bridge
-	 */
+	/* Get the _HID and check for a PCI Root Bridge */
+
 	status = acpi_ut_execute_HID(node, &hid);
 	if (ACPI_FAILURE(status)) {
 		return (FALSE);
@@ -394,10 +394,8 @@ static u8 acpi_ev_is_pci_root_bridge(struct acpi_namespace_node *node)
 		return (TRUE);
 	}
 
-	/*
-	 * The _HID did not match.
-	 * Get the _CID and check for a PCI Root Bridge
-	 */
+	/* The _HID did not match. Get the _CID and check for a PCI Root Bridge */
+
 	status = acpi_ut_execute_CID(node, &cid);
 	if (ACPI_FAILURE(status)) {
 		return (FALSE);
@@ -516,9 +514,9 @@ acpi_ev_default_region_setup(acpi_handle handle,
  *              Get the appropriate address space handler for a newly
  *              created region.
  *
- *              This also performs address space specific initialization.  For
+ *              This also performs address space specific initialization. For
  *              example, PCI regions must have an _ADR object that contains
- *              a PCI address in the scope of the definition.  This address is
+ *              a PCI address in the scope of the definition. This address is
  *              required to perform an access to PCI config space.
  *
  * MUTEX:       Interpreter should be unlocked, because we may run the _REG
@@ -572,7 +570,7 @@ acpi_ev_initialize_region(union acpi_operand_object *region_obj,
 	if (ACPI_SUCCESS(status)) {
 		/*
 		 * The _REG method is optional and there can be only one per region
-		 * definition.  This will be executed when the handler is attached
+		 * definition. This will be executed when the handler is attached
 		 * or removed
 		 */
 		region_obj2->extra.method_REG = method_node;
@@ -670,10 +668,8 @@ acpi_ev_initialize_region(union acpi_operand_object *region_obj,
 			}
 		}
 
-		/*
-		 * This node does not have the handler we need;
-		 * Pop up one level
-		 */
+		/* This node does not have the handler we need; Pop up one level */
+
 		node = acpi_ns_get_parent_node(node);
 	}
 
diff --git a/drivers/acpi/events/evsci.c b/drivers/acpi/acpica/evsci.c
index 2a8b7787761..567b356c85a 100644
--- a/drivers/acpi/events/evsci.c
+++ b/drivers/acpi/acpica/evsci.c
@@ -43,7 +43,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
+#include "accommon.h"
+#include "acevents.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evsci")
@@ -115,10 +116,8 @@ u32 ACPI_SYSTEM_XFACE acpi_ev_gpe_xrupt_handler(void *context)
 	 * if this interrupt handler is installed, ACPI is enabled.
 	 */
 
-	/*
-	 * GPEs:
-	 * Check for and dispatch any GPEs that have occurred
-	 */
+	/* GPEs: Check for and dispatch any GPEs that have occurred */
+
 	interrupt_handled |= acpi_ev_gpe_detect(gpe_xrupt_list);
 
 	return_UINT32(interrupt_handled);
@@ -158,11 +157,11 @@ u32 acpi_ev_install_sci_handler(void)
  * RETURN:      E_OK if handler uninstalled OK, E_ERROR if handler was not
  *              installed to begin with
  *
- * DESCRIPTION: Remove the SCI interrupt handler.  No further SCIs will be
+ * DESCRIPTION: Remove the SCI interrupt handler. No further SCIs will be
  *              taken.
  *
  * Note:  It doesn't seem important to disable all events or set the event
- *        enable registers to their original values.  The OS should disable
+ *        enable registers to their original values. The OS should disable
  *        the SCI interrupt level when the handler is removed, so no more
  *        events will come in.
  *
diff --git a/drivers/acpi/events/evxface.c b/drivers/acpi/acpica/evxface.c
index 94a6efe020b..3aca9010a11 100644
--- a/drivers/acpi/events/evxface.c
+++ b/drivers/acpi/acpica/evxface.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acevents.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acevents.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evxface")
@@ -267,7 +268,7 @@ acpi_install_notify_handler(acpi_handle device,
 	/*
 	 * Root Object:
 	 * Registering a notify handler on the root object indicates that the
-	 * caller wishes to receive notifications for all objects.  Note that
+	 * caller wishes to receive notifications for all objects. Note that
 	 * only one <external> global handler can be regsitered (per notify type).
 	 */
 	if (device == ACPI_ROOT_OBJECT) {
diff --git a/drivers/acpi/events/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c
index 41554f736b6..35485e4b60a 100644
--- a/drivers/acpi/events/evxfevnt.c
+++ b/drivers/acpi/acpica/evxfevnt.c
@@ -42,13 +42,19 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
-#include <acpi/acnamesp.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "acevents.h"
+#include "acnamesp.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evxfevnt")
 
+/* Local prototypes */
+acpi_status
+acpi_ev_get_gpe_device(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
+		       struct acpi_gpe_block_info *gpe_block, void *context);
+
 /*******************************************************************************
  *
  * FUNCTION:    acpi_enable
@@ -60,6 +66,7 @@ ACPI_MODULE_NAME("evxfevnt")
  * DESCRIPTION: Transfers the system into ACPI mode.
  *
  ******************************************************************************/
+
 acpi_status acpi_enable(void)
 {
 	acpi_status status = AE_OK;
@@ -161,8 +168,8 @@ acpi_status acpi_enable_event(u32 event, u32 flags)
 	}
 
 	/*
-	 * Enable the requested fixed event (by writing a one to the
-	 * enable register bit)
+	 * Enable the requested fixed event (by writing a one to the enable
+	 * register bit)
 	 */
 	status =
 	    acpi_set_register(acpi_gbl_fixed_event_info[event].
@@ -343,8 +350,8 @@ acpi_status acpi_disable_event(u32 event, u32 flags)
 	}
 
 	/*
-	 * Disable the requested fixed event (by writing a zero to the
-	 * enable register bit)
+	 * Disable the requested fixed event (by writing a zero to the enable
+	 * register bit)
 	 */
 	status =
 	    acpi_set_register(acpi_gbl_fixed_event_info[event].
@@ -396,8 +403,8 @@ acpi_status acpi_clear_event(u32 event)
 	}
 
 	/*
-	 * Clear the requested fixed event (By writing a one to the
-	 * status register bit)
+	 * Clear the requested fixed event (By writing a one to the status
+	 * register bit)
 	 */
 	status =
 	    acpi_set_register(acpi_gbl_fixed_event_info[event].
@@ -717,3 +724,148 @@ acpi_status acpi_remove_gpe_block(acpi_handle gpe_device)
 }
 
 ACPI_EXPORT_SYMBOL(acpi_remove_gpe_block)
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_get_gpe_device
+ *
+ * PARAMETERS:  Index               - System GPE index (0-current_gpe_count)
+ *              gpe_device          - Where the parent GPE Device is returned
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Obtain the GPE device associated with the input index. A NULL
+ *              gpe device indicates that the gpe number is contained in one of
+ *              the FADT-defined gpe blocks. Otherwise, the GPE block device.
+ *
+ ******************************************************************************/
+acpi_status
+acpi_get_gpe_device(u32 index, acpi_handle *gpe_device)
+{
+	struct acpi_gpe_device_info info;
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE(acpi_get_gpe_device);
+
+	if (!gpe_device) {
+		return_ACPI_STATUS(AE_BAD_PARAMETER);
+	}
+
+	if (index >= acpi_current_gpe_count) {
+		return_ACPI_STATUS(AE_NOT_EXIST);
+	}
+
+	/* Setup and walk the GPE list */
+
+	info.index = index;
+	info.status = AE_NOT_EXIST;
+	info.gpe_device = NULL;
+	info.next_block_base_index = 0;
+
+	status = acpi_ev_walk_gpe_list(acpi_ev_get_gpe_device, &info);
+	if (ACPI_FAILURE(status)) {
+		return_ACPI_STATUS(status);
+	}
+
+	*gpe_device = info.gpe_device;
+	return_ACPI_STATUS(info.status);
+}
+
+ACPI_EXPORT_SYMBOL(acpi_get_gpe_device)
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ev_get_gpe_device
+ *
+ * PARAMETERS:  GPE_WALK_CALLBACK
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Matches the input GPE index (0-current_gpe_count) with a GPE
+ *              block device. NULL if the GPE is one of the FADT-defined GPEs.
+ *
+ ******************************************************************************/
+acpi_status
+acpi_ev_get_gpe_device(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
+		       struct acpi_gpe_block_info *gpe_block, void *context)
+{
+	struct acpi_gpe_device_info *info = context;
+
+	/* Increment Index by the number of GPEs in this block */
+
+	info->next_block_base_index +=
+	    (gpe_block->register_count * ACPI_GPE_REGISTER_WIDTH);
+
+	if (info->index < info->next_block_base_index) {
+		/*
+		 * The GPE index is within this block, get the node. Leave the node
+		 * NULL for the FADT-defined GPEs
+		 */
+		if ((gpe_block->node)->type == ACPI_TYPE_DEVICE) {
+			info->gpe_device = gpe_block->node;
+		}
+
+		info->status = AE_OK;
+		return (AE_CTRL_END);
+	}
+
+	return (AE_OK);
+}
+
+/******************************************************************************
+ *
+ * FUNCTION:    acpi_disable_all_gpes
+ *
+ * PARAMETERS:  None
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Disable and clear all GPEs in all GPE blocks
+ *
+ ******************************************************************************/
+
+acpi_status acpi_disable_all_gpes(void)
+{
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE(acpi_disable_all_gpes);
+
+	status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS);
+	if (ACPI_FAILURE(status)) {
+		return_ACPI_STATUS(status);
+	}
+
+	status = acpi_hw_disable_all_gpes();
+	(void)acpi_ut_release_mutex(ACPI_MTX_EVENTS);
+
+	return_ACPI_STATUS(status);
+}
+
+/******************************************************************************
+ *
+ * FUNCTION:    acpi_enable_all_runtime_gpes
+ *
+ * PARAMETERS:  None
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Enable all "runtime" GPEs, in all GPE blocks
+ *
+ ******************************************************************************/
+
+acpi_status acpi_enable_all_runtime_gpes(void)
+{
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE(acpi_enable_all_runtime_gpes);
+
+	status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS);
+	if (ACPI_FAILURE(status)) {
+		return_ACPI_STATUS(status);
+	}
+
+	status = acpi_hw_enable_all_runtime_gpes();
+	(void)acpi_ut_release_mutex(ACPI_MTX_EVENTS);
+
+	return_ACPI_STATUS(status);
+}
diff --git a/drivers/acpi/events/evxfregn.c b/drivers/acpi/acpica/evxfregn.c
index e8750807e57..479e7a3721b 100644
--- a/drivers/acpi/events/evxfregn.c
+++ b/drivers/acpi/acpica/evxfregn.c
@@ -43,8 +43,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acevents.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acevents.h"
 
 #define _COMPONENT          ACPI_EVENTS
 ACPI_MODULE_NAME("evxfregn")
diff --git a/drivers/acpi/executer/exconfig.c b/drivers/acpi/acpica/exconfig.c
index 74da6fa52ef..932bbc26aa0 100644
--- a/drivers/acpi/executer/exconfig.c
+++ b/drivers/acpi/acpica/exconfig.c
@@ -42,10 +42,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
-#include <acpi/actables.h>
-#include <acpi/acdispat.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "acnamesp.h"
+#include "actables.h"
+#include "acdispat.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exconfig")
diff --git a/drivers/acpi/executer/exconvrt.c b/drivers/acpi/acpica/exconvrt.c
index 1d1f35adddd..0be10188316 100644
--- a/drivers/acpi/executer/exconvrt.c
+++ b/drivers/acpi/acpica/exconvrt.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exconvrt")
diff --git a/drivers/acpi/executer/excreate.c b/drivers/acpi/acpica/excreate.c
index ad09696d506..a57ad2564ab 100644
--- a/drivers/acpi/executer/excreate.c
+++ b/drivers/acpi/acpica/excreate.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "amlcode.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("excreate")
diff --git a/drivers/acpi/executer/exdump.c b/drivers/acpi/acpica/exdump.c
index d087a7d28aa..aa313574b0d 100644
--- a/drivers/acpi/executer/exdump.c
+++ b/drivers/acpi/acpica/exdump.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "amlcode.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exdump")
diff --git a/drivers/acpi/executer/exfield.c b/drivers/acpi/acpica/exfield.c
index 3e440d84226..a352d023385 100644
--- a/drivers/acpi/executer/exfield.c
+++ b/drivers/acpi/acpica/exfield.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acdispat.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exfield")
diff --git a/drivers/acpi/executer/exfldio.c b/drivers/acpi/acpica/exfldio.c
index 9ff9d1f4615..ef58ac4e687 100644
--- a/drivers/acpi/executer/exfldio.c
+++ b/drivers/acpi/acpica/exfldio.c
@@ -42,10 +42,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
-#include <acpi/acevents.h>
-#include <acpi/acdispat.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "amlcode.h"
+#include "acevents.h"
+#include "acdispat.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exfldio")
@@ -498,14 +499,13 @@ acpi_ex_field_datum_io(union acpi_operand_object *obj_desc,
 			return_ACPI_STATUS(status);
 		}
 
-		ACPI_DEBUG_PRINT((ACPI_DB_BFIELD,
-				  "I/O to Data Register: ValuePtr %p\n",
-				  value));
-
 		if (read_write == ACPI_READ) {
 
 			/* Read the datum from the data_register */
 
+			ACPI_DEBUG_PRINT((ACPI_DB_BFIELD,
+					  "Read from Data Register\n"));
+
 			status =
 			    acpi_ex_extract_from_field(obj_desc->index_field.
 						       data_obj, value,
@@ -513,6 +513,10 @@ acpi_ex_field_datum_io(union acpi_operand_object *obj_desc,
 		} else {
 			/* Write the datum to the data_register */
 
+			ACPI_DEBUG_PRINT((ACPI_DB_BFIELD,
+					  "Write to Data Register: Value %8.8X%8.8X\n",
+					  ACPI_FORMAT_UINT64(*value)));
+
 			status =
 			    acpi_ex_insert_into_field(obj_desc->index_field.
 						      data_obj, value,
diff --git a/drivers/acpi/executer/exmisc.c b/drivers/acpi/acpica/exmisc.c
index efb19134005..6b0747ac683 100644
--- a/drivers/acpi/executer/exmisc.c
+++ b/drivers/acpi/acpica/exmisc.c
@@ -43,9 +43,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
-#include <acpi/amlresrc.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "amlcode.h"
+#include "amlresrc.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exmisc")
diff --git a/drivers/acpi/executer/exmutex.c b/drivers/acpi/acpica/exmutex.c
index a8bf3d713e2..d301c1f363e 100644
--- a/drivers/acpi/executer/exmutex.c
+++ b/drivers/acpi/acpica/exmutex.c
@@ -43,8 +43,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/acevents.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "acevents.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exmutex")
diff --git a/drivers/acpi/executer/exnames.c b/drivers/acpi/acpica/exnames.c
index 817e67be369..ffdae122d94 100644
--- a/drivers/acpi/executer/exnames.c
+++ b/drivers/acpi/acpica/exnames.c
@@ -43,8 +43,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exnames")
diff --git a/drivers/acpi/executer/exoparg1.c b/drivers/acpi/acpica/exoparg1.c
index f622f9eac8a..b530480cc7d 100644
--- a/drivers/acpi/executer/exoparg1.c
+++ b/drivers/acpi/acpica/exoparg1.c
@@ -43,11 +43,12 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "amlcode.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exoparg1")
diff --git a/drivers/acpi/executer/exoparg2.c b/drivers/acpi/acpica/exoparg2.c
index 368def5dffc..0b4f513ca88 100644
--- a/drivers/acpi/executer/exoparg2.c
+++ b/drivers/acpi/acpica/exoparg2.c
@@ -42,10 +42,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/acinterp.h>
-#include <acpi/acevents.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "acinterp.h"
+#include "acevents.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exoparg2")
diff --git a/drivers/acpi/executer/exoparg3.c b/drivers/acpi/acpica/exoparg3.c
index 9cb4197681a..c6520bbf882 100644
--- a/drivers/acpi/executer/exoparg3.c
+++ b/drivers/acpi/acpica/exoparg3.c
@@ -43,9 +43,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "acparser.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exoparg3")
diff --git a/drivers/acpi/executer/exoparg6.c b/drivers/acpi/acpica/exoparg6.c
index 67d48737af5..ae43f7670a6 100644
--- a/drivers/acpi/executer/exoparg6.c
+++ b/drivers/acpi/acpica/exoparg6.c
@@ -43,9 +43,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "acparser.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exoparg6")
diff --git a/drivers/acpi/executer/exprep.c b/drivers/acpi/acpica/exprep.c
index a7dc87ecee3..a226f74d4a5 100644
--- a/drivers/acpi/executer/exprep.c
+++ b/drivers/acpi/acpica/exprep.c
@@ -43,9 +43,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "amlcode.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exprep")
diff --git a/drivers/acpi/executer/exregion.c b/drivers/acpi/acpica/exregion.c
index 7a41c409ae4..76ec8ff903b 100644
--- a/drivers/acpi/executer/exregion.c
+++ b/drivers/acpi/acpica/exregion.c
@@ -43,7 +43,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exregion")
diff --git a/drivers/acpi/executer/exresnte.c b/drivers/acpi/acpica/exresnte.c
index 423ad3635f3..a063a74006f 100644
--- a/drivers/acpi/executer/exresnte.c
+++ b/drivers/acpi/acpica/exresnte.c
@@ -43,9 +43,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exresnte")
diff --git a/drivers/acpi/executer/exresolv.c b/drivers/acpi/acpica/exresolv.c
index 60e8c47128e..f6105a6d612 100644
--- a/drivers/acpi/executer/exresolv.c
+++ b/drivers/acpi/acpica/exresolv.c
@@ -43,10 +43,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/amlcode.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "amlcode.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exresolv")
diff --git a/drivers/acpi/executer/exresop.c b/drivers/acpi/acpica/exresop.c
index 0bb82593da7..3c3802764bf 100644
--- a/drivers/acpi/executer/exresop.c
+++ b/drivers/acpi/acpica/exresop.c
@@ -43,10 +43,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/amlcode.h>
-#include <acpi/acparser.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "amlcode.h"
+#include "acparser.h"
+#include "acinterp.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exresop")
diff --git a/drivers/acpi/executer/exstore.c b/drivers/acpi/acpica/exstore.c
index 1c118ba78ad..e35e9b4f6a4 100644
--- a/drivers/acpi/executer/exstore.c
+++ b/drivers/acpi/acpica/exstore.c
@@ -43,10 +43,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "amlcode.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exstore")
diff --git a/drivers/acpi/executer/exstoren.c b/drivers/acpi/acpica/exstoren.c
index eef61a00803..145d15305f7 100644
--- a/drivers/acpi/executer/exstoren.c
+++ b/drivers/acpi/acpica/exstoren.c
@@ -44,8 +44,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exstoren")
diff --git a/drivers/acpi/executer/exstorob.c b/drivers/acpi/acpica/exstorob.c
index 9a75ff09fb0..67340cc7014 100644
--- a/drivers/acpi/executer/exstorob.c
+++ b/drivers/acpi/acpica/exstorob.c
@@ -43,7 +43,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exstorob")
diff --git a/drivers/acpi/executer/exsystem.c b/drivers/acpi/acpica/exsystem.c
index 68990f1df37..3d00b935723 100644
--- a/drivers/acpi/executer/exsystem.c
+++ b/drivers/acpi/acpica/exsystem.c
@@ -43,7 +43,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exsystem")
diff --git a/drivers/acpi/executer/exutils.c b/drivers/acpi/acpica/exutils.c
index 86c03880b52..32b85d68e75 100644
--- a/drivers/acpi/executer/exutils.c
+++ b/drivers/acpi/acpica/exutils.c
@@ -59,8 +59,9 @@
 #define DEFINE_AML_GLOBALS
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_EXECUTER
 ACPI_MODULE_NAME("exutils")
diff --git a/drivers/acpi/hardware/hwacpi.c b/drivers/acpi/acpica/hwacpi.c
index 816894ea839..a9d4fea4167 100644
--- a/drivers/acpi/hardware/hwacpi.c
+++ b/drivers/acpi/acpica/hwacpi.c
@@ -43,6 +43,7 @@
  */
 
 #include <acpi/acpi.h>
+#include "accommon.h"
 
 #define _COMPONENT          ACPI_HARDWARE
 ACPI_MODULE_NAME("hwacpi")
diff --git a/drivers/acpi/hardware/hwgpe.c b/drivers/acpi/acpica/hwgpe.c
index 0b80db9d919..2013b66745d 100644
--- a/drivers/acpi/hardware/hwgpe.c
+++ b/drivers/acpi/acpica/hwgpe.c
@@ -43,7 +43,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
+#include "accommon.h"
+#include "acevents.h"
 
 #define _COMPONENT          ACPI_HARDWARE
 ACPI_MODULE_NAME("hwgpe")
@@ -51,7 +52,8 @@ ACPI_MODULE_NAME("hwgpe")
 /* Local prototypes */
 static acpi_status
 acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
-				struct acpi_gpe_block_info *gpe_block);
+				struct acpi_gpe_block_info *gpe_block,
+				void *context);
 
 /******************************************************************************
  *
@@ -80,8 +82,7 @@ acpi_status acpi_hw_low_disable_gpe(struct acpi_gpe_event_info *gpe_event_info)
 
 	/* Get current value of the enable register that contains this GPE */
 
-	status = acpi_hw_low_level_read(ACPI_GPE_REGISTER_WIDTH, &enable_mask,
-					&gpe_register_info->enable_address);
+	status = acpi_read(&enable_mask, &gpe_register_info->enable_address);
 	if (ACPI_FAILURE(status)) {
 		return (status);
 	}
@@ -95,9 +96,7 @@ acpi_status acpi_hw_low_disable_gpe(struct acpi_gpe_event_info *gpe_event_info)
 
 	/* Write the updated enable mask */
 
-	status = acpi_hw_low_level_write(ACPI_GPE_REGISTER_WIDTH, enable_mask,
-					 &gpe_register_info->enable_address);
-
+	status = acpi_write(enable_mask, &gpe_register_info->enable_address);
 	return (status);
 }
 
@@ -132,8 +131,8 @@ acpi_hw_write_gpe_enable_reg(struct acpi_gpe_event_info * gpe_event_info)
 
 	/* Write the entire GPE (runtime) enable register */
 
-	status = acpi_hw_low_level_write(8, gpe_register_info->enable_for_run,
-					 &gpe_register_info->enable_address);
+	status = acpi_write(gpe_register_info->enable_for_run,
+			    &gpe_register_info->enable_address);
 
 	return (status);
 }
@@ -166,9 +165,8 @@ acpi_status acpi_hw_clear_gpe(struct acpi_gpe_event_info * gpe_event_info)
 	 * Write a one to the appropriate bit in the status register to
 	 * clear this GPE.
 	 */
-	status = acpi_hw_low_level_write(8, register_bit,
-					 &gpe_event_info->register_info->
-					 status_address);
+	status = acpi_write(register_bit,
+			    &gpe_event_info->register_info->status_address);
 
 	return (status);
 }
@@ -227,9 +225,7 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info,
 
 	/* GPE currently active (status bit == 1)? */
 
-	status =
-	    acpi_hw_low_level_read(8, &in_byte,
-				   &gpe_register_info->status_address);
+	status = acpi_read(&in_byte, &gpe_register_info->status_address);
 	if (ACPI_FAILURE(status)) {
 		goto unlock_and_exit;
 	}
@@ -260,8 +256,8 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info,
  ******************************************************************************/
 
 acpi_status
-acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info,
-			  struct acpi_gpe_block_info * gpe_block)
+acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
+			  struct acpi_gpe_block_info *gpe_block, void *context)
 {
 	u32 i;
 	acpi_status status;
@@ -272,9 +268,9 @@ acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info,
 
 		/* Disable all GPEs in this register */
 
-		status = acpi_hw_low_level_write(8, 0x00,
-						 &gpe_block->register_info[i].
-						 enable_address);
+		status =
+		    acpi_write(0x00,
+			       &gpe_block->register_info[i].enable_address);
 		if (ACPI_FAILURE(status)) {
 			return (status);
 		}
@@ -297,8 +293,8 @@ acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info,
  ******************************************************************************/
 
 acpi_status
-acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info,
-			struct acpi_gpe_block_info * gpe_block)
+acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
+			struct acpi_gpe_block_info *gpe_block, void *context)
 {
 	u32 i;
 	acpi_status status;
@@ -309,9 +305,9 @@ acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info,
 
 		/* Clear status on all GPEs in this register */
 
-		status = acpi_hw_low_level_write(8, 0xFF,
-						 &gpe_block->register_info[i].
-						 status_address);
+		status =
+		    acpi_write(0xFF,
+			       &gpe_block->register_info[i].status_address);
 		if (ACPI_FAILURE(status)) {
 			return (status);
 		}
@@ -335,8 +331,8 @@ acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info,
  ******************************************************************************/
 
 acpi_status
-acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info,
-				 struct acpi_gpe_block_info * gpe_block)
+acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
+				 struct acpi_gpe_block_info *gpe_block, void *context)
 {
 	u32 i;
 	acpi_status status;
@@ -352,12 +348,9 @@ acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info,
 
 		/* Enable all "runtime" GPEs in this register */
 
-		status =
-		    acpi_hw_low_level_write(8,
-					    gpe_block->register_info[i].
-					    enable_for_run,
-					    &gpe_block->register_info[i].
-					    enable_address);
+		status = acpi_write(gpe_block->register_info[i].enable_for_run,
+				    &gpe_block->register_info[i].
+				    enable_address);
 		if (ACPI_FAILURE(status)) {
 			return (status);
 		}
@@ -382,7 +375,8 @@ acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info,
 
 static acpi_status
 acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
-				struct acpi_gpe_block_info *gpe_block)
+				struct acpi_gpe_block_info *gpe_block,
+				void *context)
 {
 	u32 i;
 	acpi_status status;
@@ -396,11 +390,9 @@ acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info,
 
 		/* Enable all "wake" GPEs in this register */
 
-		status = acpi_hw_low_level_write(8,
-						 gpe_block->register_info[i].
-						 enable_for_wake,
-						 &gpe_block->register_info[i].
-						 enable_address);
+		status = acpi_write(gpe_block->register_info[i].enable_for_wake,
+				    &gpe_block->register_info[i].
+				    enable_address);
 		if (ACPI_FAILURE(status)) {
 			return (status);
 		}
@@ -427,8 +419,8 @@ acpi_status acpi_hw_disable_all_gpes(void)
 
 	ACPI_FUNCTION_TRACE(hw_disable_all_gpes);
 
-	status = acpi_ev_walk_gpe_list(acpi_hw_disable_gpe_block);
-	status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block);
+	status = acpi_ev_walk_gpe_list(acpi_hw_disable_gpe_block, NULL);
+	status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block, NULL);
 	return_ACPI_STATUS(status);
 }
 
@@ -450,7 +442,7 @@ acpi_status acpi_hw_enable_all_runtime_gpes(void)
 
 	ACPI_FUNCTION_TRACE(hw_enable_all_runtime_gpes);
 
-	status = acpi_ev_walk_gpe_list(acpi_hw_enable_runtime_gpe_block);
+	status = acpi_ev_walk_gpe_list(acpi_hw_enable_runtime_gpe_block, NULL);
 	return_ACPI_STATUS(status);
 }
 
@@ -472,6 +464,6 @@ acpi_status acpi_hw_enable_all_wakeup_gpes(void)
 
 	ACPI_FUNCTION_TRACE(hw_enable_all_wakeup_gpes);
 
-	status = acpi_ev_walk_gpe_list(acpi_hw_enable_wakeup_gpe_block);
+	status = acpi_ev_walk_gpe_list(acpi_hw_enable_wakeup_gpe_block, NULL);
 	return_ACPI_STATUS(status);
 }
diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c
new file mode 100644
index 00000000000..4dc43b01851
--- /dev/null
+++ b/drivers/acpi/acpica/hwregs.c
@@ -0,0 +1,353 @@
+
+/*******************************************************************************
+ *
+ * Module Name: hwregs - Read/write access functions for the various ACPI
+ *                       control and status registers.
+ *
+ ******************************************************************************/
+
+/*
+ * Copyright (C) 2000 - 2008, Intel Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *    of any contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ */
+
+#include <acpi/acpi.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acevents.h"
+
+#define _COMPONENT          ACPI_HARDWARE
+ACPI_MODULE_NAME("hwregs")
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_hw_clear_acpi_status
+ *
+ * PARAMETERS:  None
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Clears all fixed and general purpose status bits
+ *              THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED
+ *
+ ******************************************************************************/
+acpi_status acpi_hw_clear_acpi_status(void)
+{
+	acpi_status status;
+	acpi_cpu_flags lock_flags = 0;
+
+	ACPI_FUNCTION_TRACE(hw_clear_acpi_status);
+
+	ACPI_DEBUG_PRINT((ACPI_DB_IO, "About to write %04X to %04X\n",
+			  ACPI_BITMASK_ALL_FIXED_STATUS,
+			  (u16) acpi_gbl_FADT.xpm1a_event_block.address));
+
+	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
+
+	status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
+					ACPI_BITMASK_ALL_FIXED_STATUS);
+	if (ACPI_FAILURE(status)) {
+		goto unlock_and_exit;
+	}
+
+	/* Clear the fixed events */
+
+	if (acpi_gbl_FADT.xpm1b_event_block.address) {
+		status = acpi_write(ACPI_BITMASK_ALL_FIXED_STATUS,
+				    &acpi_gbl_FADT.xpm1b_event_block);
+		if (ACPI_FAILURE(status)) {
+			goto unlock_and_exit;
+		}
+	}
+
+	/* Clear the GPE Bits in all GPE registers in all GPE blocks */
+
+	status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block, NULL);
+
+      unlock_and_exit:
+	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
+	return_ACPI_STATUS(status);
+}
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_hw_get_register_bit_mask
+ *
+ * PARAMETERS:  register_id         - Index of ACPI Register to access
+ *
+ * RETURN:      The bitmask to be used when accessing the register
+ *
+ * DESCRIPTION: Map register_id into a register bitmask.
+ *
+ ******************************************************************************/
+
+struct acpi_bit_register_info *acpi_hw_get_bit_register_info(u32 register_id)
+{
+	ACPI_FUNCTION_ENTRY();
+
+	if (register_id > ACPI_BITREG_MAX) {
+		ACPI_ERROR((AE_INFO, "Invalid BitRegister ID: %X",
+			    register_id));
+		return (NULL);
+	}
+
+	return (&acpi_gbl_bit_register_info[register_id]);
+}
+
+/******************************************************************************
+ *
+ * FUNCTION:    acpi_hw_register_read
+ *
+ * PARAMETERS:  register_id         - ACPI Register ID
+ *              return_value        - Where the register value is returned
+ *
+ * RETURN:      Status and the value read.
+ *
+ * DESCRIPTION: Read from the specified ACPI register
+ *
+ ******************************************************************************/
+acpi_status
+acpi_hw_register_read(u32 register_id, u32 * return_value)
+{
+	u32 value1 = 0;
+	u32 value2 = 0;
+	acpi_status status;
+
+	ACPI_FUNCTION_TRACE(hw_register_read);
+
+	switch (register_id) {
+	case ACPI_REGISTER_PM1_STATUS:	/* 16-bit access */
+
+		status = acpi_read(&value1, &acpi_gbl_FADT.xpm1a_event_block);
+		if (ACPI_FAILURE(status)) {
+			goto exit;
+		}
+
+		/* PM1B is optional */
+
+		status = acpi_read(&value2, &acpi_gbl_FADT.xpm1b_event_block);
+		value1 |= value2;
+		break;
+
+	case ACPI_REGISTER_PM1_ENABLE:	/* 16-bit access */
+
+		status = acpi_read(&value1, &acpi_gbl_xpm1a_enable);
+		if (ACPI_FAILURE(status)) {
+			goto exit;
+		}
+
+		/* PM1B is optional */
+
+		status = acpi_read(&value2, &acpi_gbl_xpm1b_enable);
+		value1 |= value2;
+		break;
+
+	case ACPI_REGISTER_PM1_CONTROL:	/* 16-bit access */
+
+		status = acpi_read(&value1, &acpi_gbl_FADT.xpm1a_control_block);
+		if (ACPI_FAILURE(status)) {
+			goto exit;
+		}
+
+		status = acpi_read(&value2, &acpi_gbl_FADT.xpm1b_control_block);
+		value1 |= value2;
+		break;
+
+	case ACPI_REGISTER_PM2_CONTROL:	/* 8-bit access */
+
+		status = acpi_read(&value1, &acpi_gbl_FADT.xpm2_control_block);
+		break;
+
+	case ACPI_REGISTER_PM_TIMER:	/* 32-bit access */
+
+		status = acpi_read(&value1, &acpi_gbl_FADT.xpm_timer_block);
+		break;
+
+	case ACPI_REGISTER_SMI_COMMAND_BLOCK:	/* 8-bit access */
+
+		status =
+		    acpi_os_read_port(acpi_gbl_FADT.smi_command, &value1, 8);
+		break;
+
+	default:
+		ACPI_ERROR((AE_INFO, "Unknown Register ID: %X", register_id));
+		status = AE_BAD_PARAMETER;
+		break;
+	}
+
+      exit:
+
+	if (ACPI_SUCCESS(status)) {
+		*return_value = value1;
+	}
+
+	return_ACPI_STATUS(status);
+}
+
+/******************************************************************************
+ *
+ * FUNCTION:    acpi_hw_register_write
+ *
+ * PARAMETERS:  register_id         - ACPI Register ID
+ *              Value               - The value to write
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Write to the specified ACPI register
+ *
+ * NOTE: In accordance with the ACPI specification, this function automatically
+ * preserves the value of the following bits, meaning that these bits cannot be
+ * changed via this interface:
+ *
+ * PM1_CONTROL[0] = SCI_EN
+ * PM1_CONTROL[9]
+ * PM1_STATUS[11]
+ *
+ * ACPI References:
+ * 1) Hardware Ignored Bits: When software writes to a register with ignored
+ *      bit fields, it preserves the ignored bit fields
+ * 2) SCI_EN: OSPM always preserves this bit position
+ *
+ ******************************************************************************/
+
+acpi_status acpi_hw_register_write(u32 register_id, u32 value)
+{
+	acpi_status status;
+	u32 read_value;
+
+	ACPI_FUNCTION_TRACE(hw_register_write);
+
+	switch (register_id) {
+	case ACPI_REGISTER_PM1_STATUS:	/* 16-bit access */
+
+		/* Perform a read first to preserve certain bits (per ACPI spec) */
+
+		status = acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS,
+					       &read_value);
+		if (ACPI_FAILURE(status)) {
+			goto exit;
+		}
+
+		/* Insert the bits to be preserved */
+
+		ACPI_INSERT_BITS(value, ACPI_PM1_STATUS_PRESERVED_BITS,
+				 read_value);
+
+		/* Now we can write the data */
+
+		status = acpi_write(value, &acpi_gbl_FADT.xpm1a_event_block);
+		if (ACPI_FAILURE(status)) {
+			goto exit;
+		}
+
+		/* PM1B is optional */
+
+		status = acpi_write(value, &acpi_gbl_FADT.xpm1b_event_block);
+		break;
+
+	case ACPI_REGISTER_PM1_ENABLE:	/* 16-bit access */
+
+		status = acpi_write(value, &acpi_gbl_xpm1a_enable);
+		if (ACPI_FAILURE(status)) {
+			goto exit;
+		}
+
+		/* PM1B is optional */
+
+		status = acpi_write(value, &acpi_gbl_xpm1b_enable);
+		break;
+
+	case ACPI_REGISTER_PM1_CONTROL:	/* 16-bit access */
+
+		/*
+		 * Perform a read first to preserve certain bits (per ACPI spec)
+		 */
+		status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL,
+					       &read_value);
+		if (ACPI_FAILURE(status)) {
+			goto exit;
+		}
+
+		/* Insert the bits to be preserved */
+
+		ACPI_INSERT_BITS(value, ACPI_PM1_CONTROL_PRESERVED_BITS,
+				 read_value);
+
+		/* Now we can write the data */
+
+		status = acpi_write(value, &acpi_gbl_FADT.xpm1a_control_block);
+		if (ACPI_FAILURE(status)) {
+			goto exit;
+		}
+
+		status = acpi_write(value, &acpi_gbl_FADT.xpm1b_control_block);
+		break;
+
+	case ACPI_REGISTER_PM1A_CONTROL:	/* 16-bit access */
+
+		status = acpi_write(value, &acpi_gbl_FADT.xpm1a_control_block);
+		break;
+
+	case ACPI_REGISTER_PM1B_CONTROL:	/* 16-bit access */
+
+		status = acpi_write(value, &acpi_gbl_FADT.xpm1b_control_block);
+		break;
+
+	case ACPI_REGISTER_PM2_CONTROL:	/* 8-bit access */
+
+		status = acpi_write(value, &acpi_gbl_FADT.xpm2_control_block);
+		break;
+
+	case ACPI_REGISTER_PM_TIMER:	/* 32-bit access */
+
+		status = acpi_write(value, &acpi_gbl_FADT.xpm_timer_block);
+		break;
+
+	case ACPI_REGISTER_SMI_COMMAND_BLOCK:	/* 8-bit access */
+
+		/* SMI_CMD is currently always in IO space */
+
+		status =
+		    acpi_os_write_port(acpi_gbl_FADT.smi_command, value, 8);
+		break;
+
+	default:
+		status = AE_BAD_PARAMETER;
+		break;
+	}
+
+      exit:
+	return_ACPI_STATUS(status);
+}
diff --git a/drivers/acpi/hardware/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
index 25dccdf179b..a2af2a4f2f2 100644
--- a/drivers/acpi/hardware/hwsleep.c
+++ b/drivers/acpi/acpica/hwsleep.c
@@ -43,7 +43,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_HARDWARE
 ACPI_MODULE_NAME("hwsleep")
@@ -52,31 +53,19 @@ ACPI_MODULE_NAME("hwsleep")
  *
  * FUNCTION:    acpi_set_firmware_waking_vector
  *
- * PARAMETERS:  physical_address    - Physical address of ACPI real mode
+ * PARAMETERS:  physical_address    - 32-bit physical address of ACPI real mode
  *                                    entry point.
  *
  * RETURN:      Status
  *
- * DESCRIPTION: Access function for the firmware_waking_vector field in FACS
+ * DESCRIPTION: Sets the 32-bit firmware_waking_vector field of the FACS
  *
  ******************************************************************************/
 acpi_status
-acpi_set_firmware_waking_vector(acpi_physical_address physical_address)
+acpi_set_firmware_waking_vector(u32 physical_address)
 {
-	struct acpi_table_facs *facs;
-	acpi_status status;
-
 	ACPI_FUNCTION_TRACE(acpi_set_firmware_waking_vector);
 
-	/* Get the FACS */
-
-	status = acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS,
-					 ACPI_CAST_INDIRECT_PTR(struct
-								acpi_table_header,
-								&facs));
-	if (ACPI_FAILURE(status)) {
-		return_ACPI_STATUS(status);
-	}
 
 	/*
 	 * According to the ACPI specification 2.0c and later, the 64-bit
@@ -85,10 +74,16 @@ acpi_set_firmware_waking_vector(acpi_physical_address physical_address)
 	 * Protected Mode.  Some systems (for example HP dv5-1004nr) are known
 	 * to fail to resume if the 64-bit vector is used.
 	 */
-	if (facs->version >= 1)
-		facs->xfirmware_waking_vector = 0;
 
-	facs->firmware_waking_vector = (u32)physical_address;
+	/* Set the 32-bit vector */
+
+	acpi_gbl_FACS->firmware_waking_vector = physical_address;
+
+	/* Clear the 64-bit vector if it exists */
+
+	if ((acpi_gbl_FACS->length > 32) && (acpi_gbl_FACS->version >= 1)) {
+		acpi_gbl_FACS->xfirmware_waking_vector = 0;
+	}
 
 	return_ACPI_STATUS(AE_OK);
 }
@@ -97,48 +92,39 @@ ACPI_EXPORT_SYMBOL(acpi_set_firmware_waking_vector)
 
 /*******************************************************************************
  *
- * FUNCTION:    acpi_get_firmware_waking_vector
+ * FUNCTION:    acpi_set_firmware_waking_vector64
  *
- * PARAMETERS:  *physical_address   - Where the contents of
- *                                    the firmware_waking_vector field of
- *                                    the FACS will be returned.
+ * PARAMETERS:  physical_address    - 64-bit physical address of ACPI protected
+ *                                    mode entry point.
  *
- * RETURN:      Status, vector
+ * RETURN:      Status
  *
- * DESCRIPTION: Access function for the firmware_waking_vector field in FACS
+ * DESCRIPTION: Sets the 64-bit X_firmware_waking_vector field of the FACS, if
+ *              it exists in the table.
  *
  ******************************************************************************/
-#ifdef ACPI_FUTURE_USAGE
 acpi_status
-acpi_get_firmware_waking_vector(acpi_physical_address * physical_address)
+acpi_set_firmware_waking_vector64(u64 physical_address)
 {
-	struct acpi_table_facs *facs;
-	acpi_status status;
+	ACPI_FUNCTION_TRACE(acpi_set_firmware_waking_vector64);
 
-	ACPI_FUNCTION_TRACE(acpi_get_firmware_waking_vector);
-
-	if (!physical_address) {
-		return_ACPI_STATUS(AE_BAD_PARAMETER);
-	}
 
-	/* Get the FACS */
+	/* Determine if the 64-bit vector actually exists */
 
-	status = acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS,
-					 ACPI_CAST_INDIRECT_PTR(struct
-								acpi_table_header,
-								&facs));
-	if (ACPI_FAILURE(status)) {
-		return_ACPI_STATUS(status);
+	if ((acpi_gbl_FACS->length <= 32) || (acpi_gbl_FACS->version < 1)) {
+		return_ACPI_STATUS(AE_NOT_EXIST);
 	}
 
-	/* Get the vector */
-	*physical_address = (acpi_physical_address)facs->firmware_waking_vector;
+	/* Clear 32-bit vector, set the 64-bit X_ vector */
+
+	acpi_gbl_FACS->firmware_waking_vector = 0;
+	acpi_gbl_FACS->xfirmware_waking_vector = physical_address;
 
 	return_ACPI_STATUS(AE_OK);
 }
 
-ACPI_EXPORT_SYMBOL(acpi_get_firmware_waking_vector)
-#endif
+ACPI_EXPORT_SYMBOL(acpi_set_firmware_waking_vector64)
+
 /*******************************************************************************
  *
  * FUNCTION:    acpi_enter_sleep_state_prep
diff --git a/drivers/acpi/hardware/hwtimer.c b/drivers/acpi/acpica/hwtimer.c
index b53d575491b..b7f522c8f02 100644
--- a/drivers/acpi/hardware/hwtimer.c
+++ b/drivers/acpi/acpica/hwtimer.c
@@ -43,6 +43,7 @@
  */
 
 #include <acpi/acpi.h>
+#include "accommon.h"
 
 #define _COMPONENT          ACPI_HARDWARE
 ACPI_MODULE_NAME("hwtimer")
diff --git a/drivers/acpi/hardware/hwregs.c b/drivers/acpi/acpica/hwxface.c
index ddf792adcf9..ae597c0ab53 100644
--- a/drivers/acpi/hardware/hwregs.c
+++ b/drivers/acpi/acpica/hwxface.c
@@ -1,10 +1,9 @@
 
-/*******************************************************************************
+/******************************************************************************
  *
- * Module Name: hwregs - Read/write access functions for the various ACPI
- *                       control and status registers.
+ * Module Name: hwxface - Public ACPICA hardware interfaces
  *
- ******************************************************************************/
+ *****************************************************************************/
 
 /*
  * Copyright (C) 2000 - 2008, Intel Corp.
@@ -44,209 +43,208 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acevents.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_HARDWARE
-ACPI_MODULE_NAME("hwregs")
+ACPI_MODULE_NAME("hwxface")
 
-/*******************************************************************************
+/******************************************************************************
  *
- * FUNCTION:    acpi_hw_clear_acpi_status
+ * FUNCTION:    acpi_reset
  *
  * PARAMETERS:  None
  *
- * RETURN:      None
+ * RETURN:      Status
  *
- * DESCRIPTION: Clears all fixed and general purpose status bits
- *              THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED
+ * DESCRIPTION: Set reset register in memory or IO space. Note: Does not
+ *              support reset register in PCI config space, this must be
+ *              handled separately.
  *
  ******************************************************************************/
-acpi_status acpi_hw_clear_acpi_status(void)
+acpi_status acpi_reset(void)
 {
+	struct acpi_generic_address *reset_reg;
 	acpi_status status;
-	acpi_cpu_flags lock_flags = 0;
 
-	ACPI_FUNCTION_TRACE(hw_clear_acpi_status);
+	ACPI_FUNCTION_TRACE(acpi_reset);
 
-	ACPI_DEBUG_PRINT((ACPI_DB_IO, "About to write %04X to %04X\n",
-			  ACPI_BITMASK_ALL_FIXED_STATUS,
-			  (u16) acpi_gbl_FADT.xpm1a_event_block.address));
+	reset_reg = &acpi_gbl_FADT.reset_register;
 
-	lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
+	/* Check if the reset register is supported */
 
-	status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
-					ACPI_BITMASK_ALL_FIXED_STATUS);
-	if (ACPI_FAILURE(status)) {
-		goto unlock_and_exit;
+	if (!(acpi_gbl_FADT.flags & ACPI_FADT_RESET_REGISTER) ||
+	    !reset_reg->address) {
+		return_ACPI_STATUS(AE_NOT_EXIST);
 	}
 
-	/* Clear the fixed events */
-
-	if (acpi_gbl_FADT.xpm1b_event_block.address) {
-		status =
-		    acpi_hw_low_level_write(16, ACPI_BITMASK_ALL_FIXED_STATUS,
-					    &acpi_gbl_FADT.xpm1b_event_block);
-		if (ACPI_FAILURE(status)) {
-			goto unlock_and_exit;
-		}
-	}
-
-	/* Clear the GPE Bits in all GPE registers in all GPE blocks */
-
-	status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block);
+	/* Write the reset value to the reset register */
 
-      unlock_and_exit:
-	acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags);
+	status = acpi_write(acpi_gbl_FADT.reset_value, reset_reg);
 	return_ACPI_STATUS(status);
 }
 
-/*******************************************************************************
+ACPI_EXPORT_SYMBOL(acpi_reset)
+
+/******************************************************************************
  *
- * FUNCTION:    acpi_get_sleep_type_data
+ * FUNCTION:    acpi_read
  *
- * PARAMETERS:  sleep_state         - Numeric sleep state
- *              *sleep_type_a        - Where SLP_TYPa is returned
- *              *sleep_type_b        - Where SLP_TYPb is returned
+ * PARAMETERS:  Value               - Where the value is returned
+ *              Reg                 - GAS register structure
  *
- * RETURN:      Status - ACPI status
+ * RETURN:      Status
  *
- * DESCRIPTION: Obtain the SLP_TYPa and SLP_TYPb values for the requested sleep
- *              state.
+ * DESCRIPTION: Read from either memory or IO space.
  *
  ******************************************************************************/
-
-acpi_status
-acpi_get_sleep_type_data(u8 sleep_state, u8 * sleep_type_a, u8 * sleep_type_b)
+acpi_status acpi_read(u32 *value, struct acpi_generic_address *reg)
 {
-	acpi_status status = AE_OK;
-	struct acpi_evaluate_info *info;
-
-	ACPI_FUNCTION_TRACE(acpi_get_sleep_type_data);
-
-	/* Validate parameters */
-
-	if ((sleep_state > ACPI_S_STATES_MAX) || !sleep_type_a || !sleep_type_b) {
-		return_ACPI_STATUS(AE_BAD_PARAMETER);
-	}
+	u32 width;
+	u64 address;
+	acpi_status status;
 
-	/* Allocate the evaluation information block */
+	ACPI_FUNCTION_NAME(acpi_read);
 
-	info = ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_evaluate_info));
-	if (!info) {
-		return_ACPI_STATUS(AE_NO_MEMORY);
+	/*
+	 * Must have a valid pointer to a GAS structure, and
+	 * a non-zero address within. However, don't return an error
+	 * because the PM1A/B code must not fail if B isn't present.
+	 */
+	if (!reg) {
+		return (AE_OK);
 	}
 
-	info->pathname =
-	    ACPI_CAST_PTR(char, acpi_gbl_sleep_state_names[sleep_state]);
-
-	/* Evaluate the namespace object containing the values for this state */
-
-	status = acpi_ns_evaluate(info);
-	if (ACPI_FAILURE(status)) {
-		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
-				  "%s while evaluating SleepState [%s]\n",
-				  acpi_format_exception(status),
-				  info->pathname));
+	/* Get a local copy of the address. Handles possible alignment issues */
 
-		goto cleanup;
+	ACPI_MOVE_64_TO_64(&address, &reg->address);
+	if (!address) {
+		return (AE_OK);
 	}
 
-	/* Must have a return object */
+	/* Supported widths are 8/16/32 */
 
-	if (!info->return_object) {
-		ACPI_ERROR((AE_INFO, "No Sleep State object returned from [%s]",
-			    info->pathname));
-		status = AE_NOT_EXIST;
+	width = reg->bit_width;
+	if ((width != 8) && (width != 16) && (width != 32)) {
+		return (AE_SUPPORT);
 	}
 
-	/* It must be of type Package */
+	/* Initialize entire 32-bit return value to zero */
 
-	else if (ACPI_GET_OBJECT_TYPE(info->return_object) != ACPI_TYPE_PACKAGE) {
-		ACPI_ERROR((AE_INFO,
-			    "Sleep State return object is not a Package"));
-		status = AE_AML_OPERAND_TYPE;
-	}
+	*value = 0;
 
 	/*
-	 * The package must have at least two elements. NOTE (March 2005): This
-	 * goes against the current ACPI spec which defines this object as a
-	 * package with one encoded DWORD element. However, existing practice
-	 * by BIOS vendors seems to be to have 2 or more elements, at least
-	 * one per sleep type (A/B).
+	 * Two address spaces supported: Memory or IO.
+	 * PCI_Config is not supported here because the GAS struct is insufficient
 	 */
-	else if (info->return_object->package.count < 2) {
-		ACPI_ERROR((AE_INFO,
-			    "Sleep State return package does not have at least two elements"));
-		status = AE_AML_NO_OPERAND;
-	}
+	switch (reg->space_id) {
+	case ACPI_ADR_SPACE_SYSTEM_MEMORY:
 
-	/* The first two elements must both be of type Integer */
+		status = acpi_os_read_memory((acpi_physical_address) address,
+					     value, width);
+		break;
 
-	else if ((ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[0])
-		  != ACPI_TYPE_INTEGER) ||
-		 (ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[1])
-		  != ACPI_TYPE_INTEGER)) {
-		ACPI_ERROR((AE_INFO,
-			    "Sleep State return package elements are not both Integers (%s, %s)",
-			    acpi_ut_get_object_type_name(info->return_object->
-							 package.elements[0]),
-			    acpi_ut_get_object_type_name(info->return_object->
-							 package.elements[1])));
-		status = AE_AML_OPERAND_TYPE;
-	} else {
-		/* Valid _Sx_ package size, type, and value */
+	case ACPI_ADR_SPACE_SYSTEM_IO:
 
-		*sleep_type_a = (u8)
-		    (info->return_object->package.elements[0])->integer.value;
-		*sleep_type_b = (u8)
-		    (info->return_object->package.elements[1])->integer.value;
-	}
+		status =
+		    acpi_os_read_port((acpi_io_address) address, value, width);
+		break;
 
-	if (ACPI_FAILURE(status)) {
-		ACPI_EXCEPTION((AE_INFO, status,
-				"While evaluating SleepState [%s], bad Sleep object %p type %s",
-				info->pathname, info->return_object,
-				acpi_ut_get_object_type_name(info->
-							     return_object)));
+	default:
+		ACPI_ERROR((AE_INFO,
+			    "Unsupported address space: %X", reg->space_id));
+		return (AE_BAD_PARAMETER);
 	}
 
-	acpi_ut_remove_reference(info->return_object);
+	ACPI_DEBUG_PRINT((ACPI_DB_IO,
+			  "Read:  %8.8X width %2d from %8.8X%8.8X (%s)\n",
+			  *value, width, ACPI_FORMAT_UINT64(address),
+			  acpi_ut_get_region_name(reg->space_id)));
 
-      cleanup:
-	ACPI_FREE(info);
-	return_ACPI_STATUS(status);
+	return (status);
 }
 
-ACPI_EXPORT_SYMBOL(acpi_get_sleep_type_data)
+ACPI_EXPORT_SYMBOL(acpi_read)
 
-/*******************************************************************************
+/******************************************************************************
  *
- * FUNCTION:    acpi_hw_get_register_bit_mask
+ * FUNCTION:    acpi_write
  *
- * PARAMETERS:  register_id         - Index of ACPI Register to access
+ * PARAMETERS:  Value               - To be written
+ *              Reg                 - GAS register structure
  *
- * RETURN:      The bitmask to be used when accessing the register
+ * RETURN:      Status
  *
- * DESCRIPTION: Map register_id into a register bitmask.
+ * DESCRIPTION: Write to either memory or IO space.
  *
  ******************************************************************************/
-struct acpi_bit_register_info *acpi_hw_get_bit_register_info(u32 register_id)
+acpi_status acpi_write(u32 value, struct acpi_generic_address *reg)
 {
-	ACPI_FUNCTION_ENTRY();
+	u32 width;
+	u64 address;
+	acpi_status status;
 
-	if (register_id > ACPI_BITREG_MAX) {
-		ACPI_ERROR((AE_INFO, "Invalid BitRegister ID: %X",
-			    register_id));
-		return (NULL);
+	ACPI_FUNCTION_NAME(acpi_write);
+
+	/*
+	 * Must have a valid pointer to a GAS structure, and
+	 * a non-zero address within. However, don't return an error
+	 * because the PM1A/B code must not fail if B isn't present.
+	 */
+	if (!reg) {
+		return (AE_OK);
 	}
 
-	return (&acpi_gbl_bit_register_info[register_id]);
+	/* Get a local copy of the address. Handles possible alignment issues */
+
+	ACPI_MOVE_64_TO_64(&address, &reg->address);
+	if (!address) {
+		return (AE_OK);
+	}
+
+	/* Supported widths are 8/16/32 */
+
+	width = reg->bit_width;
+	if ((width != 8) && (width != 16) && (width != 32)) {
+		return (AE_SUPPORT);
+	}
+
+	/*
+	 * Two address spaces supported: Memory or IO.
+	 * PCI_Config is not supported here because the GAS struct is insufficient
+	 */
+	switch (reg->space_id) {
+	case ACPI_ADR_SPACE_SYSTEM_MEMORY:
+
+		status = acpi_os_write_memory((acpi_physical_address) address,
+					      value, width);
+		break;
+
+	case ACPI_ADR_SPACE_SYSTEM_IO:
+
+		status = acpi_os_write_port((acpi_io_address) address, value,
+					    width);
+		break;
+
+	default:
+		ACPI_ERROR((AE_INFO,
+			    "Unsupported address space: %X", reg->space_id));
+		return (AE_BAD_PARAMETER);
+	}
+
+	ACPI_DEBUG_PRINT((ACPI_DB_IO,
+			  "Wrote: %8.8X width %2d   to %8.8X%8.8X (%s)\n",
+			  value, width, ACPI_FORMAT_UINT64(address),
+			  acpi_ut_get_region_name(reg->space_id)));
+
+	return (status);
 }
 
+ACPI_EXPORT_SYMBOL(acpi_write)
+
 /*******************************************************************************
  *
- * FUNCTION:    acpi_get_register
+ * FUNCTION:    acpi_get_register_unlocked
  *
  * PARAMETERS:  register_id     - ID of ACPI bit_register to access
  *              return_value    - Value that was read from the register
@@ -254,17 +252,16 @@ struct acpi_bit_register_info *acpi_hw_get_bit_register_info(u32 register_id)
  * RETURN:      Status and the value read from specified Register. Value
  *              returned is normalized to bit0 (is shifted all the way right)
  *
- * DESCRIPTION: ACPI bit_register read function.
+ * DESCRIPTION: ACPI bit_register read function. Does not acquire the HW lock.
  *
  ******************************************************************************/
-
-acpi_status acpi_get_register_unlocked(u32 register_id, u32 * return_value)
+acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value)
 {
 	u32 register_value = 0;
 	struct acpi_bit_register_info *bit_reg_info;
 	acpi_status status;
 
-	ACPI_FUNCTION_TRACE(acpi_get_register);
+	ACPI_FUNCTION_TRACE(acpi_get_register_unlocked);
 
 	/* Get the info structure corresponding to the requested ACPI Register */
 
@@ -296,14 +293,31 @@ acpi_status acpi_get_register_unlocked(u32 register_id, u32 * return_value)
 	return_ACPI_STATUS(status);
 }
 
-acpi_status acpi_get_register(u32 register_id, u32 * return_value)
+ACPI_EXPORT_SYMBOL(acpi_get_register_unlocked)
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_get_register
+ *
+ * PARAMETERS:  register_id     - ID of ACPI bit_register to access
+ *              return_value    - Value that was read from the register
+ *
+ * RETURN:      Status and the value read from specified Register. Value
+ *              returned is normalized to bit0 (is shifted all the way right)
+ *
+ * DESCRIPTION: ACPI bit_register read function.
+ *
+ ******************************************************************************/
+acpi_status acpi_get_register(u32 register_id, u32 *return_value)
 {
 	acpi_status status;
 	acpi_cpu_flags flags;
+
 	flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock);
 	status = acpi_get_register_unlocked(register_id, return_value);
 	acpi_os_release_lock(acpi_gbl_hardware_lock, flags);
-	return status;
+
+	return (status);
 }
 
 ACPI_EXPORT_SYMBOL(acpi_get_register)
@@ -370,8 +384,9 @@ acpi_status acpi_set_register(u32 register_id, u32 value)
 						   bit_reg_info->
 						   access_bit_mask);
 		if (value) {
-			status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
-							(u16) value);
+			status =
+			    acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS,
+						   (u16) value);
 			register_value = 0;
 		}
 		break;
@@ -459,399 +474,120 @@ acpi_status acpi_set_register(u32 register_id, u32 value)
 
 ACPI_EXPORT_SYMBOL(acpi_set_register)
 
-/******************************************************************************
+/*******************************************************************************
  *
- * FUNCTION:    acpi_hw_register_read
+ * FUNCTION:    acpi_get_sleep_type_data
  *
- * PARAMETERS:  register_id         - ACPI Register ID
- *              return_value        - Where the register value is returned
+ * PARAMETERS:  sleep_state         - Numeric sleep state
+ *              *sleep_type_a        - Where SLP_TYPa is returned
+ *              *sleep_type_b        - Where SLP_TYPb is returned
  *
- * RETURN:      Status and the value read.
+ * RETURN:      Status - ACPI status
  *
- * DESCRIPTION: Read from the specified ACPI register
+ * DESCRIPTION: Obtain the SLP_TYPa and SLP_TYPb values for the requested sleep
+ *              state.
  *
  ******************************************************************************/
 acpi_status
-acpi_hw_register_read(u32 register_id, u32 * return_value)
+acpi_get_sleep_type_data(u8 sleep_state, u8 *sleep_type_a, u8 *sleep_type_b)
 {
-	u32 value1 = 0;
-	u32 value2 = 0;
-	acpi_status status;
-
-	ACPI_FUNCTION_TRACE(hw_register_read);
-
-	switch (register_id) {
-	case ACPI_REGISTER_PM1_STATUS:	/* 16-bit access */
-
-		status =
-		    acpi_hw_low_level_read(16, &value1,
-					   &acpi_gbl_FADT.xpm1a_event_block);
-		if (ACPI_FAILURE(status)) {
-			goto exit;
-		}
-
-		/* PM1B is optional */
-
-		status =
-		    acpi_hw_low_level_read(16, &value2,
-					   &acpi_gbl_FADT.xpm1b_event_block);
-		value1 |= value2;
-		break;
-
-	case ACPI_REGISTER_PM1_ENABLE:	/* 16-bit access */
-
-		status =
-		    acpi_hw_low_level_read(16, &value1, &acpi_gbl_xpm1a_enable);
-		if (ACPI_FAILURE(status)) {
-			goto exit;
-		}
-
-		/* PM1B is optional */
-
-		status =
-		    acpi_hw_low_level_read(16, &value2, &acpi_gbl_xpm1b_enable);
-		value1 |= value2;
-		break;
-
-	case ACPI_REGISTER_PM1_CONTROL:	/* 16-bit access */
-
-		status =
-		    acpi_hw_low_level_read(16, &value1,
-					   &acpi_gbl_FADT.xpm1a_control_block);
-		if (ACPI_FAILURE(status)) {
-			goto exit;
-		}
-
-		status =
-		    acpi_hw_low_level_read(16, &value2,
-					   &acpi_gbl_FADT.xpm1b_control_block);
-		value1 |= value2;
-		break;
-
-	case ACPI_REGISTER_PM2_CONTROL:	/* 8-bit access */
-
-		status =
-		    acpi_hw_low_level_read(8, &value1,
-					   &acpi_gbl_FADT.xpm2_control_block);
-		break;
-
-	case ACPI_REGISTER_PM_TIMER:	/* 32-bit access */
-
-		status =
-		    acpi_hw_low_level_read(32, &value1,
-					   &acpi_gbl_FADT.xpm_timer_block);
-		break;
+	acpi_status status = AE_OK;
+	struct acpi_evaluate_info *info;
 
-	case ACPI_REGISTER_SMI_COMMAND_BLOCK:	/* 8-bit access */
+	ACPI_FUNCTION_TRACE(acpi_get_sleep_type_data);
 
-		status =
-		    acpi_os_read_port(acpi_gbl_FADT.smi_command, &value1, 8);
-		break;
+	/* Validate parameters */
 
-	default:
-		ACPI_ERROR((AE_INFO, "Unknown Register ID: %X", register_id));
-		status = AE_BAD_PARAMETER;
-		break;
+	if ((sleep_state > ACPI_S_STATES_MAX) || !sleep_type_a || !sleep_type_b) {
+		return_ACPI_STATUS(AE_BAD_PARAMETER);
 	}
 
-      exit:
+	/* Allocate the evaluation information block */
 
-	if (ACPI_SUCCESS(status)) {
-		*return_value = value1;
+	info = ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_evaluate_info));
+	if (!info) {
+		return_ACPI_STATUS(AE_NO_MEMORY);
 	}
 
-	return_ACPI_STATUS(status);
-}
-
-/******************************************************************************
- *
- * FUNCTION:    acpi_hw_register_write
- *
- * PARAMETERS:  register_id         - ACPI Register ID
- *              Value               - The value to write
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Write to the specified ACPI register
- *
- * NOTE: In accordance with the ACPI specification, this function automatically
- * preserves the value of the following bits, meaning that these bits cannot be
- * changed via this interface:
- *
- * PM1_CONTROL[0] = SCI_EN
- * PM1_CONTROL[9]
- * PM1_STATUS[11]
- *
- * ACPI References:
- * 1) Hardware Ignored Bits: When software writes to a register with ignored
- *      bit fields, it preserves the ignored bit fields
- * 2) SCI_EN: OSPM always preserves this bit position
- *
- ******************************************************************************/
-
-acpi_status acpi_hw_register_write(u32 register_id, u32 value)
-{
-	acpi_status status;
-	u32 read_value;
-
-	ACPI_FUNCTION_TRACE(hw_register_write);
-
-	switch (register_id) {
-	case ACPI_REGISTER_PM1_STATUS:	/* 16-bit access */
-
-		/* Perform a read first to preserve certain bits (per ACPI spec) */
-
-		status = acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS,
-					       &read_value);
-		if (ACPI_FAILURE(status)) {
-			goto exit;
-		}
-
-		/* Insert the bits to be preserved */
-
-		ACPI_INSERT_BITS(value, ACPI_PM1_STATUS_PRESERVED_BITS,
-				 read_value);
-
-		/* Now we can write the data */
-
-		status =
-		    acpi_hw_low_level_write(16, value,
-					    &acpi_gbl_FADT.xpm1a_event_block);
-		if (ACPI_FAILURE(status)) {
-			goto exit;
-		}
-
-		/* PM1B is optional */
-
-		status =
-		    acpi_hw_low_level_write(16, value,
-					    &acpi_gbl_FADT.xpm1b_event_block);
-		break;
-
-	case ACPI_REGISTER_PM1_ENABLE:	/* 16-bit access */
-
-		status =
-		    acpi_hw_low_level_write(16, value, &acpi_gbl_xpm1a_enable);
-		if (ACPI_FAILURE(status)) {
-			goto exit;
-		}
-
-		/* PM1B is optional */
-
-		status =
-		    acpi_hw_low_level_write(16, value, &acpi_gbl_xpm1b_enable);
-		break;
-
-	case ACPI_REGISTER_PM1_CONTROL:	/* 16-bit access */
-
-		/*
-		 * Perform a read first to preserve certain bits (per ACPI spec)
-		 */
-		status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL,
-					       &read_value);
-		if (ACPI_FAILURE(status)) {
-			goto exit;
-		}
-
-		/* Insert the bits to be preserved */
-
-		ACPI_INSERT_BITS(value, ACPI_PM1_CONTROL_PRESERVED_BITS,
-				 read_value);
-
-		/* Now we can write the data */
-
-		status =
-		    acpi_hw_low_level_write(16, value,
-					    &acpi_gbl_FADT.xpm1a_control_block);
-		if (ACPI_FAILURE(status)) {
-			goto exit;
-		}
-
-		status =
-		    acpi_hw_low_level_write(16, value,
-					    &acpi_gbl_FADT.xpm1b_control_block);
-		break;
-
-	case ACPI_REGISTER_PM1A_CONTROL:	/* 16-bit access */
-
-		status =
-		    acpi_hw_low_level_write(16, value,
-					    &acpi_gbl_FADT.xpm1a_control_block);
-		break;
-
-	case ACPI_REGISTER_PM1B_CONTROL:	/* 16-bit access */
-
-		status =
-		    acpi_hw_low_level_write(16, value,
-					    &acpi_gbl_FADT.xpm1b_control_block);
-		break;
-
-	case ACPI_REGISTER_PM2_CONTROL:	/* 8-bit access */
-
-		status =
-		    acpi_hw_low_level_write(8, value,
-					    &acpi_gbl_FADT.xpm2_control_block);
-		break;
-
-	case ACPI_REGISTER_PM_TIMER:	/* 32-bit access */
-
-		status =
-		    acpi_hw_low_level_write(32, value,
-					    &acpi_gbl_FADT.xpm_timer_block);
-		break;
-
-	case ACPI_REGISTER_SMI_COMMAND_BLOCK:	/* 8-bit access */
+	info->pathname =
+	    ACPI_CAST_PTR(char, acpi_gbl_sleep_state_names[sleep_state]);
 
-		/* SMI_CMD is currently always in IO space */
+	/* Evaluate the namespace object containing the values for this state */
 
-		status =
-		    acpi_os_write_port(acpi_gbl_FADT.smi_command, value, 8);
-		break;
+	status = acpi_ns_evaluate(info);
+	if (ACPI_FAILURE(status)) {
+		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
+				  "%s while evaluating SleepState [%s]\n",
+				  acpi_format_exception(status),
+				  info->pathname));
 
-	default:
-		status = AE_BAD_PARAMETER;
-		break;
+		goto cleanup;
 	}
 
-      exit:
-	return_ACPI_STATUS(status);
-}
-
-/******************************************************************************
- *
- * FUNCTION:    acpi_hw_low_level_read
- *
- * PARAMETERS:  Width               - 8, 16, or 32
- *              Value               - Where the value is returned
- *              Reg                 - GAS register structure
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Read from either memory or IO space.
- *
- ******************************************************************************/
-
-acpi_status
-acpi_hw_low_level_read(u32 width, u32 * value, struct acpi_generic_address *reg)
-{
-	u64 address;
-	acpi_status status;
-
-	ACPI_FUNCTION_NAME(hw_low_level_read);
+	/* Must have a return object */
 
-	/*
-	 * Must have a valid pointer to a GAS structure, and
-	 * a non-zero address within. However, don't return an error
-	 * because the PM1A/B code must not fail if B isn't present.
-	 */
-	if (!reg) {
-		return (AE_OK);
+	if (!info->return_object) {
+		ACPI_ERROR((AE_INFO, "No Sleep State object returned from [%s]",
+			    info->pathname));
+		status = AE_NOT_EXIST;
 	}
 
-	/* Get a local copy of the address. Handles possible alignment issues */
+	/* It must be of type Package */
 
-	ACPI_MOVE_64_TO_64(&address, &reg->address);
-	if (!address) {
-		return (AE_OK);
+	else if (ACPI_GET_OBJECT_TYPE(info->return_object) != ACPI_TYPE_PACKAGE) {
+		ACPI_ERROR((AE_INFO,
+			    "Sleep State return object is not a Package"));
+		status = AE_AML_OPERAND_TYPE;
 	}
-	*value = 0;
 
 	/*
-	 * Two address spaces supported: Memory or IO.
-	 * PCI_Config is not supported here because the GAS struct is insufficient
+	 * The package must have at least two elements. NOTE (March 2005): This
+	 * goes against the current ACPI spec which defines this object as a
+	 * package with one encoded DWORD element. However, existing practice
+	 * by BIOS vendors seems to be to have 2 or more elements, at least
+	 * one per sleep type (A/B).
 	 */
-	switch (reg->space_id) {
-	case ACPI_ADR_SPACE_SYSTEM_MEMORY:
-
-		status = acpi_os_read_memory((acpi_physical_address) address,
-					     value, width);
-		break;
-
-	case ACPI_ADR_SPACE_SYSTEM_IO:
-
-		status =
-		    acpi_os_read_port((acpi_io_address) address, value, width);
-		break;
-
-	default:
+	else if (info->return_object->package.count < 2) {
 		ACPI_ERROR((AE_INFO,
-			    "Unsupported address space: %X", reg->space_id));
-		return (AE_BAD_PARAMETER);
+			    "Sleep State return package does not have at least two elements"));
+		status = AE_AML_NO_OPERAND;
 	}
 
-	ACPI_DEBUG_PRINT((ACPI_DB_IO,
-			  "Read:  %8.8X width %2d from %8.8X%8.8X (%s)\n",
-			  *value, width, ACPI_FORMAT_UINT64(address),
-			  acpi_ut_get_region_name(reg->space_id)));
-
-	return (status);
-}
-
-/******************************************************************************
- *
- * FUNCTION:    acpi_hw_low_level_write
- *
- * PARAMETERS:  Width               - 8, 16, or 32
- *              Value               - To be written
- *              Reg                 - GAS register structure
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Write to either memory or IO space.
- *
- ******************************************************************************/
-
-acpi_status
-acpi_hw_low_level_write(u32 width, u32 value, struct acpi_generic_address * reg)
-{
-	u64 address;
-	acpi_status status;
-
-	ACPI_FUNCTION_NAME(hw_low_level_write);
-
-	/*
-	 * Must have a valid pointer to a GAS structure, and
-	 * a non-zero address within. However, don't return an error
-	 * because the PM1A/B code must not fail if B isn't present.
-	 */
-	if (!reg) {
-		return (AE_OK);
-	}
+	/* The first two elements must both be of type Integer */
 
-	/* Get a local copy of the address. Handles possible alignment issues */
+	else if ((ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[0])
+		  != ACPI_TYPE_INTEGER) ||
+		 (ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[1])
+		  != ACPI_TYPE_INTEGER)) {
+		ACPI_ERROR((AE_INFO,
+			    "Sleep State return package elements are not both Integers (%s, %s)",
+			    acpi_ut_get_object_type_name(info->return_object->
+							 package.elements[0]),
+			    acpi_ut_get_object_type_name(info->return_object->
+							 package.elements[1])));
+		status = AE_AML_OPERAND_TYPE;
+	} else {
+		/* Valid _Sx_ package size, type, and value */
 
-	ACPI_MOVE_64_TO_64(&address, &reg->address);
-	if (!address) {
-		return (AE_OK);
+		*sleep_type_a = (u8)
+		    (info->return_object->package.elements[0])->integer.value;
+		*sleep_type_b = (u8)
+		    (info->return_object->package.elements[1])->integer.value;
 	}
 
-	/*
-	 * Two address spaces supported: Memory or IO.
-	 * PCI_Config is not supported here because the GAS struct is insufficient
-	 */
-	switch (reg->space_id) {
-	case ACPI_ADR_SPACE_SYSTEM_MEMORY:
-
-		status = acpi_os_write_memory((acpi_physical_address) address,
-					      value, width);
-		break;
-
-	case ACPI_ADR_SPACE_SYSTEM_IO:
-
-		status = acpi_os_write_port((acpi_io_address) address, value,
-					    width);
-		break;
-
-	default:
-		ACPI_ERROR((AE_INFO,
-			    "Unsupported address space: %X", reg->space_id));
-		return (AE_BAD_PARAMETER);
+	if (ACPI_FAILURE(status)) {
+		ACPI_EXCEPTION((AE_INFO, status,
+				"While evaluating SleepState [%s], bad Sleep object %p type %s",
+				info->pathname, info->return_object,
+				acpi_ut_get_object_type_name(info->
+							     return_object)));
 	}
 
-	ACPI_DEBUG_PRINT((ACPI_DB_IO,
-			  "Wrote: %8.8X width %2d   to %8.8X%8.8X (%s)\n",
-			  value, width, ACPI_FORMAT_UINT64(address),
-			  acpi_ut_get_region_name(reg->space_id)));
+	acpi_ut_remove_reference(info->return_object);
 
-	return (status);
+      cleanup:
+	ACPI_FREE(info);
+	return_ACPI_STATUS(status);
 }
+
+ACPI_EXPORT_SYMBOL(acpi_get_sleep_type_data)
diff --git a/drivers/acpi/namespace/nsaccess.c b/drivers/acpi/acpica/nsaccess.c
index c39a7f68b88..88303ebe924 100644
--- a/drivers/acpi/namespace/nsaccess.c
+++ b/drivers/acpi/acpica/nsaccess.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/amlcode.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acdispat.h>
+#include "accommon.h"
+#include "amlcode.h"
+#include "acnamesp.h"
+#include "acdispat.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsaccess")
@@ -165,12 +166,9 @@ acpi_status acpi_ns_root_initialize(void)
 
 				obj_desc->method.method_flags =
 				    AML_METHOD_INTERNAL_ONLY;
-
-#ifndef ACPI_DUMP_APP
 				obj_desc->method.implementation =
 				    acpi_ut_osi_implementation;
 #endif
-#endif
 				break;
 
 			case ACPI_TYPE_INTEGER:
@@ -521,11 +519,11 @@ acpi_ns_lookup(union acpi_generic_state *scope_info,
 	}
 
 	/*
-	 * Search namespace for each segment of the name.  Loop through and
+	 * Search namespace for each segment of the name. Loop through and
 	 * verify (or add to the namespace) each name segment.
 	 *
 	 * The object type is significant only at the last name
-	 * segment.  (We don't care about the types along the path, only
+	 * segment. (We don't care about the types along the path, only
 	 * the type of the final target object.)
 	 */
 	this_search_type = ACPI_TYPE_ANY;
@@ -591,6 +589,10 @@ acpi_ns_lookup(union acpi_generic_state *scope_info,
 			 * segments).
 			 */
 			if (this_node->type == ACPI_TYPE_LOCAL_ALIAS) {
+				if (!this_node->object) {
+					return_ACPI_STATUS(AE_NOT_EXIST);
+				}
+
 				if (acpi_ns_opens_scope
 				    (((struct acpi_namespace_node *)this_node->
 				      object)->type)) {
diff --git a/drivers/acpi/namespace/nsalloc.c b/drivers/acpi/acpica/nsalloc.c
index 3a1740ac2ed..f976d848fe8 100644
--- a/drivers/acpi/namespace/nsalloc.c
+++ b/drivers/acpi/acpica/nsalloc.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsalloc")
diff --git a/drivers/acpi/namespace/nsdump.c b/drivers/acpi/acpica/nsdump.c
index cc0ae39440e..0da33c8e9ba 100644
--- a/drivers/acpi/namespace/nsdump.c
+++ b/drivers/acpi/acpica/nsdump.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsdump")
diff --git a/drivers/acpi/namespace/nsdumpdv.c b/drivers/acpi/acpica/nsdumpdv.c
index 428f50fde11..41994fe7fbb 100644
--- a/drivers/acpi/namespace/nsdumpdv.c
+++ b/drivers/acpi/acpica/nsdumpdv.c
@@ -42,6 +42,7 @@
  */
 
 #include <acpi/acpi.h>
+#include "accommon.h"
 
 /* TBD: This entire module is apparently obsolete and should be removed */
 
@@ -49,7 +50,7 @@
 ACPI_MODULE_NAME("nsdumpdv")
 #ifdef ACPI_OBSOLETE_FUNCTIONS
 #if defined(ACPI_DEBUG_OUTPUT) || defined(ACPI_DEBUGGER)
-#include <acpi/acnamesp.h>
+#include "acnamesp.h"
 /*******************************************************************************
  *
  * FUNCTION:    acpi_ns_dump_one_device
diff --git a/drivers/acpi/namespace/nseval.c b/drivers/acpi/acpica/nseval.c
index 4cdf03ac2b4..0f3d5f9b596 100644
--- a/drivers/acpi/namespace/nseval.c
+++ b/drivers/acpi/acpica/nseval.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "acinterp.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nseval")
@@ -89,6 +90,7 @@ acpi_status acpi_ns_evaluate(struct acpi_evaluate_info * info)
 	/* Initialize the return value to an invalid object */
 
 	info->return_object = NULL;
+	info->param_count = 0;
 
 	/*
 	 * Get the actual namespace node for the target object. Handles these cases:
@@ -141,41 +143,17 @@ acpi_status acpi_ns_evaluate(struct acpi_evaluate_info * info)
 			return_ACPI_STATUS(AE_NULL_OBJECT);
 		}
 
-		/*
-		 * Calculate the number of arguments being passed to the method
-		 */
+		/* Count the number of arguments being passed to the method */
 
-		info->param_count = 0;
 		if (info->parameters) {
-			while (info->parameters[info->param_count])
+			while (info->parameters[info->param_count]) {
+				if (info->param_count > ACPI_METHOD_MAX_ARG) {
+					return_ACPI_STATUS(AE_LIMIT);
+				}
 				info->param_count++;
+			}
 		}
 
-		/*
-		 * Warning if too few or too many arguments have been passed by the
-		 * caller. We don't want to abort here with an error because an
-		 * incorrect number of arguments may not cause the method to fail.
-		 * However, the method will fail if there are too few arguments passed
-		 * and the method attempts to use one of the missing ones.
-		 */
-
-		if (info->param_count < info->obj_desc->method.param_count) {
-			ACPI_WARNING((AE_INFO,
-				    "Insufficient arguments - "
-				    "method [%4.4s] needs %d, found %d",
-				    acpi_ut_get_node_name(info->resolved_node),
-				    info->obj_desc->method.param_count,
-				    info->param_count));
-		} else if (info->param_count >
-				info->obj_desc->method.param_count) {
-			ACPI_WARNING((AE_INFO,
-				      "Excess arguments - "
-				      "method [%4.4s] needs %d, found %d",
-				      acpi_ut_get_node_name(info->
-							    resolved_node),
-				      info->obj_desc->method.param_count,
-				      info->param_count));
-		}
 
 		ACPI_DUMP_PATHNAME(info->resolved_node, "Execute Method:",
 				   ACPI_LV_INFO, _COMPONENT);
@@ -264,32 +242,13 @@ acpi_status acpi_ns_evaluate(struct acpi_evaluate_info * info)
 		}
 	}
 
-	/* Validation of return values for ACPI-predefined methods and objects */
-
-	if ((status == AE_OK) || (status == AE_CTRL_RETURN_VALUE)) {
-		/*
-		 * If this is the first evaluation, check the return value. This
-		 * ensures that any warnings will only be emitted during the very
-		 * first evaluation of the object.
-		 */
-		if (!(node->flags & ANOBJ_EVALUATED)) {
-			/*
-			 * Check for a predefined ACPI name. If found, validate the
-			 * returned object.
-			 *
-			 * Note: Ignore return status for now, emit warnings if there are
-			 * problems with the returned object. May change later to abort
-			 * the method on invalid return object.
-			 */
-			(void)acpi_ns_check_predefined_names(node,
-							     info->
-							     return_object);
-		}
-
-		/* Mark the node as having been evaluated */
-
-		node->flags |= ANOBJ_EVALUATED;
-	}
+	/*
+	 * Check input argument count against the ASL-defined count for a method.
+	 * Also check predefined names: argument count and return value against
+	 * the ACPI specification. Some incorrect return value types are repaired.
+	 */
+	(void)acpi_ns_check_predefined_names(node, info->param_count,
+		status, &info->return_object);
 
 	/* Check if there is a return value that must be dealt with */
 
diff --git a/drivers/acpi/namespace/nsinit.c b/drivers/acpi/acpica/nsinit.c
index e4c57510d79..13501cb8186 100644
--- a/drivers/acpi/namespace/nsinit.c
+++ b/drivers/acpi/acpica/nsinit.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acdispat.h"
+#include "acinterp.h"
 #include <linux/nmi.h>
 
 #define _COMPONENT          ACPI_NAMESPACE
diff --git a/drivers/acpi/namespace/nsload.c b/drivers/acpi/acpica/nsload.c
index a4a412b7c02..a0ba9e12379 100644
--- a/drivers/acpi/namespace/nsload.c
+++ b/drivers/acpi/acpica/nsload.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acdispat.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acdispat.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsload")
diff --git a/drivers/acpi/namespace/nsnames.c b/drivers/acpi/acpica/nsnames.c
index 42a39a7c96e..ae3dc10a7e8 100644
--- a/drivers/acpi/namespace/nsnames.c
+++ b/drivers/acpi/acpica/nsnames.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/amlcode.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "amlcode.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsnames")
diff --git a/drivers/acpi/namespace/nsobject.c b/drivers/acpi/acpica/nsobject.c
index 15fe09e24f7..08a97a57f8f 100644
--- a/drivers/acpi/namespace/nsobject.c
+++ b/drivers/acpi/acpica/nsobject.c
@@ -43,7 +43,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsobject")
diff --git a/drivers/acpi/namespace/nsparse.c b/drivers/acpi/acpica/nsparse.c
index a82271a9dbb..b9e8d0070b6 100644
--- a/drivers/acpi/namespace/nsparse.c
+++ b/drivers/acpi/acpica/nsparse.c
@@ -42,10 +42,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acparser.h>
-#include <acpi/acdispat.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acparser.h"
+#include "acdispat.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsparse")
diff --git a/drivers/acpi/namespace/nspredef.c b/drivers/acpi/acpica/nspredef.c
index 0f17cf0898c..452703290d3 100644
--- a/drivers/acpi/namespace/nspredef.c
+++ b/drivers/acpi/acpica/nspredef.c
@@ -43,8 +43,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acpredef.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acpredef.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nspredef")
@@ -72,7 +73,7 @@ ACPI_MODULE_NAME("nspredef")
 /* Local prototypes */
 static acpi_status
 acpi_ns_check_package(char *pathname,
-		      union acpi_operand_object *return_object,
+		      union acpi_operand_object **return_object_ptr,
 		      const union acpi_predefined_info *predefined);
 
 static acpi_status
@@ -82,13 +83,18 @@ acpi_ns_check_package_elements(char *pathname,
 
 static acpi_status
 acpi_ns_check_object_type(char *pathname,
-			  union acpi_operand_object *return_object,
+			  union acpi_operand_object **return_object_ptr,
 			  u32 expected_btypes, u32 package_index);
 
 static acpi_status
 acpi_ns_check_reference(char *pathname,
 			union acpi_operand_object *return_object);
 
+static acpi_status
+acpi_ns_repair_object(u32 expected_btypes,
+		      u32 package_index,
+		      union acpi_operand_object **return_object_ptr);
+
 /*
  * Names for the types that can be returned by the predefined objects.
  * Used for warning messages. Must be in the same order as the ACPI_RTYPEs
@@ -108,8 +114,8 @@ static const char *acpi_rtype_names[] = {
  * FUNCTION:    acpi_ns_check_predefined_names
  *
  * PARAMETERS:  Node            - Namespace node for the method/object
- *              return_object   - Object returned from the evaluation of this
- *                                method/object
+ *              return_object_ptr - Pointer to the object returned from the
+ *                                evaluation of a method or object
  *
  * RETURN:      Status
  *
@@ -119,8 +125,11 @@ static const char *acpi_rtype_names[] = {
 
 acpi_status
 acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
-			       union acpi_operand_object *return_object)
+			       u32 user_param_count,
+			       acpi_status return_status,
+			       union acpi_operand_object **return_object_ptr)
 {
+	union acpi_operand_object *return_object = *return_object_ptr;
 	acpi_status status = AE_OK;
 	const union acpi_predefined_info *predefined;
 	char *pathname;
@@ -128,12 +137,6 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
 	/* Match the name for this method/object against the predefined list */
 
 	predefined = acpi_ns_check_for_predefined_name(node);
-	if (!predefined) {
-
-		/* Name was not one of the predefined names */
-
-		return (AE_OK);
-	}
 
 	/* Get the full pathname to the object, for use in error messages */
 
@@ -143,10 +146,37 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
 	}
 
 	/*
-	 * Check that the parameter count for this method is in accordance
-	 * with the ACPI specification.
+	 * Check that the parameter count for this method matches the ASL
+	 * definition. For predefined names, ensure that both the caller and
+	 * the method itself are in accordance with the ACPI specification.
 	 */
-	acpi_ns_check_parameter_count(pathname, node, predefined);
+	acpi_ns_check_parameter_count(pathname, node, user_param_count,
+				      predefined);
+
+	/* If not a predefined name, we cannot validate the return object */
+
+	if (!predefined) {
+		goto exit;
+	}
+
+	/* If the method failed, we cannot validate the return object */
+
+	if ((return_status != AE_OK) && (return_status != AE_CTRL_RETURN_VALUE)) {
+		goto exit;
+	}
+
+	/*
+	 * Only validate the return value on the first successful evaluation of
+	 * the method. This ensures that any warnings will only be emitted during
+	 * the very first evaluation of the method/object.
+	 */
+	if (node->flags & ANOBJ_EVALUATED) {
+		goto exit;
+	}
+
+	/* Mark the node as having been successfully evaluated */
+
+	node->flags |= ANOBJ_EVALUATED;
 
 	/*
 	 * If there is no return value, check if we require a return value for
@@ -171,7 +201,7 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
 	 * We have a return value, but if one wasn't expected, just exit, this is
 	 * not a problem
 	 *
-	 * For example, if "Implicit return value" is enabled, methods will
+	 * For example, if the "Implicit Return" feature is enabled, methods will
 	 * always return a value
 	 */
 	if (!predefined->info.expected_btypes) {
@@ -182,7 +212,7 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
 	 * Check that the type of the return object is what is expected for
 	 * this predefined name
 	 */
-	status = acpi_ns_check_object_type(pathname, return_object,
+	status = acpi_ns_check_object_type(pathname, return_object_ptr,
 					   predefined->info.expected_btypes,
 					   ACPI_NOT_PACKAGE);
 	if (ACPI_FAILURE(status)) {
@@ -193,11 +223,12 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
 
 	if (ACPI_GET_OBJECT_TYPE(return_object) == ACPI_TYPE_PACKAGE) {
 		status =
-		    acpi_ns_check_package(pathname, return_object, predefined);
+		    acpi_ns_check_package(pathname, return_object_ptr,
+					  predefined);
 	}
 
       exit:
-	if (pathname) {
+	if (pathname != predefined->info.name) {
 		ACPI_FREE(pathname);
 	}
 
@@ -210,6 +241,7 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
  *
  * PARAMETERS:  Pathname        - Full pathname to the node (for error msgs)
  *              Node            - Namespace node for the method/object
+ *              user_param_count - Number of args passed in by the caller
  *              Predefined      - Pointer to entry in predefined name table
  *
  * RETURN:      None
@@ -223,32 +255,76 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node,
 void
 acpi_ns_check_parameter_count(char *pathname,
 			      struct acpi_namespace_node *node,
+			      u32 user_param_count,
 			      const union acpi_predefined_info *predefined)
 {
 	u32 param_count;
 	u32 required_params_current;
 	u32 required_params_old;
 
-	/*
-	 * Check that the ASL-defined parameter count is what is expected for
-	 * this predefined name.
-	 *
-	 * Methods have 0-7 parameters. All other types have zero.
-	 */
+	/* Methods have 0-7 parameters. All other types have zero. */
+
 	param_count = 0;
 	if (node->type == ACPI_TYPE_METHOD) {
 		param_count = node->object->method.param_count;
 	}
 
-	/* Validate parameter count - allow two different legal counts (_SCP) */
+	/* Argument count check for non-predefined methods/objects */
+
+	if (!predefined) {
+		/*
+		 * Warning if too few or too many arguments have been passed by the
+		 * caller. An incorrect number of arguments may not cause the method
+		 * to fail. However, the method will fail if there are too few
+		 * arguments and the method attempts to use one of the missing ones.
+		 */
+		if (user_param_count < param_count) {
+			ACPI_WARNING((AE_INFO,
+				      "%s: Insufficient arguments - needs %d, found %d",
+				      pathname, param_count, user_param_count));
+		} else if (user_param_count > param_count) {
+			ACPI_WARNING((AE_INFO,
+				      "%s: Excess arguments - needs %d, found %d",
+				      pathname, param_count, user_param_count));
+		}
+		return;
+	}
+
+	/* Allow two different legal argument counts (_SCP, etc.) */
 
 	required_params_current = predefined->info.param_count & 0x0F;
 	required_params_old = predefined->info.param_count >> 4;
 
+	if (user_param_count != ACPI_UINT32_MAX) {
+
+		/* Validate the user-supplied parameter count */
+
+		if ((user_param_count != required_params_current) &&
+		    (user_param_count != required_params_old)) {
+			ACPI_WARNING((AE_INFO,
+				      "%s: Parameter count mismatch - caller passed %d, ACPI requires %d",
+				      pathname, user_param_count,
+				      required_params_current));
+		}
+	}
+
+	/*
+	 * Only validate the argument count on the first successful evaluation of
+	 * the method. This ensures that any warnings will only be emitted during
+	 * the very first evaluation of the method/object.
+	 */
+	if (node->flags & ANOBJ_EVALUATED) {
+		return;
+	}
+
+	/*
+	 * Check that the ASL-defined parameter count is what is expected for
+	 * this predefined name.
+	 */
 	if ((param_count != required_params_current) &&
 	    (param_count != required_params_old)) {
 		ACPI_WARNING((AE_INFO,
-			      "%s: Parameter count mismatch - ASL declared %d, expected %d",
+			      "%s: Parameter count mismatch - ASL declared %d, ACPI requires %d",
 			      pathname, param_count, required_params_current));
 	}
 }
@@ -307,8 +383,8 @@ const union acpi_predefined_info *acpi_ns_check_for_predefined_name(struct
  * FUNCTION:    acpi_ns_check_package
  *
  * PARAMETERS:  Pathname        - Full pathname to the node (for error msgs)
- *              return_object   - Object returned from the evaluation of a
- *                                method or object
+ *              return_object_ptr - Pointer to the object returned from the
+ *                                evaluation of a method or object
  *              Predefined      - Pointer to entry in predefined name table
  *
  * RETURN:      Status
@@ -320,9 +396,10 @@ const union acpi_predefined_info *acpi_ns_check_for_predefined_name(struct
 
 static acpi_status
 acpi_ns_check_package(char *pathname,
-		      union acpi_operand_object *return_object,
+		      union acpi_operand_object **return_object_ptr,
 		      const union acpi_predefined_info *predefined)
 {
+	union acpi_operand_object *return_object = *return_object_ptr;
 	const union acpi_predefined_info *package;
 	union acpi_operand_object *sub_package;
 	union acpi_operand_object **elements;
@@ -408,7 +485,7 @@ acpi_ns_check_package(char *pathname,
 		 * elements must be of the same type
 		 */
 		for (i = 0; i < count; i++) {
-			status = acpi_ns_check_object_type(pathname, *elements,
+			status = acpi_ns_check_object_type(pathname, elements,
 							   package->ret_info.
 							   object_type1, i);
 			if (ACPI_FAILURE(status)) {
@@ -441,7 +518,7 @@ acpi_ns_check_package(char *pathname,
 
 				status =
 				    acpi_ns_check_object_type(pathname,
-							      *elements,
+							      elements,
 							      package->
 							      ret_info3.
 							      object_type[i],
@@ -454,7 +531,7 @@ acpi_ns_check_package(char *pathname,
 
 				status =
 				    acpi_ns_check_object_type(pathname,
-							      *elements,
+							      elements,
 							      package->
 							      ret_info3.
 							      tail_object_type,
@@ -471,7 +548,7 @@ acpi_ns_check_package(char *pathname,
 
 		/* First element is the (Integer) count of sub-packages to follow */
 
-		status = acpi_ns_check_object_type(pathname, *elements,
+		status = acpi_ns_check_object_type(pathname, elements,
 						   ACPI_RTYPE_INTEGER, 0);
 		if (ACPI_FAILURE(status)) {
 			return (status);
@@ -509,7 +586,7 @@ acpi_ns_check_package(char *pathname,
 			/* Each sub-object must be of type Package */
 
 			status =
-			    acpi_ns_check_object_type(pathname, sub_package,
+			    acpi_ns_check_object_type(pathname, &sub_package,
 						      ACPI_RTYPE_PACKAGE, i);
 			if (ACPI_FAILURE(status)) {
 				return (status);
@@ -567,12 +644,8 @@ acpi_ns_check_package(char *pathname,
 				for (j = 0; j < expected_count; j++) {
 					status =
 					    acpi_ns_check_object_type(pathname,
-								      sub_elements
-								      [j],
-								      package->
-								      ret_info2.
-								      object_type
-								      [j], j);
+						&sub_elements[j],
+						package->ret_info2.object_type[j], j);
 					if (ACPI_FAILURE(status)) {
 						return (status);
 					}
@@ -611,7 +684,7 @@ acpi_ns_check_package(char *pathname,
 
 				status =
 				    acpi_ns_check_object_type(pathname,
-							      *sub_elements,
+							      sub_elements,
 							      ACPI_RTYPE_INTEGER,
 							      0);
 				if (ACPI_FAILURE(status)) {
@@ -708,7 +781,7 @@ acpi_ns_check_package_elements(char *pathname,
 	 * The second group can have a count of zero.
 	 */
 	for (i = 0; i < count1; i++) {
-		status = acpi_ns_check_object_type(pathname, *this_element,
+		status = acpi_ns_check_object_type(pathname, this_element,
 						   type1, i);
 		if (ACPI_FAILURE(status)) {
 			return (status);
@@ -717,7 +790,7 @@ acpi_ns_check_package_elements(char *pathname,
 	}
 
 	for (i = 0; i < count2; i++) {
-		status = acpi_ns_check_object_type(pathname, *this_element,
+		status = acpi_ns_check_object_type(pathname, this_element,
 						   type2, (i + count1));
 		if (ACPI_FAILURE(status)) {
 			return (status);
@@ -733,8 +806,8 @@ acpi_ns_check_package_elements(char *pathname,
  * FUNCTION:    acpi_ns_check_object_type
  *
  * PARAMETERS:  Pathname        - Full pathname to the node (for error msgs)
- *              return_object   - Object return from the execution of this
- *                                method/object
+ *              return_object_ptr - Pointer to the object returned from the
+ *                                evaluation of a method or object
  *              expected_btypes - Bitmap of expected return type(s)
  *              package_index   - Index of object within parent package (if
  *                                applicable - ACPI_NOT_PACKAGE otherwise)
@@ -748,9 +821,10 @@ acpi_ns_check_package_elements(char *pathname,
 
 static acpi_status
 acpi_ns_check_object_type(char *pathname,
-			  union acpi_operand_object *return_object,
+			  union acpi_operand_object **return_object_ptr,
 			  u32 expected_btypes, u32 package_index)
 {
+	union acpi_operand_object *return_object = *return_object_ptr;
 	acpi_status status = AE_OK;
 	u32 return_btype;
 	char type_buffer[48];	/* Room for 5 types */
@@ -814,6 +888,14 @@ acpi_ns_check_object_type(char *pathname,
 	/* Is the object one of the expected types? */
 
 	if (!(return_btype & expected_btypes)) {
+
+		/* Type mismatch -- attempt repair of the returned object */
+
+		status = acpi_ns_repair_object(expected_btypes, package_index,
+					       return_object_ptr);
+		if (ACPI_SUCCESS(status)) {
+			return (status);
+		}
 		goto type_error_exit;
 	}
 
@@ -898,3 +980,86 @@ acpi_ns_check_reference(char *pathname,
 
 	return (AE_AML_OPERAND_TYPE);
 }
+
+/*******************************************************************************
+ *
+ * FUNCTION:    acpi_ns_repair_object
+ *
+ * PARAMETERS:  Pathname        - Full pathname to the node (for error msgs)
+ *              package_index   - Used to determine if target is in a package
+ *              return_object_ptr - Pointer to the object returned from the
+ *                                evaluation of a method or object
+ *
+ * RETURN:      Status. AE_OK if repair was successful.
+ *
+ * DESCRIPTION: Attempt to repair/convert a return object of a type that was
+ *              not expected.
+ *
+ ******************************************************************************/
+
+static acpi_status
+acpi_ns_repair_object(u32 expected_btypes,
+		      u32 package_index,
+		      union acpi_operand_object **return_object_ptr)
+{
+	union acpi_operand_object *return_object = *return_object_ptr;
+	union acpi_operand_object *new_object;
+	acpi_size length;
+
+	switch (ACPI_GET_OBJECT_TYPE(return_object)) {
+	case ACPI_TYPE_BUFFER:
+
+		if (!(expected_btypes & ACPI_RTYPE_STRING)) {
+			return (AE_AML_OPERAND_TYPE);
+		}
+
+		/*
+		 * Have a Buffer, expected a String, convert. Use a to_string
+		 * conversion, no transform performed on the buffer data. The best
+		 * example of this is the _BIF method, where the string data from
+		 * the battery is often (incorrectly) returned as buffer object(s).
+		 */
+		length = 0;
+		while ((length < return_object->buffer.length) &&
+		       (return_object->buffer.pointer[length])) {
+			length++;
+		}
+
+		/* Allocate a new string object */
+
+		new_object = acpi_ut_create_string_object(length);
+		if (!new_object) {
+			return (AE_NO_MEMORY);
+		}
+
+		/*
+		 * Copy the raw buffer data with no transform. String is already NULL
+		 * terminated at Length+1.
+		 */
+		ACPI_MEMCPY(new_object->string.pointer,
+			    return_object->buffer.pointer, length);
+
+		/* Install the new return object */
+
+		acpi_ut_remove_reference(return_object);
+		*return_object_ptr = new_object;
+
+		/*
+		 * If the object is a package element, we need to:
+		 * 1. Decrement the reference count of the orignal object, it was
+		 *    incremented when building the package
+		 * 2. Increment the reference count of the new object, it will be
+		 *    decremented when releasing the package
+		 */
+		if (package_index != ACPI_NOT_PACKAGE) {
+			acpi_ut_remove_reference(return_object);
+			acpi_ut_add_reference(new_object);
+		}
+		return (AE_OK);
+
+	default:
+		break;
+	}
+
+	return (AE_AML_OPERAND_TYPE);
+}
diff --git a/drivers/acpi/namespace/nssearch.c b/drivers/acpi/acpica/nssearch.c
index a9a80bf811b..6fea13f3f52 100644
--- a/drivers/acpi/namespace/nssearch.c
+++ b/drivers/acpi/acpica/nssearch.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nssearch")
diff --git a/drivers/acpi/namespace/nsutils.c b/drivers/acpi/acpica/nsutils.c
index b0817e1127b..3e1149bf4aa 100644
--- a/drivers/acpi/namespace/nsutils.c
+++ b/drivers/acpi/acpica/nsutils.c
@@ -43,9 +43,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/amlcode.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "amlcode.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsutils")
@@ -314,9 +315,15 @@ void acpi_ns_get_internal_name_length(struct acpi_namestring_info *info)
 	 *
 	 * strlen() + 1 covers the first name_seg, which has no path separator
 	 */
-	if (acpi_ns_valid_root_prefix(next_external_char[0])) {
+	if (acpi_ns_valid_root_prefix(*next_external_char)) {
 		info->fully_qualified = TRUE;
 		next_external_char++;
+
+		/* Skip redundant root_prefix, like \\_SB.PCI0.SBRG.EC0 */
+
+		while (acpi_ns_valid_root_prefix(*next_external_char)) {
+			next_external_char++;
+		}
 	} else {
 		/*
 		 * Handle Carat prefixes
diff --git a/drivers/acpi/namespace/nswalk.c b/drivers/acpi/acpica/nswalk.c
index 3c905ce26d7..200895fa272 100644
--- a/drivers/acpi/namespace/nswalk.c
+++ b/drivers/acpi/acpica/nswalk.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nswalk")
diff --git a/drivers/acpi/namespace/nsxfeval.c b/drivers/acpi/acpica/nsxfeval.c
index a085cc39c05..22a7171ac1e 100644
--- a/drivers/acpi/namespace/nsxfeval.c
+++ b/drivers/acpi/acpica/nsxfeval.c
@@ -43,8 +43,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsxfeval")
diff --git a/drivers/acpi/namespace/nsxfname.c b/drivers/acpi/acpica/nsxfname.c
index 5efa4e7ddb0..9589fea2499 100644
--- a/drivers/acpi/namespace/nsxfname.c
+++ b/drivers/acpi/acpica/nsxfname.c
@@ -43,7 +43,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsxfname")
diff --git a/drivers/acpi/namespace/nsxfobj.c b/drivers/acpi/acpica/nsxfobj.c
index 2b375ee80ce..1c7efc15225 100644
--- a/drivers/acpi/namespace/nsxfobj.c
+++ b/drivers/acpi/acpica/nsxfobj.c
@@ -43,7 +43,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_NAMESPACE
 ACPI_MODULE_NAME("nsxfobj")
diff --git a/drivers/acpi/parser/psargs.c b/drivers/acpi/acpica/psargs.c
index d830b29b85b..b161f3544b5 100644
--- a/drivers/acpi/parser/psargs.c
+++ b/drivers/acpi/acpica/psargs.c
@@ -42,10 +42,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acdispat.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "amlcode.h"
+#include "acnamesp.h"
+#include "acdispat.h"
 
 #define _COMPONENT          ACPI_PARSER
 ACPI_MODULE_NAME("psargs")
diff --git a/drivers/acpi/parser/psloop.c b/drivers/acpi/acpica/psloop.c
index 4647039a0d8..c5f6ce19a40 100644
--- a/drivers/acpi/parser/psloop.c
+++ b/drivers/acpi/acpica/psloop.c
@@ -50,9 +50,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/acdispat.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "acdispat.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_PARSER
 ACPI_MODULE_NAME("psloop")
diff --git a/drivers/acpi/parser/psopcode.c b/drivers/acpi/acpica/psopcode.c
index f425ab30eae..3bc3a60194d 100644
--- a/drivers/acpi/parser/psopcode.c
+++ b/drivers/acpi/acpica/psopcode.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/acopcode.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "acopcode.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_PARSER
 ACPI_MODULE_NAME("psopcode")
diff --git a/drivers/acpi/parser/psparse.c b/drivers/acpi/acpica/psparse.c
index 68e932f215e..70838e9b608 100644
--- a/drivers/acpi/parser/psparse.c
+++ b/drivers/acpi/acpica/psparse.c
@@ -51,11 +51,12 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/acdispat.h>
-#include <acpi/amlcode.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "acdispat.h"
+#include "amlcode.h"
+#include "acnamesp.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_PARSER
 ACPI_MODULE_NAME("psparse")
@@ -447,10 +448,22 @@ acpi_status acpi_ps_parse_aml(struct acpi_walk_state *walk_state)
 			  walk_state, walk_state->parser_state.aml,
 			  walk_state->parser_state.aml_size));
 
+	if (!walk_state->parser_state.aml) {
+		return_ACPI_STATUS(AE_NULL_OBJECT);
+	}
+
 	/* Create and initialize a new thread state */
 
 	thread = acpi_ut_create_thread_state();
 	if (!thread) {
+		if (walk_state->method_desc) {
+
+			/* Executing a control method - additional cleanup */
+
+			acpi_ds_terminate_control_method(
+				walk_state->method_desc, walk_state);
+		}
+
 		acpi_ds_delete_walk_state(walk_state);
 		return_ACPI_STATUS(AE_NO_MEMORY);
 	}
diff --git a/drivers/acpi/parser/psscope.c b/drivers/acpi/acpica/psscope.c
index ee50e67c944..2feca5ca958 100644
--- a/drivers/acpi/parser/psscope.c
+++ b/drivers/acpi/acpica/psscope.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
+#include "accommon.h"
+#include "acparser.h"
 
 #define _COMPONENT          ACPI_PARSER
 ACPI_MODULE_NAME("psscope")
diff --git a/drivers/acpi/parser/pstree.c b/drivers/acpi/acpica/pstree.c
index 1dd355ddd18..4d3389118ec 100644
--- a/drivers/acpi/parser/pstree.c
+++ b/drivers/acpi/acpica/pstree.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_PARSER
 ACPI_MODULE_NAME("pstree")
diff --git a/drivers/acpi/parser/psutils.c b/drivers/acpi/acpica/psutils.c
index 7cf1f65cd5b..e636e078ad3 100644
--- a/drivers/acpi/parser/psutils.c
+++ b/drivers/acpi/acpica/psutils.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/amlcode.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_PARSER
 ACPI_MODULE_NAME("psutils")
diff --git a/drivers/acpi/parser/pswalk.c b/drivers/acpi/acpica/pswalk.c
index 8b86ad5a320..78b8b791f2a 100644
--- a/drivers/acpi/parser/pswalk.c
+++ b/drivers/acpi/acpica/pswalk.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
+#include "accommon.h"
+#include "acparser.h"
 
 #define _COMPONENT          ACPI_PARSER
 ACPI_MODULE_NAME("pswalk")
diff --git a/drivers/acpi/parser/psxface.c b/drivers/acpi/acpica/psxface.c
index 270469aae84..ff06032c0f0 100644
--- a/drivers/acpi/parser/psxface.c
+++ b/drivers/acpi/acpica/psxface.c
@@ -42,9 +42,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acparser.h>
-#include <acpi/acdispat.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acparser.h"
+#include "acdispat.h"
+#include "acinterp.h"
+#include "amlcode.h"
 
 #define _COMPONENT          ACPI_PARSER
 ACPI_MODULE_NAME("psxface")
@@ -278,6 +280,38 @@ acpi_status acpi_ps_execute_method(struct acpi_evaluate_info *info)
 		goto cleanup;
 	}
 
+	/* Invoke an internal method if necessary */
+
+	if (info->obj_desc->method.method_flags & AML_METHOD_INTERNAL_ONLY) {
+		status = info->obj_desc->method.implementation(walk_state);
+		info->return_object = walk_state->return_desc;
+
+		/* Cleanup states */
+
+		acpi_ds_scope_stack_clear(walk_state);
+		acpi_ps_cleanup_scope(&walk_state->parser_state);
+		acpi_ds_terminate_control_method(walk_state->method_desc,
+						 walk_state);
+		acpi_ds_delete_walk_state(walk_state);
+		goto cleanup;
+	}
+
+	/*
+	 * Start method evaluation with an implicit return of zero.
+	 * This is done for Windows compatibility.
+	 */
+	if (acpi_gbl_enable_interpreter_slack) {
+		walk_state->implicit_return_obj =
+		    acpi_ut_create_internal_object(ACPI_TYPE_INTEGER);
+		if (!walk_state->implicit_return_obj) {
+			status = AE_NO_MEMORY;
+			acpi_ds_delete_walk_state(walk_state);
+			goto cleanup;
+		}
+
+		walk_state->implicit_return_obj->integer.value = 0;
+	}
+
 	/* Parse the AML */
 
 	status = acpi_ps_parse_aml(walk_state);
diff --git a/drivers/acpi/resources/rsaddr.c b/drivers/acpi/acpica/rsaddr.c
index 7f96332822b..1e437bfd8db 100644
--- a/drivers/acpi/resources/rsaddr.c
+++ b/drivers/acpi/acpica/rsaddr.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
+#include "accommon.h"
+#include "acresrc.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rsaddr")
diff --git a/drivers/acpi/resources/rscalc.c b/drivers/acpi/acpica/rscalc.c
index 8eaaecf9200..52865ee6bc7 100644
--- a/drivers/acpi/resources/rscalc.c
+++ b/drivers/acpi/acpica/rscalc.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acresrc.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rscalc")
diff --git a/drivers/acpi/resources/rscreate.c b/drivers/acpi/acpica/rscreate.c
index 08b8d73e6ee..61566b1a061 100644
--- a/drivers/acpi/resources/rscreate.c
+++ b/drivers/acpi/acpica/rscreate.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acresrc.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rscreate")
diff --git a/drivers/acpi/resources/rsdump.c b/drivers/acpi/acpica/rsdump.c
index 6bbbb7b8941..3f0ca5a12d3 100644
--- a/drivers/acpi/resources/rsdump.c
+++ b/drivers/acpi/acpica/rsdump.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
+#include "accommon.h"
+#include "acresrc.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rsdump")
diff --git a/drivers/acpi/resources/rsinfo.c b/drivers/acpi/acpica/rsinfo.c
index 3f0a1fedbe0..77b25fdb459 100644
--- a/drivers/acpi/resources/rsinfo.c
+++ b/drivers/acpi/acpica/rsinfo.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
+#include "accommon.h"
+#include "acresrc.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rsinfo")
diff --git a/drivers/acpi/resources/rsio.c b/drivers/acpi/acpica/rsio.c
index b66d42e7402..35a49aa9560 100644
--- a/drivers/acpi/resources/rsio.c
+++ b/drivers/acpi/acpica/rsio.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
+#include "accommon.h"
+#include "acresrc.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rsio")
diff --git a/drivers/acpi/resources/rsirq.c b/drivers/acpi/acpica/rsirq.c
index a8805efc036..2e0256983aa 100644
--- a/drivers/acpi/resources/rsirq.c
+++ b/drivers/acpi/acpica/rsirq.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
+#include "accommon.h"
+#include "acresrc.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rsirq")
diff --git a/drivers/acpi/resources/rslist.c b/drivers/acpi/acpica/rslist.c
index b78c7e797a1..1b1dbc69f08 100644
--- a/drivers/acpi/resources/rslist.c
+++ b/drivers/acpi/acpica/rslist.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
+#include "accommon.h"
+#include "acresrc.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rslist")
diff --git a/drivers/acpi/resources/rsmemory.c b/drivers/acpi/acpica/rsmemory.c
index 63b21abd90b..ddc76cebdc9 100644
--- a/drivers/acpi/resources/rsmemory.c
+++ b/drivers/acpi/acpica/rsmemory.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
+#include "accommon.h"
+#include "acresrc.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rsmemory")
diff --git a/drivers/acpi/resources/rsmisc.c b/drivers/acpi/acpica/rsmisc.c
index 96a6c035325..5bc49a55328 100644
--- a/drivers/acpi/resources/rsmisc.c
+++ b/drivers/acpi/acpica/rsmisc.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
+#include "accommon.h"
+#include "acresrc.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rsmisc")
diff --git a/drivers/acpi/resources/rsutils.c b/drivers/acpi/acpica/rsutils.c
index f7b3bcd59ba..bc03d596682 100644
--- a/drivers/acpi/resources/rsutils.c
+++ b/drivers/acpi/acpica/rsutils.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acresrc.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acresrc.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rsutils")
diff --git a/drivers/acpi/resources/rsxface.c b/drivers/acpi/acpica/rsxface.c
index f59f4c4e034..69a2aa5b5d8 100644
--- a/drivers/acpi/resources/rsxface.c
+++ b/drivers/acpi/acpica/rsxface.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acresrc.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acresrc.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_RESOURCES
 ACPI_MODULE_NAME("rsxface")
diff --git a/drivers/acpi/tables/tbfadt.c b/drivers/acpi/acpica/tbfadt.c
index 2817158fb6a..3636e4f8fb7 100644
--- a/drivers/acpi/tables/tbfadt.c
+++ b/drivers/acpi/acpica/tbfadt.c
@@ -42,15 +42,16 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_TABLES
 ACPI_MODULE_NAME("tbfadt")
 
 /* Local prototypes */
-static void inline
+static inline void
 acpi_tb_init_generic_address(struct acpi_generic_address *generic_address,
-			     u8 byte_width, u64 address);
+			     u8 space_id, u8 byte_width, u64 address);
 
 static void acpi_tb_convert_fadt(void);
 
@@ -60,9 +61,10 @@ static void acpi_tb_validate_fadt(void);
 
 typedef struct acpi_fadt_info {
 	char *name;
-	u8 target;
-	u8 source;
+	u8 address64;
+	u8 address32;
 	u8 length;
+	u8 default_length;
 	u8 type;
 
 } acpi_fadt_info;
@@ -71,37 +73,61 @@ typedef struct acpi_fadt_info {
 #define ACPI_FADT_SEPARATE_LENGTH   2
 
 static struct acpi_fadt_info fadt_info_table[] = {
-	{"Pm1aEventBlock", ACPI_FADT_OFFSET(xpm1a_event_block),
+	{"Pm1aEventBlock",
+	 ACPI_FADT_OFFSET(xpm1a_event_block),
 	 ACPI_FADT_OFFSET(pm1a_event_block),
-	 ACPI_FADT_OFFSET(pm1_event_length), ACPI_FADT_REQUIRED},
+	 ACPI_FADT_OFFSET(pm1_event_length),
+	 ACPI_PM1_REGISTER_WIDTH * 2,	/* Enable + Status register */
+	 ACPI_FADT_REQUIRED},
 
-	{"Pm1bEventBlock", ACPI_FADT_OFFSET(xpm1b_event_block),
+	{"Pm1bEventBlock",
+	 ACPI_FADT_OFFSET(xpm1b_event_block),
 	 ACPI_FADT_OFFSET(pm1b_event_block),
-	 ACPI_FADT_OFFSET(pm1_event_length), 0},
+	 ACPI_FADT_OFFSET(pm1_event_length),
+	 ACPI_PM1_REGISTER_WIDTH * 2,	/* Enable + Status register */
+	 0},
 
-	{"Pm1aControlBlock", ACPI_FADT_OFFSET(xpm1a_control_block),
+	{"Pm1aControlBlock",
+	 ACPI_FADT_OFFSET(xpm1a_control_block),
 	 ACPI_FADT_OFFSET(pm1a_control_block),
-	 ACPI_FADT_OFFSET(pm1_control_length), ACPI_FADT_REQUIRED},
+	 ACPI_FADT_OFFSET(pm1_control_length),
+	 ACPI_PM1_REGISTER_WIDTH,
+	 ACPI_FADT_REQUIRED},
 
-	{"Pm1bControlBlock", ACPI_FADT_OFFSET(xpm1b_control_block),
+	{"Pm1bControlBlock",
+	 ACPI_FADT_OFFSET(xpm1b_control_block),
 	 ACPI_FADT_OFFSET(pm1b_control_block),
-	 ACPI_FADT_OFFSET(pm1_control_length), 0},
+	 ACPI_FADT_OFFSET(pm1_control_length),
+	 ACPI_PM1_REGISTER_WIDTH,
+	 0},
 
-	{"Pm2ControlBlock", ACPI_FADT_OFFSET(xpm2_control_block),
+	{"Pm2ControlBlock",
+	 ACPI_FADT_OFFSET(xpm2_control_block),
 	 ACPI_FADT_OFFSET(pm2_control_block),
-	 ACPI_FADT_OFFSET(pm2_control_length), ACPI_FADT_SEPARATE_LENGTH},
+	 ACPI_FADT_OFFSET(pm2_control_length),
+	 ACPI_PM2_REGISTER_WIDTH,
+	 ACPI_FADT_SEPARATE_LENGTH},
 
-	{"PmTimerBlock", ACPI_FADT_OFFSET(xpm_timer_block),
+	{"PmTimerBlock",
+	 ACPI_FADT_OFFSET(xpm_timer_block),
 	 ACPI_FADT_OFFSET(pm_timer_block),
-	 ACPI_FADT_OFFSET(pm_timer_length), ACPI_FADT_REQUIRED},
+	 ACPI_FADT_OFFSET(pm_timer_length),
+	 ACPI_PM_TIMER_WIDTH,
+	 ACPI_FADT_REQUIRED},
 
-	{"Gpe0Block", ACPI_FADT_OFFSET(xgpe0_block),
+	{"Gpe0Block",
+	 ACPI_FADT_OFFSET(xgpe0_block),
 	 ACPI_FADT_OFFSET(gpe0_block),
-	 ACPI_FADT_OFFSET(gpe0_block_length), ACPI_FADT_SEPARATE_LENGTH},
+	 ACPI_FADT_OFFSET(gpe0_block_length),
+	 0,
+	 ACPI_FADT_SEPARATE_LENGTH},
 
-	{"Gpe1Block", ACPI_FADT_OFFSET(xgpe1_block),
+	{"Gpe1Block",
+	 ACPI_FADT_OFFSET(xgpe1_block),
 	 ACPI_FADT_OFFSET(gpe1_block),
-	 ACPI_FADT_OFFSET(gpe1_block_length), ACPI_FADT_SEPARATE_LENGTH}
+	 ACPI_FADT_OFFSET(gpe1_block_length),
+	 0,
+	 ACPI_FADT_SEPARATE_LENGTH}
 };
 
 #define ACPI_FADT_INFO_ENTRIES        (sizeof (fadt_info_table) / sizeof (struct acpi_fadt_info))
@@ -122,9 +148,9 @@ static struct acpi_fadt_info fadt_info_table[] = {
  *
  ******************************************************************************/
 
-static void inline
+static inline void
 acpi_tb_init_generic_address(struct acpi_generic_address *generic_address,
-			     u8 byte_width, u64 address)
+			     u8 space_id, u8 byte_width, u64 address)
 {
 
 	/*
@@ -135,10 +161,10 @@ acpi_tb_init_generic_address(struct acpi_generic_address *generic_address,
 
 	/* All other fields are byte-wide */
 
-	generic_address->space_id = ACPI_ADR_SPACE_SYSTEM_IO;
-	generic_address->bit_width = byte_width << 3;
+	generic_address->space_id = space_id;
+	generic_address->bit_width = (u8)ACPI_MUL_8(byte_width);
 	generic_address->bit_offset = 0;
-	generic_address->access_width = 0;
+	generic_address->access_width = 0;	/* Access width ANY */
 }
 
 /*******************************************************************************
@@ -225,7 +251,8 @@ void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length)
 	 */
 	if (length > sizeof(struct acpi_table_fadt)) {
 		ACPI_WARNING((AE_INFO,
-			      "FADT (revision %u) is longer than ACPI 2.0 version, truncating length 0x%X to 0x%zX",
+			      "FADT (revision %u) is longer than ACPI 2.0 version, "
+			      "truncating length 0x%X to 0x%zX",
 			      table->revision, (unsigned)length,
 			      sizeof(struct acpi_table_fadt)));
 	}
@@ -244,7 +271,6 @@ void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length)
 	 * 2) Validate some of the important values within the FADT
 	 */
 	acpi_tb_convert_fadt();
-	acpi_tb_validate_fadt();
 }
 
 /*******************************************************************************
@@ -278,22 +304,36 @@ void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length)
 
 static void acpi_tb_convert_fadt(void)
 {
-	u8 pm1_register_length;
-	struct acpi_generic_address *target;
+	u8 pm1_register_bit_width;
+	u8 pm1_register_byte_width;
+	struct acpi_generic_address *target64;
 	u32 i;
 
 	/* Update the local FADT table header length */
 
 	acpi_gbl_FADT.header.length = sizeof(struct acpi_table_fadt);
 
-	/* Expand the 32-bit FACS and DSDT addresses to 64-bit as necessary */
-
+	/*
+	 * Expand the 32-bit FACS and DSDT addresses to 64-bit as necessary.
+	 * Later code will always use the X 64-bit field. Also, check for an
+	 * address mismatch between the 32-bit and 64-bit address fields
+	 * (FIRMWARE_CTRL/X_FIRMWARE_CTRL, DSDT/X_DSDT) which would indicate
+	 * the presence of two FACS or two DSDT tables.
+	 */
 	if (!acpi_gbl_FADT.Xfacs) {
 		acpi_gbl_FADT.Xfacs = (u64) acpi_gbl_FADT.facs;
+	} else if (acpi_gbl_FADT.facs &&
+		   (acpi_gbl_FADT.Xfacs != (u64) acpi_gbl_FADT.facs)) {
+		ACPI_WARNING((AE_INFO,
+		    "32/64 FACS address mismatch in FADT - two FACS tables!"));
 	}
 
 	if (!acpi_gbl_FADT.Xdsdt) {
 		acpi_gbl_FADT.Xdsdt = (u64) acpi_gbl_FADT.dsdt;
+	} else if (acpi_gbl_FADT.dsdt &&
+		   (acpi_gbl_FADT.Xdsdt != (u64) acpi_gbl_FADT.dsdt)) {
+		ACPI_WARNING((AE_INFO,
+		    "32/64 DSDT address mismatch in FADT - two DSDT tables!"));
 	}
 
 	/*
@@ -312,18 +352,23 @@ static void acpi_tb_convert_fadt(void)
 	}
 
 	/*
-	 * Expand the ACPI 1.0 32-bit V1.0 addresses to the ACPI 2.0 64-bit "X"
-	 * generic address structures as necessary.
+	 * Expand the ACPI 1.0 32-bit addresses to the ACPI 2.0 64-bit "X"
+	 * generic address structures as necessary. Later code will always use
+	 * the 64-bit address structures.
 	 */
 	for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) {
-		target =
+		target64 =
 		    ACPI_ADD_PTR(struct acpi_generic_address, &acpi_gbl_FADT,
-				 fadt_info_table[i].target);
+				 fadt_info_table[i].address64);
 
-		/* Expand only if the X target is null */
+		/* Expand only if the 64-bit X target is null */
 
-		if (!target->address) {
-			acpi_tb_init_generic_address(target,
+		if (!target64->address) {
+
+			/* The space_id is always I/O for the 32-bit legacy address fields */
+
+			acpi_tb_init_generic_address(target64,
+						     ACPI_ADR_SPACE_SYSTEM_IO,
 						     *ACPI_ADD_PTR(u8,
 								   &acpi_gbl_FADT,
 								   fadt_info_table
@@ -332,11 +377,64 @@ static void acpi_tb_convert_fadt(void)
 									  &acpi_gbl_FADT,
 									  fadt_info_table
 									  [i].
-									  source));
+									  address32));
+		}
+	}
+
+	/* Validate FADT values now, before we make any changes */
+
+	acpi_tb_validate_fadt();
+
+	/*
+	 * Optionally check all register lengths against the default values and
+	 * update them if they are incorrect.
+	 */
+	if (acpi_gbl_use_default_register_widths) {
+		for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) {
+			target64 =
+			    ACPI_ADD_PTR(struct acpi_generic_address,
+					 &acpi_gbl_FADT,
+					 fadt_info_table[i].address64);
+
+			/*
+			 * If a valid register (Address != 0) and the (default_length > 0)
+			 * (Not a GPE register), then check the width against the default.
+			 */
+			if ((target64->address) &&
+			    (fadt_info_table[i].default_length > 0) &&
+			    (fadt_info_table[i].default_length !=
+			     target64->bit_width)) {
+				ACPI_WARNING((AE_INFO,
+					      "Invalid length for %s: %d, using default %d",
+					      fadt_info_table[i].name,
+					      target64->bit_width,
+					      fadt_info_table[i].
+					      default_length));
+
+				/* Incorrect size, set width to the default */
+
+				target64->bit_width =
+				    fadt_info_table[i].default_length;
+			}
 		}
 	}
 
 	/*
+	 * Get the length of the individual PM1 registers (enable and status).
+	 * Each register is defined to be (event block length / 2).
+	 */
+	pm1_register_bit_width =
+	    (u8)ACPI_DIV_2(acpi_gbl_FADT.xpm1a_event_block.bit_width);
+	pm1_register_byte_width = (u8)ACPI_DIV_8(pm1_register_bit_width);
+
+	/*
+	 * Adjust the lengths of the PM1 Event Blocks so that they can be used to
+	 * access the PM1 status register(s). Use (width / 2)
+	 */
+	acpi_gbl_FADT.xpm1a_event_block.bit_width = pm1_register_bit_width;
+	acpi_gbl_FADT.xpm1b_event_block.bit_width = pm1_register_bit_width;
+
+	/*
 	 * Calculate separate GAS structs for the PM1 Enable registers.
 	 * These addresses do not appear (directly) in the FADT, so it is
 	 * useful to calculate them once, here.
@@ -356,14 +454,14 @@ static void acpi_tb_convert_fadt(void)
 		       " PM1_EVT_LEN (%u)\n",
 		       acpi_gbl_FADT.xpm1a_event_block.bit_width,
 		       acpi_gbl_FADT.pm1_event_length);
-	pm1_register_length = (u8) ACPI_DIV_2(acpi_gbl_FADT.pm1_event_length);
 
 	/* The PM1A register block is required */
 
 	acpi_tb_init_generic_address(&acpi_gbl_xpm1a_enable,
-				     pm1_register_length,
+				     acpi_gbl_FADT.xpm1a_event_block.space_id,
+				     pm1_register_byte_width,
 				     (acpi_gbl_FADT.xpm1a_event_block.address +
-				      pm1_register_length));
+				      pm1_register_byte_width));
 	/* Don't forget to copy space_id of the GAS */
 	acpi_gbl_xpm1a_enable.space_id =
 	    acpi_gbl_FADT.xpm1a_event_block.space_id;
@@ -379,9 +477,10 @@ static void acpi_tb_convert_fadt(void)
 			       acpi_gbl_FADT.xpm1b_event_block.bit_width,
 			       acpi_gbl_FADT.pm1_event_length);
 		acpi_tb_init_generic_address(&acpi_gbl_xpm1b_enable,
-					     pm1_register_length,
+					     acpi_gbl_FADT.xpm1b_event_block.space_id,
+					     pm1_register_byte_width,
 					     (acpi_gbl_FADT.xpm1b_event_block.
-					      address + pm1_register_length));
+					      address + pm1_register_byte_width));
 		/* Don't forget to copy space_id of the GAS */
 		acpi_gbl_xpm1b_enable.space_id =
 		    acpi_gbl_FADT.xpm1b_event_block.space_id;
@@ -411,26 +510,63 @@ static void acpi_tb_convert_fadt(void)
 
 static void acpi_tb_validate_fadt(void)
 {
+	char *name;
 	u32 *address32;
 	struct acpi_generic_address *address64;
 	u8 length;
 	u32 i;
 
-	/* Examine all of the 64-bit extended address fields (X fields) */
+	/*
+	 * Check for FACS and DSDT address mismatches. An address mismatch between
+	 * the 32-bit and 64-bit address fields (FIRMWARE_CTRL/X_FIRMWARE_CTRL and
+	 * DSDT/X_DSDT) would indicate the presence of two FACS or two DSDT tables.
+	 */
+	if (acpi_gbl_FADT.facs &&
+	    (acpi_gbl_FADT.Xfacs != (u64) acpi_gbl_FADT.facs)) {
+		ACPI_WARNING((AE_INFO,
+			      "32/64X FACS address mismatch in FADT - "
+			      "two FACS tables! %8.8X/%8.8X%8.8X",
+			      acpi_gbl_FADT.facs,
+			      ACPI_FORMAT_UINT64(acpi_gbl_FADT.Xfacs)));
+	}
 
-	for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) {
+	if (acpi_gbl_FADT.dsdt &&
+	    (acpi_gbl_FADT.Xdsdt != (u64) acpi_gbl_FADT.dsdt)) {
+		ACPI_WARNING((AE_INFO,
+			      "32/64X DSDT address mismatch in FADT - "
+			      "two DSDT tables! %8.8X/%8.8X%8.8X",
+			      acpi_gbl_FADT.dsdt,
+			      ACPI_FORMAT_UINT64(acpi_gbl_FADT.Xdsdt)));
+	}
 
-		/* Generate pointers to the 32-bit and 64-bit addresses and get the length */
+	/* Examine all of the 64-bit extended address fields (X fields) */
 
-		address64 =
-		    ACPI_ADD_PTR(struct acpi_generic_address, &acpi_gbl_FADT,
-				 fadt_info_table[i].target);
+	for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) {
+		/*
+		 * Generate pointers to the 32-bit and 64-bit addresses, get the
+		 * register length (width), and the register name
+		 */
+		address64 = ACPI_ADD_PTR(struct acpi_generic_address,
+					 &acpi_gbl_FADT,
+					 fadt_info_table[i].address64);
 		address32 =
 		    ACPI_ADD_PTR(u32, &acpi_gbl_FADT,
-				 fadt_info_table[i].source);
+				 fadt_info_table[i].address32);
 		length =
 		    *ACPI_ADD_PTR(u8, &acpi_gbl_FADT,
 				  fadt_info_table[i].length);
+		name = fadt_info_table[i].name;
+
+		/*
+		 * For each extended field, check for length mismatch between the
+		 * legacy length field and the corresponding 64-bit X length field.
+		 */
+		if (address64 && (address64->bit_width != ACPI_MUL_8(length))) {
+			ACPI_WARNING((AE_INFO,
+				      "32/64X length mismatch in %s: %d/%d",
+				      name, ACPI_MUL_8(length),
+				      address64->bit_width));
+		}
 
 		if (fadt_info_table[i].type & ACPI_FADT_REQUIRED) {
 			/*
@@ -439,8 +575,8 @@ static void acpi_tb_validate_fadt(void)
 			 */
 			if (!address64->address || !length) {
 				ACPI_ERROR((AE_INFO,
-					    "Required field \"%s\" has zero address and/or length: %8.8X%8.8X/%X",
-					    fadt_info_table[i].name,
+					    "Required field %s has zero address and/or length: %8.8X%8.8X/%X",
+					    name,
 					    ACPI_FORMAT_UINT64(address64->
 							       address),
 					    length));
@@ -453,8 +589,8 @@ static void acpi_tb_validate_fadt(void)
 			if ((address64->address && !length)
 			    || (!address64->address && length)) {
 				ACPI_WARNING((AE_INFO,
-					      "Optional field \"%s\" has zero address or length: %8.8X%8.8X/%X",
-					      fadt_info_table[i].name,
+					      "Optional field %s has zero address or length: %8.8X%8.8X/%X",
+					      name,
 					      ACPI_FORMAT_UINT64(address64->
 								 address),
 					      length));
@@ -466,8 +602,8 @@ static void acpi_tb_validate_fadt(void)
 		if (address64->address && *address32 &&
 		    (address64->address != (u64) * address32)) {
 			ACPI_ERROR((AE_INFO,
-				    "32/64X address mismatch in \"%s\": [%8.8X] [%8.8X%8.8X], using 64X",
-				    fadt_info_table[i].name, *address32,
+				    "32/64X address mismatch in %s: %8.8X/%8.8X%8.8X, using 64X",
+				    name, *address32,
 				    ACPI_FORMAT_UINT64(address64->address)));
 		}
 	}
diff --git a/drivers/acpi/tables/tbfind.c b/drivers/acpi/acpica/tbfind.c
index 531584defbb..1054dfd4920 100644
--- a/drivers/acpi/tables/tbfind.c
+++ b/drivers/acpi/acpica/tbfind.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_TABLES
 ACPI_MODULE_NAME("tbfind")
diff --git a/drivers/acpi/tables/tbinstal.c b/drivers/acpi/acpica/tbinstal.c
index 18747ce8dd2..37374b21969 100644
--- a/drivers/acpi/tables/tbinstal.c
+++ b/drivers/acpi/acpica/tbinstal.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_TABLES
 ACPI_MODULE_NAME("tbinstal")
diff --git a/drivers/acpi/tables/tbutils.c b/drivers/acpi/acpica/tbutils.c
index 0cc92ef5236..9684cc82793 100644
--- a/drivers/acpi/tables/tbutils.c
+++ b/drivers/acpi/acpica/tbutils.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_TABLES
 ACPI_MODULE_NAME("tbutils")
@@ -113,6 +114,30 @@ acpi_tb_check_xsdt(acpi_physical_address address)
 
 /*******************************************************************************
  *
+ * FUNCTION:    acpi_tb_initialize_facs
+ *
+ * PARAMETERS:  None
+ *
+ * RETURN:      Status
+ *
+ * DESCRIPTION: Create a permanent mapping for the FADT and save it in a global
+ *              for accessing the Global Lock and Firmware Waking Vector
+ *
+ ******************************************************************************/
+
+acpi_status acpi_tb_initialize_facs(void)
+{
+	acpi_status status;
+
+	status = acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS,
+					 ACPI_CAST_INDIRECT_PTR(struct
+								acpi_table_header,
+								&acpi_gbl_FACS));
+	return status;
+}
+
+/*******************************************************************************
+ *
  * FUNCTION:    acpi_tb_tables_loaded
  *
  * PARAMETERS:  None
@@ -420,7 +445,8 @@ acpi_tb_parse_root_table(acpi_physical_address rsdp_address, u8 flags)
 
 	/* Differentiate between RSDT and XSDT root tables */
 
-	if (rsdp->revision > 1 && rsdp->xsdt_physical_address) {
+	if (rsdp->revision > 1 && rsdp->xsdt_physical_address
+			&& !acpi_rsdt_forced) {
 		/*
 		 * Root table is an XSDT (64-bit physical addresses). We must use the
 		 * XSDT if the revision is > 1 and the XSDT pointer is present, as per
diff --git a/drivers/acpi/tables/tbxface.c b/drivers/acpi/acpica/tbxface.c
index fd7770aa106..c3e841f3cde 100644
--- a/drivers/acpi/tables/tbxface.c
+++ b/drivers/acpi/acpica/tbxface.c
@@ -43,8 +43,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_TABLES
 ACPI_MODULE_NAME("tbxface")
diff --git a/drivers/acpi/tables/tbxfroot.c b/drivers/acpi/acpica/tbxfroot.c
index 2d157e0f98d..b7fc8dd4334 100644
--- a/drivers/acpi/tables/tbxfroot.c
+++ b/drivers/acpi/acpica/tbxfroot.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_TABLES
 ACPI_MODULE_NAME("tbxfroot")
diff --git a/drivers/acpi/utilities/utalloc.c b/drivers/acpi/acpica/utalloc.c
index 241c535c175..7580f6b3069 100644
--- a/drivers/acpi/utilities/utalloc.c
+++ b/drivers/acpi/acpica/utalloc.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acdebug.h>
+#include "accommon.h"
+#include "acdebug.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utalloc")
diff --git a/drivers/acpi/utilities/utcopy.c b/drivers/acpi/acpica/utcopy.c
index 5b2f7c27b70..b0dcfd3c872 100644
--- a/drivers/acpi/utilities/utcopy.c
+++ b/drivers/acpi/acpica/utcopy.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 
 #define _COMPONENT          ACPI_UTILITIES
diff --git a/drivers/acpi/utilities/utdebug.c b/drivers/acpi/acpica/utdebug.c
index fd66ecb6741..38821f53042 100644
--- a/drivers/acpi/utilities/utdebug.c
+++ b/drivers/acpi/acpica/utdebug.c
@@ -42,6 +42,7 @@
  */
 
 #include <acpi/acpi.h>
+#include "accommon.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utdebug")
@@ -136,7 +137,7 @@ static const char *acpi_ut_trim_function_name(const char *function_name)
 
 /*******************************************************************************
  *
- * FUNCTION:    acpi_ut_debug_print
+ * FUNCTION:    acpi_debug_print
  *
  * PARAMETERS:  requested_debug_level - Requested debug print level
  *              line_number         - Caller's line number (for error output)
@@ -154,11 +155,11 @@ static const char *acpi_ut_trim_function_name(const char *function_name)
  ******************************************************************************/
 
 void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_debug_print(u32 requested_debug_level,
-		    u32 line_number,
-		    const char *function_name,
-		    const char *module_name,
-		    u32 component_id, const char *format, ...)
+acpi_debug_print(u32 requested_debug_level,
+		 u32 line_number,
+		 const char *function_name,
+		 const char *module_name,
+		 u32 component_id, const char *format, ...)
 {
 	acpi_thread_id thread_id;
 	va_list args;
@@ -205,11 +206,11 @@ acpi_ut_debug_print(u32 requested_debug_level,
 	va_end(args);
 }
 
-ACPI_EXPORT_SYMBOL(acpi_ut_debug_print)
+ACPI_EXPORT_SYMBOL(acpi_debug_print)
 
 /*******************************************************************************
  *
- * FUNCTION:    acpi_ut_debug_print_raw
+ * FUNCTION:    acpi_debug_print_raw
  *
  * PARAMETERS:  requested_debug_level - Requested debug print level
  *              line_number         - Caller's line number
@@ -226,11 +227,11 @@ ACPI_EXPORT_SYMBOL(acpi_ut_debug_print)
  *
  ******************************************************************************/
 void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_debug_print_raw(u32 requested_debug_level,
-			u32 line_number,
-			const char *function_name,
-			const char *module_name,
-			u32 component_id, const char *format, ...)
+acpi_debug_print_raw(u32 requested_debug_level,
+		     u32 line_number,
+		     const char *function_name,
+		     const char *module_name,
+		     u32 component_id, const char *format, ...)
 {
 	va_list args;
 
@@ -244,7 +245,7 @@ acpi_ut_debug_print_raw(u32 requested_debug_level,
 	va_end(args);
 }
 
-ACPI_EXPORT_SYMBOL(acpi_ut_debug_print_raw)
+ACPI_EXPORT_SYMBOL(acpi_debug_print_raw)
 
 /*******************************************************************************
  *
@@ -270,9 +271,9 @@ acpi_ut_trace(u32 line_number,
 	acpi_gbl_nesting_level++;
 	acpi_ut_track_stack_ptr();
 
-	acpi_ut_debug_print(ACPI_LV_FUNCTIONS,
-			    line_number, function_name, module_name,
-			    component_id, "%s\n", acpi_gbl_fn_entry_str);
+	acpi_debug_print(ACPI_LV_FUNCTIONS,
+			 line_number, function_name, module_name, component_id,
+			 "%s\n", acpi_gbl_fn_entry_str);
 }
 
 ACPI_EXPORT_SYMBOL(acpi_ut_trace)
@@ -301,10 +302,9 @@ acpi_ut_trace_ptr(u32 line_number,
 	acpi_gbl_nesting_level++;
 	acpi_ut_track_stack_ptr();
 
-	acpi_ut_debug_print(ACPI_LV_FUNCTIONS,
-			    line_number, function_name, module_name,
-			    component_id, "%s %p\n", acpi_gbl_fn_entry_str,
-			    pointer);
+	acpi_debug_print(ACPI_LV_FUNCTIONS,
+			 line_number, function_name, module_name, component_id,
+			 "%s %p\n", acpi_gbl_fn_entry_str, pointer);
 }
 
 /*******************************************************************************
@@ -333,10 +333,9 @@ acpi_ut_trace_str(u32 line_number,
 	acpi_gbl_nesting_level++;
 	acpi_ut_track_stack_ptr();
 
-	acpi_ut_debug_print(ACPI_LV_FUNCTIONS,
-			    line_number, function_name, module_name,
-			    component_id, "%s %s\n", acpi_gbl_fn_entry_str,
-			    string);
+	acpi_debug_print(ACPI_LV_FUNCTIONS,
+			 line_number, function_name, module_name, component_id,
+			 "%s %s\n", acpi_gbl_fn_entry_str, string);
 }
 
 /*******************************************************************************
@@ -365,10 +364,9 @@ acpi_ut_trace_u32(u32 line_number,
 	acpi_gbl_nesting_level++;
 	acpi_ut_track_stack_ptr();
 
-	acpi_ut_debug_print(ACPI_LV_FUNCTIONS,
-			    line_number, function_name, module_name,
-			    component_id, "%s %08X\n", acpi_gbl_fn_entry_str,
-			    integer);
+	acpi_debug_print(ACPI_LV_FUNCTIONS,
+			 line_number, function_name, module_name, component_id,
+			 "%s %08X\n", acpi_gbl_fn_entry_str, integer);
 }
 
 /*******************************************************************************
@@ -393,9 +391,9 @@ acpi_ut_exit(u32 line_number,
 	     const char *module_name, u32 component_id)
 {
 
-	acpi_ut_debug_print(ACPI_LV_FUNCTIONS,
-			    line_number, function_name, module_name,
-			    component_id, "%s\n", acpi_gbl_fn_exit_str);
+	acpi_debug_print(ACPI_LV_FUNCTIONS,
+			 line_number, function_name, module_name, component_id,
+			 "%s\n", acpi_gbl_fn_exit_str);
 
 	acpi_gbl_nesting_level--;
 }
@@ -426,17 +424,16 @@ acpi_ut_status_exit(u32 line_number,
 {
 
 	if (ACPI_SUCCESS(status)) {
-		acpi_ut_debug_print(ACPI_LV_FUNCTIONS,
-				    line_number, function_name, module_name,
-				    component_id, "%s %s\n",
-				    acpi_gbl_fn_exit_str,
-				    acpi_format_exception(status));
+		acpi_debug_print(ACPI_LV_FUNCTIONS,
+				 line_number, function_name, module_name,
+				 component_id, "%s %s\n", acpi_gbl_fn_exit_str,
+				 acpi_format_exception(status));
 	} else {
-		acpi_ut_debug_print(ACPI_LV_FUNCTIONS,
-				    line_number, function_name, module_name,
-				    component_id, "%s ****Exception****: %s\n",
-				    acpi_gbl_fn_exit_str,
-				    acpi_format_exception(status));
+		acpi_debug_print(ACPI_LV_FUNCTIONS,
+				 line_number, function_name, module_name,
+				 component_id, "%s ****Exception****: %s\n",
+				 acpi_gbl_fn_exit_str,
+				 acpi_format_exception(status));
 	}
 
 	acpi_gbl_nesting_level--;
@@ -467,10 +464,10 @@ acpi_ut_value_exit(u32 line_number,
 		   u32 component_id, acpi_integer value)
 {
 
-	acpi_ut_debug_print(ACPI_LV_FUNCTIONS,
-			    line_number, function_name, module_name,
-			    component_id, "%s %8.8X%8.8X\n",
-			    acpi_gbl_fn_exit_str, ACPI_FORMAT_UINT64(value));
+	acpi_debug_print(ACPI_LV_FUNCTIONS,
+			 line_number, function_name, module_name, component_id,
+			 "%s %8.8X%8.8X\n", acpi_gbl_fn_exit_str,
+			 ACPI_FORMAT_UINT64(value));
 
 	acpi_gbl_nesting_level--;
 }
@@ -499,9 +496,9 @@ acpi_ut_ptr_exit(u32 line_number,
 		 const char *module_name, u32 component_id, u8 *ptr)
 {
 
-	acpi_ut_debug_print(ACPI_LV_FUNCTIONS,
-			    line_number, function_name, module_name,
-			    component_id, "%s %p\n", acpi_gbl_fn_exit_str, ptr);
+	acpi_debug_print(ACPI_LV_FUNCTIONS,
+			 line_number, function_name, module_name, component_id,
+			 "%s %p\n", acpi_gbl_fn_exit_str, ptr);
 
 	acpi_gbl_nesting_level--;
 }
diff --git a/drivers/acpi/utilities/utdelete.c b/drivers/acpi/acpica/utdelete.c
index d197c6b29e1..a0be9e39531 100644
--- a/drivers/acpi/utilities/utdelete.c
+++ b/drivers/acpi/acpica/utdelete.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acinterp.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acevents.h>
+#include "accommon.h"
+#include "acinterp.h"
+#include "acnamesp.h"
+#include "acevents.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utdelete")
diff --git a/drivers/acpi/utilities/uteval.c b/drivers/acpi/acpica/uteval.c
index 352747e49c7..da9450bc60f 100644
--- a/drivers/acpi/utilities/uteval.c
+++ b/drivers/acpi/acpica/uteval.c
@@ -42,8 +42,9 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acinterp.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acinterp.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("uteval")
@@ -129,7 +130,7 @@ acpi_status acpi_ut_osi_implementation(struct acpi_walk_state *walk_state)
 
 			/* The interface is supported */
 
-			return_ACPI_STATUS(AE_CTRL_TERMINATE);
+			return_ACPI_STATUS(AE_OK);
 		}
 	}
 
@@ -143,13 +144,13 @@ acpi_status acpi_ut_osi_implementation(struct acpi_walk_state *walk_state)
 
 		/* The interface is supported */
 
-		return_ACPI_STATUS(AE_CTRL_TERMINATE);
+		return_ACPI_STATUS(AE_OK);
 	}
 
 	/* The interface is not supported */
 
 	return_desc->integer.value = 0;
-	return_ACPI_STATUS(AE_CTRL_TERMINATE);
+	return_ACPI_STATUS(AE_OK);
 }
 
 /*******************************************************************************
diff --git a/drivers/acpi/utilities/utglobal.c b/drivers/acpi/acpica/utglobal.c
index 17ed5ac840f..a3ab9d9da29 100644
--- a/drivers/acpi/utilities/utglobal.c
+++ b/drivers/acpi/acpica/utglobal.c
@@ -44,11 +44,11 @@
 #define DEFINE_ACPI_GLOBALS
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
-ACPI_EXPORT_SYMBOL(acpi_gbl_FADT)
 #define _COMPONENT          ACPI_UTILITIES
-    ACPI_MODULE_NAME("utglobal")
+ACPI_MODULE_NAME("utglobal")
 
 /*******************************************************************************
  *
@@ -352,7 +352,7 @@ const char *acpi_gbl_region_types[ACPI_NUM_PREDEFINED_REGIONS] = {
 	"PCI_Config",
 	"EmbeddedControl",
 	"SMBus",
-	"CMOS",
+	"SystemCMOS",
 	"PCIBARTarget",
 	"DataTable"
 };
@@ -756,6 +756,7 @@ acpi_status acpi_ut_init_globals(void)
 	acpi_gbl_gpe_xrupt_list_head = NULL;
 	acpi_gbl_gpe_fadt_blocks[0] = NULL;
 	acpi_gbl_gpe_fadt_blocks[1] = NULL;
+	acpi_current_gpe_count = 0;
 
 	/* Global handlers */
 
@@ -771,6 +772,7 @@ acpi_status acpi_ut_init_globals(void)
 	acpi_gbl_global_lock_mutex = NULL;
 	acpi_gbl_global_lock_acquired = FALSE;
 	acpi_gbl_global_lock_handle = 0;
+	acpi_gbl_global_lock_present = FALSE;
 
 	/* Miscellaneous variables */
 
@@ -815,5 +817,7 @@ acpi_status acpi_ut_init_globals(void)
 	return_ACPI_STATUS(AE_OK);
 }
 
+ACPI_EXPORT_SYMBOL(acpi_gbl_FADT)
 ACPI_EXPORT_SYMBOL(acpi_dbg_level)
 ACPI_EXPORT_SYMBOL(acpi_dbg_layer)
+ACPI_EXPORT_SYMBOL(acpi_current_gpe_count)
diff --git a/drivers/acpi/utilities/utinit.c b/drivers/acpi/acpica/utinit.c
index cae515fc02d..a54ca84eb36 100644
--- a/drivers/acpi/utilities/utinit.c
+++ b/drivers/acpi/acpica/utinit.c
@@ -42,9 +42,10 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acevents.h>
-#include <acpi/actables.h>
+#include "accommon.h"
+#include "acnamesp.h"
+#include "acevents.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utinit")
diff --git a/drivers/acpi/utilities/utmath.c b/drivers/acpi/acpica/utmath.c
index c927324fdd2..c9f682d640e 100644
--- a/drivers/acpi/utilities/utmath.c
+++ b/drivers/acpi/acpica/utmath.c
@@ -42,6 +42,7 @@
  */
 
 #include <acpi/acpi.h>
+#include "accommon.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utmath")
diff --git a/drivers/acpi/utilities/utmisc.c b/drivers/acpi/acpica/utmisc.c
index 9089a158a87..c1f7f4e1a72 100644
--- a/drivers/acpi/utilities/utmisc.c
+++ b/drivers/acpi/acpica/utmisc.c
@@ -44,7 +44,8 @@
 #include <linux/module.h>
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utmisc")
@@ -1016,7 +1017,7 @@ acpi_ut_walk_package_tree(union acpi_operand_object * source_object,
 
 /*******************************************************************************
  *
- * FUNCTION:    acpi_ut_error, acpi_ut_warning, acpi_ut_info
+ * FUNCTION:    acpi_error, acpi_exception, acpi_warning, acpi_info
  *
  * PARAMETERS:  module_name         - Caller's module name (for error output)
  *              line_number         - Caller's line number (for error output)
@@ -1029,7 +1030,7 @@ acpi_ut_walk_package_tree(union acpi_operand_object * source_object,
  ******************************************************************************/
 
 void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_error(const char *module_name, u32 line_number, const char *format, ...)
+acpi_error(const char *module_name, u32 line_number, const char *format, ...)
 {
 	va_list args;
 
@@ -1042,8 +1043,8 @@ acpi_ut_error(const char *module_name, u32 line_number, const char *format, ...)
 }
 
 void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_exception(const char *module_name,
-		  u32 line_number, acpi_status status, const char *format, ...)
+acpi_exception(const char *module_name,
+	       u32 line_number, acpi_status status, const char *format, ...)
 {
 	va_list args;
 
@@ -1056,11 +1057,8 @@ acpi_ut_exception(const char *module_name,
 	va_end(args);
 }
 
-EXPORT_SYMBOL(acpi_ut_exception);
-
 void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_warning(const char *module_name,
-		u32 line_number, const char *format, ...)
+acpi_warning(const char *module_name, u32 line_number, const char *format, ...)
 {
 	va_list args;
 
@@ -1073,7 +1071,7 @@ acpi_ut_warning(const char *module_name,
 }
 
 void ACPI_INTERNAL_VAR_XFACE
-acpi_ut_info(const char *module_name, u32 line_number, const char *format, ...)
+acpi_info(const char *module_name, u32 line_number, const char *format, ...)
 {
 	va_list args;
 
@@ -1088,3 +1086,8 @@ acpi_ut_info(const char *module_name, u32 line_number, const char *format, ...)
 	acpi_os_printf("\n");
 	va_end(args);
 }
+
+ACPI_EXPORT_SYMBOL(acpi_error)
+ACPI_EXPORT_SYMBOL(acpi_exception)
+ACPI_EXPORT_SYMBOL(acpi_warning)
+ACPI_EXPORT_SYMBOL(acpi_info)
diff --git a/drivers/acpi/utilities/utmutex.c b/drivers/acpi/acpica/utmutex.c
index 7331dde9e1b..14eb52c4d64 100644
--- a/drivers/acpi/utilities/utmutex.c
+++ b/drivers/acpi/acpica/utmutex.c
@@ -42,6 +42,7 @@
  */
 
 #include <acpi/acpi.h>
+#include "accommon.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utmutex")
diff --git a/drivers/acpi/utilities/utobject.c b/drivers/acpi/acpica/utobject.c
index 4bef3cfbacc..fd5ea7543e5 100644
--- a/drivers/acpi/utilities/utobject.c
+++ b/drivers/acpi/acpica/utobject.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
+#include "accommon.h"
+#include "acnamesp.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utobject")
diff --git a/drivers/acpi/utilities/utresrc.c b/drivers/acpi/acpica/utresrc.c
index c3e3e1308ed..91b7c00236f 100644
--- a/drivers/acpi/utilities/utresrc.c
+++ b/drivers/acpi/acpica/utresrc.c
@@ -42,7 +42,8 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/amlresrc.h>
+#include "accommon.h"
+#include "amlresrc.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utresrc")
diff --git a/drivers/acpi/utilities/utstate.c b/drivers/acpi/acpica/utstate.c
index 63a6d3d77d8..0440c958f5a 100644
--- a/drivers/acpi/utilities/utstate.c
+++ b/drivers/acpi/acpica/utstate.c
@@ -42,6 +42,7 @@
  */
 
 #include <acpi/acpi.h>
+#include "accommon.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utstate")
diff --git a/drivers/acpi/utilities/utxface.c b/drivers/acpi/acpica/utxface.c
index c198a4d4058..078a22728c6 100644
--- a/drivers/acpi/utilities/utxface.c
+++ b/drivers/acpi/acpica/utxface.c
@@ -42,9 +42,11 @@
  */
 
 #include <acpi/acpi.h>
-#include <acpi/acevents.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acdebug.h>
+#include "accommon.h"
+#include "acevents.h"
+#include "acnamesp.h"
+#include "acdebug.h"
+#include "actables.h"
 
 #define _COMPONENT          ACPI_UTILITIES
 ACPI_MODULE_NAME("utxface")
@@ -148,6 +150,16 @@ acpi_status acpi_enable_subsystem(u32 flags)
 	}
 
 	/*
+	 * Obtain a permanent mapping for the FACS. This is required for the
+	 * Global Lock and the Firmware Waking Vector
+	 */
+	status = acpi_tb_initialize_facs();
+	if (ACPI_FAILURE(status)) {
+		ACPI_WARNING((AE_INFO, "Could not map the FACS table"));
+		return_ACPI_STATUS(status);
+	}
+
+	/*
 	 * Install the default op_region handlers. These are installed unless
 	 * other handlers have already been installed via the
 	 * install_address_space_handler interface.
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 1423b0c0cd2..65132f92045 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -471,7 +471,7 @@ static void sysfs_remove_battery(struct acpi_battery *battery)
 
 static int acpi_battery_update(struct acpi_battery *battery)
 {
-	int result;
+	int result, old_present = acpi_battery_present(battery);
 	result = acpi_battery_get_status(battery);
 	if (result)
 		return result;
@@ -482,7 +482,8 @@ static int acpi_battery_update(struct acpi_battery *battery)
 		return 0;
 	}
 #endif
-	if (!battery->update_time) {
+	if (!battery->update_time ||
+	    old_present != acpi_battery_present(battery)) {
 		result = acpi_battery_get_info(battery);
 		if (result)
 			return result;
diff --git a/drivers/acpi/cm_sbs.c b/drivers/acpi/cm_sbs.c
index 307963bd104..332fe4b2170 100644
--- a/drivers/acpi/cm_sbs.c
+++ b/drivers/acpi/cm_sbs.c
@@ -27,9 +27,6 @@
 #include <linux/seq_file.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
-#include <acpi/acmacros.h>
-#include <acpi/actypes.h>
-#include <acpi/acutils.h>
 
 ACPI_MODULE_NAME("cm_sbs");
 #define ACPI_AC_CLASS		"ac_adapter"
diff --git a/drivers/acpi/debug.c b/drivers/acpi/debug.c
index c4839689200..20223cbd0d1 100644
--- a/drivers/acpi/debug.c
+++ b/drivers/acpi/debug.c
@@ -9,7 +9,6 @@
 #include <linux/moduleparam.h>
 #include <asm/uaccess.h>
 #include <acpi/acpi_drivers.h>
-#include <acpi/acglobal.h>
 
 #define _COMPONENT		ACPI_SYSTEM_COMPONENT
 ACPI_MODULE_NAME("debug");
diff --git a/drivers/acpi/dispatcher/Makefile b/drivers/acpi/dispatcher/Makefile
deleted file mode 100644
index eb7e602a83c..00000000000
--- a/drivers/acpi/dispatcher/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for all Linux ACPI interpreter subdirectories
-#
-
-obj-y := dsfield.o   dsmthdat.o  dsopcode.o  dswexec.o  dswscope.o \
-	 dsmethod.o  dsobject.o  dsutils.o   dswload.o  dswstate.o \
-	 dsinit.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 30f3ef236ec..8dfcbb8aff7 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -42,7 +42,6 @@
 #include <asm/io.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
-#include <acpi/actypes.h>
 
 #define ACPI_EC_CLASS			"embedded_controller"
 #define ACPI_EC_DEVICE_NAME		"Embedded Controller"
@@ -370,7 +369,7 @@ unlock:
  * Note: samsung nv5000 doesn't work with ec burst mode.
  * http://bugzilla.kernel.org/show_bug.cgi?id=4980
  */
-int acpi_ec_burst_enable(struct acpi_ec *ec)
+static int acpi_ec_burst_enable(struct acpi_ec *ec)
 {
 	u8 d;
 	struct transaction t = {.command = ACPI_EC_BURST_ENABLE,
@@ -380,7 +379,7 @@ int acpi_ec_burst_enable(struct acpi_ec *ec)
 	return acpi_ec_transaction(ec, &t, 0);
 }
 
-int acpi_ec_burst_disable(struct acpi_ec *ec)
+static int acpi_ec_burst_disable(struct acpi_ec *ec)
 {
 	struct transaction t = {.command = ACPI_EC_BURST_DISABLE,
 				.wdata = NULL, .rdata = NULL,
@@ -756,10 +755,15 @@ static acpi_status
 acpi_ec_register_query_methods(acpi_handle handle, u32 level,
 			       void *context, void **return_value)
 {
-	struct acpi_namespace_node *node = handle;
+	char node_name[5];
+	struct acpi_buffer buffer = { sizeof(node_name), node_name };
 	struct acpi_ec *ec = context;
 	int value = 0;
-	if (sscanf(node->name.ascii, "_Q%x", &value) == 1) {
+	acpi_status status;
+
+	status = acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer);
+
+	if (ACPI_SUCCESS(status) && sscanf(node_name, "_Q%x", &value) == 1) {
 		acpi_ec_add_query_handler(ec, value, handle, NULL, NULL);
 	}
 	return AE_OK;
@@ -978,9 +982,9 @@ static const struct acpi_device_id ec_device_ids[] = {
 
 int __init acpi_ec_ecdt_probe(void)
 {
-	int ret;
 	acpi_status status;
 	struct acpi_table_ecdt *ecdt_ptr;
+	acpi_handle dummy;
 
 	boot_ec = make_acpi_ec();
 	if (!boot_ec)
@@ -1006,30 +1010,31 @@ int __init acpi_ec_ecdt_probe(void)
 		boot_ec->gpe = ecdt_ptr->gpe;
 		boot_ec->handle = ACPI_ROOT_OBJECT;
 		acpi_get_handle(ACPI_ROOT_OBJECT, ecdt_ptr->id, &boot_ec->handle);
-	} else {
-		/* This workaround is needed only on some broken machines,
-		 * which require early EC, but fail to provide ECDT */
-		acpi_handle x;
-		printk(KERN_DEBUG PREFIX "Look up EC in DSDT\n");
-		status = acpi_get_devices(ec_device_ids[0].id, ec_parse_device,
-						boot_ec, NULL);
-		/* Check that acpi_get_devices actually find something */
-		if (ACPI_FAILURE(status) || !boot_ec->handle)
-			goto error;
-		/* We really need to limit this workaround, the only ASUS,
-		 * which needs it, has fake EC._INI method, so use it as flag.
-		 * Keep boot_ec struct as it will be needed soon.
-		 */
-		if (ACPI_FAILURE(acpi_get_handle(boot_ec->handle, "_INI", &x)))
-			return -ENODEV;
+		/* Add some basic check against completely broken table */
+		if (boot_ec->data_addr != boot_ec->command_addr)
+			goto install;
+	/* fall through */
 	}
-
-	ret = ec_install_handlers(boot_ec);
-	if (!ret) {
+	/* This workaround is needed only on some broken machines,
+	 * which require early EC, but fail to provide ECDT */
+	printk(KERN_DEBUG PREFIX "Look up EC in DSDT\n");
+	status = acpi_get_devices(ec_device_ids[0].id, ec_parse_device,
+					boot_ec, NULL);
+	/* Check that acpi_get_devices actually find something */
+	if (ACPI_FAILURE(status) || !boot_ec->handle)
+		goto error;
+	/* We really need to limit this workaround, the only ASUS,
+	 * which needs it, has fake EC._INI method, so use it as flag.
+	 * Keep boot_ec struct as it will be needed soon.
+	 */
+	if (ACPI_FAILURE(acpi_get_handle(boot_ec->handle, "_INI", &dummy)))
+		return -ENODEV;
+install:
+	if (!ec_install_handlers(boot_ec)) {
 		first_ec = boot_ec;
 		return 0;
 	}
-      error:
+error:
 	kfree(boot_ec);
 	boot_ec = NULL;
 	return -ENODEV;
diff --git a/drivers/acpi/events/Makefile b/drivers/acpi/events/Makefile
deleted file mode 100644
index d29f2ee449c..00000000000
--- a/drivers/acpi/events/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for all Linux ACPI interpreter subdirectories
-#
-
-obj-y := evevent.o  evregion.o  evsci.o    evxfevnt.o \
-	 evmisc.o   evrgnini.o  evxface.o  evxfregn.o \
-	 evgpe.o    evgpeblk.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/executer/Makefile b/drivers/acpi/executer/Makefile
deleted file mode 100644
index e09998aa012..00000000000
--- a/drivers/acpi/executer/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Makefile for all Linux ACPI interpreter subdirectories
-#
-
-obj-y := exconfig.o  exfield.o  exnames.o   exoparg6.o  exresolv.o  exstorob.o\
-	 exconvrt.o  exfldio.o  exoparg1.o  exprep.o    exresop.o   exsystem.o\
-	 excreate.o  exmisc.o   exoparg2.o  exregion.o  exstore.o   exutils.o \
-	 exdump.o    exmutex.o  exoparg3.o  exresnte.o  exstoren.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/hardware/Makefile b/drivers/acpi/hardware/Makefile
deleted file mode 100644
index 438ad373b9a..00000000000
--- a/drivers/acpi/hardware/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for all Linux ACPI interpreter subdirectories
-#
-
-obj-y := hwacpi.o  hwgpe.o  hwregs.o  hwsleep.o
-
-obj-$(ACPI_FUTURE_USAGE) += hwtimer.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/main.c
index 28a691cc625..7e3c609cbef 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/main.c
@@ -101,13 +101,26 @@ void __init acpi_old_suspend_ordering(void)
  * cases.
  */
 static bool set_sci_en_on_resume;
+/*
+ * The ACPI specification wants us to save NVS memory regions during hibernation
+ * and to restore them during the subsequent resume.  However, it is not certain
+ * if this mechanism is going to work on all machines, so we allow the user to
+ * disable this mechanism using the 'acpi_sleep=s4_nonvs' kernel command line
+ * option.
+ */
+static bool s4_no_nvs;
+
+void __init acpi_s4_no_nvs(void)
+{
+	s4_no_nvs = true;
+}
 
 /**
  *	acpi_pm_disable_gpes - Disable the GPEs.
  */
 static int acpi_pm_disable_gpes(void)
 {
-	acpi_hw_disable_all_gpes();
+	acpi_disable_all_gpes();
 	return 0;
 }
 
@@ -135,7 +148,7 @@ static int acpi_pm_prepare(void)
 	int error = __acpi_pm_prepare();
 
 	if (!error)
-		acpi_hw_disable_all_gpes();
+		acpi_disable_all_gpes();
 	return error;
 }
 
@@ -267,7 +280,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state)
 	 * (like wakeup GPE) haven't handler, this can avoid such GPE misfire.
 	 * acpi_leave_sleep_state will reenable specific GPEs later
 	 */
-	acpi_hw_disable_all_gpes();
+	acpi_disable_all_gpes();
 
 	local_irq_restore(flags);
 	printk(KERN_DEBUG "Back to C!\n");
@@ -394,9 +407,25 @@ void __init acpi_no_s4_hw_signature(void)
 
 static int acpi_hibernation_begin(void)
 {
-	acpi_target_sleep_state = ACPI_STATE_S4;
-	acpi_sleep_tts_switch(acpi_target_sleep_state);
-	return 0;
+	int error;
+
+	error = s4_no_nvs ? 0 : hibernate_nvs_alloc();
+	if (!error) {
+		acpi_target_sleep_state = ACPI_STATE_S4;
+		acpi_sleep_tts_switch(acpi_target_sleep_state);
+	}
+
+	return error;
+}
+
+static int acpi_hibernation_pre_snapshot(void)
+{
+	int error = acpi_pm_prepare();
+
+	if (!error)
+		hibernate_nvs_save();
+
+	return error;
 }
 
 static int acpi_hibernation_enter(void)
@@ -417,6 +446,12 @@ static int acpi_hibernation_enter(void)
 	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
 }
 
+static void acpi_hibernation_finish(void)
+{
+	hibernate_nvs_free();
+	acpi_pm_finish();
+}
+
 static void acpi_hibernation_leave(void)
 {
 	/*
@@ -432,18 +467,20 @@ static void acpi_hibernation_leave(void)
 			"cannot resume!\n");
 		panic("ACPI S4 hardware signature mismatch");
 	}
+	/* Restore the NVS memory area */
+	hibernate_nvs_restore();
 }
 
 static void acpi_pm_enable_gpes(void)
 {
-	acpi_hw_enable_all_runtime_gpes();
+	acpi_enable_all_runtime_gpes();
 }
 
 static struct platform_hibernation_ops acpi_hibernation_ops = {
 	.begin = acpi_hibernation_begin,
 	.end = acpi_pm_end,
-	.pre_snapshot = acpi_pm_prepare,
-	.finish = acpi_pm_finish,
+	.pre_snapshot = acpi_hibernation_pre_snapshot,
+	.finish = acpi_hibernation_finish,
 	.prepare = acpi_pm_prepare,
 	.enter = acpi_hibernation_enter,
 	.leave = acpi_hibernation_leave,
@@ -469,8 +506,22 @@ static int acpi_hibernation_begin_old(void)
 
 	error = acpi_sleep_prepare(ACPI_STATE_S4);
 
+	if (!error) {
+		if (!s4_no_nvs)
+			error = hibernate_nvs_alloc();
+		if (!error)
+			acpi_target_sleep_state = ACPI_STATE_S4;
+	}
+	return error;
+}
+
+static int acpi_hibernation_pre_snapshot_old(void)
+{
+	int error = acpi_pm_disable_gpes();
+
 	if (!error)
-		acpi_target_sleep_state = ACPI_STATE_S4;
+		hibernate_nvs_save();
+
 	return error;
 }
 
@@ -481,8 +532,8 @@ static int acpi_hibernation_begin_old(void)
 static struct platform_hibernation_ops acpi_hibernation_ops_old = {
 	.begin = acpi_hibernation_begin_old,
 	.end = acpi_pm_end,
-	.pre_snapshot = acpi_pm_disable_gpes,
-	.finish = acpi_pm_finish,
+	.pre_snapshot = acpi_hibernation_pre_snapshot_old,
+	.finish = acpi_hibernation_finish,
 	.prepare = acpi_pm_disable_gpes,
 	.enter = acpi_hibernation_enter,
 	.leave = acpi_hibernation_leave,
@@ -622,7 +673,7 @@ static void acpi_power_off_prepare(void)
 {
 	/* Prepare to power off the system */
 	acpi_sleep_prepare(ACPI_STATE_S5);
-	acpi_hw_disable_all_gpes();
+	acpi_disable_all_gpes();
 }
 
 static void acpi_power_off(void)
@@ -671,7 +722,7 @@ int __init acpi_sleep_init(void)
 		sleep_states[ACPI_STATE_S4] = 1;
 		printk(" S4");
 		if (!nosigcheck) {
-			acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS,
+			acpi_get_table(ACPI_SIG_FACS, 1,
 				(struct acpi_table_header **)&facs);
 			if (facs)
 				s4_hardware_signature =
diff --git a/drivers/acpi/namespace/Makefile b/drivers/acpi/namespace/Makefile
deleted file mode 100644
index 371a2daf837..00000000000
--- a/drivers/acpi/namespace/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-#
-# Makefile for all Linux ACPI interpreter subdirectories
-#
-
-obj-y := nsaccess.o  nsload.o    nssearch.o  nsxfeval.o \
-	 nsalloc.o   nseval.o    nsnames.o   nsutils.o   nsxfname.o \
-	 nsdump.o    nsinit.o    nsobject.o  nswalk.o    nsxfobj.o  \
-	 nsparse.o   nspredef.o
-
-obj-$(ACPI_FUTURE_USAGE) += nsdumpdv.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 25ceae9191e..c5e292aab0e 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -29,7 +29,6 @@
 #include <linux/errno.h>
 #include <linux/acpi.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/acmacros.h>
 
 #define ACPI_NUMA	0x80000000
 #define _COMPONENT	ACPI_NUMA
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index c8111424dcb..6729a4992f2 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -726,7 +726,7 @@ static acpi_status __acpi_os_execute(acpi_execute_type type,
 
 	dpc = kmalloc(sizeof(struct acpi_os_dpc), GFP_ATOMIC);
 	if (!dpc)
-		return_ACPI_STATUS(AE_NO_MEMORY);
+		return AE_NO_MEMORY;
 
 	dpc->function = function;
 	dpc->context = context;
@@ -747,7 +747,7 @@ static acpi_status __acpi_os_execute(acpi_execute_type type,
 		status = AE_ERROR;
 		kfree(dpc);
 	}
-	return_ACPI_STATUS(status);
+	return status;
 }
 
 acpi_status acpi_os_execute(acpi_execute_type type,
diff --git a/drivers/acpi/parser/Makefile b/drivers/acpi/parser/Makefile
deleted file mode 100644
index db24ee09cf1..00000000000
--- a/drivers/acpi/parser/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-#
-# Makefile for all Linux ACPI interpreter subdirectories
-#
-
-obj-y := psargs.o    psparse.o  psloop.o pstree.o   pswalk.o  \
-	 psopcode.o  psscope.o  psutils.o  psxface.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/pci_bind.c b/drivers/acpi/pci_bind.c
index 4b252ea0e95..95650f83ce2 100644
--- a/drivers/acpi/pci_bind.c
+++ b/drivers/acpi/pci_bind.c
@@ -99,7 +99,7 @@ acpi_status acpi_get_pci_id(acpi_handle handle, struct acpi_pci_id *id)
 	 */
 
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-			  "Device %s has PCI address %02x:%02x:%02x.%02x\n",
+			  "Device %s has PCI address %04x:%02x:%02x.%d\n",
 			  acpi_device_bid(device), id->segment, id->bus,
 			  id->device, id->function));
 
@@ -111,12 +111,11 @@ EXPORT_SYMBOL(acpi_get_pci_id);
 int acpi_pci_bind(struct acpi_device *device)
 {
 	int result = 0;
-	acpi_status status = AE_OK;
-	struct acpi_pci_data *data = NULL;
-	struct acpi_pci_data *pdata = NULL;
-	char *pathname = NULL;
-	struct acpi_buffer buffer = { 0, NULL };
-	acpi_handle handle = NULL;
+	acpi_status status;
+	struct acpi_pci_data *data;
+	struct acpi_pci_data *pdata;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	acpi_handle handle;
 	struct pci_dev *dev;
 	struct pci_bus *bus;
 
@@ -124,21 +123,18 @@ int acpi_pci_bind(struct acpi_device *device)
 	if (!device || !device->parent)
 		return -EINVAL;
 
-	pathname = kzalloc(ACPI_PATHNAME_MAX, GFP_KERNEL);
-	if (!pathname)
-		return -ENOMEM;
-	buffer.length = ACPI_PATHNAME_MAX;
-	buffer.pointer = pathname;
-
 	data = kzalloc(sizeof(struct acpi_pci_data), GFP_KERNEL);
-	if (!data) {
-		kfree(pathname);
+	if (!data)
 		return -ENOMEM;
+
+	status = acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer);
+	if (ACPI_FAILURE(status)) {
+		kfree(data);
+		return -ENODEV;
 	}
 
-	acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer);
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Binding PCI device [%s]...\n",
-			  pathname));
+			  (char *)buffer.pointer));
 
 	/* 
 	 * Segment & Bus
@@ -166,7 +162,7 @@ int acpi_pci_bind(struct acpi_device *device)
 	data->id.device = device->pnp.bus_address >> 16;
 	data->id.function = device->pnp.bus_address & 0xFFFF;
 
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "...to %02x:%02x:%02x.%02x\n",
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "...to %04x:%02x:%02x.%d\n",
 			  data->id.segment, data->id.bus, data->id.device,
 			  data->id.function));
 
@@ -196,7 +192,7 @@ int acpi_pci_bind(struct acpi_device *device)
 	}
 	if (!data->dev) {
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-				  "Device %02x:%02x:%02x.%02x not present in PCI namespace\n",
+				  "Device %04x:%02x:%02x.%d not present in PCI namespace\n",
 				  data->id.segment, data->id.bus,
 				  data->id.device, data->id.function));
 		result = -ENODEV;
@@ -204,7 +200,7 @@ int acpi_pci_bind(struct acpi_device *device)
 	}
 	if (!data->dev->bus) {
 		printk(KERN_ERR PREFIX
-			    "Device %02x:%02x:%02x.%02x has invalid 'bus' field\n",
+			    "Device %04x:%02x:%02x.%d has invalid 'bus' field\n",
 			    data->id.segment, data->id.bus,
 			    data->id.device, data->id.function);
 		result = -ENODEV;
@@ -219,7 +215,7 @@ int acpi_pci_bind(struct acpi_device *device)
 	 */
 	if (data->dev->subordinate) {
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-				  "Device %02x:%02x:%02x.%02x is a PCI bridge\n",
+				  "Device %04x:%02x:%02x.%d is a PCI bridge\n",
 				  data->id.segment, data->id.bus,
 				  data->id.device, data->id.function));
 		data->bus = data->dev->subordinate;
@@ -262,7 +258,7 @@ int acpi_pci_bind(struct acpi_device *device)
 	}
 
       end:
-	kfree(pathname);
+	kfree(buffer.pointer);
 	if (result)
 		kfree(data);
 
@@ -272,25 +268,21 @@ int acpi_pci_bind(struct acpi_device *device)
 static int acpi_pci_unbind(struct acpi_device *device)
 {
 	int result = 0;
-	acpi_status status = AE_OK;
-	struct acpi_pci_data *data = NULL;
-	char *pathname = NULL;
-	struct acpi_buffer buffer = { 0, NULL };
+	acpi_status status;
+	struct acpi_pci_data *data;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 
 
 	if (!device || !device->parent)
 		return -EINVAL;
 
-	pathname = kzalloc(ACPI_PATHNAME_MAX, GFP_KERNEL);
-	if (!pathname)
-		return -ENOMEM;
+	status = acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
 
-	buffer.length = ACPI_PATHNAME_MAX;
-	buffer.pointer = pathname;
-	acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer);
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Unbinding PCI device [%s]...\n",
-			  pathname));
-	kfree(pathname);
+			  (char *) buffer.pointer));
+	kfree(buffer.pointer);
 
 	status =
 	    acpi_get_data(device->handle, acpi_pci_data_handler,
@@ -322,50 +314,44 @@ acpi_pci_bind_root(struct acpi_device *device,
 		   struct acpi_pci_id *id, struct pci_bus *bus)
 {
 	int result = 0;
-	acpi_status status = AE_OK;
+	acpi_status status;
 	struct acpi_pci_data *data = NULL;
-	char *pathname = NULL;
-	struct acpi_buffer buffer = { 0, NULL };
-
-	pathname = kzalloc(ACPI_PATHNAME_MAX, GFP_KERNEL);
-	if (!pathname)
-		return -ENOMEM;
-
-	buffer.length = ACPI_PATHNAME_MAX;
-	buffer.pointer = pathname;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
 
 	if (!device || !id || !bus) {
-		kfree(pathname);
 		return -EINVAL;
 	}
 
 	data = kzalloc(sizeof(struct acpi_pci_data), GFP_KERNEL);
-	if (!data) {
-		kfree(pathname);
+	if (!data)
 		return -ENOMEM;
-	}
 
 	data->id = *id;
 	data->bus = bus;
 	device->ops.bind = acpi_pci_bind;
 	device->ops.unbind = acpi_pci_unbind;
 
-	acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer);
+	status = acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer);
+	if (ACPI_FAILURE(status)) {
+		kfree (data);
+		return -ENODEV;
+	}
 
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Binding PCI root bridge [%s] to "
-			  "%02x:%02x\n", pathname, id->segment, id->bus));
+			"%04x:%02x\n", (char *)buffer.pointer,
+			id->segment, id->bus));
 
 	status = acpi_attach_data(device->handle, acpi_pci_data_handler, data);
 	if (ACPI_FAILURE(status)) {
 		ACPI_EXCEPTION((AE_INFO, status,
 				"Unable to attach ACPI-PCI context to device %s",
-				pathname));
+				(char *)buffer.pointer));
 		result = -ENODEV;
 		goto end;
 	}
 
       end:
-	kfree(pathname);
+	kfree(buffer.pointer);
 	if (result != 0)
 		kfree(data);
 
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index bf79d83bdfb..891bdf6679f 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -4,6 +4,8 @@
  *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
  *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
  *  Copyright (C) 2002       Dominik Brodowski <devel@brodo.de>
+ *  (c) Copyright 2008 Hewlett-Packard Development Company, L.P.
+ *	Bjorn Helgaas <bjorn.helgaas@hp.com>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
@@ -41,29 +43,36 @@
 #define _COMPONENT		ACPI_PCI_COMPONENT
 ACPI_MODULE_NAME("pci_irq");
 
-static struct acpi_prt_list acpi_prt;
+struct acpi_prt_entry {
+	struct list_head	list;
+	struct acpi_pci_id	id;
+	u8			pin;
+	acpi_handle		link;
+	u32			index;		/* GSI, or link _CRS index */
+};
+
+static LIST_HEAD(acpi_prt_list);
 static DEFINE_SPINLOCK(acpi_prt_lock);
 
+static inline char pin_name(int pin)
+{
+	return 'A' + pin - 1;
+}
+
 /* --------------------------------------------------------------------------
                          PCI IRQ Routing Table (PRT) Support
    -------------------------------------------------------------------------- */
 
-static struct acpi_prt_entry *acpi_pci_irq_find_prt_entry(int segment,
-							  int bus,
-							  int device, int pin)
+static struct acpi_prt_entry *acpi_pci_irq_find_prt_entry(struct pci_dev *dev,
+							  int pin)
 {
-	struct acpi_prt_entry *entry = NULL;
-
-	if (!acpi_prt.count)
-		return NULL;
+	struct acpi_prt_entry *entry;
+	int segment = pci_domain_nr(dev->bus);
+	int bus = dev->bus->number;
+	int device = PCI_SLOT(dev->devfn);
 
-	/*
-	 * Parse through all PRT entries looking for a match on the specified
-	 * PCI device's segment, bus, device, and pin (don't care about func).
-	 *
-	 */
 	spin_lock(&acpi_prt_lock);
-	list_for_each_entry(entry, &acpi_prt.entries, node) {
+	list_for_each_entry(entry, &acpi_prt_list, list) {
 		if ((segment == entry->id.segment)
 		    && (bus == entry->id.bus)
 		    && (device == entry->id.device)
@@ -72,7 +81,6 @@ static struct acpi_prt_entry *acpi_pci_irq_find_prt_entry(int segment,
 			return entry;
 		}
 	}
-
 	spin_unlock(&acpi_prt_lock);
 	return NULL;
 }
@@ -124,25 +132,27 @@ struct prt_quirk {
 	char			*actual_source;
 };
 
+#define PCI_INTX_PIN(c)		(c - 'A' + 1)
+
 /*
  * These systems have incorrect _PRT entries.  The BIOS claims the PCI
  * interrupt at the listed segment/bus/device/pin is connected to the first
  * link device, but it is actually connected to the second.
  */
 static struct prt_quirk prt_quirks[] = {
-	{ medion_md9580, 0, 0, 9, 'A',
+	{ medion_md9580, 0, 0, 9, PCI_INTX_PIN('A'),
 		"\\_SB_.PCI0.ISA_.LNKA",
 		"\\_SB_.PCI0.ISA_.LNKB"},
-	{ dell_optiplex, 0, 0, 0xd, 'A',
+	{ dell_optiplex, 0, 0, 0xd, PCI_INTX_PIN('A'),
 		"\\_SB_.LNKB",
 		"\\_SB_.LNKA"},
-	{ hp_t5710, 0, 0, 1, 'A',
+	{ hp_t5710, 0, 0, 1, PCI_INTX_PIN('A'),
 		"\\_SB_.PCI0.LNK1",
 		"\\_SB_.PCI0.LNK3"},
 };
 
-static void
-do_prt_fixups(struct acpi_prt_entry *entry, struct acpi_pci_routing_table *prt)
+static void do_prt_fixups(struct acpi_prt_entry *entry,
+			  struct acpi_pci_routing_table *prt)
 {
 	int i;
 	struct prt_quirk *quirk;
@@ -158,42 +168,43 @@ do_prt_fixups(struct acpi_prt_entry *entry, struct acpi_pci_routing_table *prt)
 		    entry->id.segment == quirk->segment &&
 		    entry->id.bus == quirk->bus &&
 		    entry->id.device == quirk->device &&
-		    entry->pin + 'A' == quirk->pin &&
+		    entry->pin == quirk->pin &&
 		    !strcmp(prt->source, quirk->source) &&
 		    strlen(prt->source) >= strlen(quirk->actual_source)) {
 			printk(KERN_WARNING PREFIX "firmware reports "
 				"%04x:%02x:%02x PCI INT %c connected to %s; "
 				"changing to %s\n",
 				entry->id.segment, entry->id.bus,
-				entry->id.device, 'A' + entry->pin,
+				entry->id.device, pin_name(entry->pin),
 				prt->source, quirk->actual_source);
 			strcpy(prt->source, quirk->actual_source);
 		}
 	}
 }
 
-static int
-acpi_pci_irq_add_entry(acpi_handle handle,
-		       int segment, int bus, struct acpi_pci_routing_table *prt)
+static int acpi_pci_irq_add_entry(acpi_handle handle, int segment, int bus,
+				  struct acpi_pci_routing_table *prt)
 {
-	struct acpi_prt_entry *entry = NULL;
-
-
-	if (!prt)
-		return -EINVAL;
+	struct acpi_prt_entry *entry;
 
 	entry = kzalloc(sizeof(struct acpi_prt_entry), GFP_KERNEL);
 	if (!entry)
 		return -ENOMEM;
 
+	/*
+	 * Note that the _PRT uses 0=INTA, 1=INTB, etc, while PCI uses
+	 * 1=INTA, 2=INTB.  We use the PCI encoding throughout, so convert
+	 * it here.
+	 */
 	entry->id.segment = segment;
 	entry->id.bus = bus;
 	entry->id.device = (prt->address >> 16) & 0xFFFF;
-	entry->id.function = prt->address & 0xFFFF;
-	entry->pin = prt->pin;
+	entry->pin = prt->pin + 1;
 
 	do_prt_fixups(entry, prt);
 
+	entry->index = prt->source_index;
+
 	/*
 	 * Type 1: Dynamic
 	 * ---------------
@@ -207,10 +218,9 @@ acpi_pci_irq_add_entry(acpi_handle handle,
 	 *       (e.g. exists somewhere 'below' this _PRT entry in the ACPI
 	 *       namespace).
 	 */
-	if (prt->source[0]) {
-		acpi_get_handle(handle, prt->source, &entry->link.handle);
-		entry->link.index = prt->source_index;
-	}
+	if (prt->source[0])
+		acpi_get_handle(handle, prt->source, &entry->link);
+
 	/*
 	 * Type 2: Static
 	 * --------------
@@ -218,84 +228,38 @@ acpi_pci_irq_add_entry(acpi_handle handle,
 	 * the IRQ value, which is hardwired to specific interrupt inputs on
 	 * the interrupt controller.
 	 */
-	else
-		entry->link.index = prt->source_index;
 
 	ACPI_DEBUG_PRINT_RAW((ACPI_DB_INFO,
-			      "      %02X:%02X:%02X[%c] -> %s[%d]\n",
+			      "      %04x:%02x:%02x[%c] -> %s[%d]\n",
 			      entry->id.segment, entry->id.bus,
-			      entry->id.device, ('A' + entry->pin), prt->source,
-			      entry->link.index));
+			      entry->id.device, pin_name(entry->pin),
+			      prt->source, entry->index));
 
 	spin_lock(&acpi_prt_lock);
-	list_add_tail(&entry->node, &acpi_prt.entries);
-	acpi_prt.count++;
+	list_add_tail(&entry->list, &acpi_prt_list);
 	spin_unlock(&acpi_prt_lock);
 
 	return 0;
 }
 
-static void
-acpi_pci_irq_del_entry(int segment, int bus, struct acpi_prt_entry *entry)
-{
-	if (segment == entry->id.segment && bus == entry->id.bus) {
-		acpi_prt.count--;
-		list_del(&entry->node);
-		kfree(entry);
-	}
-}
-
 int acpi_pci_irq_add_prt(acpi_handle handle, int segment, int bus)
 {
-	acpi_status status = AE_OK;
-	char *pathname = NULL;
-	struct acpi_buffer buffer = { 0, NULL };
-	struct acpi_pci_routing_table *prt = NULL;
-	struct acpi_pci_routing_table *entry = NULL;
-	static int first_time = 1;
-
-
-	pathname = kzalloc(ACPI_PATHNAME_MAX, GFP_KERNEL);
-	if (!pathname)
-		return -ENOMEM;
-
-	if (first_time) {
-		acpi_prt.count = 0;
-		INIT_LIST_HEAD(&acpi_prt.entries);
-		first_time = 0;
-	}
-
-	/* 
-	 * NOTE: We're given a 'handle' to the _PRT object's parent device
-	 *       (either a PCI root bridge or PCI-PCI bridge).
-	 */
+	acpi_status status;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_pci_routing_table *entry;
 
-	buffer.length = ACPI_PATHNAME_MAX;
-	buffer.pointer = pathname;
-	acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer);
+	/* 'handle' is the _PRT's parent (root bridge or PCI-PCI bridge) */
+	status = acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
 
 	printk(KERN_DEBUG "ACPI: PCI Interrupt Routing Table [%s._PRT]\n",
-	       pathname);
+	       (char *) buffer.pointer);
 
-	/* 
-	 * Evaluate this _PRT and add its entries to our global list (acpi_prt).
-	 */
+	kfree(buffer.pointer);
 
-	buffer.length = 0;
+	buffer.length = ACPI_ALLOCATE_BUFFER;
 	buffer.pointer = NULL;
-	kfree(pathname);
-	status = acpi_get_irq_routing_table(handle, &buffer);
-	if (status != AE_BUFFER_OVERFLOW) {
-		ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PRT [%s]",
-				acpi_format_exception(status)));
-		return -ENODEV;
-	}
-
-	prt = kzalloc(buffer.length, GFP_KERNEL);
-	if (!prt) {
-		return -ENOMEM;
-	}
-	buffer.pointer = prt;
 
 	status = acpi_get_irq_routing_table(handle, &buffer);
 	if (ACPI_FAILURE(status)) {
@@ -305,36 +269,30 @@ int acpi_pci_irq_add_prt(acpi_handle handle, int segment, int bus)
 		return -ENODEV;
 	}
 
-	entry = prt;
-
+	entry = buffer.pointer;
 	while (entry && (entry->length > 0)) {
 		acpi_pci_irq_add_entry(handle, segment, bus, entry);
 		entry = (struct acpi_pci_routing_table *)
 		    ((unsigned long)entry + entry->length);
 	}
 
-	kfree(prt);
-
+	kfree(buffer.pointer);
 	return 0;
 }
 
 void acpi_pci_irq_del_prt(int segment, int bus)
 {
-	struct list_head *node = NULL, *n = NULL;
-	struct acpi_prt_entry *entry = NULL;
-
-	if (!acpi_prt.count) {
-		return;
-	}
+	struct acpi_prt_entry *entry, *tmp;
 
 	printk(KERN_DEBUG
-	       "ACPI: Delete PCI Interrupt Routing Table for %x:%x\n", segment,
-	       bus);
+	       "ACPI: Delete PCI Interrupt Routing Table for %04x:%02x\n",
+	       segment, bus);
 	spin_lock(&acpi_prt_lock);
-	list_for_each_safe(node, n, &acpi_prt.entries) {
-		entry = list_entry(node, struct acpi_prt_entry, node);
-
-		acpi_pci_irq_del_entry(segment, bus, entry);
+	list_for_each_entry_safe(entry, tmp, &acpi_prt_list, list) {
+		if (segment == entry->id.segment && bus == entry->id.bus) {
+			list_del(&entry->list);
+			kfree(entry);
+		}
 	}
 	spin_unlock(&acpi_prt_lock);
 }
@@ -342,162 +300,26 @@ void acpi_pci_irq_del_prt(int segment, int bus)
 /* --------------------------------------------------------------------------
                           PCI Interrupt Routing Support
    -------------------------------------------------------------------------- */
-typedef int (*irq_lookup_func) (struct acpi_prt_entry *, int *, int *, char **);
-
-static int
-acpi_pci_allocate_irq(struct acpi_prt_entry *entry,
-		      int *triggering, int *polarity, char **link)
-{
-	int irq;
-
-
-	if (entry->link.handle) {
-		irq = acpi_pci_link_allocate_irq(entry->link.handle,
-						 entry->link.index, triggering,
-						 polarity, link);
-		if (irq < 0) {
-			printk(KERN_WARNING PREFIX
-				      "Invalid IRQ link routing entry\n");
-			return -1;
-		}
-	} else {
-		irq = entry->link.index;
-		*triggering = ACPI_LEVEL_SENSITIVE;
-		*polarity = ACPI_ACTIVE_LOW;
-	}
-
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found IRQ %d\n", irq));
-	return irq;
-}
-
-static int
-acpi_pci_free_irq(struct acpi_prt_entry *entry,
-		  int *triggering, int *polarity, char **link)
-{
-	int irq;
-
-	if (entry->link.handle) {
-		irq = acpi_pci_link_free_irq(entry->link.handle);
-	} else {
-		irq = entry->link.index;
-	}
-	return irq;
-}
-
-#ifdef CONFIG_X86_IO_APIC
-extern int noioapicquirk;
-
-static int bridge_has_boot_interrupt_variant(struct pci_bus *bus)
+static struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin)
 {
-	struct pci_bus *bus_it;
-
-	for (bus_it = bus ; bus_it ; bus_it = bus_it->parent) {
-		if (!bus_it->self)
-			return 0;
-
-		printk(KERN_INFO "vendor=%04x device=%04x\n", bus_it->self->vendor,
-				bus_it->self->device);
-
-		if (bus_it->self->irq_reroute_variant)
-			return bus_it->self->irq_reroute_variant;
-	}
-	return 0;
-}
-#endif /* CONFIG_X86_IO_APIC */
-
-/*
- * acpi_pci_irq_lookup
- * success: return IRQ >= 0
- * failure: return -1
- */
-static int
-acpi_pci_irq_lookup(struct pci_bus *bus,
-		    int device,
-		    int pin,
-		    int *triggering,
-		    int *polarity, char **link, irq_lookup_func func)
-{
-	struct acpi_prt_entry *entry = NULL;
-	int segment = pci_domain_nr(bus);
-	int bus_nr = bus->number;
-	int ret;
-
-
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-			  "Searching for PRT entry for %02x:%02x:%02x[%c]\n",
-			  segment, bus_nr, device, ('A' + pin)));
-
-	entry = acpi_pci_irq_find_prt_entry(segment, bus_nr, device, pin);
-	if (!entry) {
-		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "PRT entry not found\n"));
-		return -1;
-	}
-
-	ret = func(entry, triggering, polarity, link);
-
-#ifdef CONFIG_X86_IO_APIC
-	/*
-	 * Some chipsets (e.g. intel 6700PXH) generate a legacy INTx when the
-	 * IRQ entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel
-	 * does during interrupt handling). When this INTx generation cannot be
-	 * disabled, we reroute these interrupts to their legacy equivalent to
-	 * get rid of spurious interrupts.
-	 */
-        if (!noioapicquirk) {
-		switch (bridge_has_boot_interrupt_variant(bus)) {
-		case 0:
-			/* no rerouting necessary */
-			break;
-
-		case INTEL_IRQ_REROUTE_VARIANT:
-			/*
-			 * Remap according to INTx routing table in 6700PXH
-			 * specs, intel order number 302628-002, section
-			 * 2.15.2. Other chipsets (80332, ...) have the same
-			 * mapping and are handled here as well.
-			 */
-			printk(KERN_INFO "pci irq %d -> rerouted to legacy "
-					 "irq %d\n", ret, (ret % 4) + 16);
-			ret = (ret % 4) + 16;
-			break;
-
-		default:
-			printk(KERN_INFO "not rerouting irq %d to legacy irq: "
-					 "unknown mapping\n", ret);
-			break;
-		}
+	struct acpi_prt_entry *entry;
+	struct pci_dev *bridge;
+	u8 bridge_pin, orig_pin = pin;
+
+	entry = acpi_pci_irq_find_prt_entry(dev, pin);
+	if (entry) {
+		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %s[%c] _PRT entry\n",
+				  pci_name(dev), pin_name(pin)));
+		return entry;
 	}
-#endif /* CONFIG_X86_IO_APIC */
-
-	return ret;
-}
-
-/*
- * acpi_pci_irq_derive
- * success: return IRQ >= 0
- * failure: return < 0
- */
-static int
-acpi_pci_irq_derive(struct pci_dev *dev,
-		    int pin,
-		    int *triggering,
-		    int *polarity, char **link, irq_lookup_func func)
-{
-	struct pci_dev *bridge = dev;
-	int irq = -1;
-	u8 bridge_pin = 0, orig_pin = pin;
-
-
-	if (!dev)
-		return -EINVAL;
 
 	/* 
 	 * Attempt to derive an IRQ for this device from a parent bridge's
 	 * PCI interrupt routing entry (eg. yenta bridge and add-in card bridge).
 	 */
-	while (irq < 0 && bridge->bus->self) {
-		pin = (pin + PCI_SLOT(bridge->devfn)) % 4;
-		bridge = bridge->bus->self;
+	bridge = dev->bus->self;
+	while (bridge) {
+		pin = (((pin - 1) + PCI_SLOT(dev->devfn)) % 4) + 1;
 
 		if ((bridge->class >> 8) == PCI_CLASS_BRIDGE_CARDBUS) {
 			/* PC card has the same IRQ as its cardbridge */
@@ -506,50 +328,40 @@ acpi_pci_irq_derive(struct pci_dev *dev,
 				ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 						  "No interrupt pin configured for device %s\n",
 						  pci_name(bridge)));
-				return -1;
+				return NULL;
 			}
-			/* Pin is from 0 to 3 */
-			bridge_pin--;
 			pin = bridge_pin;
 		}
 
-		irq = acpi_pci_irq_lookup(bridge->bus, PCI_SLOT(bridge->devfn),
-					  pin, triggering, polarity,
-					  link, func);
-	}
+		entry = acpi_pci_irq_find_prt_entry(bridge, pin);
+		if (entry) {
+			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+					 "Derived GSI for %s INT %c from %s\n",
+					 pci_name(dev), pin_name(orig_pin),
+					 pci_name(bridge)));
+			return entry;
+		}
 
-	if (irq < 0) {
-		dev_warn(&dev->dev, "can't derive routing for PCI INT %c\n",
-			 'A' + orig_pin);
-		return -1;
+		dev = bridge;
+		bridge = dev->bus->self;
 	}
 
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Derive IRQ %d for device %s from %s\n",
-			  irq, pci_name(dev), pci_name(bridge)));
-
-	return irq;
+	dev_warn(&dev->dev, "can't derive routing for PCI INT %c\n",
+		 pin_name(orig_pin));
+	return NULL;
 }
 
-/*
- * acpi_pci_irq_enable
- * success: return 0
- * failure: return < 0
- */
-
 int acpi_pci_irq_enable(struct pci_dev *dev)
 {
-	int irq = 0;
-	u8 pin = 0;
+	struct acpi_prt_entry *entry;
+	int gsi;
+	u8 pin;
 	int triggering = ACPI_LEVEL_SENSITIVE;
 	int polarity = ACPI_ACTIVE_LOW;
 	char *link = NULL;
 	char link_desc[16];
 	int rc;
 
-
-	if (!dev)
-		return -EINVAL;
-
 	pin = dev->pin;
 	if (!pin) {
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
@@ -557,31 +369,9 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 				  pci_name(dev)));
 		return 0;
 	}
-	pin--;
-
-	if (!dev->bus) {
-		dev_err(&dev->dev, "invalid (NULL) 'bus' field\n");
-		return -ENODEV;
-	}
-
-	/* 
-	 * First we check the PCI IRQ routing table (PRT) for an IRQ.  PRT
-	 * values override any BIOS-assigned IRQs set during boot.
-	 */
-	irq = acpi_pci_irq_lookup(dev->bus, PCI_SLOT(dev->devfn), pin,
-				  &triggering, &polarity, &link,
-				  acpi_pci_allocate_irq);
-
-	/*
-	 * If no PRT entry was found, we'll try to derive an IRQ from the
-	 * device's parent bridge.
-	 */
-	if (irq < 0)
-		irq = acpi_pci_irq_derive(dev, pin, &triggering,
-					  &polarity, &link,
-					  acpi_pci_allocate_irq);
 
-	if (irq < 0) {
+	entry = acpi_pci_irq_lookup(dev, pin);
+	if (!entry) {
 		/*
 		 * IDE legacy mode controller IRQs are magic. Why do compat
 		 * extensions always make such a nasty mess.
@@ -590,12 +380,24 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 				(dev->class & 0x05) == 0)
 			return 0;
 	}
+
+	if (entry) {
+		if (entry->link)
+			gsi = acpi_pci_link_allocate_irq(entry->link,
+							 entry->index,
+							 &triggering, &polarity,
+							 &link);
+		else
+			gsi = entry->index;
+	} else
+		gsi = -1;
+
 	/*
 	 * No IRQ known to the ACPI subsystem - maybe the BIOS / 
 	 * driver reported one, then use it. Exit in any case.
 	 */
-	if (irq < 0) {
-		dev_warn(&dev->dev, "PCI INT %c: no GSI", 'A' + pin);
+	if (gsi < 0) {
+		dev_warn(&dev->dev, "PCI INT %c: no GSI", pin_name(pin));
 		/* Interrupt Line values above 0xF are forbidden */
 		if (dev->irq > 0 && (dev->irq <= 0xF)) {
 			printk(" - using IRQ %d\n", dev->irq);
@@ -608,10 +410,10 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		}
 	}
 
-	rc = acpi_register_gsi(irq, triggering, polarity);
+	rc = acpi_register_gsi(gsi, triggering, polarity);
 	if (rc < 0) {
 		dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n",
-			 'A' + pin);
+			 pin_name(pin));
 		return rc;
 	}
 	dev->irq = rc;
@@ -622,7 +424,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		link_desc[0] = '\0';
 
 	dev_info(&dev->dev, "PCI INT %c%s -> GSI %u (%s, %s) -> IRQ %d\n",
-		 'A' + pin, link_desc, irq,
+		 pin_name(pin), link_desc, gsi,
 		 (triggering == ACPI_LEVEL_SENSITIVE) ? "level" : "edge",
 		 (polarity == ACPI_ACTIVE_LOW) ? "low" : "high", dev->irq);
 
@@ -636,42 +438,28 @@ void __attribute__ ((weak)) acpi_unregister_gsi(u32 i)
 
 void acpi_pci_irq_disable(struct pci_dev *dev)
 {
-	int gsi = 0;
-	u8 pin = 0;
-	int triggering = ACPI_LEVEL_SENSITIVE;
-	int polarity = ACPI_ACTIVE_LOW;
-
-
-	if (!dev || !dev->bus)
-		return;
+	struct acpi_prt_entry *entry;
+	int gsi;
+	u8 pin;
 
 	pin = dev->pin;
 	if (!pin)
 		return;
-	pin--;
 
-	/*
-	 * First we check the PCI IRQ routing table (PRT) for an IRQ.
-	 */
-	gsi = acpi_pci_irq_lookup(dev->bus, PCI_SLOT(dev->devfn), pin,
-				  &triggering, &polarity, NULL,
-				  acpi_pci_free_irq);
-	/*
-	 * If no PRT entry was found, we'll try to derive an IRQ from the
-	 * device's parent bridge.
-	 */
-	if (gsi < 0)
-		gsi = acpi_pci_irq_derive(dev, pin,
-					  &triggering, &polarity, NULL,
-					  acpi_pci_free_irq);
-	if (gsi < 0)
+	entry = acpi_pci_irq_lookup(dev, pin);
+	if (!entry)
 		return;
 
+	if (entry->link)
+		gsi = acpi_pci_link_free_irq(entry->link);
+	else
+		gsi = entry->index;
+
 	/*
 	 * TBD: It might be worth clearing dev->irq by magic constant
 	 * (e.g. PCI_UNDEFINED_IRQ).
 	 */
 
-	dev_info(&dev->dev, "PCI INT %c disabled\n", 'A' + pin);
+	dev_info(&dev->dev, "PCI INT %c disabled\n", pin_name(pin));
 	acpi_unregister_gsi(gsi);
 }
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index e52ad91ce2d..1c6e73c7865 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -796,10 +796,6 @@ static int irqrouter_resume(struct sys_device *dev)
 	struct list_head *node = NULL;
 	struct acpi_pci_link *link = NULL;
 
-
-	/* Make sure SCI is enabled again (Apple firmware bug?) */
-	acpi_set_register(ACPI_BITREG_SCI_ENABLE, 1);
-
 	list_for_each(node, &acpi_link.entries) {
 		link = list_entry(node, struct acpi_pci_link, node);
 		if (!link) {
@@ -912,7 +908,7 @@ static int __init acpi_irq_nobalance_set(char *str)
 
 __setup("acpi_irq_nobalance", acpi_irq_nobalance_set);
 
-int __init acpi_irq_balance_set(char *str)
+static int __init acpi_irq_balance_set(char *str)
 {
 	acpi_irq_balance = 1;
 	return 1;
diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c
index bb7d50dd281..c926e7d4a0d 100644
--- a/drivers/acpi/power.c
+++ b/drivers/acpi/power.c
@@ -139,6 +139,8 @@ static int acpi_power_get_state(acpi_handle handle, int *state)
 {
 	acpi_status status = AE_OK;
 	unsigned long long sta = 0;
+	char node_name[5];
+	struct acpi_buffer buffer = { sizeof(node_name), node_name };
 
 
 	if (!handle || !state)
@@ -151,8 +153,10 @@ static int acpi_power_get_state(acpi_handle handle, int *state)
 	*state = (sta & 0x01)?ACPI_POWER_RESOURCE_STATE_ON:
 			      ACPI_POWER_RESOURCE_STATE_OFF;
 
+	acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer);
+
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Resource [%s] is %s\n",
-			  acpi_ut_get_node_name(handle),
+			  node_name,
 				*state ? "on" : "off"));
 
 	return 0;
diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/proc.c
index 4dbc2271acf..428c911dba0 100644
--- a/drivers/acpi/sleep/proc.c
+++ b/drivers/acpi/proc.c
@@ -28,8 +28,6 @@ static int acpi_system_sleep_seq_show(struct seq_file *seq, void *offset)
 {
 	int i;
 
-	ACPI_FUNCTION_TRACE("acpi_system_sleep_seq_show");
-
 	for (i = 0; i <= ACPI_STATE_S5; i++) {
 		if (sleep_states[i]) {
 			seq_printf(seq, "S%d ", i);
@@ -86,49 +84,44 @@ acpi_system_write_sleep(struct file *file,
 
 #ifdef	HAVE_ACPI_LEGACY_ALARM
 
+static u32 cmos_bcd_read(int offset, int rtc_control);
+
 static int acpi_system_alarm_seq_show(struct seq_file *seq, void *offset)
 {
 	u32 sec, min, hr;
 	u32 day, mo, yr, cent = 0;
+	u32 today = 0;
 	unsigned char rtc_control = 0;
 	unsigned long flags;
 
-	ACPI_FUNCTION_TRACE("acpi_system_alarm_seq_show");
-
 	spin_lock_irqsave(&rtc_lock, flags);
 
-	sec = CMOS_READ(RTC_SECONDS_ALARM);
-	min = CMOS_READ(RTC_MINUTES_ALARM);
-	hr = CMOS_READ(RTC_HOURS_ALARM);
 	rtc_control = CMOS_READ(RTC_CONTROL);
+	sec = cmos_bcd_read(RTC_SECONDS_ALARM, rtc_control);
+	min = cmos_bcd_read(RTC_MINUTES_ALARM, rtc_control);
+	hr = cmos_bcd_read(RTC_HOURS_ALARM, rtc_control);
 
 	/* If we ever get an FACP with proper values... */
-	if (acpi_gbl_FADT.day_alarm)
+	if (acpi_gbl_FADT.day_alarm) {
 		/* ACPI spec: only low 6 its should be cared */
 		day = CMOS_READ(acpi_gbl_FADT.day_alarm) & 0x3F;
-	else
-		day = CMOS_READ(RTC_DAY_OF_MONTH);
+		if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
+			day = bcd2bin(day);
+	} else
+		day = cmos_bcd_read(RTC_DAY_OF_MONTH, rtc_control);
 	if (acpi_gbl_FADT.month_alarm)
-		mo = CMOS_READ(acpi_gbl_FADT.month_alarm);
-	else
-		mo = CMOS_READ(RTC_MONTH);
+		mo = cmos_bcd_read(acpi_gbl_FADT.month_alarm, rtc_control);
+	else {
+		mo = cmos_bcd_read(RTC_MONTH, rtc_control);
+		today = cmos_bcd_read(RTC_DAY_OF_MONTH, rtc_control);
+	}
 	if (acpi_gbl_FADT.century)
-		cent = CMOS_READ(acpi_gbl_FADT.century);
+		cent = cmos_bcd_read(acpi_gbl_FADT.century, rtc_control);
 
-	yr = CMOS_READ(RTC_YEAR);
+	yr = cmos_bcd_read(RTC_YEAR, rtc_control);
 
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-		sec = bcd2bin(sec);
-		min = bcd2bin(min);
-		hr = bcd2bin(hr);
-		day = bcd2bin(day);
-		mo = bcd2bin(mo);
-		yr = bcd2bin(yr);
-		cent = bcd2bin(cent);
-	}
-
 	/* we're trusting the FADT (see above) */
 	if (!acpi_gbl_FADT.century)
 		/* If we're not trusting the FADT, we should at least make it
@@ -153,6 +146,20 @@ static int acpi_system_alarm_seq_show(struct seq_file *seq, void *offset)
 	else
 		yr += cent * 100;
 
+	/*
+	 * Show correct dates for alarms up to a month into the future.
+	 * This solves issues for nearly all situations with the common
+	 * 30-day alarm clocks in PC hardware.
+	 */
+	if (day < today) {
+		if (mo < 12) {
+			mo += 1;
+		} else {
+			mo = 1;
+			yr += 1;
+		}
+	}
+
 	seq_printf(seq, "%4.4u-", yr);
 	(mo > 12) ? seq_puts(seq, "**-") : seq_printf(seq, "%2.2u-", mo);
 	(day > 31) ? seq_puts(seq, "** ") : seq_printf(seq, "%2.2u ", day);
@@ -227,13 +234,11 @@ acpi_system_write_alarm(struct file *file,
 	int adjust = 0;
 	unsigned char rtc_control = 0;
 
-	ACPI_FUNCTION_TRACE("acpi_system_write_alarm");
-
 	if (count > sizeof(alarm_string) - 1)
-		return_VALUE(-EINVAL);
+		return -EINVAL;
 
 	if (copy_from_user(alarm_string, buffer, count))
-		return_VALUE(-EFAULT);
+		return -EFAULT;
 
 	alarm_string[count] = '\0';
 
@@ -334,7 +339,7 @@ acpi_system_write_alarm(struct file *file,
 
 	result = 0;
       end:
-	return_VALUE(result ? result : count);
+	return result ? result : count;
 }
 #endif				/* HAVE_ACPI_LEGACY_ALARM */
 
diff --git a/drivers/acpi/reboot.c b/drivers/acpi/reboot.c
index a6b662c00b6..93f91142d7a 100644
--- a/drivers/acpi/reboot.c
+++ b/drivers/acpi/reboot.c
@@ -42,7 +42,7 @@ void acpi_reboot(void)
 	case ACPI_ADR_SPACE_SYSTEM_MEMORY:
 	case ACPI_ADR_SPACE_SYSTEM_IO:
 		printk(KERN_DEBUG "ACPI MEMORY or I/O RESET_REG.\n");
-		acpi_hw_low_level_write(8, reset_value, rr);
+		acpi_reset();
 		break;
 	}
 	/* Wait ten seconds */
diff --git a/drivers/acpi/resources/Makefile b/drivers/acpi/resources/Makefile
deleted file mode 100644
index 8de4f69dfa0..00000000000
--- a/drivers/acpi/resources/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-#
-# Makefile for all Linux ACPI interpreter subdirectories
-#
-
-obj-y := rsaddr.o rscreate.o rsinfo.o rsio.o rslist.o rsmisc.o rsxface.o \
-	 rscalc.o  rsirq.o  rsmemory.o  rsutils.o
-
-obj-$(ACPI_FUTURE_USAGE) += rsdump.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c
index e53e590252c..0619734895b 100644
--- a/drivers/acpi/sbshc.c
+++ b/drivers/acpi/sbshc.c
@@ -10,7 +10,6 @@
 
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
-#include <acpi/actypes.h>
 #include <linux/wait.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 39b7233c348..c54d7b6c406 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -10,7 +10,6 @@
 #include <linux/kthread.h>
 
 #include <acpi/acpi_drivers.h>
-#include <acpi/acinterp.h>	/* for acpi_ex_eisa_id_to_string() */
 
 #define _COMPONENT		ACPI_BUS_COMPONENT
 ACPI_MODULE_NAME("scan");
diff --git a/drivers/acpi/sleep/sleep.h b/drivers/acpi/sleep.h
index cfaf8f5b0a1..cfaf8f5b0a1 100644
--- a/drivers/acpi/sleep/sleep.h
+++ b/drivers/acpi/sleep.h
diff --git a/drivers/acpi/sleep/Makefile b/drivers/acpi/sleep/Makefile
deleted file mode 100644
index f1fb888c2d2..00000000000
--- a/drivers/acpi/sleep/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-obj-y					:= wakeup.o
-obj-y					+= main.o
-obj-$(CONFIG_ACPI_SLEEP)		+= proc.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c
index 6e4107f8240..391d0358a59 100644
--- a/drivers/acpi/system.c
+++ b/drivers/acpi/system.c
@@ -192,65 +192,6 @@ static struct attribute_group interrupt_stats_attr_group = {
 };
 static struct kobj_attribute *counter_attrs;
 
-static int count_num_gpes(void)
-{
-	int count = 0;
-	struct acpi_gpe_xrupt_info *gpe_xrupt_info;
-	struct acpi_gpe_block_info *gpe_block;
-	acpi_cpu_flags flags;
-
-	flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock);
-
-	gpe_xrupt_info = acpi_gbl_gpe_xrupt_list_head;
-	while (gpe_xrupt_info) {
-		gpe_block = gpe_xrupt_info->gpe_block_list_head;
-		while (gpe_block) {
-			count += gpe_block->register_count *
-			    ACPI_GPE_REGISTER_WIDTH;
-			gpe_block = gpe_block->next;
-		}
-		gpe_xrupt_info = gpe_xrupt_info->next;
-	}
-	acpi_os_release_lock(acpi_gbl_gpe_lock, flags);
-
-	return count;
-}
-
-static int get_gpe_device(int index, acpi_handle *handle)
-{
-	struct acpi_gpe_xrupt_info *gpe_xrupt_info;
-	struct acpi_gpe_block_info *gpe_block;
-	acpi_cpu_flags flags;
-	struct acpi_namespace_node *node;
-
-	flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock);
-
-	gpe_xrupt_info = acpi_gbl_gpe_xrupt_list_head;
-	while (gpe_xrupt_info) {
-		gpe_block = gpe_xrupt_info->gpe_block_list_head;
-		node = gpe_block->node;
-		while (gpe_block) {
-			index -= gpe_block->register_count *
-			    ACPI_GPE_REGISTER_WIDTH;
-			if (index < 0) {
-				acpi_os_release_lock(acpi_gbl_gpe_lock, flags);
-				/* return NULL if it's FADT GPE */
-				if (node->type != ACPI_TYPE_DEVICE)
-					*handle = NULL;
-				else
-					*handle = node;
-				return 0;
-			}
-			node = gpe_block->node;
-			gpe_block = gpe_block->next;
-		}
-		gpe_xrupt_info = gpe_xrupt_info->next;
-	}
-	acpi_os_release_lock(acpi_gbl_gpe_lock, flags);
-
-	return -ENODEV;
-}
-
 static void delete_gpe_attr_array(void)
 {
 	struct event_counter *tmp = all_counters;
@@ -309,7 +250,7 @@ static int get_status(u32 index, acpi_event_status *status, acpi_handle *handle)
 		goto end;
 
 	if (index < num_gpes) {
-		result = get_gpe_device(index, handle);
+		result = acpi_get_gpe_device(index, handle);
 		if (result) {
 			ACPI_EXCEPTION((AE_INFO, AE_NOT_FOUND,
 				"Invalid GPE 0x%x\n", index));
@@ -436,7 +377,7 @@ void acpi_irq_stats_init(void)
 	if (all_counters)
 		return;
 
-	num_gpes = count_num_gpes();
+	num_gpes = acpi_current_gpe_count;
 	num_counters = num_gpes + ACPI_NUM_FIXED_EVENTS + NUM_COUNTERS_EXTRA;
 
 	all_attrs = kzalloc(sizeof(struct attribute *) * (num_counters + 1),
diff --git a/drivers/acpi/tables/Makefile b/drivers/acpi/tables/Makefile
deleted file mode 100644
index 7385efa6162..00000000000
--- a/drivers/acpi/tables/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Makefile for all Linux ACPI interpreter subdirectories
-#
-
-obj-y := tbxface.o tbinstal.o tbutils.o tbfind.o tbfadt.o tbxfroot.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/utilities/Makefile b/drivers/acpi/utilities/Makefile
deleted file mode 100644
index 88eff14c489..00000000000
--- a/drivers/acpi/utilities/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for all Linux ACPI interpreter subdirectories
-#
-
-obj-y := utalloc.o utdebug.o uteval.o utinit.o utmisc.o utxface.o \
-		utcopy.o utdelete.o utglobal.o utmath.o utobject.o \
-		utstate.o utmutex.o utobject.o utcache.o utresrc.o
-
-EXTRA_CFLAGS += $(ACPI_CFLAGS)
diff --git a/drivers/acpi/utilities/utcache.c b/drivers/acpi/utilities/utcache.c
deleted file mode 100644
index 245fa80cf60..00000000000
--- a/drivers/acpi/utilities/utcache.c
+++ /dev/null
@@ -1,314 +0,0 @@
-/******************************************************************************
- *
- * Module Name: utcache - local cache allocation routines
- *
- *****************************************************************************/
-
-/*
- * Copyright (C) 2000 - 2008, Intel Corp.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions, and the following disclaimer,
- *    without modification.
- * 2. Redistributions in binary form must reproduce at minimum a disclaimer
- *    substantially similar to the "NO WARRANTY" disclaimer below
- *    ("Disclaimer") and any redistribution must be conditioned upon
- *    including a substantially similar Disclaimer requirement for further
- *    binary redistribution.
- * 3. Neither the names of the above-listed copyright holders nor the names
- *    of any contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") version 2 as published by the Free
- * Software Foundation.
- *
- * NO WARRANTY
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGES.
- */
-
-#include <acpi/acpi.h>
-
-#define _COMPONENT          ACPI_UTILITIES
-ACPI_MODULE_NAME("utcache")
-#ifdef ACPI_USE_LOCAL_CACHE
-/*******************************************************************************
- *
- * FUNCTION:    acpi_os_create_cache
- *
- * PARAMETERS:  cache_name      - Ascii name for the cache
- *              object_size     - Size of each cached object
- *              max_depth       - Maximum depth of the cache (in objects)
- *              return_cache    - Where the new cache object is returned
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Create a cache object
- *
- ******************************************************************************/
-acpi_status
-acpi_os_create_cache(char *cache_name,
-		     u16 object_size,
-		     u16 max_depth, struct acpi_memory_list ** return_cache)
-{
-	struct acpi_memory_list *cache;
-
-	ACPI_FUNCTION_ENTRY();
-
-	if (!cache_name || !return_cache || (object_size < 16)) {
-		return (AE_BAD_PARAMETER);
-	}
-
-	/* Create the cache object */
-
-	cache = acpi_os_allocate(sizeof(struct acpi_memory_list));
-	if (!cache) {
-		return (AE_NO_MEMORY);
-	}
-
-	/* Populate the cache object and return it */
-
-	ACPI_MEMSET(cache, 0, sizeof(struct acpi_memory_list));
-	cache->link_offset = 8;
-	cache->list_name = cache_name;
-	cache->object_size = object_size;
-	cache->max_depth = max_depth;
-
-	*return_cache = cache;
-	return (AE_OK);
-}
-
-/*******************************************************************************
- *
- * FUNCTION:    acpi_os_purge_cache
- *
- * PARAMETERS:  Cache           - Handle to cache object
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Free all objects within the requested cache.
- *
- ******************************************************************************/
-
-acpi_status acpi_os_purge_cache(struct acpi_memory_list * cache)
-{
-	char *next;
-
-	ACPI_FUNCTION_ENTRY();
-
-	if (!cache) {
-		return (AE_BAD_PARAMETER);
-	}
-
-	/* Walk the list of objects in this cache */
-
-	while (cache->list_head) {
-
-		/* Delete and unlink one cached state object */
-
-		next = *(ACPI_CAST_INDIRECT_PTR(char,
-						&(((char *)cache->
-						   list_head)[cache->
-							      link_offset])));
-		ACPI_FREE(cache->list_head);
-
-		cache->list_head = next;
-		cache->current_depth--;
-	}
-
-	return (AE_OK);
-}
-
-/*******************************************************************************
- *
- * FUNCTION:    acpi_os_delete_cache
- *
- * PARAMETERS:  Cache           - Handle to cache object
- *
- * RETURN:      Status
- *
- * DESCRIPTION: Free all objects within the requested cache and delete the
- *              cache object.
- *
- ******************************************************************************/
-
-acpi_status acpi_os_delete_cache(struct acpi_memory_list * cache)
-{
-	acpi_status status;
-
-	ACPI_FUNCTION_ENTRY();
-
-	/* Purge all objects in the cache */
-
-	status = acpi_os_purge_cache(cache);
-	if (ACPI_FAILURE(status)) {
-		return (status);
-	}
-
-	/* Now we can delete the cache object */
-
-	ACPI_FREE(cache);
-	return (AE_OK);
-}
-
-/*******************************************************************************
- *
- * FUNCTION:    acpi_os_release_object
- *
- * PARAMETERS:  Cache       - Handle to cache object
- *              Object      - The object to be released
- *
- * RETURN:      None
- *
- * DESCRIPTION: Release an object to the specified cache.  If cache is full,
- *              the object is deleted.
- *
- ******************************************************************************/
-
-acpi_status
-acpi_os_release_object(struct acpi_memory_list * cache, void *object)
-{
-	acpi_status status;
-
-	ACPI_FUNCTION_ENTRY();
-
-	if (!cache || !object) {
-		return (AE_BAD_PARAMETER);
-	}
-
-	/* If cache is full, just free this object */
-
-	if (cache->current_depth >= cache->max_depth) {
-		ACPI_FREE(object);
-		ACPI_MEM_TRACKING(cache->total_freed++);
-	}
-
-	/* Otherwise put this object back into the cache */
-
-	else {
-		status = acpi_ut_acquire_mutex(ACPI_MTX_CACHES);
-		if (ACPI_FAILURE(status)) {
-			return (status);
-		}
-
-		/* Mark the object as cached */
-
-		ACPI_MEMSET(object, 0xCA, cache->object_size);
-		ACPI_SET_DESCRIPTOR_TYPE(object, ACPI_DESC_TYPE_CACHED);
-
-		/* Put the object at the head of the cache list */
-
-		*(ACPI_CAST_INDIRECT_PTR(char,
-					 &(((char *)object)[cache->
-							    link_offset]))) =
-		    cache->list_head;
-		cache->list_head = object;
-		cache->current_depth++;
-
-		(void)acpi_ut_release_mutex(ACPI_MTX_CACHES);
-	}
-
-	return (AE_OK);
-}
-
-/*******************************************************************************
- *
- * FUNCTION:    acpi_os_acquire_object
- *
- * PARAMETERS:  Cache           - Handle to cache object
- *
- * RETURN:      the acquired object.  NULL on error
- *
- * DESCRIPTION: Get an object from the specified cache.  If cache is empty,
- *              the object is allocated.
- *
- ******************************************************************************/
-
-void *acpi_os_acquire_object(struct acpi_memory_list *cache)
-{
-	acpi_status status;
-	void *object;
-
-	ACPI_FUNCTION_NAME(os_acquire_object);
-
-	if (!cache) {
-		return (NULL);
-	}
-
-	status = acpi_ut_acquire_mutex(ACPI_MTX_CACHES);
-	if (ACPI_FAILURE(status)) {
-		return (NULL);
-	}
-
-	ACPI_MEM_TRACKING(cache->requests++);
-
-	/* Check the cache first */
-
-	if (cache->list_head) {
-
-		/* There is an object available, use it */
-
-		object = cache->list_head;
-		cache->list_head = *(ACPI_CAST_INDIRECT_PTR(char,
-							    &(((char *)
-							       object)[cache->
-								       link_offset])));
-
-		cache->current_depth--;
-
-		ACPI_MEM_TRACKING(cache->hits++);
-		ACPI_DEBUG_PRINT((ACPI_DB_EXEC,
-				  "Object %p from %s cache\n", object,
-				  cache->list_name));
-
-		status = acpi_ut_release_mutex(ACPI_MTX_CACHES);
-		if (ACPI_FAILURE(status)) {
-			return (NULL);
-		}
-
-		/* Clear (zero) the previously used Object */
-
-		ACPI_MEMSET(object, 0, cache->object_size);
-	} else {
-		/* The cache is empty, create a new object */
-
-		ACPI_MEM_TRACKING(cache->total_allocated++);
-
-#ifdef ACPI_DBG_TRACK_ALLOCATIONS
-		if ((cache->total_allocated - cache->total_freed) >
-		    cache->max_occupied) {
-			cache->max_occupied =
-			    cache->total_allocated - cache->total_freed;
-		}
-#endif
-
-		/* Avoid deadlock with ACPI_ALLOCATE_ZEROED */
-
-		status = acpi_ut_release_mutex(ACPI_MTX_CACHES);
-		if (ACPI_FAILURE(status)) {
-			return (NULL);
-		}
-
-		object = ACPI_ALLOCATE_ZEROED(cache->object_size);
-		if (!object) {
-			return (NULL);
-		}
-	}
-
-	return (object);
-}
-#endif				/* ACPI_USE_LOCAL_CACHE */
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index baa44192972..f261737636d 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -36,6 +36,7 @@
 #include <linux/backlight.h>
 #include <linux/thermal.h>
 #include <linux/video_output.h>
+#include <linux/sort.h>
 #include <asm/uaccess.h>
 
 #include <acpi/acpi_bus.h>
@@ -481,6 +482,7 @@ acpi_video_device_lcd_set_level(struct acpi_video_device *device, int level)
 	int status = AE_OK;
 	union acpi_object arg0 = { ACPI_TYPE_INTEGER };
 	struct acpi_object_list args = { 1, &arg0 };
+	int state;
 
 
 	arg0.integer.value = level;
@@ -489,6 +491,10 @@ acpi_video_device_lcd_set_level(struct acpi_video_device *device, int level)
 		status = acpi_evaluate_object(device->dev->handle, "_BCM",
 					      &args, NULL);
 	device->brightness->curr = level;
+	for (state = 2; state < device->brightness->count; state++)
+		if (level == device->brightness->levels[state])
+			device->backlight->props.brightness = state - 2;
+
 	return status;
 }
 
@@ -626,6 +632,16 @@ acpi_video_bus_DOS(struct acpi_video_bus *video, int bios_flag, int lcd_flag)
 }
 
 /*
+ * Simple comparison function used to sort backlight levels.
+ */
+
+static int
+acpi_video_cmp_level(const void *a, const void *b)
+{
+	return *(int *)a - *(int *)b;
+}
+
+/*
  *  Arg:	
  *  	device	: video output device (LCD, CRT, ..)
  *
@@ -676,6 +692,10 @@ acpi_video_init_brightness(struct acpi_video_device *device)
 		count++;
 	}
 
+	/* don't sort the first two brightness levels */
+	sort(&br->levels[2], count - 2, sizeof(br->levels[2]),
+		acpi_video_cmp_level, NULL);
+
 	if (count < 2)
 		goto out_free_levels;
 
diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index f022eb6f563..50e3d2dbf3a 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -234,7 +234,7 @@ EXPORT_SYMBOL(acpi_video_display_switch_support);
  * To force that backlight or display output switching is processed by vendor
  * specific acpi drivers or video.ko driver.
  */
-int __init acpi_backlight(char *str)
+static int __init acpi_backlight(char *str)
 {
 	if (str == NULL || *str == '\0')
 		return 1;
@@ -250,7 +250,7 @@ int __init acpi_backlight(char *str)
 }
 __setup("acpi_backlight=", acpi_backlight);
 
-int __init acpi_display_output(char *str)
+static int __init acpi_display_output(char *str)
 {
 	if (str == NULL || *str == '\0')
 		return 1;
diff --git a/drivers/acpi/sleep/wakeup.c b/drivers/acpi/wakeup.c
index dea4c23df76..2d34806d45d 100644
--- a/drivers/acpi/sleep/wakeup.c
+++ b/drivers/acpi/wakeup.c
@@ -8,7 +8,6 @@
 #include <acpi/acpi_drivers.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <acpi/acevents.h>
 #include "sleep.h"
 
 #define _COMPONENT		ACPI_SYSTEM_COMPONENT
@@ -28,8 +27,6 @@ void acpi_enable_wakeup_device_prep(u8 sleep_state)
 {
 	struct list_head *node, *next;
 
-	ACPI_FUNCTION_TRACE("acpi_enable_wakeup_device_prep");
-
 	spin_lock(&acpi_device_lock);
 	list_for_each_safe(node, next, &acpi_wakeup_device_list) {
 		struct acpi_device *dev = container_of(node,
@@ -61,7 +58,6 @@ void acpi_enable_wakeup_device(u8 sleep_state)
 	 * Caution: this routine must be invoked when interrupt is disabled 
 	 * Refer ACPI2.0: P212
 	 */
-	ACPI_FUNCTION_TRACE("acpi_enable_wakeup_device");
 	spin_lock(&acpi_device_lock);
 	list_for_each_safe(node, next, &acpi_wakeup_device_list) {
 		struct acpi_device *dev =
@@ -103,8 +99,6 @@ void acpi_disable_wakeup_device(u8 sleep_state)
 {
 	struct list_head *node, *next;
 
-	ACPI_FUNCTION_TRACE("acpi_disable_wakeup_device");
-
 	spin_lock(&acpi_device_lock);
 	list_for_each_safe(node, next, &acpi_wakeup_device_list) {
 		struct acpi_device *dev =
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index ef02e488d46..6273d98d00e 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -19,12 +19,6 @@
 #include "libata.h"
 
 #include <acpi/acpi_bus.h>
-#include <acpi/acnames.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acparser.h>
-#include <acpi/acexcep.h>
-#include <acpi/acmacros.h>
-#include <acpi/actypes.h>
 
 enum {
 	ATA_ACPI_FILTER_SETXFER	= 1 << 0,
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 175df54eb66..c507a9ac78f 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4556,7 +4556,7 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
 	struct scatterlist *sg = qc->sg;
 	int dir = qc->dma_dir;
 
-	WARN_ON(sg == NULL);
+	WARN_ON_ONCE(sg == NULL);
 
 	VPRINTK("unmapping %u sg elements\n", qc->n_elem);
 
@@ -4776,7 +4776,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
 	struct ata_port *ap = qc->ap;
 	unsigned int tag;
 
-	WARN_ON(qc == NULL);	/* ata_qc_from_tag _might_ return NULL */
+	WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
 
 	qc->flags = 0;
 	tag = qc->tag;
@@ -4791,8 +4791,8 @@ void __ata_qc_complete(struct ata_queued_cmd *qc)
 	struct ata_port *ap = qc->ap;
 	struct ata_link *link = qc->dev->link;
 
-	WARN_ON(qc == NULL);	/* ata_qc_from_tag _might_ return NULL */
-	WARN_ON(!(qc->flags & ATA_QCFLAG_ACTIVE));
+	WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
+	WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE));
 
 	if (likely(qc->flags & ATA_QCFLAG_DMAMAP))
 		ata_sg_clean(qc);
@@ -4878,7 +4878,7 @@ void ata_qc_complete(struct ata_queued_cmd *qc)
 		struct ata_device *dev = qc->dev;
 		struct ata_eh_info *ehi = &dev->link->eh_info;
 
-		WARN_ON(ap->pflags & ATA_PFLAG_FROZEN);
+		WARN_ON_ONCE(ap->pflags & ATA_PFLAG_FROZEN);
 
 		if (unlikely(qc->err_mask))
 			qc->flags |= ATA_QCFLAG_FAILED;
@@ -5000,16 +5000,16 @@ void ata_qc_issue(struct ata_queued_cmd *qc)
 	 * check is skipped for old EH because it reuses active qc to
 	 * request ATAPI sense.
 	 */
-	WARN_ON(ap->ops->error_handler && ata_tag_valid(link->active_tag));
+	WARN_ON_ONCE(ap->ops->error_handler && ata_tag_valid(link->active_tag));
 
 	if (ata_is_ncq(prot)) {
-		WARN_ON(link->sactive & (1 << qc->tag));
+		WARN_ON_ONCE(link->sactive & (1 << qc->tag));
 
 		if (!link->sactive)
 			ap->nr_active_links++;
 		link->sactive |= 1 << qc->tag;
 	} else {
-		WARN_ON(link->sactive);
+		WARN_ON_ONCE(link->sactive);
 
 		ap->nr_active_links++;
 		link->active_tag = qc->tag;
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index c59ad76c84b..0eae9b45355 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -578,7 +578,7 @@ void ata_sff_tf_load(struct ata_port *ap, const struct ata_taskfile *tf)
 	}
 
 	if (is_addr && (tf->flags & ATA_TFLAG_LBA48)) {
-		WARN_ON(!ioaddr->ctl_addr);
+		WARN_ON_ONCE(!ioaddr->ctl_addr);
 		iowrite8(tf->hob_feature, ioaddr->feature_addr);
 		iowrite8(tf->hob_nsect, ioaddr->nsect_addr);
 		iowrite8(tf->hob_lbal, ioaddr->lbal_addr);
@@ -651,7 +651,7 @@ void ata_sff_tf_read(struct ata_port *ap, struct ata_taskfile *tf)
 			iowrite8(tf->ctl, ioaddr->ctl_addr);
 			ap->last_ctl = tf->ctl;
 		} else
-			WARN_ON(1);
+			WARN_ON_ONCE(1);
 	}
 }
 EXPORT_SYMBOL_GPL(ata_sff_tf_read);
@@ -891,7 +891,7 @@ static void ata_pio_sectors(struct ata_queued_cmd *qc)
 		/* READ/WRITE MULTIPLE */
 		unsigned int nsect;
 
-		WARN_ON(qc->dev->multi_count == 0);
+		WARN_ON_ONCE(qc->dev->multi_count == 0);
 
 		nsect = min((qc->nbytes - qc->curbytes) / qc->sect_size,
 			    qc->dev->multi_count);
@@ -918,7 +918,7 @@ static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc)
 {
 	/* send SCSI cdb */
 	DPRINTK("send cdb\n");
-	WARN_ON(qc->dev->cdb_len < 12);
+	WARN_ON_ONCE(qc->dev->cdb_len < 12);
 
 	ap->ops->sff_data_xfer(qc->dev, qc->cdb, qc->dev->cdb_len, 1);
 	ata_sff_sync(ap);
@@ -1014,7 +1014,7 @@ next_sg:
 	}
 
 	/* consumed can be larger than count only for the last transfer */
-	WARN_ON(qc->cursg && count != consumed);
+	WARN_ON_ONCE(qc->cursg && count != consumed);
 
 	if (bytes)
 		goto next_sg;
@@ -1172,13 +1172,13 @@ int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc,
 	unsigned long flags = 0;
 	int poll_next;
 
-	WARN_ON((qc->flags & ATA_QCFLAG_ACTIVE) == 0);
+	WARN_ON_ONCE((qc->flags & ATA_QCFLAG_ACTIVE) == 0);
 
 	/* Make sure ata_sff_qc_issue() does not throw things
 	 * like DMA polling into the workqueue. Notice that
 	 * in_wq is not equivalent to (qc->tf.flags & ATA_TFLAG_POLLING).
 	 */
-	WARN_ON(in_wq != ata_hsm_ok_in_wq(ap, qc));
+	WARN_ON_ONCE(in_wq != ata_hsm_ok_in_wq(ap, qc));
 
 fsm_start:
 	DPRINTK("ata%u: protocol %d task_state %d (dev_stat 0x%X)\n",
@@ -1387,7 +1387,7 @@ fsm_start:
 		DPRINTK("ata%u: dev %u command complete, drv_stat 0x%x\n",
 			ap->print_id, qc->dev->devno, status);
 
-		WARN_ON(qc->err_mask & (AC_ERR_DEV | AC_ERR_HSM));
+		WARN_ON_ONCE(qc->err_mask & (AC_ERR_DEV | AC_ERR_HSM));
 
 		ap->hsm_task_state = HSM_ST_IDLE;
 
@@ -1423,7 +1423,7 @@ void ata_pio_task(struct work_struct *work)
 	int poll_next;
 
 fsm_start:
-	WARN_ON(ap->hsm_task_state == HSM_ST_IDLE);
+	WARN_ON_ONCE(ap->hsm_task_state == HSM_ST_IDLE);
 
 	/*
 	 * This is purely heuristic.  This is a fast path.
@@ -1512,7 +1512,7 @@ unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc)
 		break;
 
 	case ATA_PROT_DMA:
-		WARN_ON(qc->tf.flags & ATA_TFLAG_POLLING);
+		WARN_ON_ONCE(qc->tf.flags & ATA_TFLAG_POLLING);
 
 		ap->ops->sff_tf_load(ap, &qc->tf);  /* load tf registers */
 		ap->ops->bmdma_setup(qc);	    /* set up bmdma */
@@ -1564,7 +1564,7 @@ unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc)
 		break;
 
 	case ATAPI_PROT_DMA:
-		WARN_ON(qc->tf.flags & ATA_TFLAG_POLLING);
+		WARN_ON_ONCE(qc->tf.flags & ATA_TFLAG_POLLING);
 
 		ap->ops->sff_tf_load(ap, &qc->tf);  /* load tf registers */
 		ap->ops->bmdma_setup(qc);	    /* set up bmdma */
@@ -1576,7 +1576,7 @@ unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc)
 		break;
 
 	default:
-		WARN_ON(1);
+		WARN_ON_ONCE(1);
 		return AC_ERR_SYSTEM;
 	}
 
diff --git a/drivers/ata/pata_acpi.c b/drivers/ata/pata_acpi.c
index e2e332d8ff9..8b77a9802df 100644
--- a/drivers/ata/pata_acpi.c
+++ b/drivers/ata/pata_acpi.c
@@ -13,12 +13,6 @@
 #include <linux/device.h>
 #include <scsi/scsi_host.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/acnames.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acparser.h>
-#include <acpi/acexcep.h>
-#include <acpi/acmacros.h>
-#include <acpi/actypes.h>
 
 #include <linux/libata.h>
 #include <linux/ata.h>
diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c
index 68f052b42ed..ed306eb1057 100644
--- a/drivers/char/tpm/tpm_bios.c
+++ b/drivers/char/tpm/tpm_bios.c
@@ -23,8 +23,6 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <acpi/acpi.h>
-#include <acpi/actypes.h>
-#include <acpi/actbl.h>
 #include "tpm.h"
 
 #define TCG_EVENT_NAME_LEN_MAX	255
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 8d7cf3f3145..f1df59f59a3 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -15,12 +15,14 @@
 #include <linux/tick.h>
 
 #define BREAK_FUZZ	4	/* 4 us */
+#define PRED_HISTORY_PCT	50
 
 struct menu_device {
 	int		last_state_idx;
 
 	unsigned int	expected_us;
 	unsigned int	predicted_us;
+	unsigned int    current_predicted_us;
 	unsigned int	last_measured_us;
 	unsigned int	elapsed_us;
 };
@@ -47,6 +49,12 @@ static int menu_select(struct cpuidle_device *dev)
 	data->expected_us =
 		(u32) ktime_to_ns(tick_nohz_get_sleep_length()) / 1000;
 
+	/* Recalculate predicted_us based on prediction_history_pct */
+	data->predicted_us *= PRED_HISTORY_PCT;
+	data->predicted_us += (100 - PRED_HISTORY_PCT) *
+				data->current_predicted_us;
+	data->predicted_us /= 100;
+
 	/* find the deepest idle state that satisfies our constraints */
 	for (i = CPUIDLE_DRIVER_STATE_START + 1; i < dev->state_count; i++) {
 		struct cpuidle_state *s = &dev->states[i];
@@ -97,7 +105,7 @@ static void menu_reflect(struct cpuidle_device *dev)
 		measured_us = -1;
 
 	/* Predict time until next break event */
-	data->predicted_us = max(measured_us, data->last_measured_us);
+	data->current_predicted_us = max(measured_us, data->last_measured_us);
 
 	if (last_idle_us + BREAK_FUZZ <
 	    data->expected_us - target->exit_latency) {
diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c
index d883e1b8bb8..55433849bfa 100644
--- a/drivers/dca/dca-core.c
+++ b/drivers/dca/dca-core.c
@@ -270,6 +270,6 @@ static void __exit dca_exit(void)
 	dca_sysfs_exit();
 }
 
-subsys_initcall(dca_init);
+arch_initcall(dca_init);
 module_exit(dca_exit);
 
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 904e57558bb..e34b0642081 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -33,7 +33,6 @@ config INTEL_IOATDMA
 config INTEL_IOP_ADMA
 	tristate "Intel IOP ADMA support"
 	depends on ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX
-	select ASYNC_CORE
 	select DMA_ENGINE
 	help
 	  Enable support for the Intel(R) IOP Series RAID engines.
@@ -59,7 +58,6 @@ config FSL_DMA
 config MV_XOR
 	bool "Marvell XOR engine support"
 	depends on PLAT_ORION
-	select ASYNC_CORE
 	select DMA_ENGINE
 	---help---
 	  Enable support for the Marvell XOR engine.
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 65799651737..403dbe78112 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -31,32 +31,18 @@
  *
  * LOCKING:
  *
- * The subsystem keeps two global lists, dma_device_list and dma_client_list.
- * Both of these are protected by a mutex, dma_list_mutex.
+ * The subsystem keeps a global list of dma_device structs it is protected by a
+ * mutex, dma_list_mutex.
+ *
+ * A subsystem can get access to a channel by calling dmaengine_get() followed
+ * by dma_find_channel(), or if it has need for an exclusive channel it can call
+ * dma_request_channel().  Once a channel is allocated a reference is taken
+ * against its corresponding driver to disable removal.
  *
  * Each device has a channels list, which runs unlocked but is never modified
  * once the device is registered, it's just setup by the driver.
  *
- * Each client is responsible for keeping track of the channels it uses.  See
- * the definition of dma_event_callback in dmaengine.h.
- *
- * Each device has a kref, which is initialized to 1 when the device is
- * registered. A kref_get is done for each device registered.  When the
- * device is released, the corresponding kref_put is done in the release
- * method. Every time one of the device's channels is allocated to a client,
- * a kref_get occurs.  When the channel is freed, the corresponding kref_put
- * happens. The device's release function does a completion, so
- * unregister_device does a remove event, device_unregister, a kref_put
- * for the first reference, then waits on the completion for all other
- * references to finish.
- *
- * Each channel has an open-coded implementation of Rusty Russell's "bigref,"
- * with a kref and a per_cpu local_t.  A dma_chan_get is called when a client
- * signals that it wants to use a channel, and dma_chan_put is called when
- * a channel is removed or a client using it is unregistered.  A client can
- * take extra references per outstanding transaction, as is the case with
- * the NET DMA client.  The release function does a kref_put on the device.
- *	-ChrisL, DanW
+ * See Documentation/dmaengine.txt for more details
  */
 
 #include <linux/init.h>
@@ -70,54 +56,85 @@
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
 #include <linux/jiffies.h>
+#include <linux/rculist.h>
+#include <linux/idr.h>
 
 static DEFINE_MUTEX(dma_list_mutex);
 static LIST_HEAD(dma_device_list);
-static LIST_HEAD(dma_client_list);
+static long dmaengine_ref_count;
+static struct idr dma_idr;
 
 /* --- sysfs implementation --- */
 
+/**
+ * dev_to_dma_chan - convert a device pointer to the its sysfs container object
+ * @dev - device node
+ *
+ * Must be called under dma_list_mutex
+ */
+static struct dma_chan *dev_to_dma_chan(struct device *dev)
+{
+	struct dma_chan_dev *chan_dev;
+
+	chan_dev = container_of(dev, typeof(*chan_dev), device);
+	return chan_dev->chan;
+}
+
 static ssize_t show_memcpy_count(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct dma_chan *chan = to_dma_chan(dev);
+	struct dma_chan *chan;
 	unsigned long count = 0;
 	int i;
+	int err;
 
-	for_each_possible_cpu(i)
-		count += per_cpu_ptr(chan->local, i)->memcpy_count;
+	mutex_lock(&dma_list_mutex);
+	chan = dev_to_dma_chan(dev);
+	if (chan) {
+		for_each_possible_cpu(i)
+			count += per_cpu_ptr(chan->local, i)->memcpy_count;
+		err = sprintf(buf, "%lu\n", count);
+	} else
+		err = -ENODEV;
+	mutex_unlock(&dma_list_mutex);
 
-	return sprintf(buf, "%lu\n", count);
+	return err;
 }
 
 static ssize_t show_bytes_transferred(struct device *dev, struct device_attribute *attr,
 				      char *buf)
 {
-	struct dma_chan *chan = to_dma_chan(dev);
+	struct dma_chan *chan;
 	unsigned long count = 0;
 	int i;
+	int err;
 
-	for_each_possible_cpu(i)
-		count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+	mutex_lock(&dma_list_mutex);
+	chan = dev_to_dma_chan(dev);
+	if (chan) {
+		for_each_possible_cpu(i)
+			count += per_cpu_ptr(chan->local, i)->bytes_transferred;
+		err = sprintf(buf, "%lu\n", count);
+	} else
+		err = -ENODEV;
+	mutex_unlock(&dma_list_mutex);
 
-	return sprintf(buf, "%lu\n", count);
+	return err;
 }
 
 static ssize_t show_in_use(struct device *dev, struct device_attribute *attr, char *buf)
 {
-	struct dma_chan *chan = to_dma_chan(dev);
-	int in_use = 0;
-
-	if (unlikely(chan->slow_ref) &&
-		atomic_read(&chan->refcount.refcount) > 1)
-		in_use = 1;
-	else {
-		if (local_read(&(per_cpu_ptr(chan->local,
-			get_cpu())->refcount)) > 0)
-			in_use = 1;
-		put_cpu();
-	}
+	struct dma_chan *chan;
+	int err;
 
-	return sprintf(buf, "%d\n", in_use);
+	mutex_lock(&dma_list_mutex);
+	chan = dev_to_dma_chan(dev);
+	if (chan)
+		err = sprintf(buf, "%d\n", chan->client_count);
+	else
+		err = -ENODEV;
+	mutex_unlock(&dma_list_mutex);
+
+	return err;
 }
 
 static struct device_attribute dma_attrs[] = {
@@ -127,76 +144,110 @@ static struct device_attribute dma_attrs[] = {
 	__ATTR_NULL
 };
 
-static void dma_async_device_cleanup(struct kref *kref);
-
-static void dma_dev_release(struct device *dev)
+static void chan_dev_release(struct device *dev)
 {
-	struct dma_chan *chan = to_dma_chan(dev);
-	kref_put(&chan->device->refcount, dma_async_device_cleanup);
+	struct dma_chan_dev *chan_dev;
+
+	chan_dev = container_of(dev, typeof(*chan_dev), device);
+	if (atomic_dec_and_test(chan_dev->idr_ref)) {
+		mutex_lock(&dma_list_mutex);
+		idr_remove(&dma_idr, chan_dev->dev_id);
+		mutex_unlock(&dma_list_mutex);
+		kfree(chan_dev->idr_ref);
+	}
+	kfree(chan_dev);
 }
 
 static struct class dma_devclass = {
 	.name		= "dma",
 	.dev_attrs	= dma_attrs,
-	.dev_release	= dma_dev_release,
+	.dev_release	= chan_dev_release,
 };
 
 /* --- client and device registration --- */
 
-#define dma_chan_satisfies_mask(chan, mask) \
-	__dma_chan_satisfies_mask((chan), &(mask))
+#define dma_device_satisfies_mask(device, mask) \
+	__dma_device_satisfies_mask((device), &(mask))
 static int
-__dma_chan_satisfies_mask(struct dma_chan *chan, dma_cap_mask_t *want)
+__dma_device_satisfies_mask(struct dma_device *device, dma_cap_mask_t *want)
 {
 	dma_cap_mask_t has;
 
-	bitmap_and(has.bits, want->bits, chan->device->cap_mask.bits,
+	bitmap_and(has.bits, want->bits, device->cap_mask.bits,
 		DMA_TX_TYPE_END);
 	return bitmap_equal(want->bits, has.bits, DMA_TX_TYPE_END);
 }
 
+static struct module *dma_chan_to_owner(struct dma_chan *chan)
+{
+	return chan->device->dev->driver->owner;
+}
+
 /**
- * dma_client_chan_alloc - try to allocate channels to a client
- * @client: &dma_client
+ * balance_ref_count - catch up the channel reference count
+ * @chan - channel to balance ->client_count versus dmaengine_ref_count
  *
- * Called with dma_list_mutex held.
+ * balance_ref_count must be called under dma_list_mutex
  */
-static void dma_client_chan_alloc(struct dma_client *client)
+static void balance_ref_count(struct dma_chan *chan)
 {
-	struct dma_device *device;
-	struct dma_chan *chan;
-	int desc;	/* allocated descriptor count */
-	enum dma_state_client ack;
+	struct module *owner = dma_chan_to_owner(chan);
 
-	/* Find a channel */
-	list_for_each_entry(device, &dma_device_list, global_node) {
-		/* Does the client require a specific DMA controller? */
-		if (client->slave && client->slave->dma_dev
-				&& client->slave->dma_dev != device->dev)
-			continue;
+	while (chan->client_count < dmaengine_ref_count) {
+		__module_get(owner);
+		chan->client_count++;
+	}
+}
 
-		list_for_each_entry(chan, &device->channels, device_node) {
-			if (!dma_chan_satisfies_mask(chan, client->cap_mask))
-				continue;
+/**
+ * dma_chan_get - try to grab a dma channel's parent driver module
+ * @chan - channel to grab
+ *
+ * Must be called under dma_list_mutex
+ */
+static int dma_chan_get(struct dma_chan *chan)
+{
+	int err = -ENODEV;
+	struct module *owner = dma_chan_to_owner(chan);
+
+	if (chan->client_count) {
+		__module_get(owner);
+		err = 0;
+	} else if (try_module_get(owner))
+		err = 0;
+
+	if (err == 0)
+		chan->client_count++;
+
+	/* allocate upon first client reference */
+	if (chan->client_count == 1 && err == 0) {
+		int desc_cnt = chan->device->device_alloc_chan_resources(chan);
+
+		if (desc_cnt < 0) {
+			err = desc_cnt;
+			chan->client_count = 0;
+			module_put(owner);
+		} else if (!dma_has_cap(DMA_PRIVATE, chan->device->cap_mask))
+			balance_ref_count(chan);
+	}
 
-			desc = chan->device->device_alloc_chan_resources(
-					chan, client);
-			if (desc >= 0) {
-				ack = client->event_callback(client,
-						chan,
-						DMA_RESOURCE_AVAILABLE);
+	return err;
+}
 
-				/* we are done once this client rejects
-				 * an available resource
-				 */
-				if (ack == DMA_ACK) {
-					dma_chan_get(chan);
-					chan->client_count++;
-				} else if (ack == DMA_NAK)
-					return;
-			}
-		}
-	}
+/**
+ * dma_chan_put - drop a reference to a dma channel's parent driver module
+ * @chan - channel to release
+ *
+ * Must be called under dma_list_mutex
+ */
+static void dma_chan_put(struct dma_chan *chan)
+{
+	if (!chan->client_count)
+		return; /* this channel failed alloc_chan_resources */
+	chan->client_count--;
+	module_put(dma_chan_to_owner(chan));
+	if (chan->client_count == 0)
+		chan->device->device_free_chan_resources(chan);
 }
 
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
@@ -218,138 +269,342 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie)
 EXPORT_SYMBOL(dma_sync_wait);
 
 /**
- * dma_chan_cleanup - release a DMA channel's resources
- * @kref: kernel reference structure that contains the DMA channel device
+ * dma_cap_mask_all - enable iteration over all operation types
+ */
+static dma_cap_mask_t dma_cap_mask_all;
+
+/**
+ * dma_chan_tbl_ent - tracks channel allocations per core/operation
+ * @chan - associated channel for this entry
+ */
+struct dma_chan_tbl_ent {
+	struct dma_chan *chan;
+};
+
+/**
+ * channel_table - percpu lookup table for memory-to-memory offload providers
  */
-void dma_chan_cleanup(struct kref *kref)
+static struct dma_chan_tbl_ent *channel_table[DMA_TX_TYPE_END];
+
+static int __init dma_channel_table_init(void)
 {
-	struct dma_chan *chan = container_of(kref, struct dma_chan, refcount);
-	chan->device->device_free_chan_resources(chan);
-	kref_put(&chan->device->refcount, dma_async_device_cleanup);
+	enum dma_transaction_type cap;
+	int err = 0;
+
+	bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END);
+
+	/* 'interrupt', 'private', and 'slave' are channel capabilities,
+	 * but are not associated with an operation so they do not need
+	 * an entry in the channel_table
+	 */
+	clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits);
+	clear_bit(DMA_PRIVATE, dma_cap_mask_all.bits);
+	clear_bit(DMA_SLAVE, dma_cap_mask_all.bits);
+
+	for_each_dma_cap_mask(cap, dma_cap_mask_all) {
+		channel_table[cap] = alloc_percpu(struct dma_chan_tbl_ent);
+		if (!channel_table[cap]) {
+			err = -ENOMEM;
+			break;
+		}
+	}
+
+	if (err) {
+		pr_err("dmaengine: initialization failure\n");
+		for_each_dma_cap_mask(cap, dma_cap_mask_all)
+			if (channel_table[cap])
+				free_percpu(channel_table[cap]);
+	}
+
+	return err;
 }
-EXPORT_SYMBOL(dma_chan_cleanup);
+arch_initcall(dma_channel_table_init);
 
-static void dma_chan_free_rcu(struct rcu_head *rcu)
+/**
+ * dma_find_channel - find a channel to carry out the operation
+ * @tx_type: transaction type
+ */
+struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type)
 {
-	struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu);
-	int bias = 0x7FFFFFFF;
-	int i;
-	for_each_possible_cpu(i)
-		bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount);
-	atomic_sub(bias, &chan->refcount.refcount);
-	kref_put(&chan->refcount, dma_chan_cleanup);
+	struct dma_chan *chan;
+	int cpu;
+
+	WARN_ONCE(dmaengine_ref_count == 0,
+		  "client called %s without a reference", __func__);
+
+	cpu = get_cpu();
+	chan = per_cpu_ptr(channel_table[tx_type], cpu)->chan;
+	put_cpu();
+
+	return chan;
 }
+EXPORT_SYMBOL(dma_find_channel);
 
-static void dma_chan_release(struct dma_chan *chan)
+/**
+ * dma_issue_pending_all - flush all pending operations across all channels
+ */
+void dma_issue_pending_all(void)
 {
-	atomic_add(0x7FFFFFFF, &chan->refcount.refcount);
-	chan->slow_ref = 1;
-	call_rcu(&chan->rcu, dma_chan_free_rcu);
+	struct dma_device *device;
+	struct dma_chan *chan;
+
+	WARN_ONCE(dmaengine_ref_count == 0,
+		  "client called %s without a reference", __func__);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(device, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node)
+			if (chan->client_count)
+				device->device_issue_pending(chan);
+	}
+	rcu_read_unlock();
 }
+EXPORT_SYMBOL(dma_issue_pending_all);
 
 /**
- * dma_chans_notify_available - broadcast available channels to the clients
+ * nth_chan - returns the nth channel of the given capability
+ * @cap: capability to match
+ * @n: nth channel desired
+ *
+ * Defaults to returning the channel with the desired capability and the
+ * lowest reference count when 'n' cannot be satisfied.  Must be called
+ * under dma_list_mutex.
  */
-static void dma_clients_notify_available(void)
+static struct dma_chan *nth_chan(enum dma_transaction_type cap, int n)
 {
-	struct dma_client *client;
+	struct dma_device *device;
+	struct dma_chan *chan;
+	struct dma_chan *ret = NULL;
+	struct dma_chan *min = NULL;
 
-	mutex_lock(&dma_list_mutex);
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (!dma_has_cap(cap, device->cap_mask) ||
+		    dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node) {
+			if (!chan->client_count)
+				continue;
+			if (!min)
+				min = chan;
+			else if (chan->table_count < min->table_count)
+				min = chan;
+
+			if (n-- == 0) {
+				ret = chan;
+				break; /* done */
+			}
+		}
+		if (ret)
+			break; /* done */
+	}
 
-	list_for_each_entry(client, &dma_client_list, global_node)
-		dma_client_chan_alloc(client);
+	if (!ret)
+		ret = min;
 
-	mutex_unlock(&dma_list_mutex);
+	if (ret)
+		ret->table_count++;
+
+	return ret;
 }
 
 /**
- * dma_chans_notify_available - tell the clients that a channel is going away
- * @chan: channel on its way out
+ * dma_channel_rebalance - redistribute the available channels
+ *
+ * Optimize for cpu isolation (each cpu gets a dedicated channel for an
+ * operation type) in the SMP case,  and operation isolation (avoid
+ * multi-tasking channels) in the non-SMP case.  Must be called under
+ * dma_list_mutex.
  */
-static void dma_clients_notify_removed(struct dma_chan *chan)
+static void dma_channel_rebalance(void)
 {
-	struct dma_client *client;
-	enum dma_state_client ack;
+	struct dma_chan *chan;
+	struct dma_device *device;
+	int cpu;
+	int cap;
+	int n;
 
-	mutex_lock(&dma_list_mutex);
+	/* undo the last distribution */
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_possible_cpu(cpu)
+			per_cpu_ptr(channel_table[cap], cpu)->chan = NULL;
+
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node)
+			chan->table_count = 0;
+	}
 
-	list_for_each_entry(client, &dma_client_list, global_node) {
-		ack = client->event_callback(client, chan,
-				DMA_RESOURCE_REMOVED);
+	/* don't populate the channel_table if no clients are available */
+	if (!dmaengine_ref_count)
+		return;
 
-		/* client was holding resources for this channel so
-		 * free it
-		 */
-		if (ack == DMA_ACK) {
-			dma_chan_put(chan);
-			chan->client_count--;
+	/* redistribute available channels */
+	n = 0;
+	for_each_dma_cap_mask(cap, dma_cap_mask_all)
+		for_each_online_cpu(cpu) {
+			if (num_possible_cpus() > 1)
+				chan = nth_chan(cap, n++);
+			else
+				chan = nth_chan(cap, -1);
+
+			per_cpu_ptr(channel_table[cap], cpu)->chan = chan;
+		}
+}
+
+static struct dma_chan *private_candidate(dma_cap_mask_t *mask, struct dma_device *dev,
+					  dma_filter_fn fn, void *fn_param)
+{
+	struct dma_chan *chan;
+
+	if (!__dma_device_satisfies_mask(dev, mask)) {
+		pr_debug("%s: wrong capabilities\n", __func__);
+		return NULL;
+	}
+	/* devices with multiple channels need special handling as we need to
+	 * ensure that all channels are either private or public.
+	 */
+	if (dev->chancnt > 1 && !dma_has_cap(DMA_PRIVATE, dev->cap_mask))
+		list_for_each_entry(chan, &dev->channels, device_node) {
+			/* some channels are already publicly allocated */
+			if (chan->client_count)
+				return NULL;
 		}
+
+	list_for_each_entry(chan, &dev->channels, device_node) {
+		if (chan->client_count) {
+			pr_debug("%s: %s busy\n",
+				 __func__, dma_chan_name(chan));
+			continue;
+		}
+		if (fn && !fn(chan, fn_param)) {
+			pr_debug("%s: %s filter said false\n",
+				 __func__, dma_chan_name(chan));
+			continue;
+		}
+		return chan;
 	}
 
-	mutex_unlock(&dma_list_mutex);
+	return NULL;
 }
 
 /**
- * dma_async_client_register - register a &dma_client
- * @client: ptr to a client structure with valid 'event_callback' and 'cap_mask'
+ * dma_request_channel - try to allocate an exclusive channel
+ * @mask: capabilities that the channel must satisfy
+ * @fn: optional callback to disposition available channels
+ * @fn_param: opaque parameter to pass to dma_filter_fn
  */
-void dma_async_client_register(struct dma_client *client)
+struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param)
 {
-	/* validate client data */
-	BUG_ON(dma_has_cap(DMA_SLAVE, client->cap_mask) &&
-		!client->slave);
+	struct dma_device *device, *_d;
+	struct dma_chan *chan = NULL;
+	int err;
 
+	/* Find a channel */
+	mutex_lock(&dma_list_mutex);
+	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
+		chan = private_candidate(mask, device, fn, fn_param);
+		if (chan) {
+			/* Found a suitable channel, try to grab, prep, and
+			 * return it.  We first set DMA_PRIVATE to disable
+			 * balance_ref_count as this channel will not be
+			 * published in the general-purpose allocator
+			 */
+			dma_cap_set(DMA_PRIVATE, device->cap_mask);
+			err = dma_chan_get(chan);
+
+			if (err == -ENODEV) {
+				pr_debug("%s: %s module removed\n", __func__,
+					 dma_chan_name(chan));
+				list_del_rcu(&device->global_node);
+			} else if (err)
+				pr_err("dmaengine: failed to get %s: (%d)\n",
+				       dma_chan_name(chan), err);
+			else
+				break;
+			chan = NULL;
+		}
+	}
+	mutex_unlock(&dma_list_mutex);
+
+	pr_debug("%s: %s (%s)\n", __func__, chan ? "success" : "fail",
+		 chan ? dma_chan_name(chan) : NULL);
+
+	return chan;
+}
+EXPORT_SYMBOL_GPL(__dma_request_channel);
+
+void dma_release_channel(struct dma_chan *chan)
+{
 	mutex_lock(&dma_list_mutex);
-	list_add_tail(&client->global_node, &dma_client_list);
+	WARN_ONCE(chan->client_count != 1,
+		  "chan reference count %d != 1\n", chan->client_count);
+	dma_chan_put(chan);
 	mutex_unlock(&dma_list_mutex);
 }
-EXPORT_SYMBOL(dma_async_client_register);
+EXPORT_SYMBOL_GPL(dma_release_channel);
 
 /**
- * dma_async_client_unregister - unregister a client and free the &dma_client
- * @client: &dma_client to free
- *
- * Force frees any allocated DMA channels, frees the &dma_client memory
+ * dmaengine_get - register interest in dma_channels
  */
-void dma_async_client_unregister(struct dma_client *client)
+void dmaengine_get(void)
 {
-	struct dma_device *device;
+	struct dma_device *device, *_d;
 	struct dma_chan *chan;
-	enum dma_state_client ack;
-
-	if (!client)
-		return;
+	int err;
 
 	mutex_lock(&dma_list_mutex);
-	/* free all channels the client is holding */
-	list_for_each_entry(device, &dma_device_list, global_node)
-		list_for_each_entry(chan, &device->channels, device_node) {
-			ack = client->event_callback(client, chan,
-				DMA_RESOURCE_REMOVED);
+	dmaengine_ref_count++;
 
-			if (ack == DMA_ACK) {
-				dma_chan_put(chan);
-				chan->client_count--;
-			}
+	/* try to grab channels */
+	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node) {
+			err = dma_chan_get(chan);
+			if (err == -ENODEV) {
+				/* module removed before we could use it */
+				list_del_rcu(&device->global_node);
+				break;
+			} else if (err)
+				pr_err("dmaengine: failed to get %s: (%d)\n",
+				       dma_chan_name(chan), err);
 		}
+	}
 
-	list_del(&client->global_node);
+	/* if this is the first reference and there were channels
+	 * waiting we need to rebalance to get those channels
+	 * incorporated into the channel table
+	 */
+	if (dmaengine_ref_count == 1)
+		dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 }
-EXPORT_SYMBOL(dma_async_client_unregister);
+EXPORT_SYMBOL(dmaengine_get);
 
 /**
- * dma_async_client_chan_request - send all available channels to the
- * client that satisfy the capability mask
- * @client - requester
+ * dmaengine_put - let dma drivers be removed when ref_count == 0
  */
-void dma_async_client_chan_request(struct dma_client *client)
+void dmaengine_put(void)
 {
+	struct dma_device *device;
+	struct dma_chan *chan;
+
 	mutex_lock(&dma_list_mutex);
-	dma_client_chan_alloc(client);
+	dmaengine_ref_count--;
+	BUG_ON(dmaengine_ref_count < 0);
+	/* drop channel references */
+	list_for_each_entry(device, &dma_device_list, global_node) {
+		if (dma_has_cap(DMA_PRIVATE, device->cap_mask))
+			continue;
+		list_for_each_entry(chan, &device->channels, device_node)
+			dma_chan_put(chan);
+	}
 	mutex_unlock(&dma_list_mutex);
 }
-EXPORT_SYMBOL(dma_async_client_chan_request);
+EXPORT_SYMBOL(dmaengine_put);
 
 /**
  * dma_async_device_register - registers DMA devices found
@@ -357,9 +612,9 @@ EXPORT_SYMBOL(dma_async_client_chan_request);
  */
 int dma_async_device_register(struct dma_device *device)
 {
-	static int id;
 	int chancnt = 0, rc;
 	struct dma_chan* chan;
+	atomic_t *idr_ref;
 
 	if (!device)
 		return -ENODEV;
@@ -386,57 +641,83 @@ int dma_async_device_register(struct dma_device *device)
 	BUG_ON(!device->device_issue_pending);
 	BUG_ON(!device->dev);
 
-	init_completion(&device->done);
-	kref_init(&device->refcount);
-
+	idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL);
+	if (!idr_ref)
+		return -ENOMEM;
+	atomic_set(idr_ref, 0);
+ idr_retry:
+	if (!idr_pre_get(&dma_idr, GFP_KERNEL))
+		return -ENOMEM;
 	mutex_lock(&dma_list_mutex);
-	device->dev_id = id++;
+	rc = idr_get_new(&dma_idr, NULL, &device->dev_id);
 	mutex_unlock(&dma_list_mutex);
+	if (rc == -EAGAIN)
+		goto idr_retry;
+	else if (rc != 0)
+		return rc;
 
 	/* represent channels in sysfs. Probably want devs too */
 	list_for_each_entry(chan, &device->channels, device_node) {
 		chan->local = alloc_percpu(typeof(*chan->local));
 		if (chan->local == NULL)
 			continue;
+		chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL);
+		if (chan->dev == NULL) {
+			free_percpu(chan->local);
+			continue;
+		}
 
 		chan->chan_id = chancnt++;
-		chan->dev.class = &dma_devclass;
-		chan->dev.parent = device->dev;
-		dev_set_name(&chan->dev, "dma%dchan%d",
+		chan->dev->device.class = &dma_devclass;
+		chan->dev->device.parent = device->dev;
+		chan->dev->chan = chan;
+		chan->dev->idr_ref = idr_ref;
+		chan->dev->dev_id = device->dev_id;
+		atomic_inc(idr_ref);
+		dev_set_name(&chan->dev->device, "dma%dchan%d",
 			     device->dev_id, chan->chan_id);
 
-		rc = device_register(&chan->dev);
+		rc = device_register(&chan->dev->device);
 		if (rc) {
-			chancnt--;
 			free_percpu(chan->local);
 			chan->local = NULL;
 			goto err_out;
 		}
-
-		/* One for the channel, one of the class device */
-		kref_get(&device->refcount);
-		kref_get(&device->refcount);
-		kref_init(&chan->refcount);
 		chan->client_count = 0;
-		chan->slow_ref = 0;
-		INIT_RCU_HEAD(&chan->rcu);
 	}
+	device->chancnt = chancnt;
 
 	mutex_lock(&dma_list_mutex);
-	list_add_tail(&device->global_node, &dma_device_list);
+	/* take references on public channels */
+	if (dmaengine_ref_count && !dma_has_cap(DMA_PRIVATE, device->cap_mask))
+		list_for_each_entry(chan, &device->channels, device_node) {
+			/* if clients are already waiting for channels we need
+			 * to take references on their behalf
+			 */
+			if (dma_chan_get(chan) == -ENODEV) {
+				/* note we can only get here for the first
+				 * channel as the remaining channels are
+				 * guaranteed to get a reference
+				 */
+				rc = -ENODEV;
+				mutex_unlock(&dma_list_mutex);
+				goto err_out;
+			}
+		}
+	list_add_tail_rcu(&device->global_node, &dma_device_list);
+	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
-	dma_clients_notify_available();
-
 	return 0;
 
 err_out:
 	list_for_each_entry(chan, &device->channels, device_node) {
 		if (chan->local == NULL)
 			continue;
-		kref_put(&device->refcount, dma_async_device_cleanup);
-		device_unregister(&chan->dev);
-		chancnt--;
+		mutex_lock(&dma_list_mutex);
+		chan->dev->chan = NULL;
+		mutex_unlock(&dma_list_mutex);
+		device_unregister(&chan->dev->device);
 		free_percpu(chan->local);
 	}
 	return rc;
@@ -444,37 +725,30 @@ err_out:
 EXPORT_SYMBOL(dma_async_device_register);
 
 /**
- * dma_async_device_cleanup - function called when all references are released
- * @kref: kernel reference object
- */
-static void dma_async_device_cleanup(struct kref *kref)
-{
-	struct dma_device *device;
-
-	device = container_of(kref, struct dma_device, refcount);
-	complete(&device->done);
-}
-
-/**
- * dma_async_device_unregister - unregisters DMA devices
+ * dma_async_device_unregister - unregister a DMA device
  * @device: &dma_device
+ *
+ * This routine is called by dma driver exit routines, dmaengine holds module
+ * references to prevent it being called while channels are in use.
  */
 void dma_async_device_unregister(struct dma_device *device)
 {
 	struct dma_chan *chan;
 
 	mutex_lock(&dma_list_mutex);
-	list_del(&device->global_node);
+	list_del_rcu(&device->global_node);
+	dma_channel_rebalance();
 	mutex_unlock(&dma_list_mutex);
 
 	list_for_each_entry(chan, &device->channels, device_node) {
-		dma_clients_notify_removed(chan);
-		device_unregister(&chan->dev);
-		dma_chan_release(chan);
+		WARN_ONCE(chan->client_count,
+			  "%s called while %d clients hold a reference\n",
+			  __func__, chan->client_count);
+		mutex_lock(&dma_list_mutex);
+		chan->dev->chan = NULL;
+		mutex_unlock(&dma_list_mutex);
+		device_unregister(&chan->dev->device);
 	}
-
-	kref_put(&device->refcount, dma_async_device_cleanup);
-	wait_for_completion(&device->done);
 }
 EXPORT_SYMBOL(dma_async_device_unregister);
 
@@ -626,10 +900,96 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
+/* dma_wait_for_async_tx - spin wait for a transaction to complete
+ * @tx: in-flight transaction to wait on
+ *
+ * This routine assumes that tx was obtained from a call to async_memcpy,
+ * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped
+ * and submitted).  Walking the parent chain is only meant to cover for DMA
+ * drivers that do not implement the DMA_INTERRUPT capability and may race with
+ * the driver's descriptor cleanup routine.
+ */
+enum dma_status
+dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
+{
+	enum dma_status status;
+	struct dma_async_tx_descriptor *iter;
+	struct dma_async_tx_descriptor *parent;
+
+	if (!tx)
+		return DMA_SUCCESS;
+
+	WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for"
+		  " %s\n", __func__, dma_chan_name(tx->chan));
+
+	/* poll through the dependency chain, return when tx is complete */
+	do {
+		iter = tx;
+
+		/* find the root of the unsubmitted dependency chain */
+		do {
+			parent = iter->parent;
+			if (!parent)
+				break;
+			else
+				iter = parent;
+		} while (parent);
+
+		/* there is a small window for ->parent == NULL and
+		 * ->cookie == -EBUSY
+		 */
+		while (iter->cookie == -EBUSY)
+			cpu_relax();
+
+		status = dma_sync_wait(iter->chan, iter->cookie);
+	} while (status == DMA_IN_PROGRESS || (iter != tx));
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
+
+/* dma_run_dependencies - helper routine for dma drivers to process
+ *	(start) dependent operations on their target channel
+ * @tx: transaction with dependencies
+ */
+void dma_run_dependencies(struct dma_async_tx_descriptor *tx)
+{
+	struct dma_async_tx_descriptor *dep = tx->next;
+	struct dma_async_tx_descriptor *dep_next;
+	struct dma_chan *chan;
+
+	if (!dep)
+		return;
+
+	chan = dep->chan;
+
+	/* keep submitting up until a channel switch is detected
+	 * in that case we will be called again as a result of
+	 * processing the interrupt from async_tx_channel_switch
+	 */
+	for (; dep; dep = dep_next) {
+		spin_lock_bh(&dep->lock);
+		dep->parent = NULL;
+		dep_next = dep->next;
+		if (dep_next && dep_next->chan == chan)
+			dep->next = NULL; /* ->next will be submitted */
+		else
+			dep_next = NULL; /* submit current dep and terminate */
+		spin_unlock_bh(&dep->lock);
+
+		dep->tx_submit(dep);
+	}
+
+	chan->device->device_issue_pending(chan);
+}
+EXPORT_SYMBOL_GPL(dma_run_dependencies);
+
 static int __init dma_bus_init(void)
 {
+	idr_init(&dma_idr);
 	mutex_init(&dma_list_mutex);
 	return class_register(&dma_devclass);
 }
-subsys_initcall(dma_bus_init);
+arch_initcall(dma_bus_init);
+
 
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index ed9636bfb54..3603f1ea5b2 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -35,7 +35,7 @@ MODULE_PARM_DESC(threads_per_chan,
 
 static unsigned int max_channels;
 module_param(max_channels, uint, S_IRUGO);
-MODULE_PARM_DESC(nr_channels,
+MODULE_PARM_DESC(max_channels,
 		"Maximum number of channels to use (default: all)");
 
 /*
@@ -71,7 +71,7 @@ struct dmatest_chan {
 
 /*
  * These are protected by dma_list_mutex since they're only used by
- * the DMA client event callback
+ * the DMA filter function callback
  */
 static LIST_HEAD(dmatest_channels);
 static unsigned int nr_channels;
@@ -80,7 +80,7 @@ static bool dmatest_match_channel(struct dma_chan *chan)
 {
 	if (test_channel[0] == '\0')
 		return true;
-	return strcmp(dev_name(&chan->dev), test_channel) == 0;
+	return strcmp(dma_chan_name(chan), test_channel) == 0;
 }
 
 static bool dmatest_match_device(struct dma_device *device)
@@ -215,7 +215,6 @@ static int dmatest_func(void *data)
 
 	smp_rmb();
 	chan = thread->chan;
-	dma_chan_get(chan);
 
 	while (!kthread_should_stop()) {
 		total_tests++;
@@ -293,7 +292,6 @@ static int dmatest_func(void *data)
 	}
 
 	ret = 0;
-	dma_chan_put(chan);
 	kfree(thread->dstbuf);
 err_dstbuf:
 	kfree(thread->srcbuf);
@@ -319,21 +317,16 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc)
 	kfree(dtc);
 }
 
-static enum dma_state_client dmatest_add_channel(struct dma_chan *chan)
+static int dmatest_add_channel(struct dma_chan *chan)
 {
 	struct dmatest_chan	*dtc;
 	struct dmatest_thread	*thread;
 	unsigned int		i;
 
-	/* Have we already been told about this channel? */
-	list_for_each_entry(dtc, &dmatest_channels, node)
-		if (dtc->chan == chan)
-			return DMA_DUP;
-
 	dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL);
 	if (!dtc) {
-		pr_warning("dmatest: No memory for %s\n", dev_name(&chan->dev));
-		return DMA_NAK;
+		pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan));
+		return -ENOMEM;
 	}
 
 	dtc->chan = chan;
@@ -343,16 +336,16 @@ static enum dma_state_client dmatest_add_channel(struct dma_chan *chan)
 		thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL);
 		if (!thread) {
 			pr_warning("dmatest: No memory for %s-test%u\n",
-				   dev_name(&chan->dev), i);
+				   dma_chan_name(chan), i);
 			break;
 		}
 		thread->chan = dtc->chan;
 		smp_wmb();
 		thread->task = kthread_run(dmatest_func, thread, "%s-test%u",
-				dev_name(&chan->dev), i);
+				dma_chan_name(chan), i);
 		if (IS_ERR(thread->task)) {
 			pr_warning("dmatest: Failed to run thread %s-test%u\n",
-					dev_name(&chan->dev), i);
+					dma_chan_name(chan), i);
 			kfree(thread);
 			break;
 		}
@@ -362,86 +355,62 @@ static enum dma_state_client dmatest_add_channel(struct dma_chan *chan)
 		list_add_tail(&thread->node, &dtc->threads);
 	}
 
-	pr_info("dmatest: Started %u threads using %s\n", i, dev_name(&chan->dev));
+	pr_info("dmatest: Started %u threads using %s\n", i, dma_chan_name(chan));
 
 	list_add_tail(&dtc->node, &dmatest_channels);
 	nr_channels++;
 
-	return DMA_ACK;
-}
-
-static enum dma_state_client dmatest_remove_channel(struct dma_chan *chan)
-{
-	struct dmatest_chan	*dtc, *_dtc;
-
-	list_for_each_entry_safe(dtc, _dtc, &dmatest_channels, node) {
-		if (dtc->chan == chan) {
-			list_del(&dtc->node);
-			dmatest_cleanup_channel(dtc);
-			pr_debug("dmatest: lost channel %s\n",
-					dev_name(&chan->dev));
-			return DMA_ACK;
-		}
-	}
-
-	return DMA_DUP;
+	return 0;
 }
 
-/*
- * Start testing threads as new channels are assigned to us, and kill
- * them when the channels go away.
- *
- * When we unregister the client, all channels are removed so this
- * will also take care of cleaning things up when the module is
- * unloaded.
- */
-static enum dma_state_client
-dmatest_event(struct dma_client *client, struct dma_chan *chan,
-		enum dma_state state)
+static bool filter(struct dma_chan *chan, void *param)
 {
-	enum dma_state_client	ack = DMA_NAK;
-
-	switch (state) {
-	case DMA_RESOURCE_AVAILABLE:
-		if (!dmatest_match_channel(chan)
-				|| !dmatest_match_device(chan->device))
-			ack = DMA_DUP;
-		else if (max_channels && nr_channels >= max_channels)
-			ack = DMA_NAK;
-		else
-			ack = dmatest_add_channel(chan);
-		break;
-
-	case DMA_RESOURCE_REMOVED:
-		ack = dmatest_remove_channel(chan);
-		break;
-
-	default:
-		pr_info("dmatest: Unhandled event %u (%s)\n",
-				state, dev_name(&chan->dev));
-		break;
-	}
-
-	return ack;
+	if (!dmatest_match_channel(chan) || !dmatest_match_device(chan->device))
+		return false;
+	else
+		return true;
 }
 
-static struct dma_client dmatest_client = {
-	.event_callback	= dmatest_event,
-};
-
 static int __init dmatest_init(void)
 {
-	dma_cap_set(DMA_MEMCPY, dmatest_client.cap_mask);
-	dma_async_client_register(&dmatest_client);
-	dma_async_client_chan_request(&dmatest_client);
+	dma_cap_mask_t mask;
+	struct dma_chan *chan;
+	int err = 0;
+
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_MEMCPY, mask);
+	for (;;) {
+		chan = dma_request_channel(mask, filter, NULL);
+		if (chan) {
+			err = dmatest_add_channel(chan);
+			if (err == 0)
+				continue;
+			else {
+				dma_release_channel(chan);
+				break; /* add_channel failed, punt */
+			}
+		} else
+			break; /* no more channels available */
+		if (max_channels && nr_channels >= max_channels)
+			break; /* we have all we need */
+	}
 
-	return 0;
+	return err;
 }
-module_init(dmatest_init);
+/* when compiled-in wait for drivers to load first */
+late_initcall(dmatest_init);
 
 static void __exit dmatest_exit(void)
 {
-	dma_async_client_unregister(&dmatest_client);
+	struct dmatest_chan *dtc, *_dtc;
+
+	list_for_each_entry_safe(dtc, _dtc, &dmatest_channels, node) {
+		list_del(&dtc->node);
+		dmatest_cleanup_channel(dtc);
+		pr_debug("dmatest: dropped channel %s\n",
+			 dma_chan_name(dtc->chan));
+		dma_release_channel(dtc->chan);
+	}
 }
 module_exit(dmatest_exit);
 
diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c
index 0778d99aea7..6b702cc46b3 100644
--- a/drivers/dma/dw_dmac.c
+++ b/drivers/dma/dw_dmac.c
@@ -70,6 +70,15 @@
  * the controller, though.
  */
 
+static struct device *chan2dev(struct dma_chan *chan)
+{
+	return &chan->dev->device;
+}
+static struct device *chan2parent(struct dma_chan *chan)
+{
+	return chan->dev->device.parent;
+}
+
 static struct dw_desc *dwc_first_active(struct dw_dma_chan *dwc)
 {
 	return list_entry(dwc->active_list.next, struct dw_desc, desc_node);
@@ -93,12 +102,12 @@ static struct dw_desc *dwc_desc_get(struct dw_dma_chan *dwc)
 			ret = desc;
 			break;
 		}
-		dev_dbg(&dwc->chan.dev, "desc %p not ACKed\n", desc);
+		dev_dbg(chan2dev(&dwc->chan), "desc %p not ACKed\n", desc);
 		i++;
 	}
 	spin_unlock_bh(&dwc->lock);
 
-	dev_vdbg(&dwc->chan.dev, "scanned %u descriptors on freelist\n", i);
+	dev_vdbg(chan2dev(&dwc->chan), "scanned %u descriptors on freelist\n", i);
 
 	return ret;
 }
@@ -108,10 +117,10 @@ static void dwc_sync_desc_for_cpu(struct dw_dma_chan *dwc, struct dw_desc *desc)
 	struct dw_desc	*child;
 
 	list_for_each_entry(child, &desc->txd.tx_list, desc_node)
-		dma_sync_single_for_cpu(dwc->chan.dev.parent,
+		dma_sync_single_for_cpu(chan2parent(&dwc->chan),
 				child->txd.phys, sizeof(child->lli),
 				DMA_TO_DEVICE);
-	dma_sync_single_for_cpu(dwc->chan.dev.parent,
+	dma_sync_single_for_cpu(chan2parent(&dwc->chan),
 			desc->txd.phys, sizeof(desc->lli),
 			DMA_TO_DEVICE);
 }
@@ -129,11 +138,11 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc)
 
 		spin_lock_bh(&dwc->lock);
 		list_for_each_entry(child, &desc->txd.tx_list, desc_node)
-			dev_vdbg(&dwc->chan.dev,
+			dev_vdbg(chan2dev(&dwc->chan),
 					"moving child desc %p to freelist\n",
 					child);
 		list_splice_init(&desc->txd.tx_list, &dwc->free_list);
-		dev_vdbg(&dwc->chan.dev, "moving desc %p to freelist\n", desc);
+		dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc);
 		list_add(&desc->desc_node, &dwc->free_list);
 		spin_unlock_bh(&dwc->lock);
 	}
@@ -163,9 +172,9 @@ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first)
 
 	/* ASSERT:  channel is idle */
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
-		dev_err(&dwc->chan.dev,
+		dev_err(chan2dev(&dwc->chan),
 			"BUG: Attempted to start non-idle channel\n");
-		dev_err(&dwc->chan.dev,
+		dev_err(chan2dev(&dwc->chan),
 			"  SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n",
 			channel_readl(dwc, SAR),
 			channel_readl(dwc, DAR),
@@ -193,7 +202,7 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc)
 	void				*param;
 	struct dma_async_tx_descriptor	*txd = &desc->txd;
 
-	dev_vdbg(&dwc->chan.dev, "descriptor %u complete\n", txd->cookie);
+	dev_vdbg(chan2dev(&dwc->chan), "descriptor %u complete\n", txd->cookie);
 
 	dwc->completed = txd->cookie;
 	callback = txd->callback;
@@ -208,11 +217,11 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc)
 	 * mapped before they were submitted...
 	 */
 	if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP))
-		dma_unmap_page(dwc->chan.dev.parent, desc->lli.dar, desc->len,
-				DMA_FROM_DEVICE);
+		dma_unmap_page(chan2parent(&dwc->chan), desc->lli.dar,
+			       desc->len, DMA_FROM_DEVICE);
 	if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP))
-		dma_unmap_page(dwc->chan.dev.parent, desc->lli.sar, desc->len,
-				DMA_TO_DEVICE);
+		dma_unmap_page(chan2parent(&dwc->chan), desc->lli.sar,
+			       desc->len, DMA_TO_DEVICE);
 
 	/*
 	 * The API requires that no submissions are done from a
@@ -228,7 +237,7 @@ static void dwc_complete_all(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	LIST_HEAD(list);
 
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
-		dev_err(&dwc->chan.dev,
+		dev_err(chan2dev(&dwc->chan),
 			"BUG: XFER bit set, but channel not idle!\n");
 
 		/* Try to continue after resetting the channel... */
@@ -273,7 +282,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 		return;
 	}
 
-	dev_vdbg(&dwc->chan.dev, "scan_descriptors: llp=0x%x\n", llp);
+	dev_vdbg(chan2dev(&dwc->chan), "scan_descriptors: llp=0x%x\n", llp);
 
 	list_for_each_entry_safe(desc, _desc, &dwc->active_list, desc_node) {
 		if (desc->lli.llp == llp)
@@ -292,7 +301,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 		dwc_descriptor_complete(dwc, desc);
 	}
 
-	dev_err(&dwc->chan.dev,
+	dev_err(chan2dev(&dwc->chan),
 		"BUG: All descriptors done, but channel not idle!\n");
 
 	/* Try to continue after resetting the channel... */
@@ -308,7 +317,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc)
 
 static void dwc_dump_lli(struct dw_dma_chan *dwc, struct dw_lli *lli)
 {
-	dev_printk(KERN_CRIT, &dwc->chan.dev,
+	dev_printk(KERN_CRIT, chan2dev(&dwc->chan),
 			"  desc: s0x%x d0x%x l0x%x c0x%x:%x\n",
 			lli->sar, lli->dar, lli->llp,
 			lli->ctlhi, lli->ctllo);
@@ -342,9 +351,9 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc)
 	 * controller flagged an error instead of scribbling over
 	 * random memory locations.
 	 */
-	dev_printk(KERN_CRIT, &dwc->chan.dev,
+	dev_printk(KERN_CRIT, chan2dev(&dwc->chan),
 			"Bad descriptor submitted for DMA!\n");
-	dev_printk(KERN_CRIT, &dwc->chan.dev,
+	dev_printk(KERN_CRIT, chan2dev(&dwc->chan),
 			"  cookie: %d\n", bad_desc->txd.cookie);
 	dwc_dump_lli(dwc, &bad_desc->lli);
 	list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node)
@@ -442,12 +451,12 @@ static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx)
 	 * for DMA. But this is hard to do in a race-free manner.
 	 */
 	if (list_empty(&dwc->active_list)) {
-		dev_vdbg(&tx->chan->dev, "tx_submit: started %u\n",
+		dev_vdbg(chan2dev(tx->chan), "tx_submit: started %u\n",
 				desc->txd.cookie);
 		dwc_dostart(dwc, desc);
 		list_add_tail(&desc->desc_node, &dwc->active_list);
 	} else {
-		dev_vdbg(&tx->chan->dev, "tx_submit: queued %u\n",
+		dev_vdbg(chan2dev(tx->chan), "tx_submit: queued %u\n",
 				desc->txd.cookie);
 
 		list_add_tail(&desc->desc_node, &dwc->queue);
@@ -472,11 +481,11 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	unsigned int		dst_width;
 	u32			ctllo;
 
-	dev_vdbg(&chan->dev, "prep_dma_memcpy d0x%x s0x%x l0x%zx f0x%lx\n",
+	dev_vdbg(chan2dev(chan), "prep_dma_memcpy d0x%x s0x%x l0x%zx f0x%lx\n",
 			dest, src, len, flags);
 
 	if (unlikely(!len)) {
-		dev_dbg(&chan->dev, "prep_dma_memcpy: length is zero!\n");
+		dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n");
 		return NULL;
 	}
 
@@ -516,7 +525,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 			first = desc;
 		} else {
 			prev->lli.llp = desc->txd.phys;
-			dma_sync_single_for_device(chan->dev.parent,
+			dma_sync_single_for_device(chan2parent(chan),
 					prev->txd.phys, sizeof(prev->lli),
 					DMA_TO_DEVICE);
 			list_add_tail(&desc->desc_node,
@@ -531,7 +540,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 		prev->lli.ctllo |= DWC_CTLL_INT_EN;
 
 	prev->lli.llp = 0;
-	dma_sync_single_for_device(chan->dev.parent,
+	dma_sync_single_for_device(chan2parent(chan),
 			prev->txd.phys, sizeof(prev->lli),
 			DMA_TO_DEVICE);
 
@@ -562,15 +571,15 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	struct scatterlist	*sg;
 	size_t			total_len = 0;
 
-	dev_vdbg(&chan->dev, "prep_dma_slave\n");
+	dev_vdbg(chan2dev(chan), "prep_dma_slave\n");
 
 	if (unlikely(!dws || !sg_len))
 		return NULL;
 
-	reg_width = dws->slave.reg_width;
+	reg_width = dws->reg_width;
 	prev = first = NULL;
 
-	sg_len = dma_map_sg(chan->dev.parent, sgl, sg_len, direction);
+	sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction);
 
 	switch (direction) {
 	case DMA_TO_DEVICE:
@@ -579,7 +588,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				| DWC_CTLL_DST_FIX
 				| DWC_CTLL_SRC_INC
 				| DWC_CTLL_FC_M2P);
-		reg = dws->slave.tx_reg;
+		reg = dws->tx_reg;
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
 			u32		len;
@@ -587,7 +596,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
 			desc = dwc_desc_get(dwc);
 			if (!desc) {
-				dev_err(&chan->dev,
+				dev_err(chan2dev(chan),
 					"not enough descriptors available\n");
 				goto err_desc_get;
 			}
@@ -607,7 +616,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				first = desc;
 			} else {
 				prev->lli.llp = desc->txd.phys;
-				dma_sync_single_for_device(chan->dev.parent,
+				dma_sync_single_for_device(chan2parent(chan),
 						prev->txd.phys,
 						sizeof(prev->lli),
 						DMA_TO_DEVICE);
@@ -625,7 +634,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				| DWC_CTLL_SRC_FIX
 				| DWC_CTLL_FC_P2M);
 
-		reg = dws->slave.rx_reg;
+		reg = dws->rx_reg;
 		for_each_sg(sgl, sg, sg_len, i) {
 			struct dw_desc	*desc;
 			u32		len;
@@ -633,7 +642,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
 			desc = dwc_desc_get(dwc);
 			if (!desc) {
-				dev_err(&chan->dev,
+				dev_err(chan2dev(chan),
 					"not enough descriptors available\n");
 				goto err_desc_get;
 			}
@@ -653,7 +662,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 				first = desc;
 			} else {
 				prev->lli.llp = desc->txd.phys;
-				dma_sync_single_for_device(chan->dev.parent,
+				dma_sync_single_for_device(chan2parent(chan),
 						prev->txd.phys,
 						sizeof(prev->lli),
 						DMA_TO_DEVICE);
@@ -673,7 +682,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 		prev->lli.ctllo |= DWC_CTLL_INT_EN;
 
 	prev->lli.llp = 0;
-	dma_sync_single_for_device(chan->dev.parent,
+	dma_sync_single_for_device(chan2parent(chan),
 			prev->txd.phys, sizeof(prev->lli),
 			DMA_TO_DEVICE);
 
@@ -758,29 +767,21 @@ static void dwc_issue_pending(struct dma_chan *chan)
 	spin_unlock_bh(&dwc->lock);
 }
 
-static int dwc_alloc_chan_resources(struct dma_chan *chan,
-		struct dma_client *client)
+static int dwc_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct dw_dma_chan	*dwc = to_dw_dma_chan(chan);
 	struct dw_dma		*dw = to_dw_dma(chan->device);
 	struct dw_desc		*desc;
-	struct dma_slave	*slave;
 	struct dw_dma_slave	*dws;
 	int			i;
 	u32			cfghi;
 	u32			cfglo;
 
-	dev_vdbg(&chan->dev, "alloc_chan_resources\n");
-
-	/* Channels doing slave DMA can only handle one client. */
-	if (dwc->dws || client->slave) {
-		if (chan->client_count)
-			return -EBUSY;
-	}
+	dev_vdbg(chan2dev(chan), "alloc_chan_resources\n");
 
 	/* ASSERT:  channel is idle */
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
-		dev_dbg(&chan->dev, "DMA channel not idle?\n");
+		dev_dbg(chan2dev(chan), "DMA channel not idle?\n");
 		return -EIO;
 	}
 
@@ -789,23 +790,17 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan,
 	cfghi = DWC_CFGH_FIFO_MODE;
 	cfglo = 0;
 
-	slave = client->slave;
-	if (slave) {
+	dws = dwc->dws;
+	if (dws) {
 		/*
 		 * We need controller-specific data to set up slave
 		 * transfers.
 		 */
-		BUG_ON(!slave->dma_dev || slave->dma_dev != dw->dma.dev);
-
-		dws = container_of(slave, struct dw_dma_slave, slave);
+		BUG_ON(!dws->dma_dev || dws->dma_dev != dw->dma.dev);
 
-		dwc->dws = dws;
 		cfghi = dws->cfg_hi;
 		cfglo = dws->cfg_lo;
-	} else {
-		dwc->dws = NULL;
 	}
-
 	channel_writel(dwc, CFG_LO, cfglo);
 	channel_writel(dwc, CFG_HI, cfghi);
 
@@ -822,7 +817,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan,
 
 		desc = kzalloc(sizeof(struct dw_desc), GFP_KERNEL);
 		if (!desc) {
-			dev_info(&chan->dev,
+			dev_info(chan2dev(chan),
 				"only allocated %d descriptors\n", i);
 			spin_lock_bh(&dwc->lock);
 			break;
@@ -832,7 +827,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan,
 		desc->txd.tx_submit = dwc_tx_submit;
 		desc->txd.flags = DMA_CTRL_ACK;
 		INIT_LIST_HEAD(&desc->txd.tx_list);
-		desc->txd.phys = dma_map_single(chan->dev.parent, &desc->lli,
+		desc->txd.phys = dma_map_single(chan2parent(chan), &desc->lli,
 				sizeof(desc->lli), DMA_TO_DEVICE);
 		dwc_desc_put(dwc, desc);
 
@@ -847,7 +842,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan,
 
 	spin_unlock_bh(&dwc->lock);
 
-	dev_dbg(&chan->dev,
+	dev_dbg(chan2dev(chan),
 		"alloc_chan_resources allocated %d descriptors\n", i);
 
 	return i;
@@ -860,7 +855,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	struct dw_desc		*desc, *_desc;
 	LIST_HEAD(list);
 
-	dev_dbg(&chan->dev, "free_chan_resources (descs allocated=%u)\n",
+	dev_dbg(chan2dev(chan), "free_chan_resources (descs allocated=%u)\n",
 			dwc->descs_allocated);
 
 	/* ASSERT:  channel is idle */
@@ -881,13 +876,13 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	spin_unlock_bh(&dwc->lock);
 
 	list_for_each_entry_safe(desc, _desc, &list, desc_node) {
-		dev_vdbg(&chan->dev, "  freeing descriptor %p\n", desc);
-		dma_unmap_single(chan->dev.parent, desc->txd.phys,
+		dev_vdbg(chan2dev(chan), "  freeing descriptor %p\n", desc);
+		dma_unmap_single(chan2parent(chan), desc->txd.phys,
 				sizeof(desc->lli), DMA_TO_DEVICE);
 		kfree(desc);
 	}
 
-	dev_vdbg(&chan->dev, "free_chan_resources done\n");
+	dev_vdbg(chan2dev(chan), "free_chan_resources done\n");
 }
 
 /*----------------------------------------------------------------------*/
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 0b95dcce447..ca70a21afc6 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -366,8 +366,7 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor(
  *
  * Return - The number of descriptors allocated.
  */
-static int fsl_dma_alloc_chan_resources(struct dma_chan *chan,
-					struct dma_client *client)
+static int fsl_dma_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct fsl_dma_chan *fsl_chan = to_fsl_chan(chan);
 
@@ -823,7 +822,7 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev,
 	 */
 	WARN_ON(fdev->feature != new_fsl_chan->feature);
 
-	new_fsl_chan->dev = &new_fsl_chan->common.dev;
+	new_fsl_chan->dev = &new_fsl_chan->common.dev->device;
 	new_fsl_chan->reg_base = ioremap(new_fsl_chan->reg.start,
 			new_fsl_chan->reg.end - new_fsl_chan->reg.start + 1);
 
diff --git a/drivers/dma/ioat.c b/drivers/dma/ioat.c
index 9b16a3af9a0..4105d6575b6 100644
--- a/drivers/dma/ioat.c
+++ b/drivers/dma/ioat.c
@@ -75,60 +75,10 @@ static int ioat_dca_enabled = 1;
 module_param(ioat_dca_enabled, int, 0644);
 MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)");
 
-static int ioat_setup_functionality(struct pci_dev *pdev, void __iomem *iobase)
-{
-	struct ioat_device *device = pci_get_drvdata(pdev);
-	u8 version;
-	int err = 0;
-
-	version = readb(iobase + IOAT_VER_OFFSET);
-	switch (version) {
-	case IOAT_VER_1_2:
-		device->dma = ioat_dma_probe(pdev, iobase);
-		if (device->dma && ioat_dca_enabled)
-			device->dca = ioat_dca_init(pdev, iobase);
-		break;
-	case IOAT_VER_2_0:
-		device->dma = ioat_dma_probe(pdev, iobase);
-		if (device->dma && ioat_dca_enabled)
-			device->dca = ioat2_dca_init(pdev, iobase);
-		break;
-	case IOAT_VER_3_0:
-		device->dma = ioat_dma_probe(pdev, iobase);
-		if (device->dma && ioat_dca_enabled)
-			device->dca = ioat3_dca_init(pdev, iobase);
-		break;
-	default:
-		err = -ENODEV;
-		break;
-	}
-	if (!device->dma)
-		err = -ENODEV;
-	return err;
-}
-
-static void ioat_shutdown_functionality(struct pci_dev *pdev)
-{
-	struct ioat_device *device = pci_get_drvdata(pdev);
-
-	dev_err(&pdev->dev, "Removing dma and dca services\n");
-	if (device->dca) {
-		unregister_dca_provider(device->dca);
-		free_dca_provider(device->dca);
-		device->dca = NULL;
-	}
-
-	if (device->dma) {
-		ioat_dma_remove(device->dma);
-		device->dma = NULL;
-	}
-}
-
 static struct pci_driver ioat_pci_driver = {
 	.name		= "ioatdma",
 	.id_table	= ioat_pci_tbl,
 	.probe		= ioat_probe,
-	.shutdown	= ioat_shutdown_functionality,
 	.remove		= __devexit_p(ioat_remove),
 };
 
@@ -179,7 +129,29 @@ static int __devinit ioat_probe(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 
-	err = ioat_setup_functionality(pdev, iobase);
+	switch (readb(iobase + IOAT_VER_OFFSET)) {
+	case IOAT_VER_1_2:
+		device->dma = ioat_dma_probe(pdev, iobase);
+		if (device->dma && ioat_dca_enabled)
+			device->dca = ioat_dca_init(pdev, iobase);
+		break;
+	case IOAT_VER_2_0:
+		device->dma = ioat_dma_probe(pdev, iobase);
+		if (device->dma && ioat_dca_enabled)
+			device->dca = ioat2_dca_init(pdev, iobase);
+		break;
+	case IOAT_VER_3_0:
+		device->dma = ioat_dma_probe(pdev, iobase);
+		if (device->dma && ioat_dca_enabled)
+			device->dca = ioat3_dca_init(pdev, iobase);
+		break;
+	default:
+		err = -ENODEV;
+		break;
+	}
+	if (!device->dma)
+		err = -ENODEV;
+
 	if (err)
 		goto err_version;
 
@@ -198,17 +170,21 @@ err_enable_device:
 	return err;
 }
 
-/*
- * It is unsafe to remove this module: if removed while a requested
- * dma is outstanding, esp. from tcp, it is possible to hang while
- * waiting for something that will never finish.  However, if you're
- * feeling lucky, this usually works just fine.
- */
 static void __devexit ioat_remove(struct pci_dev *pdev)
 {
 	struct ioat_device *device = pci_get_drvdata(pdev);
 
-	ioat_shutdown_functionality(pdev);
+	dev_err(&pdev->dev, "Removing dma and dca services\n");
+	if (device->dca) {
+		unregister_dca_provider(device->dca);
+		free_dca_provider(device->dca);
+		device->dca = NULL;
+	}
+
+	if (device->dma) {
+		ioat_dma_remove(device->dma);
+		device->dma = NULL;
+	}
 
 	kfree(device);
 }
diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c
index 6607fdd00b1..b3759c4b653 100644
--- a/drivers/dma/ioat_dma.c
+++ b/drivers/dma/ioat_dma.c
@@ -734,8 +734,7 @@ static void ioat2_dma_massage_chan_desc(struct ioat_dma_chan *ioat_chan)
  * ioat_dma_alloc_chan_resources - returns the number of allocated descriptors
  * @chan: the channel to be filled out
  */
-static int ioat_dma_alloc_chan_resources(struct dma_chan *chan,
-					 struct dma_client *client)
+static int ioat_dma_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
 	struct ioat_desc_sw *desc;
@@ -1341,12 +1340,11 @@ static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan)
  */
 #define IOAT_TEST_SIZE 2000
 
-DECLARE_COMPLETION(test_completion);
 static void ioat_dma_test_callback(void *dma_async_param)
 {
-	printk(KERN_ERR "ioatdma: ioat_dma_test_callback(%p)\n",
-		dma_async_param);
-	complete(&test_completion);
+	struct completion *cmp = dma_async_param;
+
+	complete(cmp);
 }
 
 /**
@@ -1363,6 +1361,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device)
 	dma_addr_t dma_dest, dma_src;
 	dma_cookie_t cookie;
 	int err = 0;
+	struct completion cmp;
 
 	src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL);
 	if (!src)
@@ -1381,7 +1380,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (device->common.device_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (device->common.device_alloc_chan_resources(dma_chan) < 1) {
 		dev_err(&device->pdev->dev,
 			"selftest cannot allocate chan resource\n");
 		err = -ENODEV;
@@ -1402,8 +1401,9 @@ static int ioat_dma_self_test(struct ioatdma_device *device)
 	}
 
 	async_tx_ack(tx);
+	init_completion(&cmp);
 	tx->callback = ioat_dma_test_callback;
-	tx->callback_param = (void *)0x8086;
+	tx->callback_param = &cmp;
 	cookie = tx->tx_submit(tx);
 	if (cookie < 0) {
 		dev_err(&device->pdev->dev,
@@ -1413,7 +1413,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device)
 	}
 	device->common.device_issue_pending(dma_chan);
 
-	wait_for_completion_timeout(&test_completion, msecs_to_jiffies(3000));
+	wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
 
 	if (device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL)
 					!= DMA_SUCCESS) {
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index 6be31726220..ea5440dd10d 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -24,7 +24,6 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/async_tx.h>
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/spinlock.h>
@@ -116,7 +115,7 @@ iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
 	}
 
 	/* run dependent operations */
-	async_tx_run_dependencies(&desc->async_tx);
+	dma_run_dependencies(&desc->async_tx);
 
 	return cookie;
 }
@@ -270,8 +269,6 @@ static void __iop_adma_slot_cleanup(struct iop_adma_chan *iop_chan)
 			break;
 	}
 
-	BUG_ON(!seen_current);
-
 	if (cookie > 0) {
 		iop_chan->completed_cookie = cookie;
 		pr_debug("\tcompleted cookie %d\n", cookie);
@@ -471,8 +468,7 @@ static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan);
  * greater than 2x the number slots needed to satisfy a device->max_xor
  * request.
  * */
-static int iop_adma_alloc_chan_resources(struct dma_chan *chan,
-					 struct dma_client *client)
+static int iop_adma_alloc_chan_resources(struct dma_chan *chan)
 {
 	char *hw_desc;
 	int idx;
@@ -866,7 +862,7 @@ static int __devinit iop_adma_memcpy_self_test(struct iop_adma_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (iop_adma_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (iop_adma_alloc_chan_resources(dma_chan) < 1) {
 		err = -ENODEV;
 		goto out;
 	}
@@ -964,7 +960,7 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (iop_adma_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (iop_adma_alloc_chan_resources(dma_chan) < 1) {
 		err = -ENODEV;
 		goto out;
 	}
@@ -1115,26 +1111,13 @@ static int __devexit iop_adma_remove(struct platform_device *dev)
 	struct iop_adma_device *device = platform_get_drvdata(dev);
 	struct dma_chan *chan, *_chan;
 	struct iop_adma_chan *iop_chan;
-	int i;
 	struct iop_adma_platform_data *plat_data = dev->dev.platform_data;
 
 	dma_async_device_unregister(&device->common);
 
-	for (i = 0; i < 3; i++) {
-		unsigned int irq;
-		irq = platform_get_irq(dev, i);
-		free_irq(irq, device);
-	}
-
 	dma_free_coherent(&dev->dev, plat_data->pool_size,
 			device->dma_desc_pool_virt, device->dma_desc_pool);
 
-	do {
-		struct resource *res;
-		res = platform_get_resource(dev, IORESOURCE_MEM, 0);
-		release_mem_region(res->start, res->end - res->start);
-	} while (0);
-
 	list_for_each_entry_safe(chan, _chan, &device->common.channels,
 				device_node) {
 		iop_chan = to_iop_adma_chan(chan);
@@ -1255,7 +1238,6 @@ static int __devinit iop_adma_probe(struct platform_device *pdev)
 	spin_lock_init(&iop_chan->lock);
 	INIT_LIST_HEAD(&iop_chan->chain);
 	INIT_LIST_HEAD(&iop_chan->all_slots);
-	INIT_RCU_HEAD(&iop_chan->common.rcu);
 	iop_chan->common.device = dma_dev;
 	list_add_tail(&iop_chan->common.device_node, &dma_dev->channels);
 
@@ -1431,16 +1413,12 @@ static int __init iop_adma_init (void)
 	return platform_driver_register(&iop_adma_driver);
 }
 
-/* it's currently unsafe to unload this module */
-#if 0
 static void __exit iop_adma_exit (void)
 {
 	platform_driver_unregister(&iop_adma_driver);
 	return;
 }
 module_exit(iop_adma_exit);
-#endif
-
 module_init(iop_adma_init);
 
 MODULE_AUTHOR("Intel Corporation");
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index bcda1742641..d35cbd1ff0b 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -18,7 +18,6 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/async_tx.h>
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/spinlock.h>
@@ -340,7 +339,7 @@ mv_xor_run_tx_complete_actions(struct mv_xor_desc_slot *desc,
 	}
 
 	/* run dependent operations */
-	async_tx_run_dependencies(&desc->async_tx);
+	dma_run_dependencies(&desc->async_tx);
 
 	return cookie;
 }
@@ -607,8 +606,7 @@ submit_done:
 }
 
 /* returns the number of allocated descriptors */
-static int mv_xor_alloc_chan_resources(struct dma_chan *chan,
-				       struct dma_client *client)
+static int mv_xor_alloc_chan_resources(struct dma_chan *chan)
 {
 	char *hw_desc;
 	int idx;
@@ -958,7 +956,7 @@ static int __devinit mv_xor_memcpy_self_test(struct mv_xor_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (mv_xor_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (mv_xor_alloc_chan_resources(dma_chan) < 1) {
 		err = -ENODEV;
 		goto out;
 	}
@@ -1053,7 +1051,7 @@ mv_xor_xor_self_test(struct mv_xor_device *device)
 	dma_chan = container_of(device->common.channels.next,
 				struct dma_chan,
 				device_node);
-	if (mv_xor_alloc_chan_resources(dma_chan, NULL) < 1) {
+	if (mv_xor_alloc_chan_resources(dma_chan) < 1) {
 		err = -ENODEV;
 		goto out;
 	}
@@ -1221,7 +1219,6 @@ static int __devinit mv_xor_probe(struct platform_device *pdev)
 	INIT_LIST_HEAD(&mv_chan->chain);
 	INIT_LIST_HEAD(&mv_chan->completed_slots);
 	INIT_LIST_HEAD(&mv_chan->all_slots);
-	INIT_RCU_HEAD(&mv_chan->common.rcu);
 	mv_chan->common.device = dma_dev;
 
 	list_add_tail(&mv_chan->common.device_node, &dma_dev->channels);
diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 2f9e941968d..d8f295bdad7 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -18,12 +18,6 @@
 #include <linux/dmi.h>
 
 #include <acpi/acpi_bus.h>
-#include <acpi/acnames.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acparser.h>
-#include <acpi/acexcep.h>
-#include <acpi/acmacros.h>
-#include <acpi/actypes.h>
 
 #define REGS_PER_GTF		7
 struct taskfile_array {
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index e7fb7d2fcbf..a4a1ae21463 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -63,6 +63,12 @@ config LEDS_WRAP
 	help
 	  This option enables support for the PCEngines WRAP programmable LEDs.
 
+config LEDS_ALIX2
+	tristate "LED Support for ALIX.2 and ALIX.3 series"
+	depends on LEDS_CLASS && X86 && EXPERIMENTAL
+	help
+	  This option enables support for the PCEngines ALIX.2 and ALIX.3 LEDs.
+
 config LEDS_H1940
 	tristate "LED Support for iPAQ H1940 device"
 	depends on LEDS_CLASS && ARCH_H1940
@@ -77,7 +83,7 @@ config LEDS_COBALT_QUBE
 
 config LEDS_COBALT_RAQ
 	bool "LED Support for the Cobalt Raq series"
-	depends on LEDS_CLASS && MIPS_COBALT
+	depends on LEDS_CLASS=y && MIPS_COBALT
 	select LEDS_TRIGGERS
 	help
 	  This option enables support for the Cobalt Raq series LEDs.
@@ -158,6 +164,13 @@ config LEDS_PCA955X
 	  LED driver chips accessed via the I2C bus.  Supported
 	  devices include PCA9550, PCA9551, PCA9552, and PCA9553.
 
+config LEDS_WM8350
+	tristate "LED Support for WM8350 AudioPlus PMIC"
+	depends on LEDS_CLASS && MFD_WM8350
+	help
+	  This option enables support for LEDs driven by the Wolfson
+	  Microelectronics WM8350 AudioPlus PMIC.
+
 config LEDS_DA903X
 	tristate "LED Support for DA9030/DA9034 PMIC"
 	depends on LEDS_CLASS && PMIC_DA903X
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index e1967a29850..bc247cb02e8 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_LEDS_S3C24XX)		+= leds-s3c24xx.o
 obj-$(CONFIG_LEDS_AMS_DELTA)		+= leds-ams-delta.o
 obj-$(CONFIG_LEDS_NET48XX)		+= leds-net48xx.o
 obj-$(CONFIG_LEDS_WRAP)			+= leds-wrap.o
+obj-$(CONFIG_LEDS_ALIX2)		+= leds-alix2.o
 obj-$(CONFIG_LEDS_H1940)		+= leds-h1940.o
 obj-$(CONFIG_LEDS_COBALT_QUBE)		+= leds-cobalt-qube.o
 obj-$(CONFIG_LEDS_COBALT_RAQ)		+= leds-cobalt-raq.o
@@ -23,6 +24,7 @@ obj-$(CONFIG_LEDS_FSG)			+= leds-fsg.o
 obj-$(CONFIG_LEDS_PCA955X)		+= leds-pca955x.o
 obj-$(CONFIG_LEDS_DA903X)		+= leds-da903x.o
 obj-$(CONFIG_LEDS_HP_DISK)		+= leds-hp-disk.o
+obj-$(CONFIG_LEDS_WM8350)		+= leds-wm8350.o
 
 # LED Triggers
 obj-$(CONFIG_LEDS_TRIGGER_TIMER)	+= ledtrig-timer.o
diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index 6c4a326176d..52f82e3ea13 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -91,9 +91,29 @@ void led_classdev_resume(struct led_classdev *led_cdev)
 }
 EXPORT_SYMBOL_GPL(led_classdev_resume);
 
+static int led_suspend(struct device *dev, pm_message_t state)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+
+	if (led_cdev->flags & LED_CORE_SUSPENDRESUME)
+		led_classdev_suspend(led_cdev);
+
+	return 0;
+}
+
+static int led_resume(struct device *dev)
+{
+	struct led_classdev *led_cdev = dev_get_drvdata(dev);
+
+	if (led_cdev->flags & LED_CORE_SUSPENDRESUME)
+		led_classdev_resume(led_cdev);
+
+	return 0;
+}
+
 /**
  * led_classdev_register - register a new object of led_classdev class.
- * @dev: The device to register.
+ * @parent: The device to register.
  * @led_cdev: the led_classdev structure for this device.
  */
 int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
@@ -174,6 +194,8 @@ static int __init leds_init(void)
 	leds_class = class_create(THIS_MODULE, "leds");
 	if (IS_ERR(leds_class))
 		return PTR_ERR(leds_class);
+	leds_class->suspend = led_suspend;
+	leds_class->resume = led_resume;
 	return 0;
 }
 
diff --git a/drivers/leds/leds-alix2.c b/drivers/leds/leds-alix2.c
new file mode 100644
index 00000000000..ddbd7730dfc
--- /dev/null
+++ b/drivers/leds/leds-alix2.c
@@ -0,0 +1,181 @@
+/*
+ * LEDs driver for PCEngines ALIX.2 and ALIX.3
+ *
+ * Copyright (C) 2008 Constantin Baranov <const@mimas.ru>
+ */
+
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/leds.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/string.h>
+
+static int force = 0;
+module_param(force, bool, 0444);
+MODULE_PARM_DESC(force, "Assume system has ALIX.2 style LEDs");
+
+struct alix_led {
+	struct led_classdev cdev;
+	unsigned short port;
+	unsigned int on_value;
+	unsigned int off_value;
+};
+
+static void alix_led_set(struct led_classdev *led_cdev,
+			 enum led_brightness brightness)
+{
+	struct alix_led *led_dev =
+		container_of(led_cdev, struct alix_led, cdev);
+
+	if (brightness)
+		outl(led_dev->on_value, led_dev->port);
+	else
+		outl(led_dev->off_value, led_dev->port);
+}
+
+static struct alix_led alix_leds[] = {
+	{
+		.cdev = {
+			.name = "alix:1",
+			.brightness_set = alix_led_set,
+		},
+		.port = 0x6100,
+		.on_value = 1 << 22,
+		.off_value = 1 << 6,
+	},
+	{
+		.cdev = {
+			.name = "alix:2",
+			.brightness_set = alix_led_set,
+		},
+		.port = 0x6180,
+		.on_value = 1 << 25,
+		.off_value = 1 << 9,
+	},
+	{
+		.cdev = {
+			.name = "alix:3",
+			.brightness_set = alix_led_set,
+		},
+		.port = 0x6180,
+		.on_value = 1 << 27,
+		.off_value = 1 << 11,
+	},
+};
+
+static int __init alix_led_probe(struct platform_device *pdev)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(alix_leds); i++) {
+		alix_leds[i].cdev.flags |= LED_CORE_SUSPENDRESUME;
+		ret = led_classdev_register(&pdev->dev, &alix_leds[i].cdev);
+		if (ret < 0)
+			goto fail;
+	}
+	return 0;
+
+fail:
+	while (--i >= 0)
+		led_classdev_unregister(&alix_leds[i].cdev);
+	return ret;
+}
+
+static int alix_led_remove(struct platform_device *pdev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(alix_leds); i++)
+		led_classdev_unregister(&alix_leds[i].cdev);
+	return 0;
+}
+
+static struct platform_driver alix_led_driver = {
+	.remove = alix_led_remove,
+	.driver = {
+		.name = KBUILD_MODNAME,
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __init alix_present(void)
+{
+	const unsigned long bios_phys = 0x000f0000;
+	const size_t bios_len = 0x00010000;
+	const char alix_sig[] = "PC Engines ALIX.";
+	const size_t alix_sig_len = sizeof(alix_sig) - 1;
+
+	const char *bios_virt;
+	const char *scan_end;
+	const char *p;
+	int ret = 0;
+
+	if (force) {
+		printk(KERN_NOTICE "%s: forced to skip BIOS test, "
+		       "assume system has ALIX.2 style LEDs\n",
+		       KBUILD_MODNAME);
+		ret = 1;
+		goto out;
+	}
+
+	bios_virt = phys_to_virt(bios_phys);
+	scan_end = bios_virt + bios_len - (alix_sig_len + 2);
+	for (p = bios_virt; p < scan_end; p++) {
+		const char *tail;
+
+		if (memcmp(p, alix_sig, alix_sig_len) != 0) {
+			continue;
+		}
+
+		tail = p + alix_sig_len;
+		if ((tail[0] == '2' || tail[0] == '3') && tail[1] == '\0') {
+			printk(KERN_INFO
+			       "%s: system is recognized as \"%s\"\n",
+			       KBUILD_MODNAME, p);
+			ret = 1;
+			break;
+		}
+	}
+
+out:
+	return ret;
+}
+
+static struct platform_device *pdev;
+
+static int __init alix_led_init(void)
+{
+	int ret;
+
+	if (!alix_present()) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	pdev = platform_device_register_simple(KBUILD_MODNAME, -1, NULL, 0);
+	if (!IS_ERR(pdev)) {
+		ret = platform_driver_probe(&alix_led_driver, alix_led_probe);
+		if (ret)
+			platform_device_unregister(pdev);
+	} else
+		ret = PTR_ERR(pdev);
+
+out:
+	return ret;
+}
+
+static void __exit alix_led_exit(void)
+{
+	platform_device_unregister(pdev);
+	platform_driver_unregister(&alix_led_driver);
+}
+
+module_init(alix_led_init);
+module_exit(alix_led_exit);
+
+MODULE_AUTHOR("Constantin Baranov <const@mimas.ru>");
+MODULE_DESCRIPTION("PCEngines ALIX.2 and ALIX.3 LED driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/leds/leds-ams-delta.c b/drivers/leds/leds-ams-delta.c
index 1bd590bb3a6..446050759b4 100644
--- a/drivers/leds/leds-ams-delta.c
+++ b/drivers/leds/leds-ams-delta.c
@@ -79,37 +79,12 @@ static struct ams_delta_led ams_delta_leds[] = {
 	},
 };
 
-#ifdef CONFIG_PM
-static int ams_delta_led_suspend(struct platform_device *dev,
-		pm_message_t state)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++)
-		led_classdev_suspend(&ams_delta_leds[i].cdev);
-
-	return 0;
-}
-
-static int ams_delta_led_resume(struct platform_device *dev)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++)
-		led_classdev_resume(&ams_delta_leds[i].cdev);
-
-	return 0;
-}
-#else
-#define ams_delta_led_suspend NULL
-#define ams_delta_led_resume NULL
-#endif
-
 static int ams_delta_led_probe(struct platform_device *pdev)
 {
 	int i, ret;
 
 	for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++) {
+		ams_delta_leds[i].cdev.flags |= LED_CORE_SUSPENDRESUME;
 		ret = led_classdev_register(&pdev->dev,
 				&ams_delta_leds[i].cdev);
 		if (ret < 0)
@@ -127,7 +102,7 @@ static int ams_delta_led_remove(struct platform_device *pdev)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i--)
+	for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++)
 		led_classdev_unregister(&ams_delta_leds[i].cdev);
 
 	return 0;
@@ -136,8 +111,6 @@ static int ams_delta_led_remove(struct platform_device *pdev)
 static struct platform_driver ams_delta_led_driver = {
 	.probe		= ams_delta_led_probe,
 	.remove		= ams_delta_led_remove,
-	.suspend	= ams_delta_led_suspend,
-	.resume		= ams_delta_led_resume,
 	.driver		= {
 		.name = "ams-delta-led",
 		.owner = THIS_MODULE,
@@ -151,7 +124,7 @@ static int __init ams_delta_led_init(void)
 
 static void __exit ams_delta_led_exit(void)
 {
-	return platform_driver_unregister(&ams_delta_led_driver);
+	platform_driver_unregister(&ams_delta_led_driver);
 }
 
 module_init(ams_delta_led_init);
diff --git a/drivers/leds/leds-clevo-mail.c b/drivers/leds/leds-clevo-mail.c
index eb3415e88f4..1813c84ea5f 100644
--- a/drivers/leds/leds-clevo-mail.c
+++ b/drivers/leds/leds-clevo-mail.c
@@ -142,6 +142,7 @@ static struct led_classdev clevo_mail_led = {
 	.name			= "clevo::mail",
 	.brightness_set		= clevo_mail_led_set,
 	.blink_set		= clevo_mail_led_blink,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static int __init clevo_mail_led_probe(struct platform_device *pdev)
@@ -155,29 +156,9 @@ static int clevo_mail_led_remove(struct platform_device *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int clevo_mail_led_suspend(struct platform_device *dev,
-				  pm_message_t state)
-{
-	led_classdev_suspend(&clevo_mail_led);
-	return 0;
-}
-
-static int clevo_mail_led_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&clevo_mail_led);
-	return 0;
-}
-#else
-#define clevo_mail_led_suspend    NULL
-#define clevo_mail_led_resume     NULL
-#endif
-
 static struct platform_driver clevo_mail_led_driver = {
 	.probe		= clevo_mail_led_probe,
 	.remove		= clevo_mail_led_remove,
-	.suspend	= clevo_mail_led_suspend,
-	.resume		= clevo_mail_led_resume,
 	.driver		= {
 		.name		= KBUILD_MODNAME,
 		.owner		= THIS_MODULE,
diff --git a/drivers/leds/leds-fsg.c b/drivers/leds/leds-fsg.c
index 34935155c1c..5f7c9c5c09b 100644
--- a/drivers/leds/leds-fsg.c
+++ b/drivers/leds/leds-fsg.c
@@ -99,64 +99,43 @@ static void fsg_led_ring_set(struct led_classdev *led_cdev,
 }
 
 
-
 static struct led_classdev fsg_wlan_led = {
 	.name			= "fsg:blue:wlan",
 	.brightness_set		= fsg_led_wlan_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_wan_led = {
 	.name			= "fsg:blue:wan",
 	.brightness_set		= fsg_led_wan_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_sata_led = {
 	.name			= "fsg:blue:sata",
 	.brightness_set		= fsg_led_sata_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_usb_led = {
 	.name			= "fsg:blue:usb",
 	.brightness_set		= fsg_led_usb_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_sync_led = {
 	.name			= "fsg:blue:sync",
 	.brightness_set		= fsg_led_sync_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev fsg_ring_led = {
 	.name			= "fsg:blue:ring",
 	.brightness_set		= fsg_led_ring_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 
-
-#ifdef CONFIG_PM
-static int fsg_led_suspend(struct platform_device *dev, pm_message_t state)
-{
-	led_classdev_suspend(&fsg_wlan_led);
-	led_classdev_suspend(&fsg_wan_led);
-	led_classdev_suspend(&fsg_sata_led);
-	led_classdev_suspend(&fsg_usb_led);
-	led_classdev_suspend(&fsg_sync_led);
-	led_classdev_suspend(&fsg_ring_led);
-	return 0;
-}
-
-static int fsg_led_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&fsg_wlan_led);
-	led_classdev_resume(&fsg_wan_led);
-	led_classdev_resume(&fsg_sata_led);
-	led_classdev_resume(&fsg_usb_led);
-	led_classdev_resume(&fsg_sync_led);
-	led_classdev_resume(&fsg_ring_led);
-	return 0;
-}
-#endif
-
-
 static int fsg_led_probe(struct platform_device *pdev)
 {
 	int ret;
@@ -232,10 +211,6 @@ static int fsg_led_remove(struct platform_device *pdev)
 static struct platform_driver fsg_led_driver = {
 	.probe		= fsg_led_probe,
 	.remove		= fsg_led_remove,
-#ifdef CONFIG_PM
-	.suspend	= fsg_led_suspend,
-	.resume		= fsg_led_resume,
-#endif
 	.driver		= {
 		.name		= "fsg-led",
 	},
diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c
index b13bd2950e9..2e3df08b649 100644
--- a/drivers/leds/leds-gpio.c
+++ b/drivers/leds/leds-gpio.c
@@ -105,6 +105,7 @@ static int gpio_led_probe(struct platform_device *pdev)
 		}
 		led_dat->cdev.brightness_set = gpio_led_set;
 		led_dat->cdev.brightness = LED_OFF;
+		led_dat->cdev.flags |= LED_CORE_SUSPENDRESUME;
 
 		gpio_direction_output(led_dat->gpio, led_dat->active_low);
 
@@ -154,44 +155,9 @@ static int __devexit gpio_led_remove(struct platform_device *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int gpio_led_suspend(struct platform_device *pdev, pm_message_t state)
-{
-	struct gpio_led_platform_data *pdata = pdev->dev.platform_data;
-	struct gpio_led_data *leds_data;
-	int i;
-
-	leds_data = platform_get_drvdata(pdev);
-
-	for (i = 0; i < pdata->num_leds; i++)
-		led_classdev_suspend(&leds_data[i].cdev);
-
-	return 0;
-}
-
-static int gpio_led_resume(struct platform_device *pdev)
-{
-	struct gpio_led_platform_data *pdata = pdev->dev.platform_data;
-	struct gpio_led_data *leds_data;
-	int i;
-
-	leds_data = platform_get_drvdata(pdev);
-
-	for (i = 0; i < pdata->num_leds; i++)
-		led_classdev_resume(&leds_data[i].cdev);
-
-	return 0;
-}
-#else
-#define gpio_led_suspend NULL
-#define gpio_led_resume NULL
-#endif
-
 static struct platform_driver gpio_led_driver = {
 	.probe		= gpio_led_probe,
 	.remove		= __devexit_p(gpio_led_remove),
-	.suspend	= gpio_led_suspend,
-	.resume		= gpio_led_resume,
 	.driver		= {
 		.name	= "leds-gpio",
 		.owner	= THIS_MODULE,
diff --git a/drivers/leds/leds-hp-disk.c b/drivers/leds/leds-hp-disk.c
index 44fa757d825..d786adc8c5e 100644
--- a/drivers/leds/leds-hp-disk.c
+++ b/drivers/leds/leds-hp-disk.c
@@ -68,25 +68,9 @@ static struct led_classdev hpled_led = {
 	.name			= "hp:red:hddprotection",
 	.default_trigger	= "heartbeat",
 	.brightness_set		= hpled_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
-#ifdef CONFIG_PM
-static int hpled_suspend(struct acpi_device *dev, pm_message_t state)
-{
-	led_classdev_suspend(&hpled_led);
-	return 0;
-}
-
-static int hpled_resume(struct acpi_device *dev)
-{
-	led_classdev_resume(&hpled_led);
-	return 0;
-}
-#else
-#define hpled_suspend NULL
-#define hpled_resume NULL
-#endif
-
 static int hpled_add(struct acpi_device *device)
 {
 	int ret;
@@ -121,8 +105,6 @@ static struct acpi_driver leds_hp_driver = {
 	.ops = {
 		.add     = hpled_add,
 		.remove  = hpled_remove,
-		.suspend = hpled_suspend,
-		.resume  = hpled_resume,
 	}
 };
 
diff --git a/drivers/leds/leds-hp6xx.c b/drivers/leds/leds-hp6xx.c
index e8fb1baf8a5..e4ce1fd4633 100644
--- a/drivers/leds/leds-hp6xx.c
+++ b/drivers/leds/leds-hp6xx.c
@@ -45,30 +45,16 @@ static struct led_classdev hp6xx_red_led = {
 	.name			= "hp6xx:red",
 	.default_trigger	= "hp6xx-charge",
 	.brightness_set		= hp6xxled_red_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev hp6xx_green_led = {
 	.name			= "hp6xx:green",
 	.default_trigger	= "ide-disk",
 	.brightness_set		= hp6xxled_green_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
-#ifdef CONFIG_PM
-static int hp6xxled_suspend(struct platform_device *dev, pm_message_t state)
-{
-	led_classdev_suspend(&hp6xx_red_led);
-	led_classdev_suspend(&hp6xx_green_led);
-	return 0;
-}
-
-static int hp6xxled_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&hp6xx_red_led);
-	led_classdev_resume(&hp6xx_green_led);
-	return 0;
-}
-#endif
-
 static int hp6xxled_probe(struct platform_device *pdev)
 {
 	int ret;
@@ -98,10 +84,6 @@ MODULE_ALIAS("platform:hp6xx-led");
 static struct platform_driver hp6xxled_driver = {
 	.probe		= hp6xxled_probe,
 	.remove		= hp6xxled_remove,
-#ifdef CONFIG_PM
-	.suspend	= hp6xxled_suspend,
-	.resume		= hp6xxled_resume,
-#endif
 	.driver		= {
 		.name		= "hp6xx-led",
 		.owner		= THIS_MODULE,
diff --git a/drivers/leds/leds-net48xx.c b/drivers/leds/leds-net48xx.c
index 054360473c9..93987a12da4 100644
--- a/drivers/leds/leds-net48xx.c
+++ b/drivers/leds/leds-net48xx.c
@@ -33,26 +33,9 @@ static void net48xx_error_led_set(struct led_classdev *led_cdev,
 static struct led_classdev net48xx_error_led = {
 	.name		= "net48xx::error",
 	.brightness_set	= net48xx_error_led_set,
+	.flags		= LED_CORE_SUSPENDRESUME,
 };
 
-#ifdef CONFIG_PM
-static int net48xx_led_suspend(struct platform_device *dev,
-		pm_message_t state)
-{
-	led_classdev_suspend(&net48xx_error_led);
-	return 0;
-}
-
-static int net48xx_led_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&net48xx_error_led);
-	return 0;
-}
-#else
-#define net48xx_led_suspend NULL
-#define net48xx_led_resume NULL
-#endif
-
 static int net48xx_led_probe(struct platform_device *pdev)
 {
 	return led_classdev_register(&pdev->dev, &net48xx_error_led);
@@ -67,8 +50,6 @@ static int net48xx_led_remove(struct platform_device *pdev)
 static struct platform_driver net48xx_led_driver = {
 	.probe		= net48xx_led_probe,
 	.remove		= net48xx_led_remove,
-	.suspend	= net48xx_led_suspend,
-	.resume		= net48xx_led_resume,
 	.driver		= {
 		.name		= DRVNAME,
 		.owner		= THIS_MODULE,
diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c
index 4064d4f6b33..76ec7498e2d 100644
--- a/drivers/leds/leds-pca9532.c
+++ b/drivers/leds/leds-pca9532.c
@@ -16,6 +16,7 @@
 #include <linux/leds.h>
 #include <linux/input.h>
 #include <linux/mutex.h>
+#include <linux/workqueue.h>
 #include <linux/leds-pca9532.h>
 
 static const unsigned short normal_i2c[] = { /*0x60,*/ I2C_CLIENT_END};
@@ -34,6 +35,7 @@ struct pca9532_data {
 	struct pca9532_led leds[16];
 	struct mutex update_lock;
 	struct input_dev    *idev;
+       struct work_struct work;
 	u8 pwm[2];
 	u8 psc[2];
 };
@@ -63,7 +65,7 @@ static struct i2c_driver pca9532_driver = {
  * as a compromise we average one pwm to the values requested by all
  * leds that are not ON/OFF.
  * */
-static int pca9532_setpwm(struct i2c_client *client, int pwm, int blink,
+static int pca9532_calcpwm(struct i2c_client *client, int pwm, int blink,
 	enum led_brightness value)
 {
 	int a = 0, b = 0, i = 0;
@@ -84,11 +86,17 @@ static int pca9532_setpwm(struct i2c_client *client, int pwm, int blink,
 	b = b/a;
 	if (b > 0xFF)
 		return -EINVAL;
-	mutex_lock(&data->update_lock);
 	data->pwm[pwm] = b;
+       data->psc[pwm] = blink;
+       return 0;
+}
+
+static int pca9532_setpwm(struct i2c_client *client, int pwm)
+{
+       struct pca9532_data *data = i2c_get_clientdata(client);
+       mutex_lock(&data->update_lock);
 	i2c_smbus_write_byte_data(client, PCA9532_REG_PWM(pwm),
 		data->pwm[pwm]);
-	data->psc[pwm] = blink;
 	i2c_smbus_write_byte_data(client, PCA9532_REG_PSC(pwm),
 		data->psc[pwm]);
 	mutex_unlock(&data->update_lock);
@@ -124,11 +132,11 @@ static void pca9532_set_brightness(struct led_classdev *led_cdev,
 		led->state = PCA9532_ON;
 	else {
 		led->state = PCA9532_PWM0; /* Thecus: hardcode one pwm */
-		err = pca9532_setpwm(led->client, 0, 0, value);
+               err = pca9532_calcpwm(led->client, 0, 0, value);
 		if (err)
 			return; /* XXX: led api doesn't allow error code? */
 	}
-	pca9532_setled(led);
+       schedule_work(&led->work);
 }
 
 static int pca9532_set_blink(struct led_classdev *led_cdev,
@@ -137,6 +145,7 @@ static int pca9532_set_blink(struct led_classdev *led_cdev,
 	struct pca9532_led *led = ldev_to_led(led_cdev);
 	struct i2c_client *client = led->client;
 	int psc;
+       int err = 0;
 
 	if (*delay_on == 0 && *delay_off == 0) {
 	/* led subsystem ask us for a blink rate */
@@ -148,11 +157,15 @@ static int pca9532_set_blink(struct led_classdev *led_cdev,
 
 	/* Thecus specific: only use PSC/PWM 0 */
 	psc = (*delay_on * 152-1)/1000;
-	return pca9532_setpwm(client, 0, psc, led_cdev->brightness);
+       err = pca9532_calcpwm(client, 0, psc, led_cdev->brightness);
+       if (err)
+               return err;
+       schedule_work(&led->work);
+       return 0;
 }
 
-int pca9532_event(struct input_dev *dev, unsigned int type, unsigned int code,
-	int value)
+static int pca9532_event(struct input_dev *dev, unsigned int type,
+	unsigned int code, int value)
 {
 	struct pca9532_data *data = input_get_drvdata(dev);
 
@@ -165,13 +178,28 @@ int pca9532_event(struct input_dev *dev, unsigned int type, unsigned int code,
 	else
 		data->pwm[1] = 0;
 
-	dev_info(&dev->dev, "setting beep to %d \n", data->pwm[1]);
+       schedule_work(&data->work);
+
+       return 0;
+}
+
+static void pca9532_input_work(struct work_struct *work)
+{
+       struct pca9532_data *data;
+       data = container_of(work, struct pca9532_data, work);
 	mutex_lock(&data->update_lock);
 	i2c_smbus_write_byte_data(data->client, PCA9532_REG_PWM(1),
 		data->pwm[1]);
 	mutex_unlock(&data->update_lock);
+}
 
-	return 0;
+static void pca9532_led_work(struct work_struct *work)
+{
+       struct pca9532_led *led;
+       led = container_of(work, struct pca9532_led, work);
+       if (led->state == PCA9532_PWM0)
+               pca9532_setpwm(led->client, 0);
+       pca9532_setled(led);
 }
 
 static int pca9532_configure(struct i2c_client *client,
@@ -204,8 +232,9 @@ static int pca9532_configure(struct i2c_client *client,
 			led->ldev.brightness = LED_OFF;
 			led->ldev.brightness_set = pca9532_set_brightness;
 			led->ldev.blink_set = pca9532_set_blink;
-			if (led_classdev_register(&client->dev,
-				&led->ldev) < 0)	{
+                       INIT_WORK(&led->work, pca9532_led_work);
+			err = led_classdev_register(&client->dev, &led->ldev);
+			if (err < 0) {
 				dev_err(&client->dev,
 					"couldn't register LED %s\n",
 					led->name);
@@ -233,9 +262,11 @@ static int pca9532_configure(struct i2c_client *client,
 						BIT_MASK(SND_TONE);
 			data->idev->event = pca9532_event;
 			input_set_drvdata(data->idev, data);
+                       INIT_WORK(&data->work, pca9532_input_work);
 			err = input_register_device(data->idev);
 			if (err) {
 				input_free_device(data->idev);
+                               cancel_work_sync(&data->work);
 				data->idev = NULL;
 				goto exit;
 			}
@@ -252,18 +283,19 @@ exit:
 				break;
 			case PCA9532_TYPE_LED:
 				led_classdev_unregister(&data->leds[i].ldev);
+                               cancel_work_sync(&data->leds[i].work);
 				break;
 			case PCA9532_TYPE_N2100_BEEP:
 				if (data->idev != NULL) {
 					input_unregister_device(data->idev);
 					input_free_device(data->idev);
+                                       cancel_work_sync(&data->work);
 					data->idev = NULL;
 				}
 				break;
 			}
 
 	return err;
-
 }
 
 static int pca9532_probe(struct i2c_client *client,
@@ -271,12 +303,16 @@ static int pca9532_probe(struct i2c_client *client,
 {
 	struct pca9532_data *data = i2c_get_clientdata(client);
 	struct pca9532_platform_data *pca9532_pdata = client->dev.platform_data;
+	int err;
+
+	if (!pca9532_pdata)
+		return -EIO;
 
 	if (!i2c_check_functionality(client->adapter,
 		I2C_FUNC_SMBUS_BYTE_DATA))
 		return -EIO;
 
-	data = kzalloc(sizeof(struct pca9532_data), GFP_KERNEL);
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
@@ -285,12 +321,13 @@ static int pca9532_probe(struct i2c_client *client,
 	data->client = client;
 	mutex_init(&data->update_lock);
 
-	if (pca9532_pdata == NULL)
-		return -EIO;
-
-	pca9532_configure(client, data, pca9532_pdata);
-	return 0;
+	err = pca9532_configure(client, data, pca9532_pdata);
+	if (err) {
+		kfree(data);
+		i2c_set_clientdata(client, NULL);
+	}
 
+	return err;
 }
 
 static int pca9532_remove(struct i2c_client *client)
@@ -303,11 +340,13 @@ static int pca9532_remove(struct i2c_client *client)
 			break;
 		case PCA9532_TYPE_LED:
 			led_classdev_unregister(&data->leds[i].ldev);
+                       cancel_work_sync(&data->leds[i].work);
 			break;
 		case PCA9532_TYPE_N2100_BEEP:
 			if (data->idev != NULL) {
 				input_unregister_device(data->idev);
 				input_free_device(data->idev);
+                               cancel_work_sync(&data->work);
 				data->idev = NULL;
 			}
 			break;
diff --git a/drivers/leds/leds-s3c24xx.c b/drivers/leds/leds-s3c24xx.c
index 25a07f2643a..4d81131542a 100644
--- a/drivers/leds/leds-s3c24xx.c
+++ b/drivers/leds/leds-s3c24xx.c
@@ -82,6 +82,7 @@ static int s3c24xx_led_probe(struct platform_device *dev)
 	led->cdev.brightness_set = s3c24xx_led_set;
 	led->cdev.default_trigger = pdata->def_trigger;
 	led->cdev.name = pdata->name;
+	led->cdev.flags |= LED_CORE_SUSPENDRESUME;
 
 	led->pdata = pdata;
 
@@ -111,33 +112,9 @@ static int s3c24xx_led_probe(struct platform_device *dev)
 	return ret;
 }
 
-
-#ifdef CONFIG_PM
-static int s3c24xx_led_suspend(struct platform_device *dev, pm_message_t state)
-{
-	struct s3c24xx_gpio_led *led = pdev_to_gpio(dev);
-
-	led_classdev_suspend(&led->cdev);
-	return 0;
-}
-
-static int s3c24xx_led_resume(struct platform_device *dev)
-{
-	struct s3c24xx_gpio_led *led = pdev_to_gpio(dev);
-
-	led_classdev_resume(&led->cdev);
-	return 0;
-}
-#else
-#define s3c24xx_led_suspend NULL
-#define s3c24xx_led_resume NULL
-#endif
-
 static struct platform_driver s3c24xx_led_driver = {
 	.probe		= s3c24xx_led_probe,
 	.remove		= s3c24xx_led_remove,
-	.suspend	= s3c24xx_led_suspend,
-	.resume		= s3c24xx_led_resume,
 	.driver		= {
 		.name		= "s3c24xx_led",
 		.owner		= THIS_MODULE,
diff --git a/drivers/leds/leds-wm8350.c b/drivers/leds/leds-wm8350.c
new file mode 100644
index 00000000000..38c6bcb07e6
--- /dev/null
+++ b/drivers/leds/leds-wm8350.c
@@ -0,0 +1,311 @@
+/*
+ * LED driver for WM8350 driven LEDS.
+ *
+ * Copyright(C) 2007, 2008 Wolfson Microelectronics PLC.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/leds.h>
+#include <linux/err.h>
+#include <linux/mfd/wm8350/pmic.h>
+#include <linux/regulator/consumer.h>
+
+/* Microamps */
+static const int isink_cur[] = {
+	4,
+	5,
+	6,
+	7,
+	8,
+	10,
+	11,
+	14,
+	16,
+	19,
+	23,
+	27,
+	32,
+	39,
+	46,
+	54,
+	65,
+	77,
+	92,
+	109,
+	130,
+	154,
+	183,
+	218,
+	259,
+	308,
+	367,
+	436,
+	518,
+	616,
+	733,
+	872,
+	1037,
+	1233,
+	1466,
+	1744,
+	2073,
+	2466,
+	2933,
+	3487,
+	4147,
+	4932,
+	5865,
+	6975,
+	8294,
+	9864,
+	11730,
+	13949,
+	16589,
+	19728,
+	23460,
+	27899,
+	33178,
+	39455,
+	46920,
+	55798,
+	66355,
+	78910,
+	93840,
+	111596,
+	132710,
+	157820,
+	187681,
+	223191
+};
+
+#define to_wm8350_led(led_cdev) \
+	container_of(led_cdev, struct wm8350_led, cdev)
+
+static void wm8350_led_enable(struct wm8350_led *led)
+{
+	int ret;
+
+	if (led->enabled)
+		return;
+
+	ret = regulator_enable(led->isink);
+	if (ret != 0) {
+		dev_err(led->cdev.dev, "Failed to enable ISINK: %d\n", ret);
+		return;
+	}
+
+	ret = regulator_enable(led->dcdc);
+	if (ret != 0) {
+		dev_err(led->cdev.dev, "Failed to enable DCDC: %d\n", ret);
+		regulator_disable(led->isink);
+		return;
+	}
+
+	led->enabled = 1;
+}
+
+static void wm8350_led_disable(struct wm8350_led *led)
+{
+	int ret;
+
+	if (!led->enabled)
+		return;
+
+	ret = regulator_disable(led->dcdc);
+	if (ret != 0) {
+		dev_err(led->cdev.dev, "Failed to disable DCDC: %d\n", ret);
+		return;
+	}
+
+	ret = regulator_disable(led->isink);
+	if (ret != 0) {
+		dev_err(led->cdev.dev, "Failed to disable ISINK: %d\n", ret);
+		regulator_enable(led->dcdc);
+		return;
+	}
+
+	led->enabled = 0;
+}
+
+static void led_work(struct work_struct *work)
+{
+	struct wm8350_led *led = container_of(work, struct wm8350_led, work);
+	int ret;
+	int uA;
+	unsigned long flags;
+
+	mutex_lock(&led->mutex);
+
+	spin_lock_irqsave(&led->value_lock, flags);
+
+	if (led->value == LED_OFF) {
+		spin_unlock_irqrestore(&led->value_lock, flags);
+		wm8350_led_disable(led);
+		goto out;
+	}
+
+	/* This scales linearly into the index of valid current
+	 * settings which results in a linear scaling of perceived
+	 * brightness due to the non-linear current settings provided
+	 * by the hardware.
+	 */
+	uA = (led->max_uA_index * led->value) / LED_FULL;
+	spin_unlock_irqrestore(&led->value_lock, flags);
+	BUG_ON(uA >= ARRAY_SIZE(isink_cur));
+
+	ret = regulator_set_current_limit(led->isink, isink_cur[uA],
+					  isink_cur[uA]);
+	if (ret != 0)
+		dev_err(led->cdev.dev, "Failed to set %duA: %d\n",
+			isink_cur[uA], ret);
+
+	wm8350_led_enable(led);
+
+out:
+	mutex_unlock(&led->mutex);
+}
+
+static void wm8350_led_set(struct led_classdev *led_cdev,
+			   enum led_brightness value)
+{
+	struct wm8350_led *led = to_wm8350_led(led_cdev);
+	unsigned long flags;
+
+	spin_lock_irqsave(&led->value_lock, flags);
+	led->value = value;
+	schedule_work(&led->work);
+	spin_unlock_irqrestore(&led->value_lock, flags);
+}
+
+static void wm8350_led_shutdown(struct platform_device *pdev)
+{
+	struct wm8350_led *led = platform_get_drvdata(pdev);
+
+	mutex_lock(&led->mutex);
+	led->value = LED_OFF;
+	wm8350_led_disable(led);
+	mutex_unlock(&led->mutex);
+}
+
+static int wm8350_led_probe(struct platform_device *pdev)
+{
+	struct regulator *isink, *dcdc;
+	struct wm8350_led *led;
+	struct wm8350_led_platform_data *pdata = pdev->dev.platform_data;
+	int ret, i;
+
+	if (pdata == NULL) {
+		dev_err(&pdev->dev, "no platform data\n");
+		return -ENODEV;
+	}
+
+	if (pdata->max_uA < isink_cur[0]) {
+		dev_err(&pdev->dev, "Invalid maximum current %duA\n",
+			pdata->max_uA);
+		return -EINVAL;
+	}
+
+	isink = regulator_get(&pdev->dev, "led_isink");
+	if (IS_ERR(isink)) {
+		printk(KERN_ERR "%s: cant get ISINK\n", __func__);
+		return PTR_ERR(isink);
+	}
+
+	dcdc = regulator_get(&pdev->dev, "led_vcc");
+	if (IS_ERR(dcdc)) {
+		printk(KERN_ERR "%s: cant get DCDC\n", __func__);
+		ret = PTR_ERR(dcdc);
+		goto err_isink;
+	}
+
+	led = kzalloc(sizeof(*led), GFP_KERNEL);
+	if (led == NULL) {
+		ret = -ENOMEM;
+		goto err_dcdc;
+	}
+
+	led->cdev.brightness_set = wm8350_led_set;
+	led->cdev.default_trigger = pdata->default_trigger;
+	led->cdev.name = pdata->name;
+	led->cdev.flags |= LED_CORE_SUSPENDRESUME;
+	led->enabled = regulator_is_enabled(isink);
+	led->isink = isink;
+	led->dcdc = dcdc;
+
+	for (i = 0; i < ARRAY_SIZE(isink_cur) - 1; i++)
+		if (isink_cur[i] >= pdata->max_uA)
+			break;
+	led->max_uA_index = i;
+	if (pdata->max_uA != isink_cur[i])
+		dev_warn(&pdev->dev,
+			 "Maximum current %duA is not directly supported,"
+			 " check platform data\n",
+			 pdata->max_uA);
+
+	spin_lock_init(&led->value_lock);
+	mutex_init(&led->mutex);
+	INIT_WORK(&led->work, led_work);
+	led->value = LED_OFF;
+	platform_set_drvdata(pdev, led);
+
+	ret = led_classdev_register(&pdev->dev, &led->cdev);
+	if (ret < 0)
+		goto err_led;
+
+	return 0;
+
+ err_led:
+	kfree(led);
+ err_dcdc:
+	regulator_put(dcdc);
+ err_isink:
+	regulator_put(isink);
+	return ret;
+}
+
+static int wm8350_led_remove(struct platform_device *pdev)
+{
+	struct wm8350_led *led = platform_get_drvdata(pdev);
+
+	led_classdev_unregister(&led->cdev);
+	flush_scheduled_work();
+	wm8350_led_disable(led);
+	regulator_put(led->dcdc);
+	regulator_put(led->isink);
+	kfree(led);
+	return 0;
+}
+
+static struct platform_driver wm8350_led_driver = {
+	.driver = {
+		   .name = "wm8350-led",
+		   .owner = THIS_MODULE,
+		   },
+	.probe = wm8350_led_probe,
+	.remove = wm8350_led_remove,
+	.shutdown = wm8350_led_shutdown,
+};
+
+static int __devinit wm8350_led_init(void)
+{
+	return platform_driver_register(&wm8350_led_driver);
+}
+module_init(wm8350_led_init);
+
+static void wm8350_led_exit(void)
+{
+	platform_driver_unregister(&wm8350_led_driver);
+}
+module_exit(wm8350_led_exit);
+
+MODULE_AUTHOR("Mark Brown");
+MODULE_DESCRIPTION("WM8350 LED driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:wm8350-led");
diff --git a/drivers/leds/leds-wrap.c b/drivers/leds/leds-wrap.c
index 2f3aa87f2a1..2982c86ac4c 100644
--- a/drivers/leds/leds-wrap.c
+++ b/drivers/leds/leds-wrap.c
@@ -56,40 +56,21 @@ static struct led_classdev wrap_power_led = {
 	.name			= "wrap::power",
 	.brightness_set		= wrap_power_led_set,
 	.default_trigger	= "default-on",
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev wrap_error_led = {
 	.name		= "wrap::error",
 	.brightness_set	= wrap_error_led_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
 static struct led_classdev wrap_extra_led = {
 	.name           = "wrap::extra",
 	.brightness_set = wrap_extra_led_set,
+	.flags			= LED_CORE_SUSPENDRESUME,
 };
 
-#ifdef CONFIG_PM
-static int wrap_led_suspend(struct platform_device *dev,
-		pm_message_t state)
-{
-	led_classdev_suspend(&wrap_power_led);
-	led_classdev_suspend(&wrap_error_led);
-	led_classdev_suspend(&wrap_extra_led);
-	return 0;
-}
-
-static int wrap_led_resume(struct platform_device *dev)
-{
-	led_classdev_resume(&wrap_power_led);
-	led_classdev_resume(&wrap_error_led);
-	led_classdev_resume(&wrap_extra_led);
-	return 0;
-}
-#else
-#define wrap_led_suspend NULL
-#define wrap_led_resume NULL
-#endif
-
 static int wrap_led_probe(struct platform_device *pdev)
 {
 	int ret;
@@ -127,8 +108,6 @@ static int wrap_led_remove(struct platform_device *pdev)
 static struct platform_driver wrap_led_driver = {
 	.probe		= wrap_led_probe,
 	.remove		= wrap_led_remove,
-	.suspend	= wrap_led_suspend,
-	.resume		= wrap_led_resume,
 	.driver		= {
 		.name		= DRVNAME,
 		.owner		= THIS_MODULE,
diff --git a/drivers/leds/ledtrig-timer.c b/drivers/leds/ledtrig-timer.c
index db681962d7b..3d6531396dd 100644
--- a/drivers/leds/ledtrig-timer.c
+++ b/drivers/leds/ledtrig-timer.c
@@ -199,6 +199,7 @@ err_out:
 static void timer_trig_deactivate(struct led_classdev *led_cdev)
 {
 	struct timer_trig_data *timer_data = led_cdev->trigger_data;
+	unsigned long on = 0, off = 0;
 
 	if (timer_data) {
 		device_remove_file(led_cdev->dev, &dev_attr_delay_on);
@@ -206,6 +207,10 @@ static void timer_trig_deactivate(struct led_classdev *led_cdev)
 		del_timer_sync(&timer_data->timer);
 		kfree(timer_data);
 	}
+
+	/* If there is hardware support for blinking, stop it */
+	if (led_cdev->blink_set)
+		led_cdev->blink_set(led_cdev, &on, &off);
 }
 
 static struct led_trigger timer_led_trigger = {
diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index 3a273ccef3f..f92595c8f16 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -1453,6 +1453,9 @@ void wm8350_device_exit(struct wm8350 *wm8350)
 {
 	int i;
 
+	for (i = 0; i < ARRAY_SIZE(wm8350->pmic.led); i++)
+		platform_device_unregister(wm8350->pmic.led[i].pdev);
+
 	for (i = 0; i < ARRAY_SIZE(wm8350->pmic.pdev); i++)
 		platform_device_unregister(wm8350->pmic.pdev[i]);
 
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 3949a1c7345..419c378bd24 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -120,7 +120,7 @@ config TIFM_CORE
 	  cards are supported via 'MMC/SD Card support: TI Flash Media MMC/SD
 	  Interface support (MMC_TIFM_SD)'.
 
-          To compile this driver as a module, choose M here: the module will
+	  To compile this driver as a module, choose M here: the module will
 	  be called tifm_core.
 
 config TIFM_7XX1
@@ -133,100 +133,9 @@ config TIFM_7XX1
 	  To make actual use of the device, you will have to select some
 	  flash card format drivers, as outlined in the TIFM_CORE Help.
 
-          To compile this driver as a module, choose M here: the module will
+	  To compile this driver as a module, choose M here: the module will
 	  be called tifm_7xx1.
 
-config ACER_WMI
-        tristate "Acer WMI Laptop Extras (EXPERIMENTAL)"
-	depends on X86
-	depends on EXPERIMENTAL
-	depends on ACPI
-	depends on LEDS_CLASS
-	depends on NEW_LEDS
-	depends on BACKLIGHT_CLASS_DEVICE
-	depends on SERIO_I8042
-	depends on RFKILL
-	select ACPI_WMI
-	---help---
-	  This is a driver for newer Acer (and Wistron) laptops. It adds
-	  wireless radio and bluetooth control, and on some laptops,
-	  exposes the mail LED and LCD backlight.
-
-	  For more information about this driver see
-	  <file:Documentation/laptops/acer-wmi.txt>
-
-	  If you have an ACPI-WMI compatible Acer/ Wistron laptop, say Y or M
-	  here.
-
-config ASUS_LAPTOP
-        tristate "Asus Laptop Extras (EXPERIMENTAL)"
-        depends on X86
-        depends on ACPI
-	depends on EXPERIMENTAL && !ACPI_ASUS
-	depends on LEDS_CLASS
-	depends on NEW_LEDS
-	depends on BACKLIGHT_CLASS_DEVICE
-        ---help---
-	  This is the new Linux driver for Asus laptops. It may also support some
-	  MEDION, JVC or VICTOR laptops. It makes all the extra buttons generate
-	  standard ACPI events that go through /proc/acpi/events. It also adds
-	  support for video output switching, LCD backlight control, Bluetooth and
-	  Wlan control, and most importantly, allows you to blink those fancy LEDs.
-
-	  For more information and a userspace daemon for handling the extra
-	  buttons see <http://acpi4asus.sf.net/>.
-
-	  If you have an ACPI-compatible ASUS laptop, say Y or M here.
-
-config FUJITSU_LAPTOP
-        tristate "Fujitsu Laptop Extras"
-        depends on X86
-        depends on ACPI
-	depends on INPUT
-        depends on BACKLIGHT_CLASS_DEVICE
-        ---help---
-	  This is a driver for laptops built by Fujitsu:
-
-	    * P2xxx/P5xxx/S6xxx/S7xxx series Lifebooks
-	    * Possibly other Fujitsu laptop models
-	    * Tested with S6410 and S7020
-
-	  It adds support for LCD brightness control and some hotkeys.
-
-	  If you have a Fujitsu laptop, say Y or M here.
-
-config FUJITSU_LAPTOP_DEBUG
-	bool "Verbose debug mode for Fujitsu Laptop Extras"
-	depends on FUJITSU_LAPTOP
-	default n
-	---help---
-	  Enables extra debug output from the fujitsu extras driver, at the
-	  expense of a slight increase in driver size.
-
-	  If you are not sure, say N here.
-
-config TC1100_WMI
-	tristate "HP Compaq TC1100 Tablet WMI Extras (EXPERIMENTAL)"
-	depends on X86 && !X86_64
-	depends on EXPERIMENTAL
-	depends on ACPI
-	select ACPI_WMI
-	---help---
-	  This is a driver for the WMI extensions (wireless and bluetooth power
-	  control) of the HP Compaq TC1100 tablet.
-
-config HP_WMI
-       tristate "HP WMI extras"
-       depends on ACPI_WMI
-       depends on INPUT
-       depends on RFKILL
-       help
-         Say Y here if you want to support WMI-based hotkeys on HP laptops and
-	 to read data from WMI such as docking or ambient light sensor state.
-
-         To compile this driver as a module, choose M here: the module will
-         be called hp-wmi.
-
 config ICS932S401
 	tristate "Integrated Circuits ICS932S401"
 	depends on I2C && EXPERIMENTAL
@@ -237,170 +146,6 @@ config ICS932S401
 	  This driver can also be built as a module. If so, the module
 	  will be called ics932s401.
 
-config MSI_LAPTOP
-        tristate "MSI Laptop Extras"
-        depends on X86
-        depends on ACPI
-        depends on BACKLIGHT_CLASS_DEVICE
-        ---help---
-	  This is a driver for laptops built by MSI (MICRO-STAR
-	  INTERNATIONAL):
-
-	  MSI MegaBook S270 (MS-1013)
-	  Cytron/TCM/Medion/Tchibo MD96100/SAM2000
-
-	  It adds support for Bluetooth, WLAN and LCD brightness control.
-
-	  More information about this driver is available at
-	  <http://0pointer.de/lennart/tchibo.html>.
-
-	  If you have an MSI S270 laptop, say Y or M here.
-
-config PANASONIC_LAPTOP
-	tristate "Panasonic Laptop Extras"
-	depends on X86 && INPUT && ACPI
-        depends on BACKLIGHT_CLASS_DEVICE
-	---help---
-	  This driver adds support for access to backlight control and hotkeys
-	  on Panasonic Let's Note laptops.
-
-	  If you have a Panasonic Let's note laptop (such as the R1(N variant),
-	  R2, R3, R5, T2, W2 and Y2 series), say Y.
-
-config COMPAL_LAPTOP
-	tristate "Compal Laptop Extras"
-	depends on X86
-	depends on ACPI
-	depends on BACKLIGHT_CLASS_DEVICE
-	---help---
-	  This is a driver for laptops built by Compal:
-
-	  Compal FL90/IFL90
-	  Compal FL91/IFL91
-	  Compal FL92/JFL92
-	  Compal FT00/IFT00
-
-	  It adds support for Bluetooth, WLAN and LCD brightness control.
-
-	  If you have an Compal FL9x/IFL9x/FT00 laptop, say Y or M here.
-
-config SONY_LAPTOP
-	tristate "Sony Laptop Extras"
-	depends on X86 && ACPI
-	select BACKLIGHT_CLASS_DEVICE
-	depends on INPUT
-	  ---help---
-	  This mini-driver drives the SNC and SPIC devices present in the ACPI
-	  BIOS of the Sony Vaio laptops.
-
-	  It gives access to some extra laptop functionalities like Bluetooth,
-	  screen brightness control, Fn keys and allows powering on/off some
-	  devices.
-
-	  Read <file:Documentation/laptops/sony-laptop.txt> for more information.
-
-config SONYPI_COMPAT
-	bool "Sonypi compatibility"
-	depends on SONY_LAPTOP
-	  ---help---
-	  Build the sonypi driver compatibility code into the sony-laptop driver.
-
-config THINKPAD_ACPI
-	tristate "ThinkPad ACPI Laptop Extras"
-	depends on X86 && ACPI
-	select BACKLIGHT_LCD_SUPPORT
-	select BACKLIGHT_CLASS_DEVICE
-	select HWMON
-	select NVRAM
-	select INPUT
-	select NEW_LEDS
-	select LEDS_CLASS
-	select NET
-	select RFKILL
-	---help---
-	  This is a driver for the IBM and Lenovo ThinkPad laptops. It adds
-	  support for Fn-Fx key combinations, Bluetooth control, video
-	  output switching, ThinkLight control, UltraBay eject and more.
-	  For more information about this driver see
-	  <file:Documentation/laptops/thinkpad-acpi.txt> and
-	  <http://ibm-acpi.sf.net/> .
-
-	  This driver was formerly known as ibm-acpi.
-
-	  If you have an IBM or Lenovo ThinkPad laptop, say Y or M here.
-
-config THINKPAD_ACPI_DEBUG
-	bool "Verbose debug mode"
-	depends on THINKPAD_ACPI
-	default n
-	---help---
-	  Enables extra debugging information, at the expense of a slightly
-	  increase in driver size.
-
-	  If you are not sure, say N here.
-
-config THINKPAD_ACPI_DOCK
-	bool "Legacy Docking Station Support"
-	depends on THINKPAD_ACPI
-	depends on ACPI_DOCK=n
-	default n
-	---help---
-	  Allows the thinkpad_acpi driver to handle docking station events.
-	  This support was made obsolete by the generic ACPI docking station
-	  support (CONFIG_ACPI_DOCK).  It will allow locking and removing the
-	  laptop from the docking station, but will not properly connect PCI
-	  devices.
-
-	  If you are not sure, say N here.
-
-config THINKPAD_ACPI_BAY
-	bool "Legacy Removable Bay Support"
-	depends on THINKPAD_ACPI
-	default y
-	---help---
-	  Allows the thinkpad_acpi driver to handle removable bays.  It will
-	  electrically disable the device in the bay, and also generate
-	  notifications when the bay lever is ejected or inserted.
-
-	  If you are not sure, say Y here.
-
-config THINKPAD_ACPI_VIDEO
-	bool "Video output control support"
-	depends on THINKPAD_ACPI
-	default y
-	---help---
-	  Allows the thinkpad_acpi driver to provide an interface to control
-	  the various video output ports.
-
-	  This feature often won't work well, depending on ThinkPad model,
-	  display state, video output devices in use, whether there is a X
-	  server running, phase of the moon, and the current mood of
-	  Schroedinger's cat.  If you can use X.org's RandR to control
-	  your ThinkPad's video output ports instead of this feature,
-	  don't think twice: do it and say N here to save some memory.
-
-	  If you are not sure, say Y here.
-
-config THINKPAD_ACPI_HOTKEY_POLL
-	bool "Support NVRAM polling for hot keys"
-	depends on THINKPAD_ACPI
-	default y
-	---help---
-	  Some thinkpad models benefit from NVRAM polling to detect a few of
-	  the hot key press events.  If you know your ThinkPad model does not
-	  need to do NVRAM polling to support any of the hot keys you use,
-	  unselecting this option will save about 1kB of memory.
-
-	  ThinkPads T40 and newer, R52 and newer, and X31 and newer are
-	  unlikely to need NVRAM polling in their latest BIOS versions.
-
-	  NVRAM polling can detect at most the following keys: ThinkPad/Access
-	  IBM, Zoom, Switch Display (fn+F7), ThinkLight, Volume up/down/mute,
-	  Brightness up/down, Display Expand (fn+F8), Hibernate (fn+F12).
-
-	  If you are not sure, say Y here.  The driver enables polling only if
-	  it is strictly necessary to do so.
-
 config ATMEL_SSC
 	tristate "Device driver for Atmel SSC peripheral"
 	depends on AVR32 || ARCH_AT91
@@ -413,31 +158,6 @@ config ATMEL_SSC
 
 	  If unsure, say N.
 
-config INTEL_MENLOW
-	tristate "Thermal Management driver for Intel menlow platform"
-	depends on ACPI_THERMAL
-	select THERMAL
-	depends on X86
-	---help---
-	  ACPI thermal management enhancement driver on
-	  Intel Menlow platform.
-
-	  If unsure, say N.
-
-config EEEPC_LAPTOP
-	tristate "Eee PC Hotkey Driver (EXPERIMENTAL)"
-	depends on X86
-	depends on ACPI
-	depends on BACKLIGHT_CLASS_DEVICE
-	depends on HWMON
-	depends on EXPERIMENTAL
-	depends on RFKILL
-	---help---
-	  This driver supports the Fn-Fx keys on Eee PC laptops.
-	  It also adds the ability to switch camera/wlan on/off.
-
-	  If you have an Eee PC laptop, say Y or M here.
-
 config ENCLOSURE_SERVICES
 	tristate "Enclosure Services"
 	default n
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 5de863a0e39..9cf8ae6e4b3 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -1,33 +1,20 @@
 #
 # Makefile for misc devices that really don't fit anywhere else.
 #
-obj- := misc.o	# Dummy rule to force built-in.o to be made
 
 obj-$(CONFIG_IBM_ASM)		+= ibmasm/
 obj-$(CONFIG_HDPU_FEATURES)	+= hdpuftrs/
-obj-$(CONFIG_ASUS_LAPTOP)	+= asus-laptop.o
-obj-$(CONFIG_EEEPC_LAPTOP)	+= eeepc-laptop.o
-obj-$(CONFIG_MSI_LAPTOP)	+= msi-laptop.o
-obj-$(CONFIG_COMPAL_LAPTOP)	+= compal-laptop.o
-obj-$(CONFIG_ACER_WMI)		+= acer-wmi.o
 obj-$(CONFIG_ATMEL_PWM)		+= atmel_pwm.o
 obj-$(CONFIG_ATMEL_SSC)		+= atmel-ssc.o
 obj-$(CONFIG_ATMEL_TCLIB)	+= atmel_tclib.o
-obj-$(CONFIG_HP_WMI)		+= hp-wmi.o
 obj-$(CONFIG_ICS932S401)	+= ics932s401.o
-obj-$(CONFIG_TC1100_WMI)	+= tc1100-wmi.o
 obj-$(CONFIG_LKDTM)		+= lkdtm.o
 obj-$(CONFIG_TIFM_CORE)       	+= tifm_core.o
 obj-$(CONFIG_DELL_LAPTOP)	+= dell-laptop.o
 obj-$(CONFIG_TIFM_7XX1)       	+= tifm_7xx1.o
 obj-$(CONFIG_PHANTOM)		+= phantom.o
 obj-$(CONFIG_SGI_IOC4)		+= ioc4.o
-obj-$(CONFIG_SONY_LAPTOP)	+= sony-laptop.o
-obj-$(CONFIG_THINKPAD_ACPI)	+= thinkpad_acpi.o
-obj-$(CONFIG_FUJITSU_LAPTOP)	+= fujitsu-laptop.o
-obj-$(CONFIG_PANASONIC_LAPTOP)	+= panasonic-laptop.o
 obj-$(CONFIG_EEPROM_93CX6)	+= eeprom_93cx6.o
-obj-$(CONFIG_INTEL_MENLOW)	+= intel_menlow.o
 obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o
 obj-$(CONFIG_KGDB_TESTS)	+= kgdbts.o
 obj-$(CONFIG_SGI_XP)		+= sgi-xp/
diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c
index 1e97916914a..76bfe16c09b 100644
--- a/drivers/mmc/host/atmel-mci.c
+++ b/drivers/mmc/host/atmel-mci.c
@@ -55,7 +55,6 @@ enum atmel_mci_state {
 
 struct atmel_mci_dma {
 #ifdef CONFIG_MMC_ATMELMCI_DMA
-	struct dma_client		client;
 	struct dma_chan			*chan;
 	struct dma_async_tx_descriptor	*data_desc;
 #endif
@@ -593,10 +592,8 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data)
 
 	/* If we don't have a channel, we can't do DMA */
 	chan = host->dma.chan;
-	if (chan) {
-		dma_chan_get(chan);
+	if (chan)
 		host->data_chan = chan;
-	}
 
 	if (!chan)
 		return -ENODEV;
@@ -1443,60 +1440,6 @@ static irqreturn_t atmci_detect_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-#ifdef CONFIG_MMC_ATMELMCI_DMA
-
-static inline struct atmel_mci *
-dma_client_to_atmel_mci(struct dma_client *client)
-{
-	return container_of(client, struct atmel_mci, dma.client);
-}
-
-static enum dma_state_client atmci_dma_event(struct dma_client *client,
-		struct dma_chan *chan, enum dma_state state)
-{
-	struct atmel_mci	*host;
-	enum dma_state_client	ret = DMA_NAK;
-
-	host = dma_client_to_atmel_mci(client);
-
-	switch (state) {
-	case DMA_RESOURCE_AVAILABLE:
-		spin_lock_bh(&host->lock);
-		if (!host->dma.chan) {
-			host->dma.chan = chan;
-			ret = DMA_ACK;
-		}
-		spin_unlock_bh(&host->lock);
-
-		if (ret == DMA_ACK)
-			dev_info(&host->pdev->dev,
-					"Using %s for DMA transfers\n",
-					chan->dev.bus_id);
-		break;
-
-	case DMA_RESOURCE_REMOVED:
-		spin_lock_bh(&host->lock);
-		if (host->dma.chan == chan) {
-			host->dma.chan = NULL;
-			ret = DMA_ACK;
-		}
-		spin_unlock_bh(&host->lock);
-
-		if (ret == DMA_ACK)
-			dev_info(&host->pdev->dev,
-					"Lost %s, falling back to PIO\n",
-					chan->dev.bus_id);
-		break;
-
-	default:
-		break;
-	}
-
-
-	return ret;
-}
-#endif /* CONFIG_MMC_ATMELMCI_DMA */
-
 static int __init atmci_init_slot(struct atmel_mci *host,
 		struct mci_slot_pdata *slot_data, unsigned int id,
 		u32 sdc_reg)
@@ -1600,6 +1543,18 @@ static void __exit atmci_cleanup_slot(struct atmel_mci_slot *slot,
 	mmc_free_host(slot->mmc);
 }
 
+#ifdef CONFIG_MMC_ATMELMCI_DMA
+static bool filter(struct dma_chan *chan, void *slave)
+{
+	struct dw_dma_slave *dws = slave;
+
+	if (dws->dma_dev == chan->device->dev)
+		return true;
+	else
+		return false;
+}
+#endif
+
 static int __init atmci_probe(struct platform_device *pdev)
 {
 	struct mci_platform_data	*pdata;
@@ -1652,22 +1607,20 @@ static int __init atmci_probe(struct platform_device *pdev)
 		goto err_request_irq;
 
 #ifdef CONFIG_MMC_ATMELMCI_DMA
-	if (pdata->dma_slave) {
-		struct dma_slave *slave = pdata->dma_slave;
+	if (pdata->dma_slave.dma_dev) {
+		struct dw_dma_slave *dws = &pdata->dma_slave;
+		dma_cap_mask_t mask;
 
-		slave->tx_reg = regs->start + MCI_TDR;
-		slave->rx_reg = regs->start + MCI_RDR;
+		dws->tx_reg = regs->start + MCI_TDR;
+		dws->rx_reg = regs->start + MCI_RDR;
 
 		/* Try to grab a DMA channel */
-		host->dma.client.event_callback = atmci_dma_event;
-		dma_cap_set(DMA_SLAVE, host->dma.client.cap_mask);
-		host->dma.client.slave = slave;
-
-		dma_async_client_register(&host->dma.client);
-		dma_async_client_chan_request(&host->dma.client);
-	} else {
-		dev_notice(&pdev->dev, "DMA not available, using PIO\n");
+		dma_cap_zero(mask);
+		dma_cap_set(DMA_SLAVE, mask);
+		host->dma.chan = dma_request_channel(mask, filter, dws);
 	}
+	if (!host->dma.chan)
+		dev_notice(&pdev->dev, "DMA not available, using PIO\n");
 #endif /* CONFIG_MMC_ATMELMCI_DMA */
 
 	platform_set_drvdata(pdev, host);
@@ -1699,8 +1652,8 @@ static int __init atmci_probe(struct platform_device *pdev)
 
 err_init_slot:
 #ifdef CONFIG_MMC_ATMELMCI_DMA
-	if (pdata->dma_slave)
-		dma_async_client_unregister(&host->dma.client);
+	if (host->dma.chan)
+		dma_release_channel(host->dma.chan);
 #endif
 	free_irq(irq, host);
 err_request_irq:
@@ -1731,8 +1684,8 @@ static int __exit atmci_remove(struct platform_device *pdev)
 	clk_disable(host->mck);
 
 #ifdef CONFIG_MMC_ATMELMCI_DMA
-	if (host->dma.client.slave)
-		dma_async_client_unregister(&host->dma.client);
+	if (host->dma.chan)
+		dma_release_channel(host->dma.chan);
 #endif
 
 	free_irq(platform_get_irq(pdev, 0), host);
@@ -1761,7 +1714,7 @@ static void __exit atmci_exit(void)
 	platform_driver_unregister(&atmci_driver);
 }
 
-module_init(atmci_init);
+late_initcall(atmci_init); /* try to load after dma driver when built-in */
 module_exit(atmci_exit);
 
 MODULE_DESCRIPTION("Atmel Multimedia Card Interface driver");
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index a90d50c2c3e..7d04fb9ddca 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -45,6 +45,14 @@ config MTD_PARTITIONS
 	  devices. Partitioning on NFTL 'devices' is a different - that's the
 	  'normal' form of partitioning used on a block device.
 
+config MTD_TESTS
+	tristate "MTD tests support"
+	depends on m
+	help
+	  This option includes various MTD tests into compilation. The tests
+	  should normally be compiled as kernel modules. The modules perform
+	  various checks and verifications when loaded.
+
 config MTD_REDBOOT_PARTS
 	tristate "RedBoot partition table parsing"
 	depends on MTD_PARTITIONS
@@ -316,6 +324,8 @@ source "drivers/mtd/nand/Kconfig"
 
 source "drivers/mtd/onenand/Kconfig"
 
+source "drivers/mtd/lpddr/Kconfig"
+
 source "drivers/mtd/ubi/Kconfig"
 
 endif # MTD
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile
index 4b77335715f..4521b1ecce4 100644
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -29,6 +29,6 @@ obj-$(CONFIG_MTD_OOPS)		+= mtdoops.o
 nftl-objs		:= nftlcore.o nftlmount.o
 inftl-objs		:= inftlcore.o inftlmount.o
 
-obj-y		+= chips/ maps/ devices/ nand/ onenand/
+obj-y		+= chips/ lpddr/ maps/ devices/ nand/ onenand/ tests/
 
 obj-$(CONFIG_MTD_UBI)		+= ubi/
diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index c93a8be5d5f..f5ab6fa1057 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -58,8 +58,8 @@ static int cfi_intelext_write_buffers(struct mtd_info *, loff_t, size_t, size_t
 static int cfi_intelext_writev(struct mtd_info *, const struct kvec *, unsigned long, loff_t, size_t *);
 static int cfi_intelext_erase_varsize(struct mtd_info *, struct erase_info *);
 static void cfi_intelext_sync (struct mtd_info *);
-static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len);
-static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, size_t len);
+static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 #ifdef CONFIG_MTD_OTP
 static int cfi_intelext_read_fact_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int cfi_intelext_read_user_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
@@ -558,8 +558,8 @@ static struct mtd_info *cfi_intelext_setup(struct mtd_info *mtd)
 	}
 
 	for (i=0; i<mtd->numeraseregions;i++){
-		printk(KERN_DEBUG "erase region %d: offset=0x%x,size=0x%x,blocks=%d\n",
-		       i,mtd->eraseregions[i].offset,
+		printk(KERN_DEBUG "erase region %d: offset=0x%llx,size=0x%x,blocks=%d\n",
+		       i,(unsigned long long)mtd->eraseregions[i].offset,
 		       mtd->eraseregions[i].erasesize,
 		       mtd->eraseregions[i].numblocks);
 	}
@@ -2058,7 +2058,7 @@ out:	put_chip(map, chip, adr);
 	return ret;
 }
 
-static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
@@ -2082,7 +2082,7 @@ static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
 	return ret;
 }
 
-static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index d74ec46aa03..94bb61e1904 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -71,8 +71,8 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 static void put_chip(struct map_info *map, struct flchip *chip, unsigned long adr);
 #include "fwh_lock.h"
 
-static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, size_t len);
-static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, size_t len);
+static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 
 static struct mtd_chip_driver cfi_amdstd_chipdrv = {
 	.probe		= NULL, /* Not usable directly */
@@ -322,6 +322,14 @@ static struct cfi_fixup fixup_table[] = {
 };
 
 
+static void cfi_fixup_major_minor(struct cfi_private *cfi,
+				  struct cfi_pri_amdstd *extp)
+{
+	if (cfi->mfr == CFI_MFR_SAMSUNG && cfi->id == 0x257e &&
+	    extp->MajorVersion == '0')
+		extp->MajorVersion = '1';
+}
+
 struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -363,6 +371,8 @@ struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary)
 			return NULL;
 		}
 
+		cfi_fixup_major_minor(cfi, extp);
+
 		if (extp->MajorVersion != '1' ||
 		    (extp->MinorVersion < '0' || extp->MinorVersion > '4')) {
 			printk(KERN_ERR "  Unknown Amd/Fujitsu Extended Query "
@@ -1774,12 +1784,12 @@ out_unlock:
 	return ret;
 }
 
-static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	return cfi_varsize_frob(mtd, do_atmel_lock, ofs, len, NULL);
 }
 
-static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	return cfi_varsize_frob(mtd, do_atmel_unlock, ofs, len, NULL);
 }
diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c
index d4714dd9f7a..6c740f346f9 100644
--- a/drivers/mtd/chips/cfi_cmdset_0020.c
+++ b/drivers/mtd/chips/cfi_cmdset_0020.c
@@ -42,8 +42,8 @@ static int cfi_staa_writev(struct mtd_info *mtd, const struct kvec *vecs,
 		unsigned long count, loff_t to, size_t *retlen);
 static int cfi_staa_erase_varsize(struct mtd_info *, struct erase_info *);
 static void cfi_staa_sync (struct mtd_info *);
-static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, size_t len);
-static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, size_t len);
+static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 static int cfi_staa_suspend (struct mtd_info *);
 static void cfi_staa_resume (struct mtd_info *);
 
@@ -221,8 +221,8 @@ static struct mtd_info *cfi_staa_setup(struct map_info *map)
 		}
 
 		for (i=0; i<mtd->numeraseregions;i++){
-			printk(KERN_DEBUG "%d: offset=0x%x,size=0x%x,blocks=%d\n",
-			       i,mtd->eraseregions[i].offset,
+			printk(KERN_DEBUG "%d: offset=0x%llx,size=0x%x,blocks=%d\n",
+			       i, (unsigned long long)mtd->eraseregions[i].offset,
 			       mtd->eraseregions[i].erasesize,
 			       mtd->eraseregions[i].numblocks);
 		}
@@ -964,7 +964,7 @@ static int cfi_staa_erase_varsize(struct mtd_info *mtd,
 		adr += regions[i].erasesize;
 		len -= regions[i].erasesize;
 
-		if (adr % (1<< cfi->chipshift) == ((regions[i].offset + (regions[i].erasesize * regions[i].numblocks)) %( 1<< cfi->chipshift)))
+		if (adr % (1<< cfi->chipshift) == (((unsigned long)regions[i].offset + (regions[i].erasesize * regions[i].numblocks)) %( 1<< cfi->chipshift)))
 			i++;
 
 		if (adr >> cfi->chipshift) {
@@ -1135,7 +1135,7 @@ retry:
 	spin_unlock_bh(chip->mutex);
 	return 0;
 }
-static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
@@ -1284,7 +1284,7 @@ retry:
 	spin_unlock_bh(chip->mutex);
 	return 0;
 }
-static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
diff --git a/drivers/mtd/chips/fwh_lock.h b/drivers/mtd/chips/fwh_lock.h
index ab44f2b996f..57e0e4e921f 100644
--- a/drivers/mtd/chips/fwh_lock.h
+++ b/drivers/mtd/chips/fwh_lock.h
@@ -77,7 +77,7 @@ static int fwh_xxlock_oneblock(struct map_info *map, struct flchip *chip,
 }
 
 
-static int fwh_lock_varsize(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int fwh_lock_varsize(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
@@ -88,7 +88,7 @@ static int fwh_lock_varsize(struct mtd_info *mtd, loff_t ofs, size_t len)
 }
 
 
-static int fwh_unlock_varsize(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int fwh_unlock_varsize(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
diff --git a/drivers/mtd/devices/lart.c b/drivers/mtd/devices/lart.c
index f4bda4cee49..578de1c67bf 100644
--- a/drivers/mtd/devices/lart.c
+++ b/drivers/mtd/devices/lart.c
@@ -619,7 +619,7 @@ static struct mtd_partition lart_partitions[] = {
 };
 #endif
 
-int __init lart_flash_init (void)
+static int __init lart_flash_init (void)
 {
    int result;
    memset (&mtd,0,sizeof (mtd));
@@ -690,7 +690,7 @@ int __init lart_flash_init (void)
    return (result);
 }
 
-void __exit lart_flash_exit (void)
+static void __exit lart_flash_exit (void)
 {
 #ifndef HAVE_PARTITIONS
    del_mtd_device (&mtd);
@@ -705,5 +705,3 @@ module_exit (lart_flash_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Abraham vd Merwe <abraham@2d3d.co.za>");
 MODULE_DESCRIPTION("MTD driver for Intel 28F160F3 on LART board");
-
-
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 5733f064384..7c3fc766dcf 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -20,6 +20,7 @@
 #include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
+#include <linux/math64.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
@@ -169,9 +170,9 @@ static int wait_till_ready(struct m25p *flash)
  */
 static int erase_chip(struct m25p *flash)
 {
-	DEBUG(MTD_DEBUG_LEVEL3, "%s: %s %dKiB\n",
-			dev_name(&flash->spi->dev), __func__,
-			flash->mtd.size / 1024);
+	DEBUG(MTD_DEBUG_LEVEL3, "%s: %s %lldKiB\n",
+	      dev_name(&flash->spi->dev), __func__,
+	      (long long)(flash->mtd.size >> 10));
 
 	/* Wait until finished previous write command. */
 	if (wait_till_ready(flash))
@@ -232,18 +233,18 @@ static int m25p80_erase(struct mtd_info *mtd, struct erase_info *instr)
 {
 	struct m25p *flash = mtd_to_m25p(mtd);
 	u32 addr,len;
+	uint32_t rem;
 
-	DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%08x, len %d\n",
-			dev_name(&flash->spi->dev), __func__, "at",
-			(u32)instr->addr, instr->len);
+	DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%llx, len %lld\n",
+	      dev_name(&flash->spi->dev), __func__, "at",
+	      (long long)instr->addr, (long long)instr->len);
 
 	/* sanity checks */
 	if (instr->addr + instr->len > flash->mtd.size)
 		return -EINVAL;
-	if ((instr->addr % mtd->erasesize) != 0
-			|| (instr->len % mtd->erasesize) != 0) {
+	div_u64_rem(instr->len, mtd->erasesize, &rem);
+	if (rem)
 		return -EINVAL;
-	}
 
 	addr = instr->addr;
 	len = instr->len;
@@ -677,24 +678,24 @@ static int __devinit m25p_probe(struct spi_device *spi)
 		flash->mtd.erasesize = info->sector_size;
 	}
 
-	dev_info(&spi->dev, "%s (%d Kbytes)\n", info->name,
-			flash->mtd.size / 1024);
+	dev_info(&spi->dev, "%s (%lld Kbytes)\n", info->name,
+			(long long)flash->mtd.size >> 10);
 
 	DEBUG(MTD_DEBUG_LEVEL2,
-		"mtd .name = %s, .size = 0x%.8x (%uMiB) "
+		"mtd .name = %s, .size = 0x%llx (%lldMiB) "
 			".erasesize = 0x%.8x (%uKiB) .numeraseregions = %d\n",
 		flash->mtd.name,
-		flash->mtd.size, flash->mtd.size / (1024*1024),
+		(long long)flash->mtd.size, (long long)(flash->mtd.size >> 20),
 		flash->mtd.erasesize, flash->mtd.erasesize / 1024,
 		flash->mtd.numeraseregions);
 
 	if (flash->mtd.numeraseregions)
 		for (i = 0; i < flash->mtd.numeraseregions; i++)
 			DEBUG(MTD_DEBUG_LEVEL2,
-				"mtd.eraseregions[%d] = { .offset = 0x%.8x, "
+				"mtd.eraseregions[%d] = { .offset = 0x%llx, "
 				".erasesize = 0x%.8x (%uKiB), "
 				".numblocks = %d }\n",
-				i, flash->mtd.eraseregions[i].offset,
+				i, (long long)flash->mtd.eraseregions[i].offset,
 				flash->mtd.eraseregions[i].erasesize,
 				flash->mtd.eraseregions[i].erasesize / 1024,
 				flash->mtd.eraseregions[i].numblocks);
@@ -722,12 +723,12 @@ static int __devinit m25p_probe(struct spi_device *spi)
 		if (nr_parts > 0) {
 			for (i = 0; i < nr_parts; i++) {
 				DEBUG(MTD_DEBUG_LEVEL2, "partitions[%d] = "
-					"{.name = %s, .offset = 0x%.8x, "
-						".size = 0x%.8x (%uKiB) }\n",
+					"{.name = %s, .offset = 0x%llx, "
+						".size = 0x%llx (%lldKiB) }\n",
 					i, parts[i].name,
-					parts[i].offset,
-					parts[i].size,
-					parts[i].size / 1024);
+					(long long)parts[i].offset,
+					(long long)parts[i].size,
+					(long long)(parts[i].size >> 10));
 			}
 			flash->partitioned = 1;
 			return add_mtd_partitions(&flash->mtd, parts, nr_parts);
diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c
index 65126cd668f..d44f741ae22 100644
--- a/drivers/mtd/devices/mtd_dataflash.c
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -16,6 +16,7 @@
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/err.h>
+#include <linux/math64.h>
 
 #include <linux/spi/spi.h>
 #include <linux/spi/flash.h>
@@ -152,15 +153,20 @@ static int dataflash_erase(struct mtd_info *mtd, struct erase_info *instr)
 	struct spi_message	msg;
 	unsigned		blocksize = priv->page_size << 3;
 	uint8_t			*command;
+	uint32_t		rem;
 
-	DEBUG(MTD_DEBUG_LEVEL2, "%s: erase addr=0x%x len 0x%x\n",
-			dev_name(&spi->dev),
-			instr->addr, instr->len);
+	DEBUG(MTD_DEBUG_LEVEL2, "%s: erase addr=0x%llx len 0x%llx\n",
+	      dev_name(&spi->dev), (long long)instr->addr,
+	      (long long)instr->len);
 
 	/* Sanity checks */
-	if ((instr->addr + instr->len) > mtd->size
-			|| (instr->len % priv->page_size) != 0
-			|| (instr->addr % priv->page_size) != 0)
+	if (instr->addr + instr->len > mtd->size)
+		return -EINVAL;
+	div_u64_rem(instr->len, priv->page_size, &rem);
+	if (rem)
+		return -EINVAL;
+	div_u64_rem(instr->addr, priv->page_size, &rem);
+	if (rem)
 		return -EINVAL;
 
 	spi_message_init(&msg);
@@ -178,7 +184,7 @@ static int dataflash_erase(struct mtd_info *mtd, struct erase_info *instr)
 		/* Calculate flash page address; use block erase (for speed) if
 		 * we're at a block boundary and need to erase the whole block.
 		 */
-		pageaddr = instr->addr / priv->page_size;
+		pageaddr = div_u64(instr->len, priv->page_size);
 		do_block = (pageaddr & 0x7) == 0 && instr->len >= blocksize;
 		pageaddr = pageaddr << priv->page_offset;
 
@@ -667,8 +673,8 @@ add_dataflash_otp(struct spi_device *spi, char *name,
 	if (revision >= 'c')
 		otp_tag = otp_setup(device, revision);
 
-	dev_info(&spi->dev, "%s (%d KBytes) pagesize %d bytes%s\n",
-			name, DIV_ROUND_UP(device->size, 1024),
+	dev_info(&spi->dev, "%s (%lld KBytes) pagesize %d bytes%s\n",
+			name, (long long)((device->size + 1023) >> 10),
 			pagesize, otp_tag);
 	dev_set_drvdata(&spi->dev, priv);
 
diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index 9bf581c4f74..a790c062af1 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -109,25 +109,25 @@ module_param(shuffle_freq, int, 0);
 /* Each memory region corresponds to a minor device */
 typedef struct partition_t {
     struct mtd_blktrans_dev mbd;
-    u_int32_t		state;
-    u_int32_t		*VirtualBlockMap;
-    u_int32_t		*VirtualPageMap;
-    u_int32_t		FreeTotal;
+    uint32_t		state;
+    uint32_t		*VirtualBlockMap;
+    uint32_t		*VirtualPageMap;
+    uint32_t		FreeTotal;
     struct eun_info_t {
-	u_int32_t		Offset;
-	u_int32_t		EraseCount;
-	u_int32_t		Free;
-	u_int32_t		Deleted;
+	uint32_t		Offset;
+	uint32_t		EraseCount;
+	uint32_t		Free;
+	uint32_t		Deleted;
     } *EUNInfo;
     struct xfer_info_t {
-	u_int32_t		Offset;
-	u_int32_t		EraseCount;
-	u_int16_t		state;
+	uint32_t		Offset;
+	uint32_t		EraseCount;
+	uint16_t		state;
     } *XferInfo;
-    u_int16_t		bam_index;
-    u_int32_t		*bam_cache;
-    u_int16_t		DataUnits;
-    u_int32_t		BlocksPerUnit;
+    uint16_t		bam_index;
+    uint32_t		*bam_cache;
+    uint16_t		DataUnits;
+    uint32_t		BlocksPerUnit;
     erase_unit_header_t	header;
 } partition_t;
 
@@ -199,8 +199,8 @@ static int scan_header(partition_t *part)
 static int build_maps(partition_t *part)
 {
     erase_unit_header_t header;
-    u_int16_t xvalid, xtrans, i;
-    u_int blocks, j;
+    uint16_t xvalid, xtrans, i;
+    unsigned blocks, j;
     int hdr_ok, ret = -1;
     ssize_t retval;
     loff_t offset;
@@ -269,14 +269,14 @@ static int build_maps(partition_t *part)
 
     /* Set up virtual page map */
     blocks = le32_to_cpu(header.FormattedSize) >> header.BlockSize;
-    part->VirtualBlockMap = vmalloc(blocks * sizeof(u_int32_t));
+    part->VirtualBlockMap = vmalloc(blocks * sizeof(uint32_t));
     if (!part->VirtualBlockMap)
 	    goto out_XferInfo;
 
-    memset(part->VirtualBlockMap, 0xff, blocks * sizeof(u_int32_t));
+    memset(part->VirtualBlockMap, 0xff, blocks * sizeof(uint32_t));
     part->BlocksPerUnit = (1 << header.EraseUnitSize) >> header.BlockSize;
 
-    part->bam_cache = kmalloc(part->BlocksPerUnit * sizeof(u_int32_t),
+    part->bam_cache = kmalloc(part->BlocksPerUnit * sizeof(uint32_t),
 			      GFP_KERNEL);
     if (!part->bam_cache)
 	    goto out_VirtualBlockMap;
@@ -290,7 +290,7 @@ static int build_maps(partition_t *part)
 	offset = part->EUNInfo[i].Offset + le32_to_cpu(header.BAMOffset);
 
 	ret = part->mbd.mtd->read(part->mbd.mtd, offset,
-			      part->BlocksPerUnit * sizeof(u_int32_t), &retval,
+			      part->BlocksPerUnit * sizeof(uint32_t), &retval,
 			      (unsigned char *)part->bam_cache);
 
 	if (ret)
@@ -332,7 +332,7 @@ out:
 ======================================================================*/
 
 static int erase_xfer(partition_t *part,
-		      u_int16_t xfernum)
+		      uint16_t xfernum)
 {
     int ret;
     struct xfer_info_t *xfer;
@@ -408,7 +408,7 @@ static int prepare_xfer(partition_t *part, int i)
     erase_unit_header_t header;
     struct xfer_info_t *xfer;
     int nbam, ret;
-    u_int32_t ctl;
+    uint32_t ctl;
     ssize_t retlen;
     loff_t offset;
 
@@ -430,15 +430,15 @@ static int prepare_xfer(partition_t *part, int i)
     }
 
     /* Write the BAM stub */
-    nbam = (part->BlocksPerUnit * sizeof(u_int32_t) +
+    nbam = (part->BlocksPerUnit * sizeof(uint32_t) +
 	    le32_to_cpu(part->header.BAMOffset) + SECTOR_SIZE - 1) / SECTOR_SIZE;
 
     offset = xfer->Offset + le32_to_cpu(part->header.BAMOffset);
     ctl = cpu_to_le32(BLOCK_CONTROL);
 
-    for (i = 0; i < nbam; i++, offset += sizeof(u_int32_t)) {
+    for (i = 0; i < nbam; i++, offset += sizeof(uint32_t)) {
 
-	ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(u_int32_t),
+	ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint32_t),
 			       &retlen, (u_char *)&ctl);
 
 	if (ret)
@@ -461,18 +461,18 @@ static int prepare_xfer(partition_t *part, int i)
 
 ======================================================================*/
 
-static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
-			   u_int16_t xferunit)
+static int copy_erase_unit(partition_t *part, uint16_t srcunit,
+			   uint16_t xferunit)
 {
     u_char buf[SECTOR_SIZE];
     struct eun_info_t *eun;
     struct xfer_info_t *xfer;
-    u_int32_t src, dest, free, i;
-    u_int16_t unit;
+    uint32_t src, dest, free, i;
+    uint16_t unit;
     int ret;
     ssize_t retlen;
     loff_t offset;
-    u_int16_t srcunitswap = cpu_to_le16(srcunit);
+    uint16_t srcunitswap = cpu_to_le16(srcunit);
 
     eun = &part->EUNInfo[srcunit];
     xfer = &part->XferInfo[xferunit];
@@ -486,7 +486,7 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
 	offset = eun->Offset + le32_to_cpu(part->header.BAMOffset);
 
 	ret = part->mbd.mtd->read(part->mbd.mtd, offset,
-			      part->BlocksPerUnit * sizeof(u_int32_t),
+			      part->BlocksPerUnit * sizeof(uint32_t),
 			      &retlen, (u_char *) (part->bam_cache));
 
 	/* mark the cache bad, in case we get an error later */
@@ -503,7 +503,7 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
     offset = xfer->Offset + 20; /* Bad! */
     unit = cpu_to_le16(0x7fff);
 
-    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(u_int16_t),
+    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint16_t),
 			   &retlen, (u_char *) &unit);
 
     if (ret) {
@@ -560,7 +560,7 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
 
 
     /* All clear? Then update the LogicalEUN again */
-    ret = part->mbd.mtd->write(part->mbd.mtd, xfer->Offset + 20, sizeof(u_int16_t),
+    ret = part->mbd.mtd->write(part->mbd.mtd, xfer->Offset + 20, sizeof(uint16_t),
 			   &retlen, (u_char *)&srcunitswap);
 
     if (ret) {
@@ -605,8 +605,8 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit,
 
 static int reclaim_block(partition_t *part)
 {
-    u_int16_t i, eun, xfer;
-    u_int32_t best;
+    uint16_t i, eun, xfer;
+    uint32_t best;
     int queued, ret;
 
     DEBUG(0, "ftl_cs: reclaiming space...\n");
@@ -723,10 +723,10 @@ static void dump_lists(partition_t *part)
 }
 #endif
 
-static u_int32_t find_free(partition_t *part)
+static uint32_t find_free(partition_t *part)
 {
-    u_int16_t stop, eun;
-    u_int32_t blk;
+    uint16_t stop, eun;
+    uint32_t blk;
     size_t retlen;
     int ret;
 
@@ -749,7 +749,7 @@ static u_int32_t find_free(partition_t *part)
 
 	ret = part->mbd.mtd->read(part->mbd.mtd,
 		       part->EUNInfo[eun].Offset + le32_to_cpu(part->header.BAMOffset),
-		       part->BlocksPerUnit * sizeof(u_int32_t),
+		       part->BlocksPerUnit * sizeof(uint32_t),
 		       &retlen, (u_char *) (part->bam_cache));
 
 	if (ret) {
@@ -786,7 +786,7 @@ static u_int32_t find_free(partition_t *part)
 static int ftl_read(partition_t *part, caddr_t buffer,
 		    u_long sector, u_long nblocks)
 {
-    u_int32_t log_addr, bsize;
+    uint32_t log_addr, bsize;
     u_long i;
     int ret;
     size_t offset, retlen;
@@ -829,14 +829,14 @@ static int ftl_read(partition_t *part, caddr_t buffer,
 
 ======================================================================*/
 
-static int set_bam_entry(partition_t *part, u_int32_t log_addr,
-			 u_int32_t virt_addr)
+static int set_bam_entry(partition_t *part, uint32_t log_addr,
+			 uint32_t virt_addr)
 {
-    u_int32_t bsize, blk, le_virt_addr;
+    uint32_t bsize, blk, le_virt_addr;
 #ifdef PSYCHO_DEBUG
-    u_int32_t old_addr;
+    uint32_t old_addr;
 #endif
-    u_int16_t eun;
+    uint16_t eun;
     int ret;
     size_t retlen, offset;
 
@@ -845,11 +845,11 @@ static int set_bam_entry(partition_t *part, u_int32_t log_addr,
     bsize = 1 << part->header.EraseUnitSize;
     eun = log_addr / bsize;
     blk = (log_addr % bsize) / SECTOR_SIZE;
-    offset = (part->EUNInfo[eun].Offset + blk * sizeof(u_int32_t) +
+    offset = (part->EUNInfo[eun].Offset + blk * sizeof(uint32_t) +
 		  le32_to_cpu(part->header.BAMOffset));
 
 #ifdef PSYCHO_DEBUG
-    ret = part->mbd.mtd->read(part->mbd.mtd, offset, sizeof(u_int32_t),
+    ret = part->mbd.mtd->read(part->mbd.mtd, offset, sizeof(uint32_t),
                         &retlen, (u_char *)&old_addr);
     if (ret) {
 	printk(KERN_WARNING"ftl: Error reading old_addr in set_bam_entry: %d\n",ret);
@@ -886,7 +886,7 @@ static int set_bam_entry(partition_t *part, u_int32_t log_addr,
 #endif
 	part->bam_cache[blk] = le_virt_addr;
     }
-    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(u_int32_t),
+    ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint32_t),
                             &retlen, (u_char *)&le_virt_addr);
 
     if (ret) {
@@ -900,7 +900,7 @@ static int set_bam_entry(partition_t *part, u_int32_t log_addr,
 static int ftl_write(partition_t *part, caddr_t buffer,
 		     u_long sector, u_long nblocks)
 {
-    u_int32_t bsize, log_addr, virt_addr, old_addr, blk;
+    uint32_t bsize, log_addr, virt_addr, old_addr, blk;
     u_long i;
     int ret;
     size_t retlen, offset;
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index 50ce13887f6..73f05227dc8 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -50,7 +50,7 @@ static void inftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	struct INFTLrecord *inftl;
 	unsigned long temp;
 
-	if (mtd->type != MTD_NANDFLASH)
+	if (mtd->type != MTD_NANDFLASH || mtd->size > UINT_MAX)
 		return;
 	/* OK, this is moderately ugly.  But probably safe.  Alternatives? */
 	if (memcmp(mtd->name, "DiskOnChip", 10))
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 9113628ed1e..f751dd97c54 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -63,7 +63,7 @@ static int find_boot_record(struct INFTLrecord *inftl)
 	 * otherwise.
 	 */
 	inftl->EraseSize = inftl->mbd.mtd->erasesize;
-        inftl->nb_blocks = inftl->mbd.mtd->size / inftl->EraseSize;
+        inftl->nb_blocks = (u32)inftl->mbd.mtd->size / inftl->EraseSize;
 
 	inftl->MediaUnit = BLOCK_NIL;
 
@@ -187,7 +187,7 @@ static int find_boot_record(struct INFTLrecord *inftl)
 				mh->BlockMultiplierBits);
 			inftl->EraseSize = inftl->mbd.mtd->erasesize <<
 				mh->BlockMultiplierBits;
-			inftl->nb_blocks = inftl->mbd.mtd->size / inftl->EraseSize;
+			inftl->nb_blocks = (u32)inftl->mbd.mtd->size / inftl->EraseSize;
 			block >>= mh->BlockMultiplierBits;
 		}
 
diff --git a/drivers/mtd/lpddr/Kconfig b/drivers/mtd/lpddr/Kconfig
new file mode 100644
index 00000000000..acd4ea9b227
--- /dev/null
+++ b/drivers/mtd/lpddr/Kconfig
@@ -0,0 +1,22 @@
+# drivers/mtd/chips/Kconfig
+
+menu "LPDDR flash memory drivers"
+	depends on MTD!=n
+
+config MTD_LPDDR
+	tristate "Support for LPDDR flash chips"
+	select MTD_QINFO_PROBE
+	help
+	  This option enables support of LPDDR (Low power double data rate)
+	  flash chips. Synonymous with Mobile-DDR. It is a new standard for
+	  DDR memories, intended for battery-operated systems.
+
+config MTD_QINFO_PROBE
+	tristate "Detect flash chips by QINFO probe"
+	help
+	    Device Information for LPDDR chips is offered through the Overlay
+	    Window QINFO interface, permits software to be used for entire
+	    families of devices. This serves similar purpose of CFI on legacy
+	    Flash products
+endmenu
+
diff --git a/drivers/mtd/lpddr/Makefile b/drivers/mtd/lpddr/Makefile
new file mode 100644
index 00000000000..da48e46b581
--- /dev/null
+++ b/drivers/mtd/lpddr/Makefile
@@ -0,0 +1,6 @@
+#
+# linux/drivers/mtd/lpddr/Makefile
+#
+
+obj-$(CONFIG_MTD_QINFO_PROBE)	+= qinfo_probe.o
+obj-$(CONFIG_MTD_LPDDR)	+= lpddr_cmds.o
diff --git a/drivers/mtd/lpddr/lpddr_cmds.c b/drivers/mtd/lpddr/lpddr_cmds.c
new file mode 100644
index 00000000000..e22ca49583e
--- /dev/null
+++ b/drivers/mtd/lpddr/lpddr_cmds.c
@@ -0,0 +1,796 @@
+/*
+ * LPDDR flash memory device operations. This module provides read, write,
+ * erase, lock/unlock support for LPDDR flash memories
+ * (C) 2008 Korolev Alexey <akorolev@infradead.org>
+ * (C) 2008 Vasiliy Leonenko <vasiliy.leonenko@gmail.com>
+ * Many thanks to Roman Borisov for intial enabling
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ * TODO:
+ * Implement VPP management
+ * Implement XIP support
+ * Implement OTP support
+ */
+#include <linux/mtd/pfow.h>
+#include <linux/mtd/qinfo.h>
+
+static int lpddr_read(struct mtd_info *mtd, loff_t adr, size_t len,
+					size_t *retlen, u_char *buf);
+static int lpddr_write_buffers(struct mtd_info *mtd, loff_t to,
+				size_t len, size_t *retlen, const u_char *buf);
+static int lpddr_writev(struct mtd_info *mtd, const struct kvec *vecs,
+				unsigned long count, loff_t to, size_t *retlen);
+static int lpddr_erase(struct mtd_info *mtd, struct erase_info *instr);
+static int lpddr_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int lpddr_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
+static int lpddr_point(struct mtd_info *mtd, loff_t adr, size_t len,
+			size_t *retlen, void **mtdbuf, resource_size_t *phys);
+static void lpddr_unpoint(struct mtd_info *mtd, loff_t adr, size_t len);
+static int get_chip(struct map_info *map, struct flchip *chip, int mode);
+static int chip_ready(struct map_info *map, struct flchip *chip, int mode);
+static void put_chip(struct map_info *map, struct flchip *chip);
+
+struct mtd_info *lpddr_cmdset(struct map_info *map)
+{
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	struct flchip_shared *shared;
+	struct flchip *chip;
+	struct mtd_info *mtd;
+	int numchips;
+	int i, j;
+
+	mtd = kzalloc(sizeof(*mtd), GFP_KERNEL);
+	if (!mtd) {
+		printk(KERN_ERR "Failed to allocate memory for MTD device\n");
+		return NULL;
+	}
+	mtd->priv = map;
+	mtd->type = MTD_NORFLASH;
+
+	/* Fill in the default mtd operations */
+	mtd->read = lpddr_read;
+	mtd->type = MTD_NORFLASH;
+	mtd->flags = MTD_CAP_NORFLASH;
+	mtd->flags &= ~MTD_BIT_WRITEABLE;
+	mtd->erase = lpddr_erase;
+	mtd->write = lpddr_write_buffers;
+	mtd->writev = lpddr_writev;
+	mtd->read_oob = NULL;
+	mtd->write_oob = NULL;
+	mtd->sync = NULL;
+	mtd->lock = lpddr_lock;
+	mtd->unlock = lpddr_unlock;
+	mtd->suspend = NULL;
+	mtd->resume = NULL;
+	if (map_is_linear(map)) {
+		mtd->point = lpddr_point;
+		mtd->unpoint = lpddr_unpoint;
+	}
+	mtd->block_isbad = NULL;
+	mtd->block_markbad = NULL;
+	mtd->size = 1 << lpddr->qinfo->DevSizeShift;
+	mtd->erasesize = 1 << lpddr->qinfo->UniformBlockSizeShift;
+	mtd->writesize = 1 << lpddr->qinfo->BufSizeShift;
+
+	shared = kmalloc(sizeof(struct flchip_shared) * lpddr->numchips,
+						GFP_KERNEL);
+	if (!shared) {
+		kfree(lpddr);
+		kfree(mtd);
+		return NULL;
+	}
+
+	chip = &lpddr->chips[0];
+	numchips = lpddr->numchips / lpddr->qinfo->HWPartsNum;
+	for (i = 0; i < numchips; i++) {
+		shared[i].writing = shared[i].erasing = NULL;
+		spin_lock_init(&shared[i].lock);
+		for (j = 0; j < lpddr->qinfo->HWPartsNum; j++) {
+			*chip = lpddr->chips[i];
+			chip->start += j << lpddr->chipshift;
+			chip->oldstate = chip->state = FL_READY;
+			chip->priv = &shared[i];
+			/* those should be reset too since
+			   they create memory references. */
+			init_waitqueue_head(&chip->wq);
+			spin_lock_init(&chip->_spinlock);
+			chip->mutex = &chip->_spinlock;
+			chip++;
+		}
+	}
+
+	return mtd;
+}
+EXPORT_SYMBOL(lpddr_cmdset);
+
+static int wait_for_ready(struct map_info *map, struct flchip *chip,
+		unsigned int chip_op_time)
+{
+	unsigned int timeo, reset_timeo, sleep_time;
+	unsigned int dsr;
+	flstate_t chip_state = chip->state;
+	int ret = 0;
+
+	/* set our timeout to 8 times the expected delay */
+	timeo = chip_op_time * 8;
+	if (!timeo)
+		timeo = 500000;
+	reset_timeo = timeo;
+	sleep_time = chip_op_time / 2;
+
+	for (;;) {
+		dsr = CMDVAL(map_read(map, map->pfow_base + PFOW_DSR));
+		if (dsr & DSR_READY_STATUS)
+			break;
+		if (!timeo) {
+			printk(KERN_ERR "%s: Flash timeout error state %d \n",
+							map->name, chip_state);
+			ret = -ETIME;
+			break;
+		}
+
+		/* OK Still waiting. Drop the lock, wait a while and retry. */
+		spin_unlock(chip->mutex);
+		if (sleep_time >= 1000000/HZ) {
+			/*
+			 * Half of the normal delay still remaining
+			 * can be performed with a sleeping delay instead
+			 * of busy waiting.
+			 */
+			msleep(sleep_time/1000);
+			timeo -= sleep_time;
+			sleep_time = 1000000/HZ;
+		} else {
+			udelay(1);
+			cond_resched();
+			timeo--;
+		}
+		spin_lock(chip->mutex);
+
+		while (chip->state != chip_state) {
+			/* Someone's suspended the operation: sleep */
+			DECLARE_WAITQUEUE(wait, current);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			add_wait_queue(&chip->wq, &wait);
+			spin_unlock(chip->mutex);
+			schedule();
+			remove_wait_queue(&chip->wq, &wait);
+			spin_lock(chip->mutex);
+		}
+		if (chip->erase_suspended || chip->write_suspended)  {
+			/* Suspend has occured while sleep: reset timeout */
+			timeo = reset_timeo;
+			chip->erase_suspended = chip->write_suspended = 0;
+		}
+	}
+	/* check status for errors */
+	if (dsr & DSR_ERR) {
+		/* Clear DSR*/
+		map_write(map, CMD(~(DSR_ERR)), map->pfow_base + PFOW_DSR);
+		printk(KERN_WARNING"%s: Bad status on wait: 0x%x \n",
+				map->name, dsr);
+		print_drs_error(dsr);
+		ret = -EIO;
+	}
+	chip->state = FL_READY;
+	return ret;
+}
+
+static int get_chip(struct map_info *map, struct flchip *chip, int mode)
+{
+	int ret;
+	DECLARE_WAITQUEUE(wait, current);
+
+ retry:
+	if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING)
+		&& chip->state != FL_SYNCING) {
+		/*
+		 * OK. We have possibility for contension on the write/erase
+		 * operations which are global to the real chip and not per
+		 * partition.  So let's fight it over in the partition which
+		 * currently has authority on the operation.
+		 *
+		 * The rules are as follows:
+		 *
+		 * - any write operation must own shared->writing.
+		 *
+		 * - any erase operation must own _both_ shared->writing and
+		 *   shared->erasing.
+		 *
+		 * - contension arbitration is handled in the owner's context.
+		 *
+		 * The 'shared' struct can be read and/or written only when
+		 * its lock is taken.
+		 */
+		struct flchip_shared *shared = chip->priv;
+		struct flchip *contender;
+		spin_lock(&shared->lock);
+		contender = shared->writing;
+		if (contender && contender != chip) {
+			/*
+			 * The engine to perform desired operation on this
+			 * partition is already in use by someone else.
+			 * Let's fight over it in the context of the chip
+			 * currently using it.  If it is possible to suspend,
+			 * that other partition will do just that, otherwise
+			 * it'll happily send us to sleep.  In any case, when
+			 * get_chip returns success we're clear to go ahead.
+			 */
+			ret = spin_trylock(contender->mutex);
+			spin_unlock(&shared->lock);
+			if (!ret)
+				goto retry;
+			spin_unlock(chip->mutex);
+			ret = chip_ready(map, contender, mode);
+			spin_lock(chip->mutex);
+
+			if (ret == -EAGAIN) {
+				spin_unlock(contender->mutex);
+				goto retry;
+			}
+			if (ret) {
+				spin_unlock(contender->mutex);
+				return ret;
+			}
+			spin_lock(&shared->lock);
+
+			/* We should not own chip if it is already in FL_SYNCING
+			 * state. Put contender and retry. */
+			if (chip->state == FL_SYNCING) {
+				put_chip(map, contender);
+				spin_unlock(contender->mutex);
+				goto retry;
+			}
+			spin_unlock(contender->mutex);
+		}
+
+		/* Check if we have suspended erase on this chip.
+		   Must sleep in such a case. */
+		if (mode == FL_ERASING && shared->erasing
+		    && shared->erasing->oldstate == FL_ERASING) {
+			spin_unlock(&shared->lock);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			add_wait_queue(&chip->wq, &wait);
+			spin_unlock(chip->mutex);
+			schedule();
+			remove_wait_queue(&chip->wq, &wait);
+			spin_lock(chip->mutex);
+			goto retry;
+		}
+
+		/* We now own it */
+		shared->writing = chip;
+		if (mode == FL_ERASING)
+			shared->erasing = chip;
+		spin_unlock(&shared->lock);
+	}
+
+	ret = chip_ready(map, chip, mode);
+	if (ret == -EAGAIN)
+		goto retry;
+
+	return ret;
+}
+
+static int chip_ready(struct map_info *map, struct flchip *chip, int mode)
+{
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	int ret = 0;
+	DECLARE_WAITQUEUE(wait, current);
+
+	/* Prevent setting state FL_SYNCING for chip in suspended state. */
+	if (FL_SYNCING == mode && FL_READY != chip->oldstate)
+		goto sleep;
+
+	switch (chip->state) {
+	case FL_READY:
+	case FL_JEDEC_QUERY:
+		return 0;
+
+	case FL_ERASING:
+		if (!lpddr->qinfo->SuspEraseSupp ||
+			!(mode == FL_READY || mode == FL_POINT))
+			goto sleep;
+
+		map_write(map, CMD(LPDDR_SUSPEND),
+			map->pfow_base + PFOW_PROGRAM_ERASE_SUSPEND);
+		chip->oldstate = FL_ERASING;
+		chip->state = FL_ERASE_SUSPENDING;
+		ret = wait_for_ready(map, chip, 0);
+		if (ret) {
+			/* Oops. something got wrong. */
+			/* Resume and pretend we weren't here.  */
+			map_write(map, CMD(LPDDR_RESUME),
+				map->pfow_base + PFOW_COMMAND_CODE);
+			map_write(map, CMD(LPDDR_START_EXECUTION),
+				map->pfow_base + PFOW_COMMAND_EXECUTE);
+			chip->state = FL_ERASING;
+			chip->oldstate = FL_READY;
+			printk(KERN_ERR "%s: suspend operation failed."
+					"State may be wrong \n", map->name);
+			return -EIO;
+		}
+		chip->erase_suspended = 1;
+		chip->state = FL_READY;
+		return 0;
+		/* Erase suspend */
+	case FL_POINT:
+		/* Only if there's no operation suspended... */
+		if (mode == FL_READY && chip->oldstate == FL_READY)
+			return 0;
+
+	default:
+sleep:
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&chip->wq, &wait);
+		spin_unlock(chip->mutex);
+		schedule();
+		remove_wait_queue(&chip->wq, &wait);
+		spin_lock(chip->mutex);
+		return -EAGAIN;
+	}
+}
+
+static void put_chip(struct map_info *map, struct flchip *chip)
+{
+	if (chip->priv) {
+		struct flchip_shared *shared = chip->priv;
+		spin_lock(&shared->lock);
+		if (shared->writing == chip && chip->oldstate == FL_READY) {
+			/* We own the ability to write, but we're done */
+			shared->writing = shared->erasing;
+			if (shared->writing && shared->writing != chip) {
+				/* give back the ownership */
+				struct flchip *loaner = shared->writing;
+				spin_lock(loaner->mutex);
+				spin_unlock(&shared->lock);
+				spin_unlock(chip->mutex);
+				put_chip(map, loaner);
+				spin_lock(chip->mutex);
+				spin_unlock(loaner->mutex);
+				wake_up(&chip->wq);
+				return;
+			}
+			shared->erasing = NULL;
+			shared->writing = NULL;
+		} else if (shared->erasing == chip && shared->writing != chip) {
+			/*
+			 * We own the ability to erase without the ability
+			 * to write, which means the erase was suspended
+			 * and some other partition is currently writing.
+			 * Don't let the switch below mess things up since
+			 * we don't have ownership to resume anything.
+			 */
+			spin_unlock(&shared->lock);
+			wake_up(&chip->wq);
+			return;
+		}
+		spin_unlock(&shared->lock);
+	}
+
+	switch (chip->oldstate) {
+	case FL_ERASING:
+		chip->state = chip->oldstate;
+		map_write(map, CMD(LPDDR_RESUME),
+				map->pfow_base + PFOW_COMMAND_CODE);
+		map_write(map, CMD(LPDDR_START_EXECUTION),
+				map->pfow_base + PFOW_COMMAND_EXECUTE);
+		chip->oldstate = FL_READY;
+		chip->state = FL_ERASING;
+		break;
+	case FL_READY:
+		break;
+	default:
+		printk(KERN_ERR "%s: put_chip() called with oldstate %d!\n",
+				map->name, chip->oldstate);
+	}
+	wake_up(&chip->wq);
+}
+
+int do_write_buffer(struct map_info *map, struct flchip *chip,
+			unsigned long adr, const struct kvec **pvec,
+			unsigned long *pvec_seek, int len)
+{
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	map_word datum;
+	int ret, wbufsize, word_gap, words;
+	const struct kvec *vec;
+	unsigned long vec_seek;
+	unsigned long prog_buf_ofs;
+
+	wbufsize = 1 << lpddr->qinfo->BufSizeShift;
+
+	spin_lock(chip->mutex);
+	ret = get_chip(map, chip, FL_WRITING);
+	if (ret) {
+		spin_unlock(chip->mutex);
+		return ret;
+	}
+	/* Figure out the number of words to write */
+	word_gap = (-adr & (map_bankwidth(map)-1));
+	words = (len - word_gap + map_bankwidth(map) - 1) / map_bankwidth(map);
+	if (!word_gap) {
+		words--;
+	} else {
+		word_gap = map_bankwidth(map) - word_gap;
+		adr -= word_gap;
+		datum = map_word_ff(map);
+	}
+	/* Write data */
+	/* Get the program buffer offset from PFOW register data first*/
+	prog_buf_ofs = map->pfow_base + CMDVAL(map_read(map,
+				map->pfow_base + PFOW_PROGRAM_BUFFER_OFFSET));
+	vec = *pvec;
+	vec_seek = *pvec_seek;
+	do {
+		int n = map_bankwidth(map) - word_gap;
+
+		if (n > vec->iov_len - vec_seek)
+			n = vec->iov_len - vec_seek;
+		if (n > len)
+			n = len;
+
+		if (!word_gap && (len < map_bankwidth(map)))
+			datum = map_word_ff(map);
+
+		datum = map_word_load_partial(map, datum,
+				vec->iov_base + vec_seek, word_gap, n);
+
+		len -= n;
+		word_gap += n;
+		if (!len || word_gap == map_bankwidth(map)) {
+			map_write(map, datum, prog_buf_ofs);
+			prog_buf_ofs += map_bankwidth(map);
+			word_gap = 0;
+		}
+
+		vec_seek += n;
+		if (vec_seek == vec->iov_len) {
+			vec++;
+			vec_seek = 0;
+		}
+	} while (len);
+	*pvec = vec;
+	*pvec_seek = vec_seek;
+
+	/* GO GO GO */
+	send_pfow_command(map, LPDDR_BUFF_PROGRAM, adr, wbufsize, NULL);
+	chip->state = FL_WRITING;
+	ret = wait_for_ready(map, chip, (1<<lpddr->qinfo->ProgBufferTime));
+	if (ret)	{
+		printk(KERN_WARNING"%s Buffer program error: %d at %lx; \n",
+			map->name, ret, adr);
+		goto out;
+	}
+
+ out:	put_chip(map, chip);
+	spin_unlock(chip->mutex);
+	return ret;
+}
+
+int do_erase_oneblock(struct mtd_info *mtd, loff_t adr)
+{
+	struct map_info *map = mtd->priv;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	int chipnum = adr >> lpddr->chipshift;
+	struct flchip *chip = &lpddr->chips[chipnum];
+	int ret;
+
+	spin_lock(chip->mutex);
+	ret = get_chip(map, chip, FL_ERASING);
+	if (ret) {
+		spin_unlock(chip->mutex);
+		return ret;
+	}
+	send_pfow_command(map, LPDDR_BLOCK_ERASE, adr, 0, NULL);
+	chip->state = FL_ERASING;
+	ret = wait_for_ready(map, chip, (1<<lpddr->qinfo->BlockEraseTime)*1000);
+	if (ret) {
+		printk(KERN_WARNING"%s Erase block error %d at : %llx\n",
+			map->name, ret, adr);
+		goto out;
+	}
+ out:	put_chip(map, chip);
+	spin_unlock(chip->mutex);
+	return ret;
+}
+
+static int lpddr_read(struct mtd_info *mtd, loff_t adr, size_t len,
+			size_t *retlen, u_char *buf)
+{
+	struct map_info *map = mtd->priv;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	int chipnum = adr >> lpddr->chipshift;
+	struct flchip *chip = &lpddr->chips[chipnum];
+	int ret = 0;
+
+	spin_lock(chip->mutex);
+	ret = get_chip(map, chip, FL_READY);
+	if (ret) {
+		spin_unlock(chip->mutex);
+		return ret;
+	}
+
+	map_copy_from(map, buf, adr, len);
+	*retlen = len;
+
+	put_chip(map, chip);
+	spin_unlock(chip->mutex);
+	return ret;
+}
+
+static int lpddr_point(struct mtd_info *mtd, loff_t adr, size_t len,
+			size_t *retlen, void **mtdbuf, resource_size_t *phys)
+{
+	struct map_info *map = mtd->priv;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	int chipnum = adr >> lpddr->chipshift;
+	unsigned long ofs, last_end = 0;
+	struct flchip *chip = &lpddr->chips[chipnum];
+	int ret = 0;
+
+	if (!map->virt || (adr + len > mtd->size))
+		return -EINVAL;
+
+	/* ofs: offset within the first chip that the first read should start */
+	ofs = adr - (chipnum << lpddr->chipshift);
+
+	*mtdbuf = (void *)map->virt + chip->start + ofs;
+	*retlen = 0;
+
+	while (len) {
+		unsigned long thislen;
+
+		if (chipnum >= lpddr->numchips)
+			break;
+
+		/* We cannot point across chips that are virtually disjoint */
+		if (!last_end)
+			last_end = chip->start;
+		else if (chip->start != last_end)
+			break;
+
+		if ((len + ofs - 1) >> lpddr->chipshift)
+			thislen = (1<<lpddr->chipshift) - ofs;
+		else
+			thislen = len;
+		/* get the chip */
+		spin_lock(chip->mutex);
+		ret = get_chip(map, chip, FL_POINT);
+		spin_unlock(chip->mutex);
+		if (ret)
+			break;
+
+		chip->state = FL_POINT;
+		chip->ref_point_counter++;
+		*retlen += thislen;
+		len -= thislen;
+
+		ofs = 0;
+		last_end += 1 << lpddr->chipshift;
+		chipnum++;
+		chip = &lpddr->chips[chipnum];
+	}
+	return 0;
+}
+
+static void lpddr_unpoint (struct mtd_info *mtd, loff_t adr, size_t len)
+{
+	struct map_info *map = mtd->priv;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	int chipnum = adr >> lpddr->chipshift;
+	unsigned long ofs;
+
+	/* ofs: offset within the first chip that the first read should start */
+	ofs = adr - (chipnum << lpddr->chipshift);
+
+	while (len) {
+		unsigned long thislen;
+		struct flchip *chip;
+
+		chip = &lpddr->chips[chipnum];
+		if (chipnum >= lpddr->numchips)
+			break;
+
+		if ((len + ofs - 1) >> lpddr->chipshift)
+			thislen = (1<<lpddr->chipshift) - ofs;
+		else
+			thislen = len;
+
+		spin_lock(chip->mutex);
+		if (chip->state == FL_POINT) {
+			chip->ref_point_counter--;
+			if (chip->ref_point_counter == 0)
+				chip->state = FL_READY;
+		} else
+			printk(KERN_WARNING "%s: Warning: unpoint called on non"
+					"pointed region\n", map->name);
+
+		put_chip(map, chip);
+		spin_unlock(chip->mutex);
+
+		len -= thislen;
+		ofs = 0;
+		chipnum++;
+	}
+}
+
+static int lpddr_write_buffers(struct mtd_info *mtd, loff_t to, size_t len,
+				size_t *retlen, const u_char *buf)
+{
+	struct kvec vec;
+
+	vec.iov_base = (void *) buf;
+	vec.iov_len = len;
+
+	return lpddr_writev(mtd, &vec, 1, to, retlen);
+}
+
+
+static int lpddr_writev(struct mtd_info *mtd, const struct kvec *vecs,
+				unsigned long count, loff_t to, size_t *retlen)
+{
+	struct map_info *map = mtd->priv;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	int ret = 0;
+	int chipnum;
+	unsigned long ofs, vec_seek, i;
+	int wbufsize = 1 << lpddr->qinfo->BufSizeShift;
+
+	size_t len = 0;
+
+	for (i = 0; i < count; i++)
+		len += vecs[i].iov_len;
+
+	*retlen = 0;
+	if (!len)
+		return 0;
+
+	chipnum = to >> lpddr->chipshift;
+
+	ofs = to;
+	vec_seek = 0;
+
+	do {
+		/* We must not cross write block boundaries */
+		int size = wbufsize - (ofs & (wbufsize-1));
+
+		if (size > len)
+			size = len;
+
+		ret = do_write_buffer(map, &lpddr->chips[chipnum],
+					  ofs, &vecs, &vec_seek, size);
+		if (ret)
+			return ret;
+
+		ofs += size;
+		(*retlen) += size;
+		len -= size;
+
+		/* Be nice and reschedule with the chip in a usable
+		 * state for other processes */
+		cond_resched();
+
+	} while (len);
+
+	return 0;
+}
+
+static int lpddr_erase(struct mtd_info *mtd, struct erase_info *instr)
+{
+	unsigned long ofs, len;
+	int ret;
+	struct map_info *map = mtd->priv;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	int size = 1 << lpddr->qinfo->UniformBlockSizeShift;
+
+	ofs = instr->addr;
+	len = instr->len;
+
+	if (ofs > mtd->size || (len + ofs) > mtd->size)
+		return -EINVAL;
+
+	while (len > 0) {
+		ret = do_erase_oneblock(mtd, ofs);
+		if (ret)
+			return ret;
+		ofs += size;
+		len -= size;
+	}
+	instr->state = MTD_ERASE_DONE;
+	mtd_erase_callback(instr);
+
+	return 0;
+}
+
+#define DO_XXLOCK_LOCK		1
+#define DO_XXLOCK_UNLOCK	2
+int do_xxlock(struct mtd_info *mtd, loff_t adr, uint32_t len, int thunk)
+{
+	int ret = 0;
+	struct map_info *map = mtd->priv;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	int chipnum = adr >> lpddr->chipshift;
+	struct flchip *chip = &lpddr->chips[chipnum];
+
+	spin_lock(chip->mutex);
+	ret = get_chip(map, chip, FL_LOCKING);
+	if (ret) {
+		spin_unlock(chip->mutex);
+		return ret;
+	}
+
+	if (thunk == DO_XXLOCK_LOCK) {
+		send_pfow_command(map, LPDDR_LOCK_BLOCK, adr, adr + len, NULL);
+		chip->state = FL_LOCKING;
+	} else if (thunk == DO_XXLOCK_UNLOCK) {
+		send_pfow_command(map, LPDDR_UNLOCK_BLOCK, adr, adr + len, NULL);
+		chip->state = FL_UNLOCKING;
+	} else
+		BUG();
+
+	ret = wait_for_ready(map, chip, 1);
+	if (ret)	{
+		printk(KERN_ERR "%s: block unlock error status %d \n",
+				map->name, ret);
+		goto out;
+	}
+out:	put_chip(map, chip);
+	spin_unlock(chip->mutex);
+	return ret;
+}
+
+static int lpddr_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
+{
+	return do_xxlock(mtd, ofs, len, DO_XXLOCK_LOCK);
+}
+
+static int lpddr_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
+{
+	return do_xxlock(mtd, ofs, len, DO_XXLOCK_UNLOCK);
+}
+
+int word_program(struct map_info *map, loff_t adr, uint32_t curval)
+{
+    int ret;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	int chipnum = adr >> lpddr->chipshift;
+	struct flchip *chip = &lpddr->chips[chipnum];
+
+	spin_lock(chip->mutex);
+	ret = get_chip(map, chip, FL_WRITING);
+	if (ret) {
+		spin_unlock(chip->mutex);
+		return ret;
+	}
+
+	send_pfow_command(map, LPDDR_WORD_PROGRAM, adr, 0x00, (map_word *)&curval);
+
+	ret = wait_for_ready(map, chip, (1<<lpddr->qinfo->SingleWordProgTime));
+	if (ret)	{
+		printk(KERN_WARNING"%s word_program error at: %llx; val: %x\n",
+			map->name, adr, curval);
+		goto out;
+	}
+
+out:	put_chip(map, chip);
+	spin_unlock(chip->mutex);
+	return ret;
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Alexey Korolev <akorolev@infradead.org>");
+MODULE_DESCRIPTION("MTD driver for LPDDR flash chips");
diff --git a/drivers/mtd/lpddr/qinfo_probe.c b/drivers/mtd/lpddr/qinfo_probe.c
new file mode 100644
index 00000000000..79bf40f48b7
--- /dev/null
+++ b/drivers/mtd/lpddr/qinfo_probe.c
@@ -0,0 +1,255 @@
+/*
+ * Probing flash chips with QINFO records.
+ * (C) 2008 Korolev Alexey <akorolev@infradead.org>
+ * (C) 2008 Vasiliy Leonenko <vasiliy.leonenko@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+
+#include <linux/mtd/xip.h>
+#include <linux/mtd/map.h>
+#include <linux/mtd/pfow.h>
+#include <linux/mtd/qinfo.h>
+
+static int lpddr_chip_setup(struct map_info *map, struct lpddr_private *lpddr);
+struct mtd_info *lpddr_probe(struct map_info *map);
+static struct lpddr_private *lpddr_probe_chip(struct map_info *map);
+static int lpddr_pfow_present(struct map_info *map,
+			struct lpddr_private *lpddr);
+
+static struct qinfo_query_info qinfo_array[] = {
+	/* General device info */
+	{0, 0, "DevSizeShift", "Device size 2^n bytes"},
+	{0, 3, "BufSizeShift", "Program buffer size 2^n bytes"},
+	/* Erase block information */
+	{1, 1, "TotalBlocksNum", "Total number of blocks"},
+	{1, 2, "UniformBlockSizeShift", "Uniform block size 2^n bytes"},
+	/* Partition information */
+	{2, 1, "HWPartsNum", "Number of hardware partitions"},
+	/* Optional features */
+	{5, 1, "SuspEraseSupp", "Suspend erase supported"},
+	/* Operation typical time */
+	{10, 0, "SingleWordProgTime", "Single word program 2^n u-sec"},
+	{10, 1, "ProgBufferTime", "Program buffer write 2^n u-sec"},
+	{10, 2, "BlockEraseTime", "Block erase 2^n m-sec"},
+	{10, 3, "FullChipEraseTime", "Full chip erase 2^n m-sec"},
+};
+
+static long lpddr_get_qinforec_pos(struct map_info *map, char *id_str)
+{
+	int qinfo_lines = sizeof(qinfo_array)/sizeof(struct qinfo_query_info);
+	int i;
+	int bankwidth = map_bankwidth(map) * 8;
+	int major, minor;
+
+	for (i = 0; i < qinfo_lines; i++) {
+		if (strcmp(id_str, qinfo_array[i].id_str) == 0) {
+			major = qinfo_array[i].major & ((1 << bankwidth) - 1);
+			minor = qinfo_array[i].minor & ((1 << bankwidth) - 1);
+			return minor | (major << bankwidth);
+		}
+	}
+	printk(KERN_ERR"%s qinfo id string is wrong! \n", map->name);
+	BUG();
+	return -1;
+}
+
+static uint16_t lpddr_info_query(struct map_info *map, char *id_str)
+{
+	unsigned int dsr, val;
+	int bits_per_chip = map_bankwidth(map) * 8;
+	unsigned long adr = lpddr_get_qinforec_pos(map, id_str);
+	int attempts = 20;
+
+	/* Write a request for the PFOW record */
+	map_write(map, CMD(LPDDR_INFO_QUERY),
+			map->pfow_base + PFOW_COMMAND_CODE);
+	map_write(map, CMD(adr & ((1 << bits_per_chip) - 1)),
+			map->pfow_base + PFOW_COMMAND_ADDRESS_L);
+	map_write(map, CMD(adr >> bits_per_chip),
+			map->pfow_base + PFOW_COMMAND_ADDRESS_H);
+	map_write(map, CMD(LPDDR_START_EXECUTION),
+			map->pfow_base + PFOW_COMMAND_EXECUTE);
+
+	while ((attempts--) > 0) {
+		dsr = CMDVAL(map_read(map, map->pfow_base + PFOW_DSR));
+		if (dsr & DSR_READY_STATUS)
+			break;
+		udelay(10);
+	}
+
+	val = CMDVAL(map_read(map, map->pfow_base + PFOW_COMMAND_DATA));
+	return val;
+}
+
+static int lpddr_pfow_present(struct map_info *map, struct lpddr_private *lpddr)
+{
+	map_word pfow_val[4];
+
+	/* Check identification string */
+	pfow_val[0] = map_read(map, map->pfow_base + PFOW_QUERY_STRING_P);
+	pfow_val[1] = map_read(map, map->pfow_base + PFOW_QUERY_STRING_F);
+	pfow_val[2] = map_read(map, map->pfow_base + PFOW_QUERY_STRING_O);
+	pfow_val[3] = map_read(map, map->pfow_base + PFOW_QUERY_STRING_W);
+
+	if (!map_word_equal(map, CMD('P'), pfow_val[0]))
+		goto out;
+
+	if (!map_word_equal(map, CMD('F'), pfow_val[1]))
+		goto out;
+
+	if (!map_word_equal(map, CMD('O'), pfow_val[2]))
+		goto out;
+
+	if (!map_word_equal(map, CMD('W'), pfow_val[3]))
+		goto out;
+
+	return 1;	/* "PFOW" is found */
+out:
+	printk(KERN_WARNING"%s: PFOW string at 0x%lx is not found \n",
+					map->name, map->pfow_base);
+	return 0;
+}
+
+static int lpddr_chip_setup(struct map_info *map, struct lpddr_private *lpddr)
+{
+
+	lpddr->qinfo = kmalloc(sizeof(struct qinfo_chip), GFP_KERNEL);
+	if (!lpddr->qinfo) {
+		printk(KERN_WARNING "%s: no memory for LPDDR qinfo structure\n",
+				map->name);
+		return 0;
+	}
+	memset(lpddr->qinfo, 0, sizeof(struct qinfo_chip));
+
+	/* Get the ManuID */
+	lpddr->ManufactId = CMDVAL(map_read(map, map->pfow_base + PFOW_MANUFACTURER_ID));
+	/* Get the DeviceID */
+	lpddr->DevId = CMDVAL(map_read(map, map->pfow_base + PFOW_DEVICE_ID));
+	/* read parameters from chip qinfo table */
+	lpddr->qinfo->DevSizeShift = lpddr_info_query(map, "DevSizeShift");
+	lpddr->qinfo->TotalBlocksNum = lpddr_info_query(map, "TotalBlocksNum");
+	lpddr->qinfo->BufSizeShift = lpddr_info_query(map, "BufSizeShift");
+	lpddr->qinfo->HWPartsNum = lpddr_info_query(map, "HWPartsNum");
+	lpddr->qinfo->UniformBlockSizeShift =
+				lpddr_info_query(map, "UniformBlockSizeShift");
+	lpddr->qinfo->SuspEraseSupp = lpddr_info_query(map, "SuspEraseSupp");
+	lpddr->qinfo->SingleWordProgTime =
+				lpddr_info_query(map, "SingleWordProgTime");
+	lpddr->qinfo->ProgBufferTime = lpddr_info_query(map, "ProgBufferTime");
+	lpddr->qinfo->BlockEraseTime = lpddr_info_query(map, "BlockEraseTime");
+	return 1;
+}
+static struct lpddr_private *lpddr_probe_chip(struct map_info *map)
+{
+	struct lpddr_private lpddr;
+	struct lpddr_private *retlpddr;
+	int numvirtchips;
+
+
+	if ((map->pfow_base + 0x1000) >= map->size) {
+		printk(KERN_NOTICE"%s Probe at base (0x%08lx) past the end of"
+				"the map(0x%08lx)\n", map->name,
+				(unsigned long)map->pfow_base, map->size - 1);
+		return NULL;
+	}
+	memset(&lpddr, 0, sizeof(struct lpddr_private));
+	if (!lpddr_pfow_present(map, &lpddr))
+		return NULL;
+
+	if (!lpddr_chip_setup(map, &lpddr))
+		return NULL;
+
+	/* Ok so we found a chip */
+	lpddr.chipshift = lpddr.qinfo->DevSizeShift;
+	lpddr.numchips = 1;
+
+	numvirtchips = lpddr.numchips * lpddr.qinfo->HWPartsNum;
+	retlpddr = kmalloc(sizeof(struct lpddr_private) +
+			numvirtchips * sizeof(struct flchip), GFP_KERNEL);
+	if (!retlpddr)
+		return NULL;
+
+	memset(retlpddr, 0, sizeof(struct lpddr_private) +
+				numvirtchips * sizeof(struct flchip));
+	memcpy(retlpddr, &lpddr, sizeof(struct lpddr_private));
+
+	retlpddr->numchips = numvirtchips;
+	retlpddr->chipshift = retlpddr->qinfo->DevSizeShift -
+				__ffs(retlpddr->qinfo->HWPartsNum);
+
+	return retlpddr;
+}
+
+struct mtd_info *lpddr_probe(struct map_info *map)
+{
+	struct mtd_info *mtd = NULL;
+	struct lpddr_private *lpddr;
+
+	/* First probe the map to see if we havecan open PFOW here */
+	lpddr = lpddr_probe_chip(map);
+	if (!lpddr)
+		return NULL;
+
+	map->fldrv_priv = lpddr;
+	mtd = lpddr_cmdset(map);
+	if (mtd) {
+		if (mtd->size > map->size) {
+			printk(KERN_WARNING "Reducing visibility of %ldKiB chip"
+				"to %ldKiB\n", (unsigned long)mtd->size >> 10,
+				(unsigned long)map->size >> 10);
+			mtd->size = map->size;
+		}
+		return mtd;
+	}
+
+	kfree(lpddr->qinfo);
+	kfree(lpddr);
+	map->fldrv_priv = NULL;
+	return NULL;
+}
+
+static struct mtd_chip_driver lpddr_chipdrv = {
+	.probe		= lpddr_probe,
+	.name		= "qinfo_probe",
+	.module		= THIS_MODULE
+};
+
+static int __init lpddr_probe_init(void)
+{
+	register_mtd_chip_driver(&lpddr_chipdrv);
+	return 0;
+}
+
+static void __exit lpddr_probe_exit(void)
+{
+	unregister_mtd_chip_driver(&lpddr_chipdrv);
+}
+
+module_init(lpddr_probe_init);
+module_exit(lpddr_probe_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vasiliy Leonenko <vasiliy.leonenko@gmail.com>");
+MODULE_DESCRIPTION("Driver to probe qinfo flash chips");
+
diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index 5ea16936216..0225cbbf22d 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -10,8 +10,8 @@ config MTD_COMPLEX_MAPPINGS
 	  paged mappings of flash chips.
 
 config MTD_PHYSMAP
-	tristate "CFI Flash device in physical memory map"
-	depends on MTD_CFI || MTD_JEDECPROBE || MTD_ROM
+	tristate "Flash device in physical memory map"
+	depends on MTD_CFI || MTD_JEDECPROBE || MTD_ROM || MTD_LPDDR
 	help
 	  This provides a 'mapping' driver which allows the NOR Flash and
 	  ROM driver code to communicate with chips which are mapped
@@ -23,9 +23,20 @@ config MTD_PHYSMAP
 	  To compile this driver as a module, choose M here: the
 	  module will be called physmap.
 
+config MTD_PHYSMAP_COMPAT
+	bool "Physmap compat support"
+	depends on MTD_PHYSMAP
+	default n
+	help
+	  Setup a simple mapping via the Kconfig options.  Normally the
+	  physmap configuration options are done via your board's
+	  resource file.
+
+	  If unsure, say N here.
+
 config MTD_PHYSMAP_START
 	hex "Physical start address of flash mapping"
-	depends on MTD_PHYSMAP
+	depends on MTD_PHYSMAP_COMPAT
 	default "0x8000000"
 	help
 	  This is the physical memory location at which the flash chips
@@ -37,7 +48,7 @@ config MTD_PHYSMAP_START
 
 config MTD_PHYSMAP_LEN
 	hex "Physical length of flash mapping"
-	depends on MTD_PHYSMAP
+	depends on MTD_PHYSMAP_COMPAT
 	default "0"
 	help
 	  This is the total length of the mapping of the flash chips on
@@ -51,7 +62,7 @@ config MTD_PHYSMAP_LEN
 
 config MTD_PHYSMAP_BANKWIDTH
 	int "Bank width in octets"
-	depends on MTD_PHYSMAP
+	depends on MTD_PHYSMAP_COMPAT
 	default "2"
 	help
 	  This is the total width of the data bus of the flash devices
diff --git a/drivers/mtd/maps/alchemy-flash.c b/drivers/mtd/maps/alchemy-flash.c
index 82811bcb043..845ad4f2a54 100644
--- a/drivers/mtd/maps/alchemy-flash.c
+++ b/drivers/mtd/maps/alchemy-flash.c
@@ -111,7 +111,7 @@ static struct mtd_partition alchemy_partitions[] = {
 
 static struct mtd_info *mymtd;
 
-int __init alchemy_mtd_init(void)
+static int __init alchemy_mtd_init(void)
 {
 	struct mtd_partition *parts;
 	int nb_parts = 0;
diff --git a/drivers/mtd/maps/amd76xrom.c b/drivers/mtd/maps/amd76xrom.c
index d1eec7d3243..237733d094c 100644
--- a/drivers/mtd/maps/amd76xrom.c
+++ b/drivers/mtd/maps/amd76xrom.c
@@ -232,8 +232,8 @@ static int __devinit amd76xrom_init_one (struct pci_dev *pdev,
 		/* Trim the size if we are larger than the map */
 		if (map->mtd->size > map->map.size) {
 			printk(KERN_WARNING MOD_NAME
-				" rom(%u) larger than window(%lu). fixing...\n",
-				map->mtd->size, map->map.size);
+				" rom(%llu) larger than window(%lu). fixing...\n",
+				(unsigned long long)map->mtd->size, map->map.size);
 			map->mtd->size = map->map.size;
 		}
 		if (window->rsrc.parent) {
diff --git a/drivers/mtd/maps/cfi_flagadm.c b/drivers/mtd/maps/cfi_flagadm.c
index 0ecc3f6d735..b4ed8161191 100644
--- a/drivers/mtd/maps/cfi_flagadm.c
+++ b/drivers/mtd/maps/cfi_flagadm.c
@@ -88,7 +88,7 @@ struct mtd_partition flagadm_parts[] = {
 
 static struct mtd_info *mymtd;
 
-int __init init_flagadm(void)
+static int __init init_flagadm(void)
 {
 	printk(KERN_NOTICE "FlagaDM flash device: %x at %x\n",
 			FLASH_SIZE, FLASH_PHYS_ADDR);
diff --git a/drivers/mtd/maps/ck804xrom.c b/drivers/mtd/maps/ck804xrom.c
index 1a6feb4474d..5f7a245ed13 100644
--- a/drivers/mtd/maps/ck804xrom.c
+++ b/drivers/mtd/maps/ck804xrom.c
@@ -263,8 +263,8 @@ static int __devinit ck804xrom_init_one (struct pci_dev *pdev,
 		/* Trim the size if we are larger than the map */
 		if (map->mtd->size > map->map.size) {
 			printk(KERN_WARNING MOD_NAME
-				" rom(%u) larger than window(%lu). fixing...\n",
-				map->mtd->size, map->map.size);
+				" rom(%llu) larger than window(%lu). fixing...\n",
+				(unsigned long long)map->mtd->size, map->map.size);
 			map->mtd->size = map->map.size;
 		}
 		if (window->rsrc.parent) {
diff --git a/drivers/mtd/maps/dbox2-flash.c b/drivers/mtd/maps/dbox2-flash.c
index e115667bf1d..cfacfa6f45d 100644
--- a/drivers/mtd/maps/dbox2-flash.c
+++ b/drivers/mtd/maps/dbox2-flash.c
@@ -69,7 +69,7 @@ struct map_info dbox2_flash_map = {
 	.phys		= WINDOW_ADDR,
 };
 
-int __init init_dbox2_flash(void)
+static int __init init_dbox2_flash(void)
 {
        	printk(KERN_NOTICE "D-Box 2 flash driver (size->0x%X mem->0x%X)\n", WINDOW_SIZE, WINDOW_ADDR);
 	dbox2_flash_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE);
diff --git a/drivers/mtd/maps/edb7312.c b/drivers/mtd/maps/edb7312.c
index 9433738c166..be9e90b4458 100644
--- a/drivers/mtd/maps/edb7312.c
+++ b/drivers/mtd/maps/edb7312.c
@@ -71,7 +71,7 @@ static const char *probes[] = { "RedBoot", "cmdlinepart", NULL };
 static int                   mtd_parts_nb = 0;
 static struct mtd_partition *mtd_parts    = 0;
 
-int __init init_edb7312nor(void)
+static int __init init_edb7312nor(void)
 {
 	static const char *rom_probe_types[] = PROBETYPES;
 	const char **type;
diff --git a/drivers/mtd/maps/esb2rom.c b/drivers/mtd/maps/esb2rom.c
index bbbcdd4c8d1..11a2f57df9c 100644
--- a/drivers/mtd/maps/esb2rom.c
+++ b/drivers/mtd/maps/esb2rom.c
@@ -324,8 +324,8 @@ static int __devinit esb2rom_init_one(struct pci_dev *pdev,
 		/* Trim the size if we are larger than the map */
 		if (map->mtd->size > map->map.size) {
 			printk(KERN_WARNING MOD_NAME
-				" rom(%u) larger than window(%lu). fixing...\n",
-				map->mtd->size, map->map.size);
+				" rom(%llu) larger than window(%lu). fixing...\n",
+				(unsigned long long)map->mtd->size, map->map.size);
 			map->mtd->size = map->map.size;
 		}
 		if (window->rsrc.parent) {
diff --git a/drivers/mtd/maps/fortunet.c b/drivers/mtd/maps/fortunet.c
index a8e3fde4cbd..1e43124d498 100644
--- a/drivers/mtd/maps/fortunet.c
+++ b/drivers/mtd/maps/fortunet.c
@@ -181,7 +181,7 @@ __setup("MTD_Partition=", MTD_New_Partition);
 /* Backwards-spelling-compatibility */
 __setup("MTD_Partion=", MTD_New_Partition);
 
-int __init init_fortunet(void)
+static int __init init_fortunet(void)
 {
 	int	ix,iy;
 	for(iy=ix=0;ix<MAX_NUM_REGIONS;ix++)
diff --git a/drivers/mtd/maps/h720x-flash.c b/drivers/mtd/maps/h720x-flash.c
index 3b959fad1c4..72c724fa8c2 100644
--- a/drivers/mtd/maps/h720x-flash.c
+++ b/drivers/mtd/maps/h720x-flash.c
@@ -65,7 +65,7 @@ static const char *probes[] = { "cmdlinepart", NULL };
 /*
  * Initialize FLASH support
  */
-int __init h720x_mtd_init(void)
+static int __init h720x_mtd_init(void)
 {
 
 	char	*part_type = NULL;
diff --git a/drivers/mtd/maps/ichxrom.c b/drivers/mtd/maps/ichxrom.c
index aeb6c916e23..c32bc28920b 100644
--- a/drivers/mtd/maps/ichxrom.c
+++ b/drivers/mtd/maps/ichxrom.c
@@ -258,8 +258,8 @@ static int __devinit ichxrom_init_one (struct pci_dev *pdev,
 		/* Trim the size if we are larger than the map */
 		if (map->mtd->size > map->map.size) {
 			printk(KERN_WARNING MOD_NAME
-				" rom(%u) larger than window(%lu). fixing...\n",
-				map->mtd->size, map->map.size);
+				" rom(%llu) larger than window(%lu). fixing...\n",
+				(unsigned long long)map->mtd->size, map->map.size);
 			map->mtd->size = map->map.size;
 		}
 		if (window->rsrc.parent) {
diff --git a/drivers/mtd/maps/impa7.c b/drivers/mtd/maps/impa7.c
index 2682ab51a36..998a27da97f 100644
--- a/drivers/mtd/maps/impa7.c
+++ b/drivers/mtd/maps/impa7.c
@@ -70,7 +70,7 @@ static struct mtd_partition *mtd_parts[NUM_FLASHBANKS];
 
 static const char *probes[] = { "cmdlinepart", NULL };
 
-int __init init_impa7(void)
+static int __init init_impa7(void)
 {
 	static const char *rom_probe_types[] = PROBETYPES;
 	const char **type;
diff --git a/drivers/mtd/maps/ipaq-flash.c b/drivers/mtd/maps/ipaq-flash.c
index ed58f6a77bd..748c85f635f 100644
--- a/drivers/mtd/maps/ipaq-flash.c
+++ b/drivers/mtd/maps/ipaq-flash.c
@@ -202,7 +202,7 @@ static const char *part_probes[] = { "cmdlinepart", "RedBoot", NULL };
 
 static int __init h1900_special_case(void);
 
-int __init ipaq_mtd_init(void)
+static int __init ipaq_mtd_init(void)
 {
 	struct mtd_partition *parts = NULL;
 	int nb_parts = 0;
diff --git a/drivers/mtd/maps/mbx860.c b/drivers/mtd/maps/mbx860.c
index 706f67394b0..0eb5a7c8538 100644
--- a/drivers/mtd/maps/mbx860.c
+++ b/drivers/mtd/maps/mbx860.c
@@ -55,7 +55,7 @@ struct map_info mbx_map = {
 	.bankwidth = 4,
 };
 
-int __init init_mbx(void)
+static int __init init_mbx(void)
 {
 	printk(KERN_NOTICE "Motorola MBX flash device: 0x%x at 0x%x\n", WINDOW_SIZE*4, WINDOW_ADDR);
 	mbx_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE * 4);
diff --git a/drivers/mtd/maps/nettel.c b/drivers/mtd/maps/nettel.c
index 965e6c6d6ab..a97133eb9d7 100644
--- a/drivers/mtd/maps/nettel.c
+++ b/drivers/mtd/maps/nettel.c
@@ -226,7 +226,7 @@ static int __init nettel_init(void)
 
 	if ((amd_mtd = do_map_probe("jedec_probe", &nettel_amd_map))) {
 		printk(KERN_NOTICE "SNAPGEAR: AMD flash device size = %dK\n",
-			amd_mtd->size>>10);
+			(int)(amd_mtd->size>>10));
 
 		amd_mtd->owner = THIS_MODULE;
 
@@ -357,13 +357,12 @@ static int __init nettel_init(void)
 		*intel1par = 0;
 	}
 
-	printk(KERN_NOTICE "SNAPGEAR: Intel flash device size = %dK\n",
-		(intel_mtd->size >> 10));
+	printk(KERN_NOTICE "SNAPGEAR: Intel flash device size = %lldKiB\n",
+	       (unsigned long long)(intel_mtd->size >> 10));
 
 	intel_mtd->owner = THIS_MODULE;
 
-	num_intel_partitions = sizeof(nettel_intel_partitions) /
-		sizeof(nettel_intel_partitions[0]);
+	num_intel_partitions = ARRAY_SIZE(nettel_intel_partitions);
 
 	if (intelboot) {
 		/*
diff --git a/drivers/mtd/maps/octagon-5066.c b/drivers/mtd/maps/octagon-5066.c
index 43e04c1d22a..2b2e4509321 100644
--- a/drivers/mtd/maps/octagon-5066.c
+++ b/drivers/mtd/maps/octagon-5066.c
@@ -184,7 +184,7 @@ void cleanup_oct5066(void)
 	release_region(PAGE_IO, 1);
 }
 
-int __init init_oct5066(void)
+static int __init init_oct5066(void)
 {
 	int i;
 	int ret = 0;
diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c
index 1db16e549e3..87743661d48 100644
--- a/drivers/mtd/maps/physmap.c
+++ b/drivers/mtd/maps/physmap.c
@@ -29,7 +29,6 @@ struct physmap_flash_info {
 	struct map_info		map[MAX_RESOURCES];
 #ifdef CONFIG_MTD_PARTITIONS
 	int			nr_parts;
-	struct mtd_partition	*parts;
 #endif
 };
 
@@ -56,14 +55,10 @@ static int physmap_flash_remove(struct platform_device *dev)
 	for (i = 0; i < MAX_RESOURCES; i++) {
 		if (info->mtd[i] != NULL) {
 #ifdef CONFIG_MTD_PARTITIONS
-			if (info->nr_parts) {
+			if (info->nr_parts || physmap_data->nr_parts)
 				del_mtd_partitions(info->mtd[i]);
-				kfree(info->parts);
-			} else if (physmap_data->nr_parts) {
-				del_mtd_partitions(info->mtd[i]);
-			} else {
+			else
 				del_mtd_device(info->mtd[i]);
-			}
 #else
 			del_mtd_device(info->mtd[i]);
 #endif
@@ -73,7 +68,12 @@ static int physmap_flash_remove(struct platform_device *dev)
 	return 0;
 }
 
-static const char *rom_probe_types[] = { "cfi_probe", "jedec_probe", "map_rom", NULL };
+static const char *rom_probe_types[] = {
+					"cfi_probe",
+					"jedec_probe",
+					"qinfo_probe",
+					"map_rom",
+					NULL };
 #ifdef CONFIG_MTD_PARTITIONS
 static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", NULL };
 #endif
@@ -86,6 +86,9 @@ static int physmap_flash_probe(struct platform_device *dev)
 	int err = 0;
 	int i;
 	int devices_found = 0;
+#ifdef CONFIG_MTD_PARTITIONS
+	struct mtd_partition *parts;
+#endif
 
 	physmap_data = dev->dev.platform_data;
 	if (physmap_data == NULL)
@@ -119,6 +122,7 @@ static int physmap_flash_probe(struct platform_device *dev)
 		info->map[i].size = dev->resource[i].end - dev->resource[i].start + 1;
 		info->map[i].bankwidth = physmap_data->width;
 		info->map[i].set_vpp = physmap_data->set_vpp;
+		info->map[i].pfow_base = physmap_data->pfow_base;
 
 		info->map[i].virt = devm_ioremap(&dev->dev, info->map[i].phys,
 						 info->map[i].size);
@@ -163,9 +167,10 @@ static int physmap_flash_probe(struct platform_device *dev)
 		goto err_out;
 
 #ifdef CONFIG_MTD_PARTITIONS
-	err = parse_mtd_partitions(info->cmtd, part_probe_types, &info->parts, 0);
+	err = parse_mtd_partitions(info->cmtd, part_probe_types, &parts, 0);
 	if (err > 0) {
-		add_mtd_partitions(info->cmtd, info->parts, err);
+		add_mtd_partitions(info->cmtd, parts, err);
+		kfree(parts);
 		return 0;
 	}
 
@@ -251,14 +256,7 @@ static struct platform_driver physmap_flash_driver = {
 };
 
 
-#ifdef CONFIG_MTD_PHYSMAP_LEN
-#if CONFIG_MTD_PHYSMAP_LEN != 0
-#warning using PHYSMAP compat code
-#define PHYSMAP_COMPAT
-#endif
-#endif
-
-#ifdef PHYSMAP_COMPAT
+#ifdef CONFIG_MTD_PHYSMAP_COMPAT
 static struct physmap_flash_data physmap_flash_data = {
 	.width		= CONFIG_MTD_PHYSMAP_BANKWIDTH,
 };
@@ -302,7 +300,7 @@ static int __init physmap_init(void)
 	int err;
 
 	err = platform_driver_register(&physmap_flash_driver);
-#ifdef PHYSMAP_COMPAT
+#ifdef CONFIG_MTD_PHYSMAP_COMPAT
 	if (err == 0)
 		platform_device_register(&physmap_flash);
 #endif
@@ -312,7 +310,7 @@ static int __init physmap_init(void)
 
 static void __exit physmap_exit(void)
 {
-#ifdef PHYSMAP_COMPAT
+#ifdef CONFIG_MTD_PHYSMAP_COMPAT
 	platform_device_unregister(&physmap_flash);
 #endif
 	platform_driver_unregister(&physmap_flash_driver);
@@ -326,8 +324,7 @@ MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>");
 MODULE_DESCRIPTION("Generic configurable MTD map driver");
 
 /* legacy platform drivers can't hotplug or coldplg */
-#ifndef PHYSMAP_COMPAT
+#ifndef CONFIG_MTD_PHYSMAP_COMPAT
 /* work with hotplug and coldplug */
 MODULE_ALIAS("platform:physmap-flash");
 #endif
-
diff --git a/drivers/mtd/maps/pmcmsp-flash.c b/drivers/mtd/maps/pmcmsp-flash.c
index f43ba2815cb..4768bd5459d 100644
--- a/drivers/mtd/maps/pmcmsp-flash.c
+++ b/drivers/mtd/maps/pmcmsp-flash.c
@@ -48,7 +48,7 @@ static int fcnt;
 
 #define DEBUG_MARKER printk(KERN_NOTICE "%s[%d]\n", __func__, __LINE__)
 
-int __init init_msp_flash(void)
+static int __init init_msp_flash(void)
 {
 	int i, j;
 	int offset, coff;
diff --git a/drivers/mtd/maps/redwood.c b/drivers/mtd/maps/redwood.c
index de002eb1a7f..933c0b63b01 100644
--- a/drivers/mtd/maps/redwood.c
+++ b/drivers/mtd/maps/redwood.c
@@ -122,7 +122,7 @@ struct map_info redwood_flash_map = {
 
 static struct mtd_info *redwood_mtd;
 
-int __init init_redwood_flash(void)
+static int __init init_redwood_flash(void)
 {
 	int err;
 
diff --git a/drivers/mtd/maps/rpxlite.c b/drivers/mtd/maps/rpxlite.c
index 14d90edb443..3e3ef53d4fd 100644
--- a/drivers/mtd/maps/rpxlite.c
+++ b/drivers/mtd/maps/rpxlite.c
@@ -23,7 +23,7 @@ static struct map_info rpxlite_map = {
 	.phys = WINDOW_ADDR,
 };
 
-int __init init_rpxlite(void)
+static int __init init_rpxlite(void)
 {
 	printk(KERN_NOTICE "RPX Lite or CLLF flash device: %x at %x\n", WINDOW_SIZE*4, WINDOW_ADDR);
 	rpxlite_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE * 4);
diff --git a/drivers/mtd/maps/sbc8240.c b/drivers/mtd/maps/sbc8240.c
index 6e1e99cd2b5..d5374cdcb16 100644
--- a/drivers/mtd/maps/sbc8240.c
+++ b/drivers/mtd/maps/sbc8240.c
@@ -136,7 +136,7 @@ static struct mtd_part_def sbc8240_part_banks[NUM_FLASH_BANKS];
 #endif	/* CONFIG_MTD_PARTITIONS */
 
 
-int __init init_sbc8240_mtd (void)
+static int __init init_sbc8240_mtd (void)
 {
 	static struct _cjs {
 		u_long addr;
diff --git a/drivers/mtd/maps/scb2_flash.c b/drivers/mtd/maps/scb2_flash.c
index 21169e6d646..7e329f09a54 100644
--- a/drivers/mtd/maps/scb2_flash.c
+++ b/drivers/mtd/maps/scb2_flash.c
@@ -118,7 +118,8 @@ scb2_fixup_mtd(struct mtd_info *mtd)
 		struct mtd_erase_region_info *region = &mtd->eraseregions[i];
 
 		if (region->numblocks * region->erasesize > mtd->size) {
-			region->numblocks = (mtd->size / region->erasesize);
+			region->numblocks = ((unsigned long)mtd->size /
+						region->erasesize);
 			done = 1;
 		} else {
 			region->numblocks = 0;
@@ -187,8 +188,9 @@ scb2_flash_probe(struct pci_dev *dev, const struct pci_device_id *ent)
 		return -ENODEV;
 	}
 
-	printk(KERN_NOTICE MODNAME ": chip size 0x%x at offset 0x%x\n",
-	       scb2_mtd->size, SCB2_WINDOW - scb2_mtd->size);
+	printk(KERN_NOTICE MODNAME ": chip size 0x%llx at offset 0x%llx\n",
+	       (unsigned long long)scb2_mtd->size,
+	       (unsigned long long)(SCB2_WINDOW - scb2_mtd->size));
 
 	add_mtd_device(scb2_mtd);
 
diff --git a/drivers/mtd/maps/sharpsl-flash.c b/drivers/mtd/maps/sharpsl-flash.c
index 026eab02818..b392f096c70 100644
--- a/drivers/mtd/maps/sharpsl-flash.c
+++ b/drivers/mtd/maps/sharpsl-flash.c
@@ -47,7 +47,7 @@ static struct mtd_partition sharpsl_partitions[1] = {
 	}
 };
 
-int __init init_sharpsl(void)
+static int __init init_sharpsl(void)
 {
 	struct mtd_partition *parts;
 	int nb_parts = 0;
diff --git a/drivers/mtd/maps/tqm8xxl.c b/drivers/mtd/maps/tqm8xxl.c
index a5d3d8531fa..60146984f4b 100644
--- a/drivers/mtd/maps/tqm8xxl.c
+++ b/drivers/mtd/maps/tqm8xxl.c
@@ -109,7 +109,7 @@ static struct mtd_partition tqm8xxl_fs_partitions[] = {
 };
 #endif
 
-int __init init_tqm_mtd(void)
+static int __init init_tqm_mtd(void)
 {
 	int idx = 0, ret = 0;
 	unsigned long flash_addr, flash_size, mtd_size = 0;
diff --git a/drivers/mtd/maps/uclinux.c b/drivers/mtd/maps/uclinux.c
index 0dc645f8152..81756e39771 100644
--- a/drivers/mtd/maps/uclinux.c
+++ b/drivers/mtd/maps/uclinux.c
@@ -51,7 +51,7 @@ int uclinux_point(struct mtd_info *mtd, loff_t from, size_t len,
 
 /****************************************************************************/
 
-int __init uclinux_mtd_init(void)
+static int __init uclinux_mtd_init(void)
 {
 	struct mtd_info *mtd;
 	struct map_info *mapp;
@@ -94,7 +94,7 @@ int __init uclinux_mtd_init(void)
 
 /****************************************************************************/
 
-void __exit uclinux_mtd_cleanup(void)
+static void __exit uclinux_mtd_cleanup(void)
 {
 	if (uclinux_ram_mtdinfo) {
 		del_mtd_partitions(uclinux_ram_mtdinfo);
diff --git a/drivers/mtd/maps/vmax301.c b/drivers/mtd/maps/vmax301.c
index 5a0c9a353b0..6d452dcdfe3 100644
--- a/drivers/mtd/maps/vmax301.c
+++ b/drivers/mtd/maps/vmax301.c
@@ -146,7 +146,7 @@ static void __exit cleanup_vmax301(void)
 	iounmap((void *)vmax_map[0].map_priv_1 - WINDOW_START);
 }
 
-int __init init_vmax301(void)
+static int __init init_vmax301(void)
 {
 	int i;
 	unsigned long iomapadr;
diff --git a/drivers/mtd/maps/wr_sbc82xx_flash.c b/drivers/mtd/maps/wr_sbc82xx_flash.c
index 413b0cf9bbd..933a2b6598b 100644
--- a/drivers/mtd/maps/wr_sbc82xx_flash.c
+++ b/drivers/mtd/maps/wr_sbc82xx_flash.c
@@ -74,7 +74,7 @@ do {								\
 	}							\
 } while (0);
 
-int __init init_sbc82xx_flash(void)
+static int __init init_sbc82xx_flash(void)
 {
 	volatile memctl_cpm2_t *mc = &cpm2_immr->im_memctl;
 	int bigflash;
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index bcffeda2df3..e9ec59e9a56 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -450,16 +450,20 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		if (!erase)
 			ret = -ENOMEM;
 		else {
+			struct erase_info_user einfo;
+
 			wait_queue_head_t waitq;
 			DECLARE_WAITQUEUE(wait, current);
 
 			init_waitqueue_head(&waitq);
 
-			if (copy_from_user(&erase->addr, argp,
+			if (copy_from_user(&einfo, argp,
 				    sizeof(struct erase_info_user))) {
 				kfree(erase);
 				return -EFAULT;
 			}
+			erase->addr = einfo.start;
+			erase->len = einfo.length;
 			erase->mtd = mtd;
 			erase->callback = mtdchar_erase_callback;
 			erase->priv = (unsigned long)&waitq;
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 1a05cf37851..3dbb1b38db6 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -197,7 +197,7 @@ concat_writev(struct mtd_info *mtd, const struct kvec *vecs,
 			continue;
 		}
 
-		size = min(total_len, (size_t)(subdev->size - to));
+		size = min_t(uint64_t, total_len, subdev->size - to);
 		wsize = size; /* store for future use */
 
 		entry_high = entry_low;
@@ -385,7 +385,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr)
 	struct mtd_concat *concat = CONCAT(mtd);
 	struct mtd_info *subdev;
 	int i, err;
-	u_int32_t length, offset = 0;
+	uint64_t length, offset = 0;
 	struct erase_info *erase;
 
 	if (!(mtd->flags & MTD_WRITEABLE))
@@ -518,7 +518,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr)
 	return 0;
 }
 
-static int concat_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int concat_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct mtd_concat *concat = CONCAT(mtd);
 	int i, err = -EINVAL;
@@ -528,7 +528,7 @@ static int concat_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
 
 	for (i = 0; i < concat->num_subdev; i++) {
 		struct mtd_info *subdev = concat->subdev[i];
-		size_t size;
+		uint64_t size;
 
 		if (ofs >= subdev->size) {
 			size = 0;
@@ -556,7 +556,7 @@ static int concat_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
 	return err;
 }
 
-static int concat_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int concat_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct mtd_concat *concat = CONCAT(mtd);
 	int i, err = 0;
@@ -566,7 +566,7 @@ static int concat_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
 
 	for (i = 0; i < concat->num_subdev; i++) {
 		struct mtd_info *subdev = concat->subdev[i];
-		size_t size;
+		uint64_t size;
 
 		if (ofs >= subdev->size) {
 			size = 0;
@@ -696,7 +696,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 	int i;
 	size_t size;
 	struct mtd_concat *concat;
-	u_int32_t max_erasesize, curr_erasesize;
+	uint32_t max_erasesize, curr_erasesize;
 	int num_erase_region;
 
 	printk(KERN_NOTICE "Concatenating MTD devices:\n");
@@ -842,12 +842,14 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 		concat->mtd.erasesize = curr_erasesize;
 		concat->mtd.numeraseregions = 0;
 	} else {
+		uint64_t tmp64;
+
 		/*
 		 * erase block size varies across the subdevices: allocate
 		 * space to store the data describing the variable erase regions
 		 */
 		struct mtd_erase_region_info *erase_region_p;
-		u_int32_t begin, position;
+		uint64_t begin, position;
 
 		concat->mtd.erasesize = max_erasesize;
 		concat->mtd.numeraseregions = num_erase_region;
@@ -879,8 +881,9 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 					erase_region_p->offset = begin;
 					erase_region_p->erasesize =
 					    curr_erasesize;
-					erase_region_p->numblocks =
-					    (position - begin) / curr_erasesize;
+					tmp64 = position - begin;
+					do_div(tmp64, curr_erasesize);
+					erase_region_p->numblocks = tmp64;
 					begin = position;
 
 					curr_erasesize = subdev[i]->erasesize;
@@ -897,9 +900,9 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 						erase_region_p->offset = begin;
 						erase_region_p->erasesize =
 						    curr_erasesize;
-						erase_region_p->numblocks =
-						    (position -
-						     begin) / curr_erasesize;
+						tmp64 = position - begin;
+						do_div(tmp64, curr_erasesize);
+						erase_region_p->numblocks = tmp64;
 						begin = position;
 
 						curr_erasesize =
@@ -909,14 +912,16 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 					}
 					position +=
 					    subdev[i]->eraseregions[j].
-					    numblocks * curr_erasesize;
+					    numblocks * (uint64_t)curr_erasesize;
 				}
 			}
 		}
 		/* Now write the final entry */
 		erase_region_p->offset = begin;
 		erase_region_p->erasesize = curr_erasesize;
-		erase_region_p->numblocks = (position - begin) / curr_erasesize;
+		tmp64 = position - begin;
+		do_div(tmp64, curr_erasesize);
+		erase_region_p->numblocks = tmp64;
 	}
 
 	return &concat->mtd;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index a9d24694982..76fe0a1e7a5 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -57,6 +57,19 @@ int add_mtd_device(struct mtd_info *mtd)
 			mtd->index = i;
 			mtd->usecount = 0;
 
+			if (is_power_of_2(mtd->erasesize))
+				mtd->erasesize_shift = ffs(mtd->erasesize) - 1;
+			else
+				mtd->erasesize_shift = 0;
+
+			if (is_power_of_2(mtd->writesize))
+				mtd->writesize_shift = ffs(mtd->writesize) - 1;
+			else
+				mtd->writesize_shift = 0;
+
+			mtd->erasesize_mask = (1 << mtd->erasesize_shift) - 1;
+			mtd->writesize_mask = (1 << mtd->writesize_shift) - 1;
+
 			/* Some chips always power up locked. Unlock them now */
 			if ((mtd->flags & MTD_WRITEABLE)
 			    && (mtd->flags & MTD_POWERUP_LOCK) && mtd->unlock) {
@@ -344,7 +357,8 @@ static inline int mtd_proc_info (char *buf, int i)
 	if (!this)
 		return 0;
 
-	return sprintf(buf, "mtd%d: %8.8x %8.8x \"%s\"\n", i, this->size,
+	return sprintf(buf, "mtd%d: %8.8llx %8.8x \"%s\"\n", i,
+		       (unsigned long long)this->size,
 		       this->erasesize, this->name);
 }
 
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index aebb3b27edb..1a6b3beabe8 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -80,9 +80,9 @@ static int mtdoops_erase_block(struct mtd_info *mtd, int offset)
 	if (ret) {
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&wait_q, &wait);
-		printk (KERN_WARNING "mtdoops: erase of region [0x%x, 0x%x] "
+		printk (KERN_WARNING "mtdoops: erase of region [0x%llx, 0x%llx] "
 				     "on \"%s\" failed\n",
-			erase.addr, erase.len, mtd->name);
+			(unsigned long long)erase.addr, (unsigned long long)erase.len, mtd->name);
 		return ret;
 	}
 
@@ -289,7 +289,10 @@ static void mtdoops_notify_add(struct mtd_info *mtd)
 	}
 
 	cxt->mtd = mtd;
-	cxt->oops_pages = mtd->size / OOPS_PAGE_SIZE;
+	if (mtd->size > INT_MAX)
+		cxt->oops_pages = INT_MAX / OOPS_PAGE_SIZE;
+	else
+		cxt->oops_pages = (int)mtd->size / OOPS_PAGE_SIZE;
 
 	find_next_position(cxt);
 
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 3728913fa5f..144e6b613a7 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -26,7 +26,7 @@ static LIST_HEAD(mtd_partitions);
 struct mtd_part {
 	struct mtd_info mtd;
 	struct mtd_info *master;
-	u_int32_t offset;
+	uint64_t offset;
 	int index;
 	struct list_head list;
 	int registered;
@@ -235,7 +235,7 @@ void mtd_erase_callback(struct erase_info *instr)
 }
 EXPORT_SYMBOL_GPL(mtd_erase_callback);
 
-static int part_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int part_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct mtd_part *part = PART(mtd);
 	if ((len + ofs) > mtd->size)
@@ -243,7 +243,7 @@ static int part_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
 	return part->master->lock(part->master, ofs + part->offset, len);
 }
 
-static int part_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int part_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	struct mtd_part *part = PART(mtd);
 	if ((len + ofs) > mtd->size)
@@ -317,7 +317,7 @@ EXPORT_SYMBOL(del_mtd_partitions);
 
 static struct mtd_part *add_one_partition(struct mtd_info *master,
 		const struct mtd_partition *part, int partno,
-		u_int32_t cur_offset)
+		uint64_t cur_offset)
 {
 	struct mtd_part *slave;
 
@@ -395,19 +395,19 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 		slave->offset = cur_offset;
 	if (slave->offset == MTDPART_OFS_NXTBLK) {
 		slave->offset = cur_offset;
-		if ((cur_offset % master->erasesize) != 0) {
+		if (mtd_mod_by_eb(cur_offset, master) != 0) {
 			/* Round up to next erasesize */
-			slave->offset = ((cur_offset / master->erasesize) + 1) * master->erasesize;
+			slave->offset = (mtd_div_by_eb(cur_offset, master) + 1) * master->erasesize;
 			printk(KERN_NOTICE "Moving partition %d: "
-			       "0x%08x -> 0x%08x\n", partno,
-			       cur_offset, slave->offset);
+			       "0x%012llx -> 0x%012llx\n", partno,
+			       (unsigned long long)cur_offset, (unsigned long long)slave->offset);
 		}
 	}
 	if (slave->mtd.size == MTDPART_SIZ_FULL)
 		slave->mtd.size = master->size - slave->offset;
 
-	printk(KERN_NOTICE "0x%08x-0x%08x : \"%s\"\n", slave->offset,
-		slave->offset + slave->mtd.size, slave->mtd.name);
+	printk(KERN_NOTICE "0x%012llx-0x%012llx : \"%s\"\n", (unsigned long long)slave->offset,
+		(unsigned long long)(slave->offset + slave->mtd.size), slave->mtd.name);
 
 	/* let's do some sanity checks */
 	if (slave->offset >= master->size) {
@@ -420,13 +420,13 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 	}
 	if (slave->offset + slave->mtd.size > master->size) {
 		slave->mtd.size = master->size - slave->offset;
-		printk(KERN_WARNING"mtd: partition \"%s\" extends beyond the end of device \"%s\" -- size truncated to %#x\n",
-			part->name, master->name, slave->mtd.size);
+		printk(KERN_WARNING"mtd: partition \"%s\" extends beyond the end of device \"%s\" -- size truncated to %#llx\n",
+			part->name, master->name, (unsigned long long)slave->mtd.size);
 	}
 	if (master->numeraseregions > 1) {
 		/* Deal with variable erase size stuff */
 		int i, max = master->numeraseregions;
-		u32 end = slave->offset + slave->mtd.size;
+		u64 end = slave->offset + slave->mtd.size;
 		struct mtd_erase_region_info *regions = master->eraseregions;
 
 		/* Find the first erase regions which is part of this
@@ -449,7 +449,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 	}
 
 	if ((slave->mtd.flags & MTD_WRITEABLE) &&
-	    (slave->offset % slave->mtd.erasesize)) {
+	    mtd_mod_by_eb(slave->offset, &slave->mtd)) {
 		/* Doesn't start on a boundary of major erase size */
 		/* FIXME: Let it be writable if it is on a boundary of
 		 * _minor_ erase size though */
@@ -458,7 +458,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 			part->name);
 	}
 	if ((slave->mtd.flags & MTD_WRITEABLE) &&
-	    (slave->mtd.size % slave->mtd.erasesize)) {
+	    mtd_mod_by_eb(slave->mtd.size, &slave->mtd)) {
 		slave->mtd.flags &= ~MTD_WRITEABLE;
 		printk(KERN_WARNING"mtd: partition \"%s\" doesn't end on an erase block -- force read-only\n",
 			part->name);
@@ -466,7 +466,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master,
 
 	slave->mtd.ecclayout = master->ecclayout;
 	if (master->block_isbad) {
-		uint32_t offs = 0;
+		uint64_t offs = 0;
 
 		while (offs < slave->mtd.size) {
 			if (master->block_isbad(master,
@@ -501,7 +501,7 @@ int add_mtd_partitions(struct mtd_info *master,
 		       int nbparts)
 {
 	struct mtd_part *slave;
-	u_int32_t cur_offset = 0;
+	uint64_t cur_offset = 0;
 	int i;
 
 	printk(KERN_NOTICE "Creating %d MTD partitions on \"%s\":\n", nbparts, master->name);
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index f8ae0400c49..8b12e6e109d 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -163,6 +163,13 @@ config MTD_NAND_S3C2410_HWECC
 	  incorrect ECC generation, and if using these, the default of
 	  software ECC is preferable.
 
+config MTD_NAND_NDFC
+	tristate "NDFC NanD Flash Controller"
+	depends on 4xx
+	select MTD_NAND_ECC_SMC
+	help
+	 NDFC Nand Flash Controllers are integrated in IBM/AMCC's 4xx SoCs
+
 config MTD_NAND_S3C2410_CLKSTOP
 	bool "S3C2410 NAND IDLE clock stop"
 	depends on MTD_NAND_S3C2410
diff --git a/drivers/mtd/nand/alauda.c b/drivers/mtd/nand/alauda.c
index 96238039485..6d9649159a1 100644
--- a/drivers/mtd/nand/alauda.c
+++ b/drivers/mtd/nand/alauda.c
@@ -676,11 +676,11 @@ static int alauda_probe(struct usb_interface *interface,
 		goto error;
 
 	al->write_out = usb_sndbulkpipe(al->dev,
-			ep_wr->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
+			usb_endpoint_num(ep_wr));
 	al->bulk_in = usb_rcvbulkpipe(al->dev,
-			ep_in->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
+			usb_endpoint_num(ep_in));
 	al->bulk_out = usb_sndbulkpipe(al->dev,
-			ep_out->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK);
+			usb_endpoint_num(ep_out));
 
 	/* second device is identical up to now */
 	memcpy(al+1, al, sizeof(*al));
diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c
index b8064bf3aee..22a6b2e50e9 100644
--- a/drivers/mtd/nand/cafe_nand.c
+++ b/drivers/mtd/nand/cafe_nand.c
@@ -90,7 +90,7 @@ static int timing[3];
 module_param_array(timing, int, &numtimings, 0644);
 
 #ifdef CONFIG_MTD_PARTITIONS
-static const char *part_probes[] = { "RedBoot", NULL };
+static const char *part_probes[] = { "cmdlinepart", "RedBoot", NULL };
 #endif
 
 /* Hrm. Why isn't this already conditional on something in the struct device? */
@@ -805,10 +805,13 @@ static int __devinit cafe_nand_probe(struct pci_dev *pdev,
 	add_mtd_device(mtd);
 
 #ifdef CONFIG_MTD_PARTITIONS
+#ifdef CONFIG_MTD_CMDLINE_PARTS
+	mtd->name = "cafe_nand";
+#endif
 	nr_parts = parse_mtd_partitions(mtd, part_probes, &parts, 0);
 	if (nr_parts > 0) {
 		cafe->parts = parts;
-		dev_info(&cafe->pdev->dev, "%d RedBoot partitions found\n", nr_parts);
+		dev_info(&cafe->pdev->dev, "%d partitions found\n", nr_parts);
 		add_mtd_partitions(mtd, parts, nr_parts);
 	}
 #endif
diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c
index 4aa5bd6158d..65929db2944 100644
--- a/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/fsl_elbc_nand.c
@@ -777,7 +777,9 @@ static int fsl_elbc_chip_init(struct fsl_elbc_mtd *priv)
 	/* Fill in fsl_elbc_mtd structure */
 	priv->mtd.priv = chip;
 	priv->mtd.owner = THIS_MODULE;
-	priv->fmr = 0; /* rest filled in later */
+
+	/* Set the ECCM according to the settings in bootloader.*/
+	priv->fmr = in_be32(&lbc->fmr) & FMR_ECCM;
 
 	/* fill in nand_chip structure */
 	/* set up function call table */
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0a9c9cd33f9..0c3afccde8a 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2014,13 +2014,14 @@ static int nand_erase(struct mtd_info *mtd, struct erase_info *instr)
 int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		    int allowbbt)
 {
-	int page, len, status, pages_per_block, ret, chipnr;
+	int page, status, pages_per_block, ret, chipnr;
 	struct nand_chip *chip = mtd->priv;
-	int rewrite_bbt[NAND_MAX_CHIPS]={0};
+	loff_t rewrite_bbt[NAND_MAX_CHIPS]={0};
 	unsigned int bbt_masked_page = 0xffffffff;
+	loff_t len;
 
-	DEBUG(MTD_DEBUG_LEVEL3, "nand_erase: start = 0x%08x, len = %i\n",
-	      (unsigned int)instr->addr, (unsigned int)instr->len);
+	DEBUG(MTD_DEBUG_LEVEL3, "nand_erase: start = 0x%012llx, len = %llu\n",
+	      (unsigned long long)instr->addr, (unsigned long long)instr->len);
 
 	/* Start address must align on block boundary */
 	if (instr->addr & ((1 << chip->phys_erase_shift) - 1)) {
@@ -2116,7 +2117,8 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 			DEBUG(MTD_DEBUG_LEVEL0, "nand_erase: "
 			      "Failed erase, page 0x%08x\n", page);
 			instr->state = MTD_ERASE_FAILED;
-			instr->fail_addr = (page << chip->page_shift);
+			instr->fail_addr =
+				((loff_t)page << chip->page_shift);
 			goto erase_exit;
 		}
 
@@ -2126,7 +2128,8 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		 */
 		if (bbt_masked_page != 0xffffffff &&
 		    (page & BBT_PAGE_MASK) == bbt_masked_page)
-			    rewrite_bbt[chipnr] = (page << chip->page_shift);
+			    rewrite_bbt[chipnr] =
+					((loff_t)page << chip->page_shift);
 
 		/* Increment page address and decrement length */
 		len -= (1 << chip->phys_erase_shift);
@@ -2173,7 +2176,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 			continue;
 		/* update the BBT for chip */
 		DEBUG(MTD_DEBUG_LEVEL0, "nand_erase_nand: nand_update_bbt "
-		      "(%d:0x%0x 0x%0x)\n", chipnr, rewrite_bbt[chipnr],
+		      "(%d:0x%0llx 0x%0x)\n", chipnr, rewrite_bbt[chipnr],
 		      chip->bbt_td->pages[chipnr]);
 		nand_update_bbt(mtd, rewrite_bbt[chipnr]);
 	}
@@ -2365,7 +2368,7 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 	if (!mtd->name)
 		mtd->name = type->name;
 
-	chip->chipsize = type->chipsize << 20;
+	chip->chipsize = (uint64_t)type->chipsize << 20;
 
 	/* Newer devices have all the information in additional id bytes */
 	if (!type->pagesize) {
@@ -2423,7 +2426,10 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 
 	chip->bbt_erase_shift = chip->phys_erase_shift =
 		ffs(mtd->erasesize) - 1;
-	chip->chip_shift = ffs(chip->chipsize) - 1;
+	if (chip->chipsize & 0xffffffff)
+		chip->chip_shift = ffs((unsigned)chip->chipsize) - 1;
+	else
+		chip->chip_shift = ffs((unsigned)(chip->chipsize >> 32)) + 32 - 1;
 
 	/* Set the bad block position */
 	chip->badblockpos = mtd->writesize > 512 ?
@@ -2517,7 +2523,6 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips)
 /**
  * nand_scan_tail - [NAND Interface] Scan for the NAND device
  * @mtd:	    MTD device structure
- * @maxchips:	    Number of chips to scan for
  *
  * This is the second phase of the normal nand_scan() function. It
  * fills out all the uninitialized function pointers with the defaults
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 0b1c48595f1..55c23e5cd21 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -171,16 +171,16 @@ static int read_bbt(struct mtd_info *mtd, uint8_t *buf, int page, int num,
 				if (tmp == msk)
 					continue;
 				if (reserved_block_code && (tmp == reserved_block_code)) {
-					printk(KERN_DEBUG "nand_read_bbt: Reserved block at 0x%08x\n",
-					       ((offs << 2) + (act >> 1)) << this->bbt_erase_shift);
+					printk(KERN_DEBUG "nand_read_bbt: Reserved block at 0x%012llx\n",
+					       (loff_t)((offs << 2) + (act >> 1)) << this->bbt_erase_shift);
 					this->bbt[offs + (act >> 3)] |= 0x2 << (act & 0x06);
 					mtd->ecc_stats.bbtblocks++;
 					continue;
 				}
 				/* Leave it for now, if its matured we can move this
 				 * message to MTD_DEBUG_LEVEL0 */
-				printk(KERN_DEBUG "nand_read_bbt: Bad block at 0x%08x\n",
-				       ((offs << 2) + (act >> 1)) << this->bbt_erase_shift);
+				printk(KERN_DEBUG "nand_read_bbt: Bad block at 0x%012llx\n",
+				       (loff_t)((offs << 2) + (act >> 1)) << this->bbt_erase_shift);
 				/* Factory marked bad or worn out ? */
 				if (tmp == 0)
 					this->bbt[offs + (act >> 3)] |= 0x3 << (act & 0x06);
@@ -284,7 +284,7 @@ static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf,
 
 	/* Read the primary version, if available */
 	if (td->options & NAND_BBT_VERSION) {
-		scan_read_raw(mtd, buf, td->pages[0] << this->page_shift,
+		scan_read_raw(mtd, buf, (loff_t)td->pages[0] << this->page_shift,
 			      mtd->writesize);
 		td->version[0] = buf[mtd->writesize + td->veroffs];
 		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n",
@@ -293,7 +293,7 @@ static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf,
 
 	/* Read the mirror version, if available */
 	if (md && (md->options & NAND_BBT_VERSION)) {
-		scan_read_raw(mtd, buf, md->pages[0] << this->page_shift,
+		scan_read_raw(mtd, buf, (loff_t)md->pages[0] << this->page_shift,
 			      mtd->writesize);
 		md->version[0] = buf[mtd->writesize + md->veroffs];
 		printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n",
@@ -411,7 +411,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf,
 		numblocks = this->chipsize >> (this->bbt_erase_shift - 1);
 		startblock = chip * numblocks;
 		numblocks += startblock;
-		from = startblock << (this->bbt_erase_shift - 1);
+		from = (loff_t)startblock << (this->bbt_erase_shift - 1);
 	}
 
 	for (i = startblock; i < numblocks;) {
@@ -428,8 +428,8 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf,
 
 		if (ret) {
 			this->bbt[i >> 3] |= 0x03 << (i & 0x6);
-			printk(KERN_WARNING "Bad eraseblock %d at 0x%08x\n",
-			       i >> 1, (unsigned int)from);
+			printk(KERN_WARNING "Bad eraseblock %d at 0x%012llx\n",
+			       i >> 1, (unsigned long long)from);
 			mtd->ecc_stats.badblocks++;
 		}
 
@@ -495,7 +495,7 @@ static int search_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr
 		for (block = 0; block < td->maxblocks; block++) {
 
 			int actblock = startblock + dir * block;
-			loff_t offs = actblock << this->bbt_erase_shift;
+			loff_t offs = (loff_t)actblock << this->bbt_erase_shift;
 
 			/* Read first page */
 			scan_read_raw(mtd, buf, offs, mtd->writesize);
@@ -719,7 +719,7 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 
 		memset(&einfo, 0, sizeof(einfo));
 		einfo.mtd = mtd;
-		einfo.addr = (unsigned long)to;
+		einfo.addr = to;
 		einfo.len = 1 << this->bbt_erase_shift;
 		res = nand_erase_nand(mtd, &einfo, 1);
 		if (res < 0)
@@ -729,8 +729,8 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf,
 		if (res < 0)
 			goto outerr;
 
-		printk(KERN_DEBUG "Bad block table written to 0x%08x, version "
-		       "0x%02X\n", (unsigned int)to, td->version[chip]);
+		printk(KERN_DEBUG "Bad block table written to 0x%012llx, version "
+		       "0x%02X\n", (unsigned long long)to, td->version[chip]);
 
 		/* Mark it as used */
 		td->pages[chip] = page;
@@ -910,7 +910,7 @@ static void mark_bbt_region(struct mtd_info *mtd, struct nand_bbt_descr *td)
 			newval = oldval | (0x2 << (block & 0x06));
 			this->bbt[(block >> 3)] = newval;
 			if ((oldval != newval) && td->reserved_block_code)
-				nand_update_bbt(mtd, block << (this->bbt_erase_shift - 1));
+				nand_update_bbt(mtd, (loff_t)block << (this->bbt_erase_shift - 1));
 			continue;
 		}
 		update = 0;
@@ -931,7 +931,7 @@ static void mark_bbt_region(struct mtd_info *mtd, struct nand_bbt_descr *td)
 		   new ones have been marked, then we need to update the stored
 		   bbts.  This should only happen once. */
 		if (update && td->reserved_block_code)
-			nand_update_bbt(mtd, (block - 2) << (this->bbt_erase_shift - 1));
+			nand_update_bbt(mtd, (loff_t)(block - 2) << (this->bbt_erase_shift - 1));
 	}
 }
 
@@ -1027,7 +1027,6 @@ int nand_update_bbt(struct mtd_info *mtd, loff_t offs)
 	if (!this->bbt || !td)
 		return -EINVAL;
 
-	len = mtd->size >> (this->bbt_erase_shift + 2);
 	/* Allocate a temporary buffer for one eraseblock incl. oob */
 	len = (1 << this->bbt_erase_shift);
 	len += (len >> this->page_shift) * mtd->oobsize;
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index ae7c57781a6..cd0711b83ac 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -38,6 +38,9 @@
 #include <linux/delay.h>
 #include <linux/list.h>
 #include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
 
 /* Default simulator parameters values */
 #if !defined(CONFIG_NANDSIM_FIRST_ID_BYTE)  || \
@@ -100,6 +103,7 @@ static unsigned int bitflips = 0;
 static char *gravepages = NULL;
 static unsigned int rptwear = 0;
 static unsigned int overridesize = 0;
+static char *cache_file = NULL;
 
 module_param(first_id_byte,  uint, 0400);
 module_param(second_id_byte, uint, 0400);
@@ -122,12 +126,13 @@ module_param(bitflips,       uint, 0400);
 module_param(gravepages,     charp, 0400);
 module_param(rptwear,        uint, 0400);
 module_param(overridesize,   uint, 0400);
+module_param(cache_file,     charp, 0400);
 
 MODULE_PARM_DESC(first_id_byte,  "The first byte returned by NAND Flash 'read ID' command (manufacturer ID)");
 MODULE_PARM_DESC(second_id_byte, "The second byte returned by NAND Flash 'read ID' command (chip ID)");
 MODULE_PARM_DESC(third_id_byte,  "The third byte returned by NAND Flash 'read ID' command");
 MODULE_PARM_DESC(fourth_id_byte, "The fourth byte returned by NAND Flash 'read ID' command");
-MODULE_PARM_DESC(access_delay,   "Initial page access delay (microiseconds)");
+MODULE_PARM_DESC(access_delay,   "Initial page access delay (microseconds)");
 MODULE_PARM_DESC(programm_delay, "Page programm delay (microseconds");
 MODULE_PARM_DESC(erase_delay,    "Sector erase delay (milliseconds)");
 MODULE_PARM_DESC(output_cycle,   "Word output (from flash) time (nanodeconds)");
@@ -153,6 +158,7 @@ MODULE_PARM_DESC(rptwear,        "Number of erases inbetween reporting wear, if
 MODULE_PARM_DESC(overridesize,   "Specifies the NAND Flash size overriding the ID bytes. "
 				 "The size is specified in erase blocks and as the exponent of a power of two"
 				 " e.g. 5 means a size of 32 erase blocks");
+MODULE_PARM_DESC(cache_file,     "File to use to cache nand pages instead of memory");
 
 /* The largest possible page size */
 #define NS_LARGEST_PAGE_SIZE	2048
@@ -266,6 +272,9 @@ MODULE_PARM_DESC(overridesize,   "Specifies the NAND Flash size overriding the I
  */
 #define NS_MAX_PREVSTATES 1
 
+/* Maximum page cache pages needed to read or write a NAND page to the cache_file */
+#define NS_MAX_HELD_PAGES 16
+
 /*
  * A union to represent flash memory contents and flash buffer.
  */
@@ -295,6 +304,9 @@ struct nandsim {
 	/* The simulated NAND flash pages array */
 	union ns_mem *pages;
 
+	/* Slab allocator for nand pages */
+	struct kmem_cache *nand_pages_slab;
+
 	/* Internal buffer of page + OOB size bytes */
 	union ns_mem buf;
 
@@ -335,6 +347,13 @@ struct nandsim {
                 int ale; /* address Latch Enable */
                 int wp;  /* write Protect */
         } lines;
+
+	/* Fields needed when using a cache file */
+	struct file *cfile; /* Open file */
+	unsigned char *pages_written; /* Which pages have been written */
+	void *file_buf;
+	struct page *held_pages[NS_MAX_HELD_PAGES];
+	int held_cnt;
 };
 
 /*
@@ -420,25 +439,69 @@ static struct mtd_info *nsmtd;
 static u_char ns_verify_buf[NS_LARGEST_PAGE_SIZE];
 
 /*
- * Allocate array of page pointers and initialize the array to NULL
- * pointers.
+ * Allocate array of page pointers, create slab allocation for an array
+ * and initialize the array by NULL pointers.
  *
  * RETURNS: 0 if success, -ENOMEM if memory alloc fails.
  */
 static int alloc_device(struct nandsim *ns)
 {
-	int i;
+	struct file *cfile;
+	int i, err;
+
+	if (cache_file) {
+		cfile = filp_open(cache_file, O_CREAT | O_RDWR | O_LARGEFILE, 0600);
+		if (IS_ERR(cfile))
+			return PTR_ERR(cfile);
+		if (!cfile->f_op || (!cfile->f_op->read && !cfile->f_op->aio_read)) {
+			NS_ERR("alloc_device: cache file not readable\n");
+			err = -EINVAL;
+			goto err_close;
+		}
+		if (!cfile->f_op->write && !cfile->f_op->aio_write) {
+			NS_ERR("alloc_device: cache file not writeable\n");
+			err = -EINVAL;
+			goto err_close;
+		}
+		ns->pages_written = vmalloc(ns->geom.pgnum);
+		if (!ns->pages_written) {
+			NS_ERR("alloc_device: unable to allocate pages written array\n");
+			err = -ENOMEM;
+			goto err_close;
+		}
+		ns->file_buf = kmalloc(ns->geom.pgszoob, GFP_KERNEL);
+		if (!ns->file_buf) {
+			NS_ERR("alloc_device: unable to allocate file buf\n");
+			err = -ENOMEM;
+			goto err_free;
+		}
+		ns->cfile = cfile;
+		memset(ns->pages_written, 0, ns->geom.pgnum);
+		return 0;
+	}
 
 	ns->pages = vmalloc(ns->geom.pgnum * sizeof(union ns_mem));
 	if (!ns->pages) {
-		NS_ERR("alloc_map: unable to allocate page array\n");
+		NS_ERR("alloc_device: unable to allocate page array\n");
 		return -ENOMEM;
 	}
 	for (i = 0; i < ns->geom.pgnum; i++) {
 		ns->pages[i].byte = NULL;
 	}
+	ns->nand_pages_slab = kmem_cache_create("nandsim",
+						ns->geom.pgszoob, 0, 0, NULL);
+	if (!ns->nand_pages_slab) {
+		NS_ERR("cache_create: unable to create kmem_cache\n");
+		return -ENOMEM;
+	}
 
 	return 0;
+
+err_free:
+	vfree(ns->pages_written);
+err_close:
+	filp_close(cfile, NULL);
+	return err;
 }
 
 /*
@@ -448,11 +511,20 @@ static void free_device(struct nandsim *ns)
 {
 	int i;
 
+	if (ns->cfile) {
+		kfree(ns->file_buf);
+		vfree(ns->pages_written);
+		filp_close(ns->cfile, NULL);
+		return;
+	}
+
 	if (ns->pages) {
 		for (i = 0; i < ns->geom.pgnum; i++) {
 			if (ns->pages[i].byte)
-				kfree(ns->pages[i].byte);
+				kmem_cache_free(ns->nand_pages_slab,
+						ns->pages[i].byte);
 		}
+		kmem_cache_destroy(ns->nand_pages_slab);
 		vfree(ns->pages);
 	}
 }
@@ -464,7 +536,7 @@ static char *get_partition_name(int i)
 	return kstrdup(buf, GFP_KERNEL);
 }
 
-static u_int64_t divide(u_int64_t n, u_int32_t d)
+static uint64_t divide(uint64_t n, uint32_t d)
 {
 	do_div(n, d);
 	return n;
@@ -480,8 +552,8 @@ static int init_nandsim(struct mtd_info *mtd)
 	struct nand_chip *chip = (struct nand_chip *)mtd->priv;
 	struct nandsim   *ns   = (struct nandsim *)(chip->priv);
 	int i, ret = 0;
-	u_int64_t remains;
-	u_int64_t next_offset;
+	uint64_t remains;
+	uint64_t next_offset;
 
 	if (NS_IS_INITIALIZED(ns)) {
 		NS_ERR("init_nandsim: nandsim is already initialized\n");
@@ -548,7 +620,7 @@ static int init_nandsim(struct mtd_info *mtd)
 	remains = ns->geom.totsz;
 	next_offset = 0;
 	for (i = 0; i < parts_num; ++i) {
-		u_int64_t part_sz = (u_int64_t)parts[i] * ns->geom.secsz;
+		uint64_t part_sz = (uint64_t)parts[i] * ns->geom.secsz;
 
 		if (!part_sz || part_sz > remains) {
 			NS_ERR("bad partition size.\n");
@@ -1211,6 +1283,97 @@ static int find_operation(struct nandsim *ns, uint32_t flag)
 	return -1;
 }
 
+static void put_pages(struct nandsim *ns)
+{
+	int i;
+
+	for (i = 0; i < ns->held_cnt; i++)
+		page_cache_release(ns->held_pages[i]);
+}
+
+/* Get page cache pages in advance to provide NOFS memory allocation */
+static int get_pages(struct nandsim *ns, struct file *file, size_t count, loff_t pos)
+{
+	pgoff_t index, start_index, end_index;
+	struct page *page;
+	struct address_space *mapping = file->f_mapping;
+
+	start_index = pos >> PAGE_CACHE_SHIFT;
+	end_index = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+	if (end_index - start_index + 1 > NS_MAX_HELD_PAGES)
+		return -EINVAL;
+	ns->held_cnt = 0;
+	for (index = start_index; index <= end_index; index++) {
+		page = find_get_page(mapping, index);
+		if (page == NULL) {
+			page = find_or_create_page(mapping, index, GFP_NOFS);
+			if (page == NULL) {
+				write_inode_now(mapping->host, 1);
+				page = find_or_create_page(mapping, index, GFP_NOFS);
+			}
+			if (page == NULL) {
+				put_pages(ns);
+				return -ENOMEM;
+			}
+			unlock_page(page);
+		}
+		ns->held_pages[ns->held_cnt++] = page;
+	}
+	return 0;
+}
+
+static int set_memalloc(void)
+{
+	if (current->flags & PF_MEMALLOC)
+		return 0;
+	current->flags |= PF_MEMALLOC;
+	return 1;
+}
+
+static void clear_memalloc(int memalloc)
+{
+	if (memalloc)
+		current->flags &= ~PF_MEMALLOC;
+}
+
+static ssize_t read_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t *pos)
+{
+	mm_segment_t old_fs;
+	ssize_t tx;
+	int err, memalloc;
+
+	err = get_pages(ns, file, count, *pos);
+	if (err)
+		return err;
+	old_fs = get_fs();
+	set_fs(get_ds());
+	memalloc = set_memalloc();
+	tx = vfs_read(file, (char __user *)buf, count, pos);
+	clear_memalloc(memalloc);
+	set_fs(old_fs);
+	put_pages(ns);
+	return tx;
+}
+
+static ssize_t write_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t *pos)
+{
+	mm_segment_t old_fs;
+	ssize_t tx;
+	int err, memalloc;
+
+	err = get_pages(ns, file, count, *pos);
+	if (err)
+		return err;
+	old_fs = get_fs();
+	set_fs(get_ds());
+	memalloc = set_memalloc();
+	tx = vfs_write(file, (char __user *)buf, count, pos);
+	clear_memalloc(memalloc);
+	set_fs(old_fs);
+	put_pages(ns);
+	return tx;
+}
+
 /*
  * Returns a pointer to the current page.
  */
@@ -1227,6 +1390,38 @@ static inline u_char *NS_PAGE_BYTE_OFF(struct nandsim *ns)
 	return NS_GET_PAGE(ns)->byte + ns->regs.column + ns->regs.off;
 }
 
+int do_read_error(struct nandsim *ns, int num)
+{
+	unsigned int page_no = ns->regs.row;
+
+	if (read_error(page_no)) {
+		int i;
+		memset(ns->buf.byte, 0xFF, num);
+		for (i = 0; i < num; ++i)
+			ns->buf.byte[i] = random32();
+		NS_WARN("simulating read error in page %u\n", page_no);
+		return 1;
+	}
+	return 0;
+}
+
+void do_bit_flips(struct nandsim *ns, int num)
+{
+	if (bitflips && random32() < (1 << 22)) {
+		int flips = 1;
+		if (bitflips > 1)
+			flips = (random32() % (int) bitflips) + 1;
+		while (flips--) {
+			int pos = random32() % (num * 8);
+			ns->buf.byte[pos / 8] ^= (1 << (pos % 8));
+			NS_WARN("read_page: flipping bit %d in page %d "
+				"reading from %d ecc: corrected=%u failed=%u\n",
+				pos, ns->regs.row, ns->regs.column + ns->regs.off,
+				nsmtd->ecc_stats.corrected, nsmtd->ecc_stats.failed);
+		}
+	}
+}
+
 /*
  * Fill the NAND buffer with data read from the specified page.
  */
@@ -1234,36 +1429,40 @@ static void read_page(struct nandsim *ns, int num)
 {
 	union ns_mem *mypage;
 
+	if (ns->cfile) {
+		if (!ns->pages_written[ns->regs.row]) {
+			NS_DBG("read_page: page %d not written\n", ns->regs.row);
+			memset(ns->buf.byte, 0xFF, num);
+		} else {
+			loff_t pos;
+			ssize_t tx;
+
+			NS_DBG("read_page: page %d written, reading from %d\n",
+				ns->regs.row, ns->regs.column + ns->regs.off);
+			if (do_read_error(ns, num))
+				return;
+			pos = (loff_t)ns->regs.row * ns->geom.pgszoob + ns->regs.column + ns->regs.off;
+			tx = read_file(ns, ns->cfile, ns->buf.byte, num, &pos);
+			if (tx != num) {
+				NS_ERR("read_page: read error for page %d ret %ld\n", ns->regs.row, (long)tx);
+				return;
+			}
+			do_bit_flips(ns, num);
+		}
+		return;
+	}
+
 	mypage = NS_GET_PAGE(ns);
 	if (mypage->byte == NULL) {
 		NS_DBG("read_page: page %d not allocated\n", ns->regs.row);
 		memset(ns->buf.byte, 0xFF, num);
 	} else {
-		unsigned int page_no = ns->regs.row;
 		NS_DBG("read_page: page %d allocated, reading from %d\n",
 			ns->regs.row, ns->regs.column + ns->regs.off);
-		if (read_error(page_no)) {
-			int i;
-			memset(ns->buf.byte, 0xFF, num);
-			for (i = 0; i < num; ++i)
-				ns->buf.byte[i] = random32();
-			NS_WARN("simulating read error in page %u\n", page_no);
+		if (do_read_error(ns, num))
 			return;
-		}
 		memcpy(ns->buf.byte, NS_PAGE_BYTE_OFF(ns), num);
-		if (bitflips && random32() < (1 << 22)) {
-			int flips = 1;
-			if (bitflips > 1)
-				flips = (random32() % (int) bitflips) + 1;
-			while (flips--) {
-				int pos = random32() % (num * 8);
-				ns->buf.byte[pos / 8] ^= (1 << (pos % 8));
-				NS_WARN("read_page: flipping bit %d in page %d "
-					"reading from %d ecc: corrected=%u failed=%u\n",
-					pos, ns->regs.row, ns->regs.column + ns->regs.off,
-					nsmtd->ecc_stats.corrected, nsmtd->ecc_stats.failed);
-			}
-		}
+		do_bit_flips(ns, num);
 	}
 }
 
@@ -1275,11 +1474,20 @@ static void erase_sector(struct nandsim *ns)
 	union ns_mem *mypage;
 	int i;
 
+	if (ns->cfile) {
+		for (i = 0; i < ns->geom.pgsec; i++)
+			if (ns->pages_written[ns->regs.row + i]) {
+				NS_DBG("erase_sector: freeing page %d\n", ns->regs.row + i);
+				ns->pages_written[ns->regs.row + i] = 0;
+			}
+		return;
+	}
+
 	mypage = NS_GET_PAGE(ns);
 	for (i = 0; i < ns->geom.pgsec; i++) {
 		if (mypage->byte != NULL) {
 			NS_DBG("erase_sector: freeing page %d\n", ns->regs.row+i);
-			kfree(mypage->byte);
+			kmem_cache_free(ns->nand_pages_slab, mypage->byte);
 			mypage->byte = NULL;
 		}
 		mypage++;
@@ -1295,16 +1503,57 @@ static int prog_page(struct nandsim *ns, int num)
 	union ns_mem *mypage;
 	u_char *pg_off;
 
+	if (ns->cfile) {
+		loff_t off, pos;
+		ssize_t tx;
+		int all;
+
+		NS_DBG("prog_page: writing page %d\n", ns->regs.row);
+		pg_off = ns->file_buf + ns->regs.column + ns->regs.off;
+		off = (loff_t)ns->regs.row * ns->geom.pgszoob + ns->regs.column + ns->regs.off;
+		if (!ns->pages_written[ns->regs.row]) {
+			all = 1;
+			memset(ns->file_buf, 0xff, ns->geom.pgszoob);
+		} else {
+			all = 0;
+			pos = off;
+			tx = read_file(ns, ns->cfile, pg_off, num, &pos);
+			if (tx != num) {
+				NS_ERR("prog_page: read error for page %d ret %ld\n", ns->regs.row, (long)tx);
+				return -1;
+			}
+		}
+		for (i = 0; i < num; i++)
+			pg_off[i] &= ns->buf.byte[i];
+		if (all) {
+			pos = (loff_t)ns->regs.row * ns->geom.pgszoob;
+			tx = write_file(ns, ns->cfile, ns->file_buf, ns->geom.pgszoob, &pos);
+			if (tx != ns->geom.pgszoob) {
+				NS_ERR("prog_page: write error for page %d ret %ld\n", ns->regs.row, (long)tx);
+				return -1;
+			}
+			ns->pages_written[ns->regs.row] = 1;
+		} else {
+			pos = off;
+			tx = write_file(ns, ns->cfile, pg_off, num, &pos);
+			if (tx != num) {
+				NS_ERR("prog_page: write error for page %d ret %ld\n", ns->regs.row, (long)tx);
+				return -1;
+			}
+		}
+		return 0;
+	}
+
 	mypage = NS_GET_PAGE(ns);
 	if (mypage->byte == NULL) {
 		NS_DBG("prog_page: allocating page %d\n", ns->regs.row);
 		/*
 		 * We allocate memory with GFP_NOFS because a flash FS may
 		 * utilize this. If it is holding an FS lock, then gets here,
-		 * then kmalloc runs writeback which goes to the FS again
-		 * and deadlocks. This was seen in practice.
+		 * then kernel memory alloc runs writeback which goes to the FS
+		 * again and deadlocks. This was seen in practice.
 		 */
-		mypage->byte = kmalloc(ns->geom.pgszoob, GFP_NOFS);
+		mypage->byte = kmem_cache_alloc(ns->nand_pages_slab, GFP_NOFS);
 		if (mypage->byte == NULL) {
 			NS_ERR("prog_page: error allocating memory for page %d\n", ns->regs.row);
 			return -1;
@@ -1736,13 +1985,17 @@ static void ns_nand_write_byte(struct mtd_info *mtd, u_char byte)
 
 		/* Check if chip is expecting command */
 		if (NS_STATE(ns->nxstate) != STATE_UNKNOWN && !(ns->nxstate & STATE_CMD_MASK)) {
-			/*
-			 * We are in situation when something else (not command)
-			 * was expected but command was input. In this case ignore
-			 * previous command(s)/state(s) and accept the last one.
-			 */
-			NS_WARN("write_byte: command (%#x) wasn't expected, expected state is %s, "
-				"ignore previous states\n", (uint)byte, get_state_name(ns->nxstate));
+			/* Do not warn if only 2 id bytes are read */
+			if (!(ns->regs.command == NAND_CMD_READID &&
+			    NS_STATE(ns->state) == STATE_DATAOUT_ID && ns->regs.count == 2)) {
+				/*
+				 * We are in situation when something else (not command)
+				 * was expected but command was input. In this case ignore
+				 * previous command(s)/state(s) and accept the last one.
+				 */
+				NS_WARN("write_byte: command (%#x) wasn't expected, expected state is %s, "
+					"ignore previous states\n", (uint)byte, get_state_name(ns->nxstate));
+			}
 			switch_to_ready_state(ns, NS_STATUS_FAILED(ns));
 		}
 
@@ -2044,7 +2297,7 @@ static int __init ns_init_module(void)
 	}
 
 	if (overridesize) {
-		u_int64_t new_size = (u_int64_t)nsmtd->erasesize << overridesize;
+		uint64_t new_size = (uint64_t)nsmtd->erasesize << overridesize;
 		if (new_size >> overridesize != nsmtd->erasesize) {
 			NS_ERR("overridesize is too big\n");
 			goto err_exit;
diff --git a/drivers/mtd/nand/ndfc.c b/drivers/mtd/nand/ndfc.c
index 955959eb02d..582cf80f555 100644
--- a/drivers/mtd/nand/ndfc.c
+++ b/drivers/mtd/nand/ndfc.c
@@ -2,12 +2,20 @@
  *  drivers/mtd/ndfc.c
  *
  *  Overview:
- *   Platform independend driver for NDFC (NanD Flash Controller)
+ *   Platform independent driver for NDFC (NanD Flash Controller)
  *   integrated into EP440 cores
  *
+ *   Ported to an OF platform driver by Sean MacLennan
+ *
+ *   The NDFC supports multiple chips, but this driver only supports a
+ *   single chip since I do not have access to any boards with
+ *   multiple chips.
+ *
  *  Author: Thomas Gleixner
  *
  *  Copyright 2006 IBM
+ *  Copyright 2008 PIKA Technologies
+ *    Sean MacLennan <smaclennan@pikatech.com>
  *
  *  This program is free software; you can redistribute	 it and/or modify it
  *  under  the terms of	 the GNU General  Public License as published by the
@@ -21,27 +29,20 @@
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/ndfc.h>
 #include <linux/mtd/mtd.h>
-#include <linux/platform_device.h>
-
+#include <linux/of_platform.h>
 #include <asm/io.h>
-#ifdef CONFIG_40x
-#include <asm/ibm405.h>
-#else
-#include <asm/ibm44x.h>
-#endif
-
-struct ndfc_nand_mtd {
-	struct mtd_info			mtd;
-	struct nand_chip		chip;
-	struct platform_nand_chip	*pl_chip;
-};
 
-static struct ndfc_nand_mtd ndfc_mtd[NDFC_MAX_BANKS];
 
 struct ndfc_controller {
-	void __iomem		*ndfcbase;
-	struct nand_hw_control	ndfc_control;
-	atomic_t		childs_active;
+	struct of_device *ofdev;
+	void __iomem *ndfcbase;
+	struct mtd_info mtd;
+	struct nand_chip chip;
+	int chip_select;
+	struct nand_hw_control ndfc_control;
+#ifdef CONFIG_MTD_PARTITIONS
+	struct mtd_partition *parts;
+#endif
 };
 
 static struct ndfc_controller ndfc_ctrl;
@@ -50,17 +51,14 @@ static void ndfc_select_chip(struct mtd_info *mtd, int chip)
 {
 	uint32_t ccr;
 	struct ndfc_controller *ndfc = &ndfc_ctrl;
-	struct nand_chip *nandchip = mtd->priv;
-	struct ndfc_nand_mtd *nandmtd = nandchip->priv;
-	struct platform_nand_chip *pchip = nandmtd->pl_chip;
 
-	ccr = __raw_readl(ndfc->ndfcbase + NDFC_CCR);
+	ccr = in_be32(ndfc->ndfcbase + NDFC_CCR);
 	if (chip >= 0) {
 		ccr &= ~NDFC_CCR_BS_MASK;
-		ccr |= NDFC_CCR_BS(chip + pchip->chip_offset);
+		ccr |= NDFC_CCR_BS(chip + ndfc->chip_select);
 	} else
 		ccr |= NDFC_CCR_RESET_CE;
-	__raw_writel(ccr, ndfc->ndfcbase + NDFC_CCR);
+	out_be32(ndfc->ndfcbase + NDFC_CCR, ccr);
 }
 
 static void ndfc_hwcontrol(struct mtd_info *mtd, int cmd, unsigned int ctrl)
@@ -80,7 +78,7 @@ static int ndfc_ready(struct mtd_info *mtd)
 {
 	struct ndfc_controller *ndfc = &ndfc_ctrl;
 
-	return __raw_readl(ndfc->ndfcbase + NDFC_STAT) & NDFC_STAT_IS_READY;
+	return in_be32(ndfc->ndfcbase + NDFC_STAT) & NDFC_STAT_IS_READY;
 }
 
 static void ndfc_enable_hwecc(struct mtd_info *mtd, int mode)
@@ -88,9 +86,9 @@ static void ndfc_enable_hwecc(struct mtd_info *mtd, int mode)
 	uint32_t ccr;
 	struct ndfc_controller *ndfc = &ndfc_ctrl;
 
-	ccr = __raw_readl(ndfc->ndfcbase + NDFC_CCR);
+	ccr = in_be32(ndfc->ndfcbase + NDFC_CCR);
 	ccr |= NDFC_CCR_RESET_ECC;
-	__raw_writel(ccr, ndfc->ndfcbase + NDFC_CCR);
+	out_be32(ndfc->ndfcbase + NDFC_CCR, ccr);
 	wmb();
 }
 
@@ -102,9 +100,10 @@ static int ndfc_calculate_ecc(struct mtd_info *mtd,
 	uint8_t *p = (uint8_t *)&ecc;
 
 	wmb();
-	ecc = __raw_readl(ndfc->ndfcbase + NDFC_ECC);
-	ecc_code[0] = p[1];
-	ecc_code[1] = p[2];
+	ecc = in_be32(ndfc->ndfcbase + NDFC_ECC);
+	/* The NDFC uses Smart Media (SMC) bytes order */
+	ecc_code[0] = p[2];
+	ecc_code[1] = p[1];
 	ecc_code[2] = p[3];
 
 	return 0;
@@ -123,7 +122,7 @@ static void ndfc_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
 	uint32_t *p = (uint32_t *) buf;
 
 	for(;len > 0; len -= 4)
-		*p++ = __raw_readl(ndfc->ndfcbase + NDFC_DATA);
+		*p++ = in_be32(ndfc->ndfcbase + NDFC_DATA);
 }
 
 static void ndfc_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
@@ -132,7 +131,7 @@ static void ndfc_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 	uint32_t *p = (uint32_t *) buf;
 
 	for(;len > 0; len -= 4)
-		__raw_writel(*p++, ndfc->ndfcbase + NDFC_DATA);
+		out_be32(ndfc->ndfcbase + NDFC_DATA, *p++);
 }
 
 static int ndfc_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
@@ -141,7 +140,7 @@ static int ndfc_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 	uint32_t *p = (uint32_t *) buf;
 
 	for(;len > 0; len -= 4)
-		if (*p++ != __raw_readl(ndfc->ndfcbase + NDFC_DATA))
+		if (*p++ != in_be32(ndfc->ndfcbase + NDFC_DATA))
 			return -EFAULT;
 	return 0;
 }
@@ -149,10 +148,19 @@ static int ndfc_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 /*
  * Initialize chip structure
  */
-static void ndfc_chip_init(struct ndfc_nand_mtd *mtd)
+static int ndfc_chip_init(struct ndfc_controller *ndfc,
+			  struct device_node *node)
 {
-	struct ndfc_controller *ndfc = &ndfc_ctrl;
-	struct nand_chip *chip = &mtd->chip;
+#ifdef CONFIG_MTD_PARTITIONS
+#ifdef CONFIG_MTD_CMDLINE_PARTS
+	static const char *part_types[] = { "cmdlinepart", NULL };
+#else
+	static const char *part_types[] = { NULL };
+#endif
+#endif
+	struct device_node *flash_np;
+	struct nand_chip *chip = &ndfc->chip;
+	int ret;
 
 	chip->IO_ADDR_R = ndfc->ndfcbase + NDFC_DATA;
 	chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_DATA;
@@ -160,8 +168,6 @@ static void ndfc_chip_init(struct ndfc_nand_mtd *mtd)
 	chip->dev_ready = ndfc_ready;
 	chip->select_chip = ndfc_select_chip;
 	chip->chip_delay = 50;
-	chip->priv = mtd;
-	chip->options = mtd->pl_chip->options;
 	chip->controller = &ndfc->ndfc_control;
 	chip->read_buf = ndfc_read_buf;
 	chip->write_buf = ndfc_write_buf;
@@ -172,143 +178,136 @@ static void ndfc_chip_init(struct ndfc_nand_mtd *mtd)
 	chip->ecc.mode = NAND_ECC_HW;
 	chip->ecc.size = 256;
 	chip->ecc.bytes = 3;
-	chip->ecclayout = chip->ecc.layout = mtd->pl_chip->ecclayout;
-	mtd->mtd.priv = chip;
-	mtd->mtd.owner = THIS_MODULE;
-}
-
-static int ndfc_chip_probe(struct platform_device *pdev)
-{
-	struct platform_nand_chip *nc = pdev->dev.platform_data;
-	struct ndfc_chip_settings *settings = nc->priv;
-	struct ndfc_controller *ndfc = &ndfc_ctrl;
-	struct ndfc_nand_mtd *nandmtd;
-
-	if (nc->chip_offset >= NDFC_MAX_BANKS || nc->nr_chips > NDFC_MAX_BANKS)
-		return -EINVAL;
-
-	/* Set the bank settings */
-	__raw_writel(settings->bank_settings,
-		     ndfc->ndfcbase + NDFC_BCFG0 + (nc->chip_offset << 2));
 
-	nandmtd = &ndfc_mtd[pdev->id];
-	if (nandmtd->pl_chip)
-		return -EBUSY;
+	ndfc->mtd.priv = chip;
+	ndfc->mtd.owner = THIS_MODULE;
 
-	nandmtd->pl_chip = nc;
-	ndfc_chip_init(nandmtd);
-
-	/* Scan for chips */
-	if (nand_scan(&nandmtd->mtd, nc->nr_chips)) {
-		nandmtd->pl_chip = NULL;
+	flash_np = of_get_next_child(node, NULL);
+	if (!flash_np)
 		return -ENODEV;
+
+	ndfc->mtd.name = kasprintf(GFP_KERNEL, "%s.%s",
+				   ndfc->ofdev->dev.bus_id, flash_np->name);
+	if (!ndfc->mtd.name) {
+		ret = -ENOMEM;
+		goto err;
 	}
 
-#ifdef CONFIG_MTD_PARTITIONS
-	printk("Number of partitions %d\n", nc->nr_partitions);
-	if (nc->nr_partitions) {
-		/* Add the full device, so complete dumps can be made */
-		add_mtd_device(&nandmtd->mtd);
-		add_mtd_partitions(&nandmtd->mtd, nc->partitions,
-				   nc->nr_partitions);
+	ret = nand_scan(&ndfc->mtd, 1);
+	if (ret)
+		goto err;
 
-	} else
-#else
-		add_mtd_device(&nandmtd->mtd);
+#ifdef CONFIG_MTD_PARTITIONS
+	ret = parse_mtd_partitions(&ndfc->mtd, part_types, &ndfc->parts, 0);
+	if (ret < 0)
+		goto err;
+
+#ifdef CONFIG_MTD_OF_PARTS
+	if (ret == 0) {
+		ret = of_mtd_parse_partitions(&ndfc->ofdev->dev, flash_np,
+					      &ndfc->parts);
+		if (ret < 0)
+			goto err;
+	}
 #endif
 
-	atomic_inc(&ndfc->childs_active);
-	return 0;
-}
+	if (ret > 0)
+		ret = add_mtd_partitions(&ndfc->mtd, ndfc->parts, ret);
+	else
+#endif
+		ret = add_mtd_device(&ndfc->mtd);
 
-static int ndfc_chip_remove(struct platform_device *pdev)
-{
-	return 0;
+err:
+	of_node_put(flash_np);
+	if (ret)
+		kfree(ndfc->mtd.name);
+	return ret;
 }
 
-static int ndfc_nand_probe(struct platform_device *pdev)
+static int __devinit ndfc_probe(struct of_device *ofdev,
+				const struct of_device_id *match)
 {
-	struct platform_nand_ctrl *nc = pdev->dev.platform_data;
-	struct ndfc_controller_settings *settings = nc->priv;
-	struct resource *res = pdev->resource;
 	struct ndfc_controller *ndfc = &ndfc_ctrl;
-	unsigned long long phys = settings->ndfc_erpn | res->start;
+	const u32 *reg;
+	u32 ccr;
+	int err, len;
 
-#ifndef CONFIG_PHYS_64BIT
-	ndfc->ndfcbase = ioremap((phys_addr_t)phys, res->end - res->start + 1);
-#else
-	ndfc->ndfcbase = ioremap64(phys, res->end - res->start + 1);
-#endif
+	spin_lock_init(&ndfc->ndfc_control.lock);
+	init_waitqueue_head(&ndfc->ndfc_control.wq);
+	ndfc->ofdev = ofdev;
+	dev_set_drvdata(&ofdev->dev, ndfc);
+
+	/* Read the reg property to get the chip select */
+	reg = of_get_property(ofdev->node, "reg", &len);
+	if (reg == NULL || len != 12) {
+		dev_err(&ofdev->dev, "unable read reg property (%d)\n", len);
+		return -ENOENT;
+	}
+	ndfc->chip_select = reg[0];
+
+	ndfc->ndfcbase = of_iomap(ofdev->node, 0);
 	if (!ndfc->ndfcbase) {
-		printk(KERN_ERR "NDFC: ioremap failed\n");
+		dev_err(&ofdev->dev, "failed to get memory\n");
 		return -EIO;
 	}
 
-	__raw_writel(settings->ccr_settings, ndfc->ndfcbase + NDFC_CCR);
+	ccr = NDFC_CCR_BS(ndfc->chip_select);
 
-	spin_lock_init(&ndfc->ndfc_control.lock);
-	init_waitqueue_head(&ndfc->ndfc_control.wq);
+	/* It is ok if ccr does not exist - just default to 0 */
+	reg = of_get_property(ofdev->node, "ccr", NULL);
+	if (reg)
+		ccr |= *reg;
 
-	platform_set_drvdata(pdev, ndfc);
+	out_be32(ndfc->ndfcbase + NDFC_CCR, ccr);
 
-	printk("NDFC NAND Driver initialized. Chip-Rev: 0x%08x\n",
-	       __raw_readl(ndfc->ndfcbase + NDFC_REVID));
+	/* Set the bank settings if given */
+	reg = of_get_property(ofdev->node, "bank-settings", NULL);
+	if (reg) {
+		int offset = NDFC_BCFG0 + (ndfc->chip_select << 2);
+		out_be32(ndfc->ndfcbase + offset, *reg);
+	}
+
+	err = ndfc_chip_init(ndfc, ofdev->node);
+	if (err) {
+		iounmap(ndfc->ndfcbase);
+		return err;
+	}
 
 	return 0;
 }
 
-static int ndfc_nand_remove(struct platform_device *pdev)
+static int __devexit ndfc_remove(struct of_device *ofdev)
 {
-	struct ndfc_controller *ndfc = platform_get_drvdata(pdev);
+	struct ndfc_controller *ndfc = dev_get_drvdata(&ofdev->dev);
 
-	if (atomic_read(&ndfc->childs_active))
-		return -EBUSY;
+	nand_release(&ndfc->mtd);
 
-	if (ndfc) {
-		platform_set_drvdata(pdev, NULL);
-		iounmap(ndfc_ctrl.ndfcbase);
-		ndfc_ctrl.ndfcbase = NULL;
-	}
 	return 0;
 }
 
-/* driver device registration */
-
-static struct platform_driver ndfc_chip_driver = {
-	.probe		= ndfc_chip_probe,
-	.remove		= ndfc_chip_remove,
-	.driver		= {
-		.name	= "ndfc-chip",
-		.owner	= THIS_MODULE,
-	},
+static const struct of_device_id ndfc_match[] = {
+	{ .compatible = "ibm,ndfc", },
+	{}
 };
+MODULE_DEVICE_TABLE(of, ndfc_match);
 
-static struct platform_driver ndfc_nand_driver = {
-	.probe		= ndfc_nand_probe,
-	.remove		= ndfc_nand_remove,
-	.driver		= {
-		.name	= "ndfc-nand",
-		.owner	= THIS_MODULE,
+static struct of_platform_driver ndfc_driver = {
+	.driver = {
+		.name	= "ndfc",
 	},
+	.match_table = ndfc_match,
+	.probe = ndfc_probe,
+	.remove = __devexit_p(ndfc_remove),
 };
 
 static int __init ndfc_nand_init(void)
 {
-	int ret;
-
-	spin_lock_init(&ndfc_ctrl.ndfc_control.lock);
-	init_waitqueue_head(&ndfc_ctrl.ndfc_control.wq);
-
-	ret = platform_driver_register(&ndfc_nand_driver);
-	if (!ret)
-		ret = platform_driver_register(&ndfc_chip_driver);
-	return ret;
+	return of_register_platform_driver(&ndfc_driver);
 }
 
 static void __exit ndfc_nand_exit(void)
 {
-	platform_driver_unregister(&ndfc_chip_driver);
-	platform_driver_unregister(&ndfc_nand_driver);
+	of_unregister_platform_driver(&ndfc_driver);
 }
 
 module_init(ndfc_nand_init);
@@ -316,6 +315,4 @@ module_exit(ndfc_nand_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Thomas Gleixner <tglx@linutronix.de>");
-MODULE_DESCRIPTION("Platform driver for NDFC");
-MODULE_ALIAS("platform:ndfc-chip");
-MODULE_ALIAS("platform:ndfc-nand");
+MODULE_DESCRIPTION("OF Platform driver for NDFC");
diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index fc414449561..cc55cbc2b30 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -298,7 +298,7 @@ static struct pxa3xx_nand_flash *builtin_flash_types[] = {
 #define NDTR1_tAR(c)	(min((c), 15) << 0)
 
 /* convert nano-seconds to nand flash controller clock cycles */
-#define ns2cycle(ns, clk)	(int)(((ns) * (clk / 1000000) / 1000) + 1)
+#define ns2cycle(ns, clk)	(int)(((ns) * (clk / 1000000) / 1000) - 1)
 
 static void pxa3xx_nand_set_timing(struct pxa3xx_nand_info *info,
 				   const struct pxa3xx_nand_timing *t)
@@ -368,14 +368,14 @@ static int prepare_read_prog_cmd(struct pxa3xx_nand_info *info,
 		/* large block, 2 cycles for column address
 		 * row address starts from 3rd cycle
 		 */
-		info->ndcb1 |= (page_addr << 16) | (column & 0xffff);
+		info->ndcb1 |= page_addr << 16;
 		if (info->row_addr_cycles == 3)
 			info->ndcb2 = (page_addr >> 16) & 0xff;
 	} else
 		/* small block, 1 cycles for column address
 		 * row address starts from 2nd cycle
 		 */
-		info->ndcb1 = (page_addr << 8) | (column & 0xff);
+		info->ndcb1 = page_addr << 8;
 
 	if (cmd == cmdset->program)
 		info->ndcb0 |= NDCB0_CMD_TYPE(1) | NDCB0_AUTO_RS;
diff --git a/drivers/mtd/nand/sharpsl.c b/drivers/mtd/nand/sharpsl.c
index 30a518e211b..54ec7542a7b 100644
--- a/drivers/mtd/nand/sharpsl.c
+++ b/drivers/mtd/nand/sharpsl.c
@@ -2,6 +2,7 @@
  * drivers/mtd/nand/sharpsl.c
  *
  *  Copyright (C) 2004 Richard Purdie
+ *  Copyright (C) 2008 Dmitry Baryshkov
  *
  *  Based on Sharp's NAND driver sharp_sl.c
  *
@@ -19,22 +20,31 @@
 #include <linux/mtd/nand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
+#include <linux/mtd/sharpsl.h>
 #include <linux/interrupt.h>
+#include <linux/platform_device.h>
+
 #include <asm/io.h>
 #include <mach/hardware.h>
 #include <asm/mach-types.h>
 
-static void __iomem *sharpsl_io_base;
-static int sharpsl_phys_base = 0x0C000000;
+struct sharpsl_nand {
+	struct mtd_info		mtd;
+	struct nand_chip	chip;
+
+	void __iomem		*io;
+};
+
+#define mtd_to_sharpsl(_mtd)	container_of(_mtd, struct sharpsl_nand, mtd)
 
 /* register offset */
-#define ECCLPLB	 	sharpsl_io_base+0x00	/* line parity 7 - 0 bit */
-#define ECCLPUB	 	sharpsl_io_base+0x04	/* line parity 15 - 8 bit */
-#define ECCCP	   	sharpsl_io_base+0x08	/* column parity 5 - 0 bit */
-#define ECCCNTR	 	sharpsl_io_base+0x0C	/* ECC byte counter */
-#define ECCCLRR	 	sharpsl_io_base+0x10	/* cleare ECC */
-#define FLASHIO	 	sharpsl_io_base+0x14	/* Flash I/O */
-#define FLASHCTL	sharpsl_io_base+0x18	/* Flash Control */
+#define ECCLPLB		0x00	/* line parity 7 - 0 bit */
+#define ECCLPUB		0x04	/* line parity 15 - 8 bit */
+#define ECCCP		0x08	/* column parity 5 - 0 bit */
+#define ECCCNTR		0x0C	/* ECC byte counter */
+#define ECCCLRR		0x10	/* cleare ECC */
+#define FLASHIO		0x14	/* Flash I/O */
+#define FLASHCTL	0x18	/* Flash Control */
 
 /* Flash control bit */
 #define FLRYBY		(1 << 5)
@@ -45,35 +55,6 @@ static int sharpsl_phys_base = 0x0C000000;
 #define FLCE0		(1 << 0)
 
 /*
- * MTD structure for SharpSL
- */
-static struct mtd_info *sharpsl_mtd = NULL;
-
-/*
- * Define partitions for flash device
- */
-#define DEFAULT_NUM_PARTITIONS 3
-
-static int nr_partitions;
-static struct mtd_partition sharpsl_nand_default_partition_info[] = {
-	{
-	 .name = "System Area",
-	 .offset = 0,
-	 .size = 7 * 1024 * 1024,
-	 },
-	{
-	 .name = "Root Filesystem",
-	 .offset = 7 * 1024 * 1024,
-	 .size = 30 * 1024 * 1024,
-	 },
-	{
-	 .name = "Home Filesystem",
-	 .offset = MTDPART_OFS_APPEND,
-	 .size = MTDPART_SIZ_FULL,
-	 },
-};
-
-/*
  *	hardware specific access to control-lines
  *	ctrl:
  *	NAND_CNE: bit 0 -> ! bit 0 & 4
@@ -84,6 +65,7 @@ static struct mtd_partition sharpsl_nand_default_partition_info[] = {
 static void sharpsl_nand_hwcontrol(struct mtd_info *mtd, int cmd,
 				   unsigned int ctrl)
 {
+	struct sharpsl_nand *sharpsl = mtd_to_sharpsl(mtd);
 	struct nand_chip *chip = mtd->priv;
 
 	if (ctrl & NAND_CTRL_CHANGE) {
@@ -93,103 +75,97 @@ static void sharpsl_nand_hwcontrol(struct mtd_info *mtd, int cmd,
 
 		bits ^= 0x11;
 
-		writeb((readb(FLASHCTL) & ~0x17) | bits, FLASHCTL);
+		writeb((readb(sharpsl->io + FLASHCTL) & ~0x17) | bits, sharpsl->io + FLASHCTL);
 	}
 
 	if (cmd != NAND_CMD_NONE)
 		writeb(cmd, chip->IO_ADDR_W);
 }
 
-static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
-
-static struct nand_bbt_descr sharpsl_bbt = {
-	.options = 0,
-	.offs = 4,
-	.len = 2,
-	.pattern = scan_ff_pattern
-};
-
-static struct nand_bbt_descr sharpsl_akita_bbt = {
-	.options = 0,
-	.offs = 4,
-	.len = 1,
-	.pattern = scan_ff_pattern
-};
-
-static struct nand_ecclayout akita_oobinfo = {
-	.eccbytes = 24,
-	.eccpos = {
-		   0x5, 0x1, 0x2, 0x3, 0x6, 0x7, 0x15, 0x11,
-		   0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23,
-		   0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37},
-	.oobfree = {{0x08, 0x09}}
-};
-
 static int sharpsl_nand_dev_ready(struct mtd_info *mtd)
 {
-	return !((readb(FLASHCTL) & FLRYBY) == 0);
+	struct sharpsl_nand *sharpsl = mtd_to_sharpsl(mtd);
+	return !((readb(sharpsl->io + FLASHCTL) & FLRYBY) == 0);
 }
 
 static void sharpsl_nand_enable_hwecc(struct mtd_info *mtd, int mode)
 {
-	writeb(0, ECCCLRR);
+	struct sharpsl_nand *sharpsl = mtd_to_sharpsl(mtd);
+	writeb(0, sharpsl->io + ECCCLRR);
 }
 
 static int sharpsl_nand_calculate_ecc(struct mtd_info *mtd, const u_char * dat, u_char * ecc_code)
 {
-	ecc_code[0] = ~readb(ECCLPUB);
-	ecc_code[1] = ~readb(ECCLPLB);
-	ecc_code[2] = (~readb(ECCCP) << 2) | 0x03;
-	return readb(ECCCNTR) != 0;
+	struct sharpsl_nand *sharpsl = mtd_to_sharpsl(mtd);
+	ecc_code[0] = ~readb(sharpsl->io + ECCLPUB);
+	ecc_code[1] = ~readb(sharpsl->io + ECCLPLB);
+	ecc_code[2] = (~readb(sharpsl->io + ECCCP) << 2) | 0x03;
+	return readb(sharpsl->io + ECCCNTR) != 0;
 }
 
 #ifdef CONFIG_MTD_PARTITIONS
-const char *part_probes[] = { "cmdlinepart", NULL };
+static const char *part_probes[] = { "cmdlinepart", NULL };
 #endif
 
 /*
  * Main initialization routine
  */
-static int __init sharpsl_nand_init(void)
+static int __devinit sharpsl_nand_probe(struct platform_device *pdev)
 {
 	struct nand_chip *this;
+#ifdef CONFIG_MTD_PARTITIONS
 	struct mtd_partition *sharpsl_partition_info;
+	int nr_partitions;
+#endif
+	struct resource *r;
 	int err = 0;
+	struct sharpsl_nand *sharpsl;
+	struct sharpsl_nand_platform_data *data = pdev->dev.platform_data;
+
+	if (!data) {
+		dev_err(&pdev->dev, "no platform data!\n");
+		return -EINVAL;
+	}
 
 	/* Allocate memory for MTD device structure and private data */
-	sharpsl_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL);
-	if (!sharpsl_mtd) {
+	sharpsl = kzalloc(sizeof(struct sharpsl_nand), GFP_KERNEL);
+	if (!sharpsl) {
 		printk("Unable to allocate SharpSL NAND MTD device structure.\n");
 		return -ENOMEM;
 	}
 
+	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!r) {
+		dev_err(&pdev->dev, "no io memory resource defined!\n");
+		err = -ENODEV;
+		goto err_get_res;
+	}
+
 	/* map physical address */
-	sharpsl_io_base = ioremap(sharpsl_phys_base, 0x1000);
-	if (!sharpsl_io_base) {
+	sharpsl->io = ioremap(r->start, resource_size(r));
+	if (!sharpsl->io) {
 		printk("ioremap to access Sharp SL NAND chip failed\n");
-		kfree(sharpsl_mtd);
-		return -EIO;
+		err = -EIO;
+		goto err_ioremap;
 	}
 
 	/* Get pointer to private data */
-	this = (struct nand_chip *)(&sharpsl_mtd[1]);
-
-	/* Initialize structures */
-	memset(sharpsl_mtd, 0, sizeof(struct mtd_info));
-	memset(this, 0, sizeof(struct nand_chip));
+	this = (struct nand_chip *)(&sharpsl->chip);
 
 	/* Link the private data with the MTD structure */
-	sharpsl_mtd->priv = this;
-	sharpsl_mtd->owner = THIS_MODULE;
+	sharpsl->mtd.priv = this;
+	sharpsl->mtd.owner = THIS_MODULE;
+
+	platform_set_drvdata(pdev, sharpsl);
 
 	/*
 	 * PXA initialize
 	 */
-	writeb(readb(FLASHCTL) | FLWP, FLASHCTL);
+	writeb(readb(sharpsl->io + FLASHCTL) | FLWP, sharpsl->io + FLASHCTL);
 
 	/* Set address of NAND IO lines */
-	this->IO_ADDR_R = FLASHIO;
-	this->IO_ADDR_W = FLASHIO;
+	this->IO_ADDR_R = sharpsl->io + FLASHIO;
+	this->IO_ADDR_W = sharpsl->io + FLASHIO;
 	/* Set address of hardware control function */
 	this->cmd_ctrl = sharpsl_nand_hwcontrol;
 	this->dev_ready = sharpsl_nand_dev_ready;
@@ -199,68 +175,89 @@ static int __init sharpsl_nand_init(void)
 	this->ecc.mode = NAND_ECC_HW;
 	this->ecc.size = 256;
 	this->ecc.bytes = 3;
-	this->badblock_pattern = &sharpsl_bbt;
-	if (machine_is_akita() || machine_is_borzoi()) {
-		this->badblock_pattern = &sharpsl_akita_bbt;
-		this->ecc.layout = &akita_oobinfo;
-	}
+	this->badblock_pattern = data->badblock_pattern;
+	this->ecc.layout = data->ecc_layout;
 	this->ecc.hwctl = sharpsl_nand_enable_hwecc;
 	this->ecc.calculate = sharpsl_nand_calculate_ecc;
 	this->ecc.correct = nand_correct_data;
 
 	/* Scan to find existence of the device */
-	err = nand_scan(sharpsl_mtd, 1);
-	if (err) {
-		iounmap(sharpsl_io_base);
-		kfree(sharpsl_mtd);
-		return err;
-	}
+	err = nand_scan(&sharpsl->mtd, 1);
+	if (err)
+		goto err_scan;
 
 	/* Register the partitions */
-	sharpsl_mtd->name = "sharpsl-nand";
-	nr_partitions = parse_mtd_partitions(sharpsl_mtd, part_probes, &sharpsl_partition_info, 0);
-
+	sharpsl->mtd.name = "sharpsl-nand";
+#ifdef CONFIG_MTD_PARTITIONS
+	nr_partitions = parse_mtd_partitions(&sharpsl->mtd, part_probes, &sharpsl_partition_info, 0);
 	if (nr_partitions <= 0) {
-		nr_partitions = DEFAULT_NUM_PARTITIONS;
-		sharpsl_partition_info = sharpsl_nand_default_partition_info;
-		if (machine_is_poodle()) {
-			sharpsl_partition_info[1].size = 22 * 1024 * 1024;
-		} else if (machine_is_corgi() || machine_is_shepherd()) {
-			sharpsl_partition_info[1].size = 25 * 1024 * 1024;
-		} else if (machine_is_husky()) {
-			sharpsl_partition_info[1].size = 53 * 1024 * 1024;
-		} else if (machine_is_spitz()) {
-			sharpsl_partition_info[1].size = 5 * 1024 * 1024;
-		} else if (machine_is_akita()) {
-			sharpsl_partition_info[1].size = 58 * 1024 * 1024;
-		} else if (machine_is_borzoi()) {
-			sharpsl_partition_info[1].size = 32 * 1024 * 1024;
-		}
+		nr_partitions = data->nr_partitions;
+		sharpsl_partition_info = data->partitions;
 	}
 
-	add_mtd_partitions(sharpsl_mtd, sharpsl_partition_info, nr_partitions);
+	if (nr_partitions > 0)
+		err = add_mtd_partitions(&sharpsl->mtd, sharpsl_partition_info, nr_partitions);
+	else
+#endif
+	err = add_mtd_device(&sharpsl->mtd);
+	if (err)
+		goto err_add;
 
 	/* Return happy */
 	return 0;
-}
 
-module_init(sharpsl_nand_init);
+err_add:
+	nand_release(&sharpsl->mtd);
+
+err_scan:
+	platform_set_drvdata(pdev, NULL);
+	iounmap(sharpsl->io);
+err_ioremap:
+err_get_res:
+	kfree(sharpsl);
+	return err;
+}
 
 /*
  * Clean up routine
  */
-static void __exit sharpsl_nand_cleanup(void)
+static int __devexit sharpsl_nand_remove(struct platform_device *pdev)
 {
+	struct sharpsl_nand *sharpsl = platform_get_drvdata(pdev);
+
 	/* Release resources, unregister device */
-	nand_release(sharpsl_mtd);
+	nand_release(&sharpsl->mtd);
 
-	iounmap(sharpsl_io_base);
+	platform_set_drvdata(pdev, NULL);
+
+	iounmap(sharpsl->io);
 
 	/* Free the MTD device structure */
-	kfree(sharpsl_mtd);
+	kfree(sharpsl);
+
+	return 0;
+}
+
+static struct platform_driver sharpsl_nand_driver = {
+	.driver = {
+		.name	= "sharpsl-nand",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= sharpsl_nand_probe,
+	.remove		= __devexit_p(sharpsl_nand_remove),
+};
+
+static int __init sharpsl_nand_init(void)
+{
+	return platform_driver_register(&sharpsl_nand_driver);
 }
+module_init(sharpsl_nand_init);
 
-module_exit(sharpsl_nand_cleanup);
+static void __exit sharpsl_nand_exit(void)
+{
+	platform_driver_unregister(&sharpsl_nand_driver);
+}
+module_exit(sharpsl_nand_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Richard Purdie <rpurdie@rpsys.net>");
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index 320b929abe7..d1c4546513f 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -39,7 +39,7 @@ static void nftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	struct NFTLrecord *nftl;
 	unsigned long temp;
 
-	if (mtd->type != MTD_NANDFLASH)
+	if (mtd->type != MTD_NANDFLASH || mtd->size > UINT_MAX)
 		return;
 	/* OK, this is moderately ugly.  But probably safe.  Alternatives? */
 	if (memcmp(mtd->name, "DiskOnChip", 10))
diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c
index ccc4f209fbb..8b22b1836e9 100644
--- a/drivers/mtd/nftlmount.c
+++ b/drivers/mtd/nftlmount.c
@@ -51,7 +51,7 @@ static int find_boot_record(struct NFTLrecord *nftl)
 	   the mtd device accordingly.  We could even get rid of
 	   nftl->EraseSize if there were any point in doing so. */
 	nftl->EraseSize = nftl->mbd.mtd->erasesize;
-        nftl->nb_blocks = nftl->mbd.mtd->size / nftl->EraseSize;
+        nftl->nb_blocks = (u32)nftl->mbd.mtd->size / nftl->EraseSize;
 
 	nftl->MediaUnit = BLOCK_NIL;
 	nftl->SpareMediaUnit = BLOCK_NIL;
@@ -168,7 +168,7 @@ device is already correct.
 			printk(KERN_NOTICE "WARNING: Support for NFTL with UnitSizeFactor 0x%02x is experimental\n",
 			       mh->UnitSizeFactor);
 			nftl->EraseSize = nftl->mbd.mtd->erasesize << (0xff - mh->UnitSizeFactor);
-			nftl->nb_blocks = nftl->mbd.mtd->size / nftl->EraseSize;
+			nftl->nb_blocks = (u32)nftl->mbd.mtd->size / nftl->EraseSize;
 		}
 #endif
 		nftl->nb_boot_blocks = le16_to_cpu(mh->FirstPhysicalEUN);
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 90ed319f26e..529af271db1 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1772,7 +1772,7 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 	int len;
 	int ret = 0;
 
-	DEBUG(MTD_DEBUG_LEVEL3, "onenand_erase: start = 0x%08x, len = %i\n", (unsigned int) instr->addr, (unsigned int) instr->len);
+	DEBUG(MTD_DEBUG_LEVEL3, "onenand_erase: start = 0x%012llx, len = %llu\n", (unsigned long long) instr->addr, (unsigned long long) instr->len);
 
 	block_size = (1 << this->erase_shift);
 
@@ -1810,7 +1810,7 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 
 		/* Check if we have a bad block, we do not erase bad blocks */
 		if (onenand_block_isbad_nolock(mtd, addr, 0)) {
-			printk (KERN_WARNING "onenand_erase: attempt to erase a bad block at addr 0x%08x\n", (unsigned int) addr);
+			printk (KERN_WARNING "onenand_erase: attempt to erase a bad block at addr 0x%012llx\n", (unsigned long long) addr);
 			instr->state = MTD_ERASE_FAILED;
 			goto erase_exit;
 		}
@@ -2029,7 +2029,7 @@ static int onenand_do_lock_cmd(struct mtd_info *mtd, loff_t ofs, size_t len, int
  *
  * Lock one or more blocks
  */
-static int onenand_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int onenand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
@@ -2047,7 +2047,7 @@ static int onenand_lock(struct mtd_info *mtd, loff_t ofs, size_t len)
  *
  * Unlock one or more blocks
  */
-static int onenand_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
+static int onenand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len)
 {
 	int ret;
 
diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c
index e538c0a72ab..d2aa9c46530 100644
--- a/drivers/mtd/rfd_ftl.c
+++ b/drivers/mtd/rfd_ftl.c
@@ -21,8 +21,6 @@
 
 #include <asm/types.h>
 
-#define const_cpu_to_le16	__constant_cpu_to_le16
-
 static int block_size = 0;
 module_param(block_size, int, 0);
 MODULE_PARM_DESC(block_size, "Block size to use by RFD, defaults to erase unit size");
@@ -156,7 +154,7 @@ static int scan_header(struct partition *part)
 	size_t retlen;
 
 	sectors_per_block = part->block_size / SECTOR_SIZE;
-	part->total_blocks = part->mbd.mtd->size / part->block_size;
+	part->total_blocks = (u32)part->mbd.mtd->size / part->block_size;
 
 	if (part->total_blocks < 2)
 		return -ENOENT;
@@ -276,16 +274,17 @@ static void erase_callback(struct erase_info *erase)
 
 	part = (struct partition*)erase->priv;
 
-	i = erase->addr / part->block_size;
-	if (i >= part->total_blocks || part->blocks[i].offset != erase->addr) {
-		printk(KERN_ERR PREFIX "erase callback for unknown offset %x "
-				"on '%s'\n", erase->addr, part->mbd.mtd->name);
+	i = (u32)erase->addr / part->block_size;
+	if (i >= part->total_blocks || part->blocks[i].offset != erase->addr ||
+	    erase->addr > UINT_MAX) {
+		printk(KERN_ERR PREFIX "erase callback for unknown offset %llx "
+				"on '%s'\n", (unsigned long long)erase->addr, part->mbd.mtd->name);
 		return;
 	}
 
 	if (erase->state != MTD_ERASE_DONE) {
-		printk(KERN_WARNING PREFIX "erase failed at 0x%x on '%s', "
-				"state %d\n", erase->addr,
+		printk(KERN_WARNING PREFIX "erase failed at 0x%llx on '%s', "
+				"state %d\n", (unsigned long long)erase->addr,
 				part->mbd.mtd->name, erase->state);
 
 		part->blocks[i].state = BLOCK_FAILED;
@@ -297,7 +296,7 @@ static void erase_callback(struct erase_info *erase)
 		return;
 	}
 
-	magic = const_cpu_to_le16(RFD_MAGIC);
+	magic = cpu_to_le16(RFD_MAGIC);
 
 	part->blocks[i].state = BLOCK_ERASED;
 	part->blocks[i].free_sectors = part->data_sectors_per_block;
@@ -345,9 +344,9 @@ static int erase_block(struct partition *part, int block)
 	rc = part->mbd.mtd->erase(part->mbd.mtd, erase);
 
 	if (rc) {
-		printk(KERN_ERR PREFIX "erase of region %x,%x on '%s' "
-				"failed\n", erase->addr, erase->len,
-				part->mbd.mtd->name);
+		printk(KERN_ERR PREFIX "erase of region %llx,%llx on '%s' "
+				"failed\n", (unsigned long long)erase->addr,
+				(unsigned long long)erase->len, part->mbd.mtd->name);
 		kfree(erase);
 	}
 
@@ -587,7 +586,7 @@ static int mark_sector_deleted(struct partition *part, u_long old_addr)
 	int block, offset, rc;
 	u_long addr;
 	size_t retlen;
-	u16 del = const_cpu_to_le16(SECTOR_DELETED);
+	u16 del = cpu_to_le16(SECTOR_DELETED);
 
 	block = old_addr / part->block_size;
 	offset = (old_addr % part->block_size) / SECTOR_SIZE -
@@ -763,7 +762,7 @@ static void rfd_ftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 {
 	struct partition *part;
 
-	if (mtd->type != MTD_NORFLASH)
+	if (mtd->type != MTD_NORFLASH || mtd->size > UINT_MAX)
 		return;
 
 	part = kzalloc(sizeof(struct partition), GFP_KERNEL);
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index 33a5d6ed6f1..3f67e00d98e 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -294,7 +294,8 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	int cis_sector;
 
 	/* Check for small page NAND flash */
-	if (mtd->type != MTD_NANDFLASH || mtd->oobsize != OOB_SIZE)
+	if (mtd->type != MTD_NANDFLASH || mtd->oobsize != OOB_SIZE ||
+	    mtd->size > UINT_MAX)
 		return;
 
 	/* Check for SSDFC format by reading CIS/IDI sector */
@@ -316,7 +317,7 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 
 	ssfdc->cis_block = cis_sector / (mtd->erasesize >> SECTOR_SHIFT);
 	ssfdc->erase_size = mtd->erasesize;
-	ssfdc->map_len = mtd->size / mtd->erasesize;
+	ssfdc->map_len = (u32)mtd->size / mtd->erasesize;
 
 	DEBUG(MTD_DEBUG_LEVEL1,
 		"SSFDC_RO: cis_block=%d,erase_size=%d,map_len=%d,n_zones=%d\n",
@@ -327,7 +328,7 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	ssfdc->heads = 16;
 	ssfdc->sectors = 32;
 	get_chs(mtd->size, NULL, &ssfdc->heads, &ssfdc->sectors);
-	ssfdc->cylinders = (unsigned short)((mtd->size >> SECTOR_SHIFT) /
+	ssfdc->cylinders = (unsigned short)(((u32)mtd->size >> SECTOR_SHIFT) /
 			((long)ssfdc->sectors * (long)ssfdc->heads));
 
 	DEBUG(MTD_DEBUG_LEVEL1, "SSFDC_RO: using C:%d H:%d S:%d == %ld sects\n",
diff --git a/drivers/mtd/tests/Makefile b/drivers/mtd/tests/Makefile
new file mode 100644
index 00000000000..c1d50133500
--- /dev/null
+++ b/drivers/mtd/tests/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_MTD_TESTS) += mtd_oobtest.o
+obj-$(CONFIG_MTD_TESTS) += mtd_pagetest.o
+obj-$(CONFIG_MTD_TESTS) += mtd_readtest.o
+obj-$(CONFIG_MTD_TESTS) += mtd_speedtest.o
+obj-$(CONFIG_MTD_TESTS) += mtd_stresstest.o
+obj-$(CONFIG_MTD_TESTS) += mtd_subpagetest.o
+obj-$(CONFIG_MTD_TESTS) += mtd_torturetest.o
diff --git a/drivers/mtd/tests/mtd_oobtest.c b/drivers/mtd/tests/mtd_oobtest.c
new file mode 100644
index 00000000000..afbc3f8126d
--- /dev/null
+++ b/drivers/mtd/tests/mtd_oobtest.c
@@ -0,0 +1,742 @@
+/*
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; see the file COPYING. If not, write to the Free Software
+ * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Test OOB read and write on MTD device.
+ *
+ * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
+ */
+
+#include <asm/div64.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/err.h>
+#include <linux/mtd/mtd.h>
+#include <linux/sched.h>
+
+#define PRINT_PREF KERN_INFO "mtd_oobtest: "
+
+static int dev;
+module_param(dev, int, S_IRUGO);
+MODULE_PARM_DESC(dev, "MTD device number to use");
+
+static struct mtd_info *mtd;
+static unsigned char *readbuf;
+static unsigned char *writebuf;
+static unsigned char *bbt;
+
+static int ebcnt;
+static int pgcnt;
+static int errcnt;
+static int use_offset;
+static int use_len;
+static int use_len_max;
+static int vary_offset;
+static unsigned long next = 1;
+
+static inline unsigned int simple_rand(void)
+{
+	next = next * 1103515245 + 12345;
+	return (unsigned int)((next / 65536) % 32768);
+}
+
+static inline void simple_srand(unsigned long seed)
+{
+	next = seed;
+}
+
+static void set_random_data(unsigned char *buf, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; ++i)
+		buf[i] = simple_rand();
+}
+
+static int erase_eraseblock(int ebnum)
+{
+	int err;
+	struct erase_info ei;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	memset(&ei, 0, sizeof(struct erase_info));
+	ei.mtd  = mtd;
+	ei.addr = addr;
+	ei.len  = mtd->erasesize;
+
+	err = mtd->erase(mtd, &ei);
+	if (err) {
+		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		return err;
+	}
+
+	if (ei.state == MTD_ERASE_FAILED) {
+		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		       ebnum);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int erase_whole_device(void)
+{
+	int err;
+	unsigned int i;
+
+	printk(PRINT_PREF "erasing whole device\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = erase_eraseblock(i);
+		if (err)
+			return err;
+		cond_resched();
+	}
+	printk(PRINT_PREF "erased %u eraseblocks\n", i);
+	return 0;
+}
+
+static void do_vary_offset(void)
+{
+	use_len -= 1;
+	if (use_len < 1) {
+		use_offset += 1;
+		if (use_offset >= use_len_max)
+			use_offset = 0;
+		use_len = use_len_max - use_offset;
+	}
+}
+
+static int write_eraseblock(int ebnum)
+{
+	int i;
+	struct mtd_oob_ops ops;
+	int err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	for (i = 0; i < pgcnt; ++i, addr += mtd->writesize) {
+		set_random_data(writebuf, use_len);
+		ops.mode      = MTD_OOB_AUTO;
+		ops.len       = 0;
+		ops.retlen    = 0;
+		ops.ooblen    = use_len;
+		ops.oobretlen = 0;
+		ops.ooboffs   = use_offset;
+		ops.datbuf    = 0;
+		ops.oobbuf    = writebuf;
+		err = mtd->write_oob(mtd, addr, &ops);
+		if (err || ops.oobretlen != use_len) {
+			printk(PRINT_PREF "error: writeoob failed at %#llx\n",
+			       (long long)addr);
+			printk(PRINT_PREF "error: use_len %d, use_offset %d\n",
+			       use_len, use_offset);
+			errcnt += 1;
+			return err ? err : -1;
+		}
+		if (vary_offset)
+			do_vary_offset();
+	}
+
+	return err;
+}
+
+static int write_whole_device(void)
+{
+	int err;
+	unsigned int i;
+
+	printk(PRINT_PREF "writing OOBs of whole device\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = write_eraseblock(i);
+		if (err)
+			return err;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "written up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "written %u eraseblocks\n", i);
+	return 0;
+}
+
+static int verify_eraseblock(int ebnum)
+{
+	int i;
+	struct mtd_oob_ops ops;
+	int err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	for (i = 0; i < pgcnt; ++i, addr += mtd->writesize) {
+		set_random_data(writebuf, use_len);
+		ops.mode      = MTD_OOB_AUTO;
+		ops.len       = 0;
+		ops.retlen    = 0;
+		ops.ooblen    = use_len;
+		ops.oobretlen = 0;
+		ops.ooboffs   = use_offset;
+		ops.datbuf    = 0;
+		ops.oobbuf    = readbuf;
+		err = mtd->read_oob(mtd, addr, &ops);
+		if (err || ops.oobretlen != use_len) {
+			printk(PRINT_PREF "error: readoob failed at %#llx\n",
+			       (long long)addr);
+			errcnt += 1;
+			return err ? err : -1;
+		}
+		if (memcmp(readbuf, writebuf, use_len)) {
+			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			       (long long)addr);
+			errcnt += 1;
+			if (errcnt > 1000) {
+				printk(PRINT_PREF "error: too many errors\n");
+				return -1;
+			}
+		}
+		if (use_offset != 0 || use_len < mtd->ecclayout->oobavail) {
+			int k;
+
+			ops.mode      = MTD_OOB_AUTO;
+			ops.len       = 0;
+			ops.retlen    = 0;
+			ops.ooblen    = mtd->ecclayout->oobavail;
+			ops.oobretlen = 0;
+			ops.ooboffs   = 0;
+			ops.datbuf    = 0;
+			ops.oobbuf    = readbuf;
+			err = mtd->read_oob(mtd, addr, &ops);
+			if (err || ops.oobretlen != mtd->ecclayout->oobavail) {
+				printk(PRINT_PREF "error: readoob failed at "
+				       "%#llx\n", (long long)addr);
+				errcnt += 1;
+				return err ? err : -1;
+			}
+			if (memcmp(readbuf + use_offset, writebuf, use_len)) {
+				printk(PRINT_PREF "error: verify failed at "
+				       "%#llx\n", (long long)addr);
+				errcnt += 1;
+				if (errcnt > 1000) {
+					printk(PRINT_PREF "error: too many "
+					       "errors\n");
+					return -1;
+				}
+			}
+			for (k = 0; k < use_offset; ++k)
+				if (readbuf[k] != 0xff) {
+					printk(PRINT_PREF "error: verify 0xff "
+					       "failed at %#llx\n",
+					       (long long)addr);
+					errcnt += 1;
+					if (errcnt > 1000) {
+						printk(PRINT_PREF "error: too "
+						       "many errors\n");
+						return -1;
+					}
+				}
+			for (k = use_offset + use_len;
+			     k < mtd->ecclayout->oobavail; ++k)
+				if (readbuf[k] != 0xff) {
+					printk(PRINT_PREF "error: verify 0xff "
+					       "failed at %#llx\n",
+					       (long long)addr);
+					errcnt += 1;
+					if (errcnt > 1000) {
+						printk(PRINT_PREF "error: too "
+						       "many errors\n");
+						return -1;
+					}
+				}
+		}
+		if (vary_offset)
+			do_vary_offset();
+	}
+	return err;
+}
+
+static int verify_eraseblock_in_one_go(int ebnum)
+{
+	struct mtd_oob_ops ops;
+	int err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+	size_t len = mtd->ecclayout->oobavail * pgcnt;
+
+	set_random_data(writebuf, len);
+	ops.mode      = MTD_OOB_AUTO;
+	ops.len       = 0;
+	ops.retlen    = 0;
+	ops.ooblen    = len;
+	ops.oobretlen = 0;
+	ops.ooboffs   = 0;
+	ops.datbuf    = 0;
+	ops.oobbuf    = readbuf;
+	err = mtd->read_oob(mtd, addr, &ops);
+	if (err || ops.oobretlen != len) {
+		printk(PRINT_PREF "error: readoob failed at %#llx\n",
+		       (long long)addr);
+		errcnt += 1;
+		return err ? err : -1;
+	}
+	if (memcmp(readbuf, writebuf, len)) {
+		printk(PRINT_PREF "error: verify failed at %#llx\n",
+		       (long long)addr);
+		errcnt += 1;
+		if (errcnt > 1000) {
+			printk(PRINT_PREF "error: too many errors\n");
+			return -1;
+		}
+	}
+
+	return err;
+}
+
+static int verify_all_eraseblocks(void)
+{
+	int err;
+	unsigned int i;
+
+	printk(PRINT_PREF "verifying all eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = verify_eraseblock(i);
+		if (err)
+			return err;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+	return 0;
+}
+
+static int is_block_bad(int ebnum)
+{
+	int ret;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	ret = mtd->block_isbad(mtd, addr);
+	if (ret)
+		printk(PRINT_PREF "block %d is bad\n", ebnum);
+	return ret;
+}
+
+static int scan_for_bad_eraseblocks(void)
+{
+	int i, bad = 0;
+
+	bbt = kmalloc(ebcnt, GFP_KERNEL);
+	if (!bbt) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		return -ENOMEM;
+	}
+	memset(bbt, 0 , ebcnt);
+
+	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		bbt[i] = is_block_bad(i) ? 1 : 0;
+		if (bbt[i])
+			bad += 1;
+		cond_resched();
+	}
+	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	return 0;
+}
+
+static int __init mtd_oobtest_init(void)
+{
+	int err = 0;
+	unsigned int i;
+	uint64_t tmp;
+	struct mtd_oob_ops ops;
+	loff_t addr = 0, addr0;
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "=================================================\n");
+	printk(PRINT_PREF "MTD device: %d\n", dev);
+
+	mtd = get_mtd_device(NULL, dev);
+	if (IS_ERR(mtd)) {
+		err = PTR_ERR(mtd);
+		printk(PRINT_PREF "error: cannot get MTD device\n");
+		return err;
+	}
+
+	if (mtd->type != MTD_NANDFLASH) {
+		printk(PRINT_PREF "this test requires NAND flash\n");
+		goto out;
+	}
+
+	tmp = mtd->size;
+	do_div(tmp, mtd->erasesize);
+	ebcnt = tmp;
+	pgcnt = mtd->erasesize / mtd->writesize;
+
+	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	       "page size %u, count of eraseblocks %u, pages per "
+	       "eraseblock %u, OOB size %u\n",
+	       (unsigned long long)mtd->size, mtd->erasesize,
+	       mtd->writesize, ebcnt, pgcnt, mtd->oobsize);
+
+	err = -ENOMEM;
+	mtd->erasesize = mtd->erasesize;
+	readbuf = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!readbuf) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+	writebuf = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!writebuf) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+
+	err = scan_for_bad_eraseblocks();
+	if (err)
+		goto out;
+
+	use_offset = 0;
+	use_len = mtd->ecclayout->oobavail;
+	use_len_max = mtd->ecclayout->oobavail;
+	vary_offset = 0;
+
+	/* First test: write all OOB, read it back and verify */
+	printk(PRINT_PREF "test 1 of 5\n");
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	simple_srand(1);
+	err = write_whole_device();
+	if (err)
+		goto out;
+
+	simple_srand(1);
+	err = verify_all_eraseblocks();
+	if (err)
+		goto out;
+
+	/*
+	 * Second test: write all OOB, a block at a time, read it back and
+	 * verify.
+	 */
+	printk(PRINT_PREF "test 2 of 5\n");
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	simple_srand(3);
+	err = write_whole_device();
+	if (err)
+		goto out;
+
+	/* Check all eraseblocks */
+	simple_srand(3);
+	printk(PRINT_PREF "verifying all eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = verify_eraseblock_in_one_go(i);
+		if (err)
+			goto out;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+
+	/*
+	 * Third test: write OOB at varying offsets and lengths, read it back
+	 * and verify.
+	 */
+	printk(PRINT_PREF "test 3 of 5\n");
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	/* Write all eraseblocks */
+	use_offset = 0;
+	use_len = mtd->ecclayout->oobavail;
+	use_len_max = mtd->ecclayout->oobavail;
+	vary_offset = 1;
+	simple_srand(5);
+	printk(PRINT_PREF "writing OOBs of whole device\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = write_eraseblock(i);
+		if (err)
+			goto out;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "written up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "written %u eraseblocks\n", i);
+
+	/* Check all eraseblocks */
+	use_offset = 0;
+	use_len = mtd->ecclayout->oobavail;
+	use_len_max = mtd->ecclayout->oobavail;
+	vary_offset = 1;
+	simple_srand(5);
+	err = verify_all_eraseblocks();
+	if (err)
+		goto out;
+
+	use_offset = 0;
+	use_len = mtd->ecclayout->oobavail;
+	use_len_max = mtd->ecclayout->oobavail;
+	vary_offset = 0;
+
+	/* Fourth test: try to write off end of device */
+	printk(PRINT_PREF "test 4 of 5\n");
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	addr0 = 0;
+	for (i = 0; bbt[i] && i < ebcnt; ++i)
+		addr0 += mtd->erasesize;
+
+	/* Attempt to write off end of OOB */
+	ops.mode      = MTD_OOB_AUTO;
+	ops.len       = 0;
+	ops.retlen    = 0;
+	ops.ooblen    = 1;
+	ops.oobretlen = 0;
+	ops.ooboffs   = mtd->ecclayout->oobavail;
+	ops.datbuf    = 0;
+	ops.oobbuf    = writebuf;
+	printk(PRINT_PREF "attempting to start write past end of OOB\n");
+	printk(PRINT_PREF "an error is expected...\n");
+	err = mtd->write_oob(mtd, addr0, &ops);
+	if (err) {
+		printk(PRINT_PREF "error occurred as expected\n");
+		err = 0;
+	} else {
+		printk(PRINT_PREF "error: can write past end of OOB\n");
+		errcnt += 1;
+	}
+
+	/* Attempt to read off end of OOB */
+	ops.mode      = MTD_OOB_AUTO;
+	ops.len       = 0;
+	ops.retlen    = 0;
+	ops.ooblen    = 1;
+	ops.oobretlen = 0;
+	ops.ooboffs   = mtd->ecclayout->oobavail;
+	ops.datbuf    = 0;
+	ops.oobbuf    = readbuf;
+	printk(PRINT_PREF "attempting to start read past end of OOB\n");
+	printk(PRINT_PREF "an error is expected...\n");
+	err = mtd->read_oob(mtd, addr0, &ops);
+	if (err) {
+		printk(PRINT_PREF "error occurred as expected\n");
+		err = 0;
+	} else {
+		printk(PRINT_PREF "error: can read past end of OOB\n");
+		errcnt += 1;
+	}
+
+	if (bbt[ebcnt - 1])
+		printk(PRINT_PREF "skipping end of device tests because last "
+		       "block is bad\n");
+	else {
+		/* Attempt to write off end of device */
+		ops.mode      = MTD_OOB_AUTO;
+		ops.len       = 0;
+		ops.retlen    = 0;
+		ops.ooblen    = mtd->ecclayout->oobavail + 1;
+		ops.oobretlen = 0;
+		ops.ooboffs   = 0;
+		ops.datbuf    = 0;
+		ops.oobbuf    = writebuf;
+		printk(PRINT_PREF "attempting to write past end of device\n");
+		printk(PRINT_PREF "an error is expected...\n");
+		err = mtd->write_oob(mtd, mtd->size - mtd->writesize, &ops);
+		if (err) {
+			printk(PRINT_PREF "error occurred as expected\n");
+			err = 0;
+		} else {
+			printk(PRINT_PREF "error: wrote past end of device\n");
+			errcnt += 1;
+		}
+
+		/* Attempt to read off end of device */
+		ops.mode      = MTD_OOB_AUTO;
+		ops.len       = 0;
+		ops.retlen    = 0;
+		ops.ooblen    = mtd->ecclayout->oobavail + 1;
+		ops.oobretlen = 0;
+		ops.ooboffs   = 0;
+		ops.datbuf    = 0;
+		ops.oobbuf    = readbuf;
+		printk(PRINT_PREF "attempting to read past end of device\n");
+		printk(PRINT_PREF "an error is expected...\n");
+		err = mtd->read_oob(mtd, mtd->size - mtd->writesize, &ops);
+		if (err) {
+			printk(PRINT_PREF "error occurred as expected\n");
+			err = 0;
+		} else {
+			printk(PRINT_PREF "error: read past end of device\n");
+			errcnt += 1;
+		}
+
+		err = erase_eraseblock(ebcnt - 1);
+		if (err)
+			goto out;
+
+		/* Attempt to write off end of device */
+		ops.mode      = MTD_OOB_AUTO;
+		ops.len       = 0;
+		ops.retlen    = 0;
+		ops.ooblen    = mtd->ecclayout->oobavail;
+		ops.oobretlen = 0;
+		ops.ooboffs   = 1;
+		ops.datbuf    = 0;
+		ops.oobbuf    = writebuf;
+		printk(PRINT_PREF "attempting to write past end of device\n");
+		printk(PRINT_PREF "an error is expected...\n");
+		err = mtd->write_oob(mtd, mtd->size - mtd->writesize, &ops);
+		if (err) {
+			printk(PRINT_PREF "error occurred as expected\n");
+			err = 0;
+		} else {
+			printk(PRINT_PREF "error: wrote past end of device\n");
+			errcnt += 1;
+		}
+
+		/* Attempt to read off end of device */
+		ops.mode      = MTD_OOB_AUTO;
+		ops.len       = 0;
+		ops.retlen    = 0;
+		ops.ooblen    = mtd->ecclayout->oobavail;
+		ops.oobretlen = 0;
+		ops.ooboffs   = 1;
+		ops.datbuf    = 0;
+		ops.oobbuf    = readbuf;
+		printk(PRINT_PREF "attempting to read past end of device\n");
+		printk(PRINT_PREF "an error is expected...\n");
+		err = mtd->read_oob(mtd, mtd->size - mtd->writesize, &ops);
+		if (err) {
+			printk(PRINT_PREF "error occurred as expected\n");
+			err = 0;
+		} else {
+			printk(PRINT_PREF "error: read past end of device\n");
+			errcnt += 1;
+		}
+	}
+
+	/* Fifth test: write / read across block boundaries */
+	printk(PRINT_PREF "test 5 of 5\n");
+
+	/* Erase all eraseblocks */
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	/* Write all eraseblocks */
+	simple_srand(11);
+	printk(PRINT_PREF "writing OOBs of whole device\n");
+	for (i = 0; i < ebcnt - 1; ++i) {
+		int cnt = 2;
+		int pg;
+		size_t sz = mtd->ecclayout->oobavail;
+		if (bbt[i] || bbt[i + 1])
+			continue;
+		addr = (i + 1) * mtd->erasesize - mtd->writesize;
+		for (pg = 0; pg < cnt; ++pg) {
+			set_random_data(writebuf, sz);
+			ops.mode      = MTD_OOB_AUTO;
+			ops.len       = 0;
+			ops.retlen    = 0;
+			ops.ooblen    = sz;
+			ops.oobretlen = 0;
+			ops.ooboffs   = 0;
+			ops.datbuf    = 0;
+			ops.oobbuf    = writebuf;
+			err = mtd->write_oob(mtd, addr, &ops);
+			if (err)
+				goto out;
+			if (i % 256 == 0)
+				printk(PRINT_PREF "written up to eraseblock "
+				       "%u\n", i);
+			cond_resched();
+			addr += mtd->writesize;
+		}
+	}
+	printk(PRINT_PREF "written %u eraseblocks\n", i);
+
+	/* Check all eraseblocks */
+	simple_srand(11);
+	printk(PRINT_PREF "verifying all eraseblocks\n");
+	for (i = 0; i < ebcnt - 1; ++i) {
+		if (bbt[i] || bbt[i + 1])
+			continue;
+		set_random_data(writebuf, mtd->ecclayout->oobavail * 2);
+		addr = (i + 1) * mtd->erasesize - mtd->writesize;
+		ops.mode      = MTD_OOB_AUTO;
+		ops.len       = 0;
+		ops.retlen    = 0;
+		ops.ooblen    = mtd->ecclayout->oobavail * 2;
+		ops.oobretlen = 0;
+		ops.ooboffs   = 0;
+		ops.datbuf    = 0;
+		ops.oobbuf    = readbuf;
+		err = mtd->read_oob(mtd, addr, &ops);
+		if (err)
+			goto out;
+		if (memcmp(readbuf, writebuf, mtd->ecclayout->oobavail * 2)) {
+			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			       (long long)addr);
+			errcnt += 1;
+			if (errcnt > 1000) {
+				printk(PRINT_PREF "error: too many errors\n");
+				goto out;
+			}
+		}
+		if (i % 256 == 0)
+			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+
+	printk(PRINT_PREF "finished with %d errors\n", errcnt);
+out:
+	kfree(bbt);
+	kfree(writebuf);
+	kfree(readbuf);
+	put_mtd_device(mtd);
+	if (err)
+		printk(PRINT_PREF "error %d occurred\n", err);
+	printk(KERN_INFO "=================================================\n");
+	return err;
+}
+module_init(mtd_oobtest_init);
+
+static void __exit mtd_oobtest_exit(void)
+{
+	return;
+}
+module_exit(mtd_oobtest_exit);
+
+MODULE_DESCRIPTION("Out-of-band test module");
+MODULE_AUTHOR("Adrian Hunter");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/tests/mtd_pagetest.c b/drivers/mtd/tests/mtd_pagetest.c
new file mode 100644
index 00000000000..9648818b9e2
--- /dev/null
+++ b/drivers/mtd/tests/mtd_pagetest.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; see the file COPYING. If not, write to the Free Software
+ * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Test page read and write on MTD device.
+ *
+ * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
+ */
+
+#include <asm/div64.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/err.h>
+#include <linux/mtd/mtd.h>
+#include <linux/sched.h>
+
+#define PRINT_PREF KERN_INFO "mtd_pagetest: "
+
+static int dev;
+module_param(dev, int, S_IRUGO);
+MODULE_PARM_DESC(dev, "MTD device number to use");
+
+static struct mtd_info *mtd;
+static unsigned char *twopages;
+static unsigned char *writebuf;
+static unsigned char *boundary;
+static unsigned char *bbt;
+
+static int pgsize;
+static int bufsize;
+static int ebcnt;
+static int pgcnt;
+static int errcnt;
+static unsigned long next = 1;
+
+static inline unsigned int simple_rand(void)
+{
+	next = next * 1103515245 + 12345;
+	return (unsigned int)((next / 65536) % 32768);
+}
+
+static inline void simple_srand(unsigned long seed)
+{
+	next = seed;
+}
+
+static void set_random_data(unsigned char *buf, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; ++i)
+		buf[i] = simple_rand();
+}
+
+static int erase_eraseblock(int ebnum)
+{
+	int err;
+	struct erase_info ei;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	memset(&ei, 0, sizeof(struct erase_info));
+	ei.mtd  = mtd;
+	ei.addr = addr;
+	ei.len  = mtd->erasesize;
+
+	err = mtd->erase(mtd, &ei);
+	if (err) {
+		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		return err;
+	}
+
+	if (ei.state == MTD_ERASE_FAILED) {
+		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		       ebnum);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int write_eraseblock(int ebnum)
+{
+	int err = 0;
+	size_t written = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	set_random_data(writebuf, mtd->erasesize);
+	cond_resched();
+	err = mtd->write(mtd, addr, mtd->erasesize, &written, writebuf);
+	if (err || written != mtd->erasesize)
+		printk(PRINT_PREF "error: write failed at %#llx\n",
+		       (long long)addr);
+
+	return err;
+}
+
+static int verify_eraseblock(int ebnum)
+{
+	uint32_t j;
+	size_t read = 0;
+	int err = 0, i;
+	loff_t addr0, addrn;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	addr0 = 0;
+	for (i = 0; bbt[i] && i < ebcnt; ++i)
+		addr0 += mtd->erasesize;
+
+	addrn = mtd->size;
+	for (i = 0; bbt[ebcnt - i - 1] && i < ebcnt; ++i)
+		addrn -= mtd->erasesize;
+
+	set_random_data(writebuf, mtd->erasesize);
+	for (j = 0; j < pgcnt - 1; ++j, addr += pgsize) {
+		/* Do a read to set the internal dataRAMs to different data */
+		err = mtd->read(mtd, addr0, bufsize, &read, twopages);
+		if (err == -EUCLEAN)
+			err = 0;
+		if (err || read != bufsize) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       (long long)addr0);
+			return err;
+		}
+		err = mtd->read(mtd, addrn - bufsize, bufsize, &read, twopages);
+		if (err == -EUCLEAN)
+			err = 0;
+		if (err || read != bufsize) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       (long long)(addrn - bufsize));
+			return err;
+		}
+		memset(twopages, 0, bufsize);
+		read = 0;
+		err = mtd->read(mtd, addr, bufsize, &read, twopages);
+		if (err == -EUCLEAN)
+			err = 0;
+		if (err || read != bufsize) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       (long long)addr);
+			break;
+		}
+		if (memcmp(twopages, writebuf + (j * pgsize), bufsize)) {
+			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			       (long long)addr);
+			errcnt += 1;
+		}
+	}
+	/* Check boundary between eraseblocks */
+	if (addr <= addrn - pgsize - pgsize && !bbt[ebnum + 1]) {
+		unsigned long oldnext = next;
+		/* Do a read to set the internal dataRAMs to different data */
+		err = mtd->read(mtd, addr0, bufsize, &read, twopages);
+		if (err == -EUCLEAN)
+			err = 0;
+		if (err || read != bufsize) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       (long long)addr0);
+			return err;
+		}
+		err = mtd->read(mtd, addrn - bufsize, bufsize, &read, twopages);
+		if (err == -EUCLEAN)
+			err = 0;
+		if (err || read != bufsize) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       (long long)(addrn - bufsize));
+			return err;
+		}
+		memset(twopages, 0, bufsize);
+		read = 0;
+		err = mtd->read(mtd, addr, bufsize, &read, twopages);
+		if (err == -EUCLEAN)
+			err = 0;
+		if (err || read != bufsize) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       (long long)addr);
+			return err;
+		}
+		memcpy(boundary, writebuf + mtd->erasesize - pgsize, pgsize);
+		set_random_data(boundary + pgsize, pgsize);
+		if (memcmp(twopages, boundary, bufsize)) {
+			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			       (long long)addr);
+			errcnt += 1;
+		}
+		next = oldnext;
+	}
+	return err;
+}
+
+static int crosstest(void)
+{
+	size_t read = 0;
+	int err = 0, i;
+	loff_t addr, addr0, addrn;
+	unsigned char *pp1, *pp2, *pp3, *pp4;
+
+	printk(PRINT_PREF "crosstest\n");
+	pp1 = kmalloc(pgsize * 4, GFP_KERNEL);
+	if (!pp1) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		return -ENOMEM;
+	}
+	pp2 = pp1 + pgsize;
+	pp3 = pp2 + pgsize;
+	pp4 = pp3 + pgsize;
+	memset(pp1, 0, pgsize * 4);
+
+	addr0 = 0;
+	for (i = 0; bbt[i] && i < ebcnt; ++i)
+		addr0 += mtd->erasesize;
+
+	addrn = mtd->size;
+	for (i = 0; bbt[ebcnt - i - 1] && i < ebcnt; ++i)
+		addrn -= mtd->erasesize;
+
+	/* Read 2nd-to-last page to pp1 */
+	read = 0;
+	addr = addrn - pgsize - pgsize;
+	err = mtd->read(mtd, addr, pgsize, &read, pp1);
+	if (err == -EUCLEAN)
+		err = 0;
+	if (err || read != pgsize) {
+		printk(PRINT_PREF "error: read failed at %#llx\n",
+		       (long long)addr);
+		kfree(pp1);
+		return err;
+	}
+
+	/* Read 3rd-to-last page to pp1 */
+	read = 0;
+	addr = addrn - pgsize - pgsize - pgsize;
+	err = mtd->read(mtd, addr, pgsize, &read, pp1);
+	if (err == -EUCLEAN)
+		err = 0;
+	if (err || read != pgsize) {
+		printk(PRINT_PREF "error: read failed at %#llx\n",
+		       (long long)addr);
+		kfree(pp1);
+		return err;
+	}
+
+	/* Read first page to pp2 */
+	read = 0;
+	addr = addr0;
+	printk(PRINT_PREF "reading page at %#llx\n", (long long)addr);
+	err = mtd->read(mtd, addr, pgsize, &read, pp2);
+	if (err == -EUCLEAN)
+		err = 0;
+	if (err || read != pgsize) {
+		printk(PRINT_PREF "error: read failed at %#llx\n",
+		       (long long)addr);
+		kfree(pp1);
+		return err;
+	}
+
+	/* Read last page to pp3 */
+	read = 0;
+	addr = addrn - pgsize;
+	printk(PRINT_PREF "reading page at %#llx\n", (long long)addr);
+	err = mtd->read(mtd, addr, pgsize, &read, pp3);
+	if (err == -EUCLEAN)
+		err = 0;
+	if (err || read != pgsize) {
+		printk(PRINT_PREF "error: read failed at %#llx\n",
+		       (long long)addr);
+		kfree(pp1);
+		return err;
+	}
+
+	/* Read first page again to pp4 */
+	read = 0;
+	addr = addr0;
+	printk(PRINT_PREF "reading page at %#llx\n", (long long)addr);
+	err = mtd->read(mtd, addr, pgsize, &read, pp4);
+	if (err == -EUCLEAN)
+		err = 0;
+	if (err || read != pgsize) {
+		printk(PRINT_PREF "error: read failed at %#llx\n",
+		       (long long)addr);
+		kfree(pp1);
+		return err;
+	}
+
+	/* pp2 and pp4 should be the same */
+	printk(PRINT_PREF "verifying pages read at %#llx match\n",
+	       (long long)addr0);
+	if (memcmp(pp2, pp4, pgsize)) {
+		printk(PRINT_PREF "verify failed!\n");
+		errcnt += 1;
+	} else if (!err)
+		printk(PRINT_PREF "crosstest ok\n");
+	kfree(pp1);
+	return err;
+}
+
+static int erasecrosstest(void)
+{
+	size_t read = 0, written = 0;
+	int err = 0, i, ebnum, ok = 1, ebnum2;
+	loff_t addr0;
+	char *readbuf = twopages;
+
+	printk(PRINT_PREF "erasecrosstest\n");
+
+	ebnum = 0;
+	addr0 = 0;
+	for (i = 0; bbt[i] && i < ebcnt; ++i) {
+		addr0 += mtd->erasesize;
+		ebnum += 1;
+	}
+
+	ebnum2 = ebcnt - 1;
+	while (ebnum2 && bbt[ebnum2])
+		ebnum2 -= 1;
+
+	printk(PRINT_PREF "erasing block %d\n", ebnum);
+	err = erase_eraseblock(ebnum);
+	if (err)
+		return err;
+
+	printk(PRINT_PREF "writing 1st page of block %d\n", ebnum);
+	set_random_data(writebuf, pgsize);
+	strcpy(writebuf, "There is no data like this!");
+	err = mtd->write(mtd, addr0, pgsize, &written, writebuf);
+	if (err || written != pgsize) {
+		printk(PRINT_PREF "error: write failed at %#llx\n",
+		       (long long)addr0);
+		return err ? err : -1;
+	}
+
+	printk(PRINT_PREF "reading 1st page of block %d\n", ebnum);
+	memset(readbuf, 0, pgsize);
+	err = mtd->read(mtd, addr0, pgsize, &read, readbuf);
+	if (err == -EUCLEAN)
+		err = 0;
+	if (err || read != pgsize) {
+		printk(PRINT_PREF "error: read failed at %#llx\n",
+		       (long long)addr0);
+		return err ? err : -1;
+	}
+
+	printk(PRINT_PREF "verifying 1st page of block %d\n", ebnum);
+	if (memcmp(writebuf, readbuf, pgsize)) {
+		printk(PRINT_PREF "verify failed!\n");
+		errcnt += 1;
+		ok = 0;
+		return err;
+	}
+
+	printk(PRINT_PREF "erasing block %d\n", ebnum);
+	err = erase_eraseblock(ebnum);
+	if (err)
+		return err;
+
+	printk(PRINT_PREF "writing 1st page of block %d\n", ebnum);
+	set_random_data(writebuf, pgsize);
+	strcpy(writebuf, "There is no data like this!");
+	err = mtd->write(mtd, addr0, pgsize, &written, writebuf);
+	if (err || written != pgsize) {
+		printk(PRINT_PREF "error: write failed at %#llx\n",
+		       (long long)addr0);
+		return err ? err : -1;
+	}
+
+	printk(PRINT_PREF "erasing block %d\n", ebnum2);
+	err = erase_eraseblock(ebnum2);
+	if (err)
+		return err;
+
+	printk(PRINT_PREF "reading 1st page of block %d\n", ebnum);
+	memset(readbuf, 0, pgsize);
+	err = mtd->read(mtd, addr0, pgsize, &read, readbuf);
+	if (err == -EUCLEAN)
+		err = 0;
+	if (err || read != pgsize) {
+		printk(PRINT_PREF "error: read failed at %#llx\n",
+		       (long long)addr0);
+		return err ? err : -1;
+	}
+
+	printk(PRINT_PREF "verifying 1st page of block %d\n", ebnum);
+	if (memcmp(writebuf, readbuf, pgsize)) {
+		printk(PRINT_PREF "verify failed!\n");
+		errcnt += 1;
+		ok = 0;
+	}
+
+	if (ok && !err)
+		printk(PRINT_PREF "erasecrosstest ok\n");
+	return err;
+}
+
+static int erasetest(void)
+{
+	size_t read = 0, written = 0;
+	int err = 0, i, ebnum, ok = 1;
+	loff_t addr0;
+
+	printk(PRINT_PREF "erasetest\n");
+
+	ebnum = 0;
+	addr0 = 0;
+	for (i = 0; bbt[i] && i < ebcnt; ++i) {
+		addr0 += mtd->erasesize;
+		ebnum += 1;
+	}
+
+	printk(PRINT_PREF "erasing block %d\n", ebnum);
+	err = erase_eraseblock(ebnum);
+	if (err)
+		return err;
+
+	printk(PRINT_PREF "writing 1st page of block %d\n", ebnum);
+	set_random_data(writebuf, pgsize);
+	err = mtd->write(mtd, addr0, pgsize, &written, writebuf);
+	if (err || written != pgsize) {
+		printk(PRINT_PREF "error: write failed at %#llx\n",
+		       (long long)addr0);
+		return err ? err : -1;
+	}
+
+	printk(PRINT_PREF "erasing block %d\n", ebnum);
+	err = erase_eraseblock(ebnum);
+	if (err)
+		return err;
+
+	printk(PRINT_PREF "reading 1st page of block %d\n", ebnum);
+	err = mtd->read(mtd, addr0, pgsize, &read, twopages);
+	if (err == -EUCLEAN)
+		err = 0;
+	if (err || read != pgsize) {
+		printk(PRINT_PREF "error: read failed at %#llx\n",
+		       (long long)addr0);
+		return err ? err : -1;
+	}
+
+	printk(PRINT_PREF "verifying 1st page of block %d is all 0xff\n",
+	       ebnum);
+	for (i = 0; i < pgsize; ++i)
+		if (twopages[i] != 0xff) {
+			printk(PRINT_PREF "verifying all 0xff failed at %d\n",
+			       i);
+			errcnt += 1;
+			ok = 0;
+			break;
+		}
+
+	if (ok && !err)
+		printk(PRINT_PREF "erasetest ok\n");
+
+	return err;
+}
+
+static int is_block_bad(int ebnum)
+{
+	loff_t addr = ebnum * mtd->erasesize;
+	int ret;
+
+	ret = mtd->block_isbad(mtd, addr);
+	if (ret)
+		printk(PRINT_PREF "block %d is bad\n", ebnum);
+	return ret;
+}
+
+static int scan_for_bad_eraseblocks(void)
+{
+	int i, bad = 0;
+
+	bbt = kmalloc(ebcnt, GFP_KERNEL);
+	if (!bbt) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		return -ENOMEM;
+	}
+	memset(bbt, 0 , ebcnt);
+
+	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		bbt[i] = is_block_bad(i) ? 1 : 0;
+		if (bbt[i])
+			bad += 1;
+		cond_resched();
+	}
+	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	return 0;
+}
+
+static int __init mtd_pagetest_init(void)
+{
+	int err = 0;
+	uint64_t tmp;
+	uint32_t i;
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "=================================================\n");
+	printk(PRINT_PREF "MTD device: %d\n", dev);
+
+	mtd = get_mtd_device(NULL, dev);
+	if (IS_ERR(mtd)) {
+		err = PTR_ERR(mtd);
+		printk(PRINT_PREF "error: cannot get MTD device\n");
+		return err;
+	}
+
+	if (mtd->type != MTD_NANDFLASH) {
+		printk(PRINT_PREF "this test requires NAND flash\n");
+		goto out;
+	}
+
+	tmp = mtd->size;
+	do_div(tmp, mtd->erasesize);
+	ebcnt = tmp;
+	pgcnt = mtd->erasesize / mtd->writesize;
+
+	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	       "page size %u, count of eraseblocks %u, pages per "
+	       "eraseblock %u, OOB size %u\n",
+	       (unsigned long long)mtd->size, mtd->erasesize,
+	       pgsize, ebcnt, pgcnt, mtd->oobsize);
+
+	err = -ENOMEM;
+	bufsize = pgsize * 2;
+	writebuf = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!writebuf) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+	twopages = kmalloc(bufsize, GFP_KERNEL);
+	if (!twopages) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+	boundary = kmalloc(bufsize, GFP_KERNEL);
+	if (!boundary) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+
+	err = scan_for_bad_eraseblocks();
+	if (err)
+		goto out;
+
+	/* Erase all eraseblocks */
+	printk(PRINT_PREF "erasing whole device\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = erase_eraseblock(i);
+		if (err)
+			goto out;
+		cond_resched();
+	}
+	printk(PRINT_PREF "erased %u eraseblocks\n", i);
+
+	/* Write all eraseblocks */
+	simple_srand(1);
+	printk(PRINT_PREF "writing whole device\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = write_eraseblock(i);
+		if (err)
+			goto out;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "written up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "written %u eraseblocks\n", i);
+
+	/* Check all eraseblocks */
+	simple_srand(1);
+	printk(PRINT_PREF "verifying all eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = verify_eraseblock(i);
+		if (err)
+			goto out;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+
+	err = crosstest();
+	if (err)
+		goto out;
+
+	err = erasecrosstest();
+	if (err)
+		goto out;
+
+	err = erasetest();
+	if (err)
+		goto out;
+
+	printk(PRINT_PREF "finished with %d errors\n", errcnt);
+out:
+
+	kfree(bbt);
+	kfree(boundary);
+	kfree(twopages);
+	kfree(writebuf);
+	put_mtd_device(mtd);
+	if (err)
+		printk(PRINT_PREF "error %d occurred\n", err);
+	printk(KERN_INFO "=================================================\n");
+	return err;
+}
+module_init(mtd_pagetest_init);
+
+static void __exit mtd_pagetest_exit(void)
+{
+	return;
+}
+module_exit(mtd_pagetest_exit);
+
+MODULE_DESCRIPTION("NAND page test");
+MODULE_AUTHOR("Adrian Hunter");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/tests/mtd_readtest.c b/drivers/mtd/tests/mtd_readtest.c
new file mode 100644
index 00000000000..645e77fdc63
--- /dev/null
+++ b/drivers/mtd/tests/mtd_readtest.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; see the file COPYING. If not, write to the Free Software
+ * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Check MTD device read.
+ *
+ * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/err.h>
+#include <linux/mtd/mtd.h>
+#include <linux/sched.h>
+
+#define PRINT_PREF KERN_INFO "mtd_readtest: "
+
+static int dev;
+module_param(dev, int, S_IRUGO);
+MODULE_PARM_DESC(dev, "MTD device number to use");
+
+static struct mtd_info *mtd;
+static unsigned char *iobuf;
+static unsigned char *iobuf1;
+static unsigned char *bbt;
+
+static int pgsize;
+static int ebcnt;
+static int pgcnt;
+
+static int read_eraseblock_by_page(int ebnum)
+{
+	size_t read = 0;
+	int i, ret, err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+	void *buf = iobuf;
+	void *oobbuf = iobuf1;
+
+	for (i = 0; i < pgcnt; i++) {
+		memset(buf, 0 , pgcnt);
+		ret = mtd->read(mtd, addr, pgsize, &read, buf);
+		if (ret == -EUCLEAN)
+			ret = 0;
+		if (ret || read != pgsize) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       (long long)addr);
+			if (!err)
+				err = ret;
+			if (!err)
+				err = -EINVAL;
+		}
+		if (mtd->oobsize) {
+			struct mtd_oob_ops ops;
+
+			ops.mode      = MTD_OOB_PLACE;
+			ops.len       = 0;
+			ops.retlen    = 0;
+			ops.ooblen    = mtd->oobsize;
+			ops.oobretlen = 0;
+			ops.ooboffs   = 0;
+			ops.datbuf    = 0;
+			ops.oobbuf    = oobbuf;
+			ret = mtd->read_oob(mtd, addr, &ops);
+			if (ret || ops.oobretlen != mtd->oobsize) {
+				printk(PRINT_PREF "error: read oob failed at "
+						  "%#llx\n", (long long)addr);
+				if (!err)
+					err = ret;
+				if (!err)
+					err = -EINVAL;
+			}
+			oobbuf += mtd->oobsize;
+		}
+		addr += pgsize;
+		buf += pgsize;
+	}
+
+	return err;
+}
+
+static void dump_eraseblock(int ebnum)
+{
+	int i, j, n;
+	char line[128];
+	int pg, oob;
+
+	printk(PRINT_PREF "dumping eraseblock %d\n", ebnum);
+	n = mtd->erasesize;
+	for (i = 0; i < n;) {
+		char *p = line;
+
+		p += sprintf(p, "%05x: ", i);
+		for (j = 0; j < 32 && i < n; j++, i++)
+			p += sprintf(p, "%02x", (unsigned int)iobuf[i]);
+		printk(KERN_CRIT "%s\n", line);
+		cond_resched();
+	}
+	if (!mtd->oobsize)
+		return;
+	printk(PRINT_PREF "dumping oob from eraseblock %d\n", ebnum);
+	n = mtd->oobsize;
+	for (pg = 0, i = 0; pg < pgcnt; pg++)
+		for (oob = 0; oob < n;) {
+			char *p = line;
+
+			p += sprintf(p, "%05x: ", i);
+			for (j = 0; j < 32 && oob < n; j++, oob++, i++)
+				p += sprintf(p, "%02x",
+					     (unsigned int)iobuf1[i]);
+			printk(KERN_CRIT "%s\n", line);
+			cond_resched();
+		}
+}
+
+static int is_block_bad(int ebnum)
+{
+	loff_t addr = ebnum * mtd->erasesize;
+	int ret;
+
+	ret = mtd->block_isbad(mtd, addr);
+	if (ret)
+		printk(PRINT_PREF "block %d is bad\n", ebnum);
+	return ret;
+}
+
+static int scan_for_bad_eraseblocks(void)
+{
+	int i, bad = 0;
+
+	bbt = kmalloc(ebcnt, GFP_KERNEL);
+	if (!bbt) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		return -ENOMEM;
+	}
+	memset(bbt, 0 , ebcnt);
+
+	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		bbt[i] = is_block_bad(i) ? 1 : 0;
+		if (bbt[i])
+			bad += 1;
+		cond_resched();
+	}
+	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	return 0;
+}
+
+static int __init mtd_readtest_init(void)
+{
+	uint64_t tmp;
+	int err, i;
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "=================================================\n");
+	printk(PRINT_PREF "MTD device: %d\n", dev);
+
+	mtd = get_mtd_device(NULL, dev);
+	if (IS_ERR(mtd)) {
+		err = PTR_ERR(mtd);
+		printk(PRINT_PREF "error: Cannot get MTD device\n");
+		return err;
+	}
+
+	if (mtd->writesize == 1) {
+		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
+		       "bytes.\n");
+		pgsize = 512;
+	} else
+		pgsize = mtd->writesize;
+
+	tmp = mtd->size;
+	do_div(tmp, mtd->erasesize);
+	ebcnt = tmp;
+	pgcnt = mtd->erasesize / mtd->writesize;
+
+	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	       "page size %u, count of eraseblocks %u, pages per "
+	       "eraseblock %u, OOB size %u\n",
+	       (unsigned long long)mtd->size, mtd->erasesize,
+	       pgsize, ebcnt, pgcnt, mtd->oobsize);
+
+	err = -ENOMEM;
+	iobuf = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!iobuf) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+	iobuf1 = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!iobuf1) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+
+	err = scan_for_bad_eraseblocks();
+	if (err)
+		goto out;
+
+	/* Read all eraseblocks 1 page at a time */
+	printk(PRINT_PREF "testing page read\n");
+	for (i = 0; i < ebcnt; ++i) {
+		int ret;
+
+		if (bbt[i])
+			continue;
+		ret = read_eraseblock_by_page(i);
+		if (ret) {
+			dump_eraseblock(i);
+			if (!err)
+				err = ret;
+		}
+		cond_resched();
+	}
+
+	if (err)
+		printk(PRINT_PREF "finished with errors\n");
+	else
+		printk(PRINT_PREF "finished\n");
+
+out:
+
+	kfree(iobuf);
+	kfree(iobuf1);
+	kfree(bbt);
+	put_mtd_device(mtd);
+	if (err)
+		printk(PRINT_PREF "error %d occurred\n", err);
+	printk(KERN_INFO "=================================================\n");
+	return err;
+}
+module_init(mtd_readtest_init);
+
+static void __exit mtd_readtest_exit(void)
+{
+	return;
+}
+module_exit(mtd_readtest_exit);
+
+MODULE_DESCRIPTION("Read test module");
+MODULE_AUTHOR("Adrian Hunter");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/tests/mtd_speedtest.c b/drivers/mtd/tests/mtd_speedtest.c
new file mode 100644
index 00000000000..141363a7e80
--- /dev/null
+++ b/drivers/mtd/tests/mtd_speedtest.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright (C) 2007 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; see the file COPYING. If not, write to the Free Software
+ * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Test read and write speed of a MTD device.
+ *
+ * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/err.h>
+#include <linux/mtd/mtd.h>
+#include <linux/sched.h>
+
+#define PRINT_PREF KERN_INFO "mtd_speedtest: "
+
+static int dev;
+module_param(dev, int, S_IRUGO);
+MODULE_PARM_DESC(dev, "MTD device number to use");
+
+static struct mtd_info *mtd;
+static unsigned char *iobuf;
+static unsigned char *bbt;
+
+static int pgsize;
+static int ebcnt;
+static int pgcnt;
+static int goodebcnt;
+static struct timeval start, finish;
+static unsigned long next = 1;
+
+static inline unsigned int simple_rand(void)
+{
+	next = next * 1103515245 + 12345;
+	return (unsigned int)((next / 65536) % 32768);
+}
+
+static inline void simple_srand(unsigned long seed)
+{
+	next = seed;
+}
+
+static void set_random_data(unsigned char *buf, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; ++i)
+		buf[i] = simple_rand();
+}
+
+static int erase_eraseblock(int ebnum)
+{
+	int err;
+	struct erase_info ei;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	memset(&ei, 0, sizeof(struct erase_info));
+	ei.mtd  = mtd;
+	ei.addr = addr;
+	ei.len  = mtd->erasesize;
+
+	err = mtd->erase(mtd, &ei);
+	if (err) {
+		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		return err;
+	}
+
+	if (ei.state == MTD_ERASE_FAILED) {
+		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		       ebnum);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int erase_whole_device(void)
+{
+	int err;
+	unsigned int i;
+
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = erase_eraseblock(i);
+		if (err)
+			return err;
+		cond_resched();
+	}
+	return 0;
+}
+
+static int write_eraseblock(int ebnum)
+{
+	size_t written = 0;
+	int err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	err = mtd->write(mtd, addr, mtd->erasesize, &written, iobuf);
+	if (err || written != mtd->erasesize) {
+		printk(PRINT_PREF "error: write failed at %#llx\n", addr);
+		if (!err)
+			err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int write_eraseblock_by_page(int ebnum)
+{
+	size_t written = 0;
+	int i, err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+	void *buf = iobuf;
+
+	for (i = 0; i < pgcnt; i++) {
+		err = mtd->write(mtd, addr, pgsize, &written, buf);
+		if (err || written != pgsize) {
+			printk(PRINT_PREF "error: write failed at %#llx\n",
+			       addr);
+			if (!err)
+				err = -EINVAL;
+			break;
+		}
+		addr += pgsize;
+		buf += pgsize;
+	}
+
+	return err;
+}
+
+static int write_eraseblock_by_2pages(int ebnum)
+{
+	size_t written = 0, sz = pgsize * 2;
+	int i, n = pgcnt / 2, err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+	void *buf = iobuf;
+
+	for (i = 0; i < n; i++) {
+		err = mtd->write(mtd, addr, sz, &written, buf);
+		if (err || written != sz) {
+			printk(PRINT_PREF "error: write failed at %#llx\n",
+			       addr);
+			if (!err)
+				err = -EINVAL;
+			return err;
+		}
+		addr += sz;
+		buf += sz;
+	}
+	if (pgcnt % 2) {
+		err = mtd->write(mtd, addr, pgsize, &written, buf);
+		if (err || written != pgsize) {
+			printk(PRINT_PREF "error: write failed at %#llx\n",
+			       addr);
+			if (!err)
+				err = -EINVAL;
+		}
+	}
+
+	return err;
+}
+
+static int read_eraseblock(int ebnum)
+{
+	size_t read = 0;
+	int err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	err = mtd->read(mtd, addr, mtd->erasesize, &read, iobuf);
+	/* Ignore corrected ECC errors */
+	if (err == -EUCLEAN)
+		err = 0;
+	if (err || read != mtd->erasesize) {
+		printk(PRINT_PREF "error: read failed at %#llx\n", addr);
+		if (!err)
+			err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int read_eraseblock_by_page(int ebnum)
+{
+	size_t read = 0;
+	int i, err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+	void *buf = iobuf;
+
+	for (i = 0; i < pgcnt; i++) {
+		err = mtd->read(mtd, addr, pgsize, &read, buf);
+		/* Ignore corrected ECC errors */
+		if (err == -EUCLEAN)
+			err = 0;
+		if (err || read != pgsize) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       addr);
+			if (!err)
+				err = -EINVAL;
+			break;
+		}
+		addr += pgsize;
+		buf += pgsize;
+	}
+
+	return err;
+}
+
+static int read_eraseblock_by_2pages(int ebnum)
+{
+	size_t read = 0, sz = pgsize * 2;
+	int i, n = pgcnt / 2, err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+	void *buf = iobuf;
+
+	for (i = 0; i < n; i++) {
+		err = mtd->read(mtd, addr, sz, &read, buf);
+		/* Ignore corrected ECC errors */
+		if (err == -EUCLEAN)
+			err = 0;
+		if (err || read != sz) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       addr);
+			if (!err)
+				err = -EINVAL;
+			return err;
+		}
+		addr += sz;
+		buf += sz;
+	}
+	if (pgcnt % 2) {
+		err = mtd->read(mtd, addr, pgsize, &read, buf);
+		/* Ignore corrected ECC errors */
+		if (err == -EUCLEAN)
+			err = 0;
+		if (err || read != pgsize) {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       addr);
+			if (!err)
+				err = -EINVAL;
+		}
+	}
+
+	return err;
+}
+
+static int is_block_bad(int ebnum)
+{
+	loff_t addr = ebnum * mtd->erasesize;
+	int ret;
+
+	ret = mtd->block_isbad(mtd, addr);
+	if (ret)
+		printk(PRINT_PREF "block %d is bad\n", ebnum);
+	return ret;
+}
+
+static inline void start_timing(void)
+{
+	do_gettimeofday(&start);
+}
+
+static inline void stop_timing(void)
+{
+	do_gettimeofday(&finish);
+}
+
+static long calc_speed(void)
+{
+	long ms, k, speed;
+
+	ms = (finish.tv_sec - start.tv_sec) * 1000 +
+	     (finish.tv_usec - start.tv_usec) / 1000;
+	k = goodebcnt * mtd->erasesize / 1024;
+	speed = (k * 1000) / ms;
+	return speed;
+}
+
+static int scan_for_bad_eraseblocks(void)
+{
+	int i, bad = 0;
+
+	bbt = kmalloc(ebcnt, GFP_KERNEL);
+	if (!bbt) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		return -ENOMEM;
+	}
+	memset(bbt, 0 , ebcnt);
+
+	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		bbt[i] = is_block_bad(i) ? 1 : 0;
+		if (bbt[i])
+			bad += 1;
+		cond_resched();
+	}
+	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	goodebcnt = ebcnt - bad;
+	return 0;
+}
+
+static int __init mtd_speedtest_init(void)
+{
+	int err, i;
+	long speed;
+	uint64_t tmp;
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "=================================================\n");
+	printk(PRINT_PREF "MTD device: %d\n", dev);
+
+	mtd = get_mtd_device(NULL, dev);
+	if (IS_ERR(mtd)) {
+		err = PTR_ERR(mtd);
+		printk(PRINT_PREF "error: cannot get MTD device\n");
+		return err;
+	}
+
+	if (mtd->writesize == 1) {
+		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
+		       "bytes.\n");
+		pgsize = 512;
+	} else
+		pgsize = mtd->writesize;
+
+	tmp = mtd->size;
+	do_div(tmp, mtd->erasesize);
+	ebcnt = tmp;
+	pgcnt = mtd->erasesize / mtd->writesize;
+
+	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	       "page size %u, count of eraseblocks %u, pages per "
+	       "eraseblock %u, OOB size %u\n",
+	       (unsigned long long)mtd->size, mtd->erasesize,
+	       pgsize, ebcnt, pgcnt, mtd->oobsize);
+
+	err = -ENOMEM;
+	iobuf = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!iobuf) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+
+	simple_srand(1);
+	set_random_data(iobuf, mtd->erasesize);
+
+	err = scan_for_bad_eraseblocks();
+	if (err)
+		goto out;
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	/* Write all eraseblocks, 1 eraseblock at a time */
+	printk(PRINT_PREF "testing eraseblock write speed\n");
+	start_timing();
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = write_eraseblock(i);
+		if (err)
+			goto out;
+		cond_resched();
+	}
+	stop_timing();
+	speed = calc_speed();
+	printk(PRINT_PREF "eraseblock write speed is %ld KiB/s\n", speed);
+
+	/* Read all eraseblocks, 1 eraseblock at a time */
+	printk(PRINT_PREF "testing eraseblock read speed\n");
+	start_timing();
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = read_eraseblock(i);
+		if (err)
+			goto out;
+		cond_resched();
+	}
+	stop_timing();
+	speed = calc_speed();
+	printk(PRINT_PREF "eraseblock read speed is %ld KiB/s\n", speed);
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	/* Write all eraseblocks, 1 page at a time */
+	printk(PRINT_PREF "testing page write speed\n");
+	start_timing();
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = write_eraseblock_by_page(i);
+		if (err)
+			goto out;
+		cond_resched();
+	}
+	stop_timing();
+	speed = calc_speed();
+	printk(PRINT_PREF "page write speed is %ld KiB/s\n", speed);
+
+	/* Read all eraseblocks, 1 page at a time */
+	printk(PRINT_PREF "testing page read speed\n");
+	start_timing();
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = read_eraseblock_by_page(i);
+		if (err)
+			goto out;
+		cond_resched();
+	}
+	stop_timing();
+	speed = calc_speed();
+	printk(PRINT_PREF "page read speed is %ld KiB/s\n", speed);
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	/* Write all eraseblocks, 2 pages at a time */
+	printk(PRINT_PREF "testing 2 page write speed\n");
+	start_timing();
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = write_eraseblock_by_2pages(i);
+		if (err)
+			goto out;
+		cond_resched();
+	}
+	stop_timing();
+	speed = calc_speed();
+	printk(PRINT_PREF "2 page write speed is %ld KiB/s\n", speed);
+
+	/* Read all eraseblocks, 2 pages at a time */
+	printk(PRINT_PREF "testing 2 page read speed\n");
+	start_timing();
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = read_eraseblock_by_2pages(i);
+		if (err)
+			goto out;
+		cond_resched();
+	}
+	stop_timing();
+	speed = calc_speed();
+	printk(PRINT_PREF "2 page read speed is %ld KiB/s\n", speed);
+
+	/* Erase all eraseblocks */
+	printk(PRINT_PREF "Testing erase speed\n");
+	start_timing();
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = erase_eraseblock(i);
+		if (err)
+			goto out;
+		cond_resched();
+	}
+	stop_timing();
+	speed = calc_speed();
+	printk(PRINT_PREF "erase speed is %ld KiB/s\n", speed);
+
+	printk(PRINT_PREF "finished\n");
+out:
+	kfree(iobuf);
+	kfree(bbt);
+	put_mtd_device(mtd);
+	if (err)
+		printk(PRINT_PREF "error %d occurred\n", err);
+	printk(KERN_INFO "=================================================\n");
+	return err;
+}
+module_init(mtd_speedtest_init);
+
+static void __exit mtd_speedtest_exit(void)
+{
+	return;
+}
+module_exit(mtd_speedtest_exit);
+
+MODULE_DESCRIPTION("Speed test module");
+MODULE_AUTHOR("Adrian Hunter");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/tests/mtd_stresstest.c b/drivers/mtd/tests/mtd_stresstest.c
new file mode 100644
index 00000000000..63920476b57
--- /dev/null
+++ b/drivers/mtd/tests/mtd_stresstest.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; see the file COPYING. If not, write to the Free Software
+ * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Test random reads, writes and erases on MTD device.
+ *
+ * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/err.h>
+#include <linux/mtd/mtd.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#define PRINT_PREF KERN_INFO "mtd_stresstest: "
+
+static int dev;
+module_param(dev, int, S_IRUGO);
+MODULE_PARM_DESC(dev, "MTD device number to use");
+
+static int count = 10000;
+module_param(count, int, S_IRUGO);
+MODULE_PARM_DESC(count, "Number of operations to do (default is 10000)");
+
+static struct mtd_info *mtd;
+static unsigned char *writebuf;
+static unsigned char *readbuf;
+static unsigned char *bbt;
+static int *offsets;
+
+static int pgsize;
+static int bufsize;
+static int ebcnt;
+static int pgcnt;
+static unsigned long next = 1;
+
+static inline unsigned int simple_rand(void)
+{
+	next = next * 1103515245 + 12345;
+	return (unsigned int)((next / 65536) % 32768);
+}
+
+static inline void simple_srand(unsigned long seed)
+{
+	next = seed;
+}
+
+static int rand_eb(void)
+{
+	int eb;
+
+again:
+	if (ebcnt < 32768)
+		eb = simple_rand();
+	else
+		eb = (simple_rand() << 15) | simple_rand();
+	/* Read or write up 2 eraseblocks at a time - hence 'ebcnt - 1' */
+	eb %= (ebcnt - 1);
+	if (bbt[eb])
+		goto again;
+	return eb;
+}
+
+static int rand_offs(void)
+{
+	int offs;
+
+	if (bufsize < 32768)
+		offs = simple_rand();
+	else
+		offs = (simple_rand() << 15) | simple_rand();
+	offs %= bufsize;
+	return offs;
+}
+
+static int rand_len(int offs)
+{
+	int len;
+
+	if (bufsize < 32768)
+		len = simple_rand();
+	else
+		len = (simple_rand() << 15) | simple_rand();
+	len %= (bufsize - offs);
+	return len;
+}
+
+static int erase_eraseblock(int ebnum)
+{
+	int err;
+	struct erase_info ei;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	memset(&ei, 0, sizeof(struct erase_info));
+	ei.mtd  = mtd;
+	ei.addr = addr;
+	ei.len  = mtd->erasesize;
+
+	err = mtd->erase(mtd, &ei);
+	if (unlikely(err)) {
+		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		return err;
+	}
+
+	if (unlikely(ei.state == MTD_ERASE_FAILED)) {
+		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		       ebnum);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int is_block_bad(int ebnum)
+{
+	loff_t addr = ebnum * mtd->erasesize;
+	int ret;
+
+	ret = mtd->block_isbad(mtd, addr);
+	if (ret)
+		printk(PRINT_PREF "block %d is bad\n", ebnum);
+	return ret;
+}
+
+static int do_read(void)
+{
+	size_t read = 0;
+	int eb = rand_eb();
+	int offs = rand_offs();
+	int len = rand_len(offs), err;
+	loff_t addr;
+
+	if (bbt[eb + 1]) {
+		if (offs >= mtd->erasesize)
+			offs -= mtd->erasesize;
+		if (offs + len > mtd->erasesize)
+			len = mtd->erasesize - offs;
+	}
+	addr = eb * mtd->erasesize + offs;
+	err = mtd->read(mtd, addr, len, &read, readbuf);
+	if (err == -EUCLEAN)
+		err = 0;
+	if (unlikely(err || read != len)) {
+		printk(PRINT_PREF "error: read failed at 0x%llx\n",
+		       (long long)addr);
+		if (!err)
+			err = -EINVAL;
+		return err;
+	}
+	return 0;
+}
+
+static int do_write(void)
+{
+	int eb = rand_eb(), offs, err, len;
+	size_t written = 0;
+	loff_t addr;
+
+	offs = offsets[eb];
+	if (offs >= mtd->erasesize) {
+		err = erase_eraseblock(eb);
+		if (err)
+			return err;
+		offs = offsets[eb] = 0;
+	}
+	len = rand_len(offs);
+	len = ((len + pgsize - 1) / pgsize) * pgsize;
+	if (offs + len > mtd->erasesize) {
+		if (bbt[eb + 1])
+			len = mtd->erasesize - offs;
+		else {
+			err = erase_eraseblock(eb + 1);
+			if (err)
+				return err;
+			offsets[eb + 1] = 0;
+		}
+	}
+	addr = eb * mtd->erasesize + offs;
+	err = mtd->write(mtd, addr, len, &written, writebuf);
+	if (unlikely(err || written != len)) {
+		printk(PRINT_PREF "error: write failed at 0x%llx\n",
+		       (long long)addr);
+		if (!err)
+			err = -EINVAL;
+		return err;
+	}
+	offs += len;
+	while (offs > mtd->erasesize) {
+		offsets[eb++] = mtd->erasesize;
+		offs -= mtd->erasesize;
+	}
+	offsets[eb] = offs;
+	return 0;
+}
+
+static int do_operation(void)
+{
+	if (simple_rand() & 1)
+		return do_read();
+	else
+		return do_write();
+}
+
+static int scan_for_bad_eraseblocks(void)
+{
+	int i, bad = 0;
+
+	bbt = kmalloc(ebcnt, GFP_KERNEL);
+	if (!bbt) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		return -ENOMEM;
+	}
+	memset(bbt, 0 , ebcnt);
+
+	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		bbt[i] = is_block_bad(i) ? 1 : 0;
+		if (bbt[i])
+			bad += 1;
+		cond_resched();
+	}
+	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	return 0;
+}
+
+static int __init mtd_stresstest_init(void)
+{
+	int err;
+	int i, op;
+	uint64_t tmp;
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "=================================================\n");
+	printk(PRINT_PREF "MTD device: %d\n", dev);
+
+	mtd = get_mtd_device(NULL, dev);
+	if (IS_ERR(mtd)) {
+		err = PTR_ERR(mtd);
+		printk(PRINT_PREF "error: cannot get MTD device\n");
+		return err;
+	}
+
+	if (mtd->writesize == 1) {
+		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
+		       "bytes.\n");
+		pgsize = 512;
+	} else
+		pgsize = mtd->writesize;
+
+	tmp = mtd->size;
+	do_div(tmp, mtd->erasesize);
+	ebcnt = tmp;
+	pgcnt = mtd->erasesize / mtd->writesize;
+
+	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	       "page size %u, count of eraseblocks %u, pages per "
+	       "eraseblock %u, OOB size %u\n",
+	       (unsigned long long)mtd->size, mtd->erasesize,
+	       pgsize, ebcnt, pgcnt, mtd->oobsize);
+
+	/* Read or write up 2 eraseblocks at a time */
+	bufsize = mtd->erasesize * 2;
+
+	err = -ENOMEM;
+	readbuf = vmalloc(bufsize);
+	writebuf = vmalloc(bufsize);
+	offsets = kmalloc(ebcnt * sizeof(int), GFP_KERNEL);
+	if (!readbuf || !writebuf || !offsets) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+	for (i = 0; i < ebcnt; i++)
+		offsets[i] = mtd->erasesize;
+	simple_srand(current->pid);
+	for (i = 0; i < bufsize; i++)
+		writebuf[i] = simple_rand();
+
+	err = scan_for_bad_eraseblocks();
+	if (err)
+		goto out;
+
+	/* Do operations */
+	printk(PRINT_PREF "doing operations\n");
+	for (op = 0; op < count; op++) {
+		if ((op & 1023) == 0)
+			printk(PRINT_PREF "%d operations done\n", op);
+		err = do_operation();
+		if (err)
+			goto out;
+		cond_resched();
+	}
+	printk(PRINT_PREF "finished, %d operations done\n", op);
+
+out:
+	kfree(offsets);
+	kfree(bbt);
+	vfree(writebuf);
+	vfree(readbuf);
+	put_mtd_device(mtd);
+	if (err)
+		printk(PRINT_PREF "error %d occurred\n", err);
+	printk(KERN_INFO "=================================================\n");
+	return err;
+}
+module_init(mtd_stresstest_init);
+
+static void __exit mtd_stresstest_exit(void)
+{
+	return;
+}
+module_exit(mtd_stresstest_exit);
+
+MODULE_DESCRIPTION("Stress test module");
+MODULE_AUTHOR("Adrian Hunter");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/tests/mtd_subpagetest.c b/drivers/mtd/tests/mtd_subpagetest.c
new file mode 100644
index 00000000000..5b889724268
--- /dev/null
+++ b/drivers/mtd/tests/mtd_subpagetest.c
@@ -0,0 +1,525 @@
+/*
+ * Copyright (C) 2006-2007 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; see the file COPYING. If not, write to the Free Software
+ * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Test sub-page read and write on MTD device.
+ * Author: Adrian Hunter <ext-adrian.hunter@nokia.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/err.h>
+#include <linux/mtd/mtd.h>
+#include <linux/sched.h>
+
+#define PRINT_PREF KERN_INFO "mtd_subpagetest: "
+
+static int dev;
+module_param(dev, int, S_IRUGO);
+MODULE_PARM_DESC(dev, "MTD device number to use");
+
+static struct mtd_info *mtd;
+static unsigned char *writebuf;
+static unsigned char *readbuf;
+static unsigned char *bbt;
+
+static int subpgsize;
+static int bufsize;
+static int ebcnt;
+static int pgcnt;
+static int errcnt;
+static unsigned long next = 1;
+
+static inline unsigned int simple_rand(void)
+{
+	next = next * 1103515245 + 12345;
+	return (unsigned int)((next / 65536) % 32768);
+}
+
+static inline void simple_srand(unsigned long seed)
+{
+	next = seed;
+}
+
+static void set_random_data(unsigned char *buf, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; ++i)
+		buf[i] = simple_rand();
+}
+
+static inline void clear_data(unsigned char *buf, size_t len)
+{
+	memset(buf, 0, len);
+}
+
+static int erase_eraseblock(int ebnum)
+{
+	int err;
+	struct erase_info ei;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	memset(&ei, 0, sizeof(struct erase_info));
+	ei.mtd  = mtd;
+	ei.addr = addr;
+	ei.len  = mtd->erasesize;
+
+	err = mtd->erase(mtd, &ei);
+	if (err) {
+		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		return err;
+	}
+
+	if (ei.state == MTD_ERASE_FAILED) {
+		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		       ebnum);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int erase_whole_device(void)
+{
+	int err;
+	unsigned int i;
+
+	printk(PRINT_PREF "erasing whole device\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = erase_eraseblock(i);
+		if (err)
+			return err;
+		cond_resched();
+	}
+	printk(PRINT_PREF "erased %u eraseblocks\n", i);
+	return 0;
+}
+
+static int write_eraseblock(int ebnum)
+{
+	size_t written = 0;
+	int err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	set_random_data(writebuf, subpgsize);
+	err = mtd->write(mtd, addr, subpgsize, &written, writebuf);
+	if (unlikely(err || written != subpgsize)) {
+		printk(PRINT_PREF "error: write failed at %#llx\n",
+		       (long long)addr);
+		if (written != subpgsize) {
+			printk(PRINT_PREF "  write size: %#x\n", subpgsize);
+			printk(PRINT_PREF "  written: %#zx\n", written);
+		}
+		return err ? err : -1;
+	}
+
+	addr += subpgsize;
+
+	set_random_data(writebuf, subpgsize);
+	err = mtd->write(mtd, addr, subpgsize, &written, writebuf);
+	if (unlikely(err || written != subpgsize)) {
+		printk(PRINT_PREF "error: write failed at %#llx\n",
+		       (long long)addr);
+		if (written != subpgsize) {
+			printk(PRINT_PREF "  write size: %#x\n", subpgsize);
+			printk(PRINT_PREF "  written: %#zx\n", written);
+		}
+		return err ? err : -1;
+	}
+
+	return err;
+}
+
+static int write_eraseblock2(int ebnum)
+{
+	size_t written = 0;
+	int err = 0, k;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	for (k = 1; k < 33; ++k) {
+		if (addr + (subpgsize * k) > (ebnum + 1) * mtd->erasesize)
+			break;
+		set_random_data(writebuf, subpgsize * k);
+		err = mtd->write(mtd, addr, subpgsize * k, &written, writebuf);
+		if (unlikely(err || written != subpgsize * k)) {
+			printk(PRINT_PREF "error: write failed at %#llx\n",
+			       (long long)addr);
+			if (written != subpgsize) {
+				printk(PRINT_PREF "  write size: %#x\n",
+				       subpgsize * k);
+				printk(PRINT_PREF "  written: %#08zx\n",
+				       written);
+			}
+			return err ? err : -1;
+		}
+		addr += subpgsize * k;
+	}
+
+	return err;
+}
+
+static void print_subpage(unsigned char *p)
+{
+	int i, j;
+
+	for (i = 0; i < subpgsize; ) {
+		for (j = 0; i < subpgsize && j < 32; ++i, ++j)
+			printk("%02x", *p++);
+		printk("\n");
+	}
+}
+
+static int verify_eraseblock(int ebnum)
+{
+	size_t read = 0;
+	int err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	set_random_data(writebuf, subpgsize);
+	clear_data(readbuf, subpgsize);
+	read = 0;
+	err = mtd->read(mtd, addr, subpgsize, &read, readbuf);
+	if (unlikely(err || read != subpgsize)) {
+		if (err == -EUCLEAN && read == subpgsize) {
+			printk(PRINT_PREF "ECC correction at %#llx\n",
+			       (long long)addr);
+			err = 0;
+		} else {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       (long long)addr);
+			return err ? err : -1;
+		}
+	}
+	if (unlikely(memcmp(readbuf, writebuf, subpgsize))) {
+		printk(PRINT_PREF "error: verify failed at %#llx\n",
+		       (long long)addr);
+		printk(PRINT_PREF "------------- written----------------\n");
+		print_subpage(writebuf);
+		printk(PRINT_PREF "------------- read ------------------\n");
+		print_subpage(readbuf);
+		printk(PRINT_PREF "-------------------------------------\n");
+		errcnt += 1;
+	}
+
+	addr += subpgsize;
+
+	set_random_data(writebuf, subpgsize);
+	clear_data(readbuf, subpgsize);
+	read = 0;
+	err = mtd->read(mtd, addr, subpgsize, &read, readbuf);
+	if (unlikely(err || read != subpgsize)) {
+		if (err == -EUCLEAN && read == subpgsize) {
+			printk(PRINT_PREF "ECC correction at %#llx\n",
+			       (long long)addr);
+			err = 0;
+		} else {
+			printk(PRINT_PREF "error: read failed at %#llx\n",
+			       (long long)addr);
+			return err ? err : -1;
+		}
+	}
+	if (unlikely(memcmp(readbuf, writebuf, subpgsize))) {
+		printk(PRINT_PREF "error: verify failed at %#llx\n",
+		       (long long)addr);
+		printk(PRINT_PREF "------------- written----------------\n");
+		print_subpage(writebuf);
+		printk(PRINT_PREF "------------- read ------------------\n");
+		print_subpage(readbuf);
+		printk(PRINT_PREF "-------------------------------------\n");
+		errcnt += 1;
+	}
+
+	return err;
+}
+
+static int verify_eraseblock2(int ebnum)
+{
+	size_t read = 0;
+	int err = 0, k;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	for (k = 1; k < 33; ++k) {
+		if (addr + (subpgsize * k) > (ebnum + 1) * mtd->erasesize)
+			break;
+		set_random_data(writebuf, subpgsize * k);
+		clear_data(readbuf, subpgsize * k);
+		read = 0;
+		err = mtd->read(mtd, addr, subpgsize * k, &read, readbuf);
+		if (unlikely(err || read != subpgsize * k)) {
+			if (err == -EUCLEAN && read == subpgsize * k) {
+				printk(PRINT_PREF "ECC correction at %#llx\n",
+				       (long long)addr);
+				err = 0;
+			} else {
+				printk(PRINT_PREF "error: read failed at "
+				       "%#llx\n", (long long)addr);
+				return err ? err : -1;
+			}
+		}
+		if (unlikely(memcmp(readbuf, writebuf, subpgsize * k))) {
+			printk(PRINT_PREF "error: verify failed at %#llx\n",
+			       (long long)addr);
+			errcnt += 1;
+		}
+		addr += subpgsize * k;
+	}
+
+	return err;
+}
+
+static int verify_eraseblock_ff(int ebnum)
+{
+	uint32_t j;
+	size_t read = 0;
+	int err = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	memset(writebuf, 0xff, subpgsize);
+	for (j = 0; j < mtd->erasesize / subpgsize; ++j) {
+		clear_data(readbuf, subpgsize);
+		read = 0;
+		err = mtd->read(mtd, addr, subpgsize, &read, readbuf);
+		if (unlikely(err || read != subpgsize)) {
+			if (err == -EUCLEAN && read == subpgsize) {
+				printk(PRINT_PREF "ECC correction at %#llx\n",
+				       (long long)addr);
+				err = 0;
+			} else {
+				printk(PRINT_PREF "error: read failed at "
+				       "%#llx\n", (long long)addr);
+				return err ? err : -1;
+			}
+		}
+		if (unlikely(memcmp(readbuf, writebuf, subpgsize))) {
+			printk(PRINT_PREF "error: verify 0xff failed at "
+			       "%#llx\n", (long long)addr);
+			errcnt += 1;
+		}
+		addr += subpgsize;
+	}
+
+	return err;
+}
+
+static int verify_all_eraseblocks_ff(void)
+{
+	int err;
+	unsigned int i;
+
+	printk(PRINT_PREF "verifying all eraseblocks for 0xff\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = verify_eraseblock_ff(i);
+		if (err)
+			return err;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+	return 0;
+}
+
+static int is_block_bad(int ebnum)
+{
+	loff_t addr = ebnum * mtd->erasesize;
+	int ret;
+
+	ret = mtd->block_isbad(mtd, addr);
+	if (ret)
+		printk(PRINT_PREF "block %d is bad\n", ebnum);
+	return ret;
+}
+
+static int scan_for_bad_eraseblocks(void)
+{
+	int i, bad = 0;
+
+	bbt = kmalloc(ebcnt, GFP_KERNEL);
+	if (!bbt) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		return -ENOMEM;
+	}
+	memset(bbt, 0 , ebcnt);
+
+	printk(PRINT_PREF "scanning for bad eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		bbt[i] = is_block_bad(i) ? 1 : 0;
+		if (bbt[i])
+			bad += 1;
+		cond_resched();
+	}
+	printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad);
+	return 0;
+}
+
+static int __init mtd_subpagetest_init(void)
+{
+	int err = 0;
+	uint32_t i;
+	uint64_t tmp;
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "=================================================\n");
+	printk(PRINT_PREF "MTD device: %d\n", dev);
+
+	mtd = get_mtd_device(NULL, dev);
+	if (IS_ERR(mtd)) {
+		err = PTR_ERR(mtd);
+		printk(PRINT_PREF "error: cannot get MTD device\n");
+		return err;
+	}
+
+	if (mtd->type != MTD_NANDFLASH) {
+		printk(PRINT_PREF "this test requires NAND flash\n");
+		goto out;
+	}
+
+	subpgsize = mtd->writesize >> mtd->subpage_sft;
+	printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, "
+	       "page size %u, subpage size %u, count of eraseblocks %u, "
+	       "pages per eraseblock %u, OOB size %u\n",
+	       (unsigned long long)mtd->size, mtd->erasesize,
+	       mtd->writesize, subpgsize, ebcnt, pgcnt, mtd->oobsize);
+
+	err = -ENOMEM;
+	bufsize = subpgsize * 32;
+	writebuf = kmalloc(bufsize, GFP_KERNEL);
+	if (!writebuf) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+	readbuf = kmalloc(bufsize, GFP_KERNEL);
+	if (!readbuf) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out;
+	}
+
+	tmp = mtd->size;
+	do_div(tmp, mtd->erasesize);
+	ebcnt = tmp;
+	pgcnt = mtd->erasesize / mtd->writesize;
+
+	err = scan_for_bad_eraseblocks();
+	if (err)
+		goto out;
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	printk(PRINT_PREF "writing whole device\n");
+	simple_srand(1);
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = write_eraseblock(i);
+		if (unlikely(err))
+			goto out;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "written up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "written %u eraseblocks\n", i);
+
+	simple_srand(1);
+	printk(PRINT_PREF "verifying all eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = verify_eraseblock(i);
+		if (unlikely(err))
+			goto out;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	err = verify_all_eraseblocks_ff();
+	if (err)
+		goto out;
+
+	/* Write all eraseblocks */
+	simple_srand(3);
+	printk(PRINT_PREF "writing whole device\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = write_eraseblock2(i);
+		if (unlikely(err))
+			goto out;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "written up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "written %u eraseblocks\n", i);
+
+	/* Check all eraseblocks */
+	simple_srand(3);
+	printk(PRINT_PREF "verifying all eraseblocks\n");
+	for (i = 0; i < ebcnt; ++i) {
+		if (bbt[i])
+			continue;
+		err = verify_eraseblock2(i);
+		if (unlikely(err))
+			goto out;
+		if (i % 256 == 0)
+			printk(PRINT_PREF "verified up to eraseblock %u\n", i);
+		cond_resched();
+	}
+	printk(PRINT_PREF "verified %u eraseblocks\n", i);
+
+	err = erase_whole_device();
+	if (err)
+		goto out;
+
+	err = verify_all_eraseblocks_ff();
+	if (err)
+		goto out;
+
+	printk(PRINT_PREF "finished with %d errors\n", errcnt);
+
+out:
+	kfree(bbt);
+	kfree(readbuf);
+	kfree(writebuf);
+	put_mtd_device(mtd);
+	if (err)
+		printk(PRINT_PREF "error %d occurred\n", err);
+	printk(KERN_INFO "=================================================\n");
+	return err;
+}
+module_init(mtd_subpagetest_init);
+
+static void __exit mtd_subpagetest_exit(void)
+{
+	return;
+}
+module_exit(mtd_subpagetest_exit);
+
+MODULE_DESCRIPTION("Subpage test module");
+MODULE_AUTHOR("Adrian Hunter");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/tests/mtd_torturetest.c b/drivers/mtd/tests/mtd_torturetest.c
new file mode 100644
index 00000000000..631a0ab3a33
--- /dev/null
+++ b/drivers/mtd/tests/mtd_torturetest.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (C) 2006-2008 Artem Bityutskiy
+ * Copyright (C) 2006-2008 Jarkko Lavinen
+ * Copyright (C) 2006-2008 Adrian Hunter
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; see the file COPYING. If not, write to the Free Software
+ * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Authors: Artem Bityutskiy, Jarkko Lavinen, Adria Hunter
+ *
+ * WARNING: this test program may kill your flash and your device. Do not
+ * use it unless you know what you do. Authors are not responsible for any
+ * damage caused by this program.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/err.h>
+#include <linux/mtd/mtd.h>
+#include <linux/sched.h>
+
+#define PRINT_PREF KERN_INFO "mtd_torturetest: "
+#define RETRIES 3
+
+static int eb = 8;
+module_param(eb, int, S_IRUGO);
+MODULE_PARM_DESC(eb, "eraseblock number within the selected MTD device");
+
+static int ebcnt = 32;
+module_param(ebcnt, int, S_IRUGO);
+MODULE_PARM_DESC(ebcnt, "number of consecutive eraseblocks to torture");
+
+static int pgcnt;
+module_param(pgcnt, int, S_IRUGO);
+MODULE_PARM_DESC(pgcnt, "number of pages per eraseblock to torture (0 => all)");
+
+static int dev;
+module_param(dev, int, S_IRUGO);
+MODULE_PARM_DESC(dev, "MTD device number to use");
+
+static int gran = 512;
+module_param(gran, int, S_IRUGO);
+MODULE_PARM_DESC(gran, "how often the status information should be printed");
+
+static int check = 1;
+module_param(check, int, S_IRUGO);
+MODULE_PARM_DESC(check, "if the written data should be checked");
+
+static unsigned int cycles_count;
+module_param(cycles_count, uint, S_IRUGO);
+MODULE_PARM_DESC(cycles_count, "how many erase cycles to do "
+			       "(infinite by default)");
+
+static struct mtd_info *mtd;
+
+/* This buffer contains 0x555555...0xAAAAAA... pattern */
+static unsigned char *patt_5A5;
+/* This buffer contains 0xAAAAAA...0x555555... pattern */
+static unsigned char *patt_A5A;
+/* This buffer contains all 0xFF bytes */
+static unsigned char *patt_FF;
+/* This a temporary buffer is use when checking data */
+static unsigned char *check_buf;
+/* How many erase cycles were done */
+static unsigned int erase_cycles;
+
+static int pgsize;
+static struct timeval start, finish;
+
+static void report_corrupt(unsigned char *read, unsigned char *written);
+
+static inline void start_timing(void)
+{
+	do_gettimeofday(&start);
+}
+
+static inline void stop_timing(void)
+{
+	do_gettimeofday(&finish);
+}
+
+/*
+ * Erase eraseblock number @ebnum.
+ */
+static inline int erase_eraseblock(int ebnum)
+{
+	int err;
+	struct erase_info ei;
+	loff_t addr = ebnum * mtd->erasesize;
+
+	memset(&ei, 0, sizeof(struct erase_info));
+	ei.mtd  = mtd;
+	ei.addr = addr;
+	ei.len  = mtd->erasesize;
+
+	err = mtd->erase(mtd, &ei);
+	if (err) {
+		printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum);
+		return err;
+	}
+
+	if (ei.state == MTD_ERASE_FAILED) {
+		printk(PRINT_PREF "some erase error occurred at EB %d\n",
+		       ebnum);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * Check that the contents of eraseblock number @enbum is equivalent to the
+ * @buf buffer.
+ */
+static inline int check_eraseblock(int ebnum, unsigned char *buf)
+{
+	int err, retries = 0;
+	size_t read = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+	size_t len = mtd->erasesize;
+
+	if (pgcnt) {
+		addr = (ebnum + 1) * mtd->erasesize - pgcnt * pgsize;
+		len = pgcnt * pgsize;
+	}
+
+retry:
+	err = mtd->read(mtd, addr, len, &read, check_buf);
+	if (err == -EUCLEAN)
+		printk(PRINT_PREF "single bit flip occurred at EB %d "
+		       "MTD reported that it was fixed.\n", ebnum);
+	else if (err) {
+		printk(PRINT_PREF "error %d while reading EB %d, "
+		       "read %zd\n", err, ebnum, read);
+		return err;
+	}
+
+	if (read != len) {
+		printk(PRINT_PREF "failed to read %zd bytes from EB %d, "
+		       "read only %zd, but no error reported\n",
+		       len, ebnum, read);
+		return -EIO;
+	}
+
+	if (memcmp(buf, check_buf, len)) {
+		printk(PRINT_PREF "read wrong data from EB %d\n", ebnum);
+		report_corrupt(check_buf, buf);
+
+		if (retries++ < RETRIES) {
+			/* Try read again */
+			yield();
+			printk(PRINT_PREF "re-try reading data from EB %d\n",
+			       ebnum);
+			goto retry;
+		} else {
+			printk(PRINT_PREF "retried %d times, still errors, "
+			       "give-up\n", RETRIES);
+			return -EINVAL;
+		}
+	}
+
+	if (retries != 0)
+		printk(PRINT_PREF "only attempt number %d was OK (!!!)\n",
+		       retries);
+
+	return 0;
+}
+
+static inline int write_pattern(int ebnum, void *buf)
+{
+	int err;
+	size_t written = 0;
+	loff_t addr = ebnum * mtd->erasesize;
+	size_t len = mtd->erasesize;
+
+	if (pgcnt) {
+		addr = (ebnum + 1) * mtd->erasesize - pgcnt * pgsize;
+		len = pgcnt * pgsize;
+	}
+	err = mtd->write(mtd, addr, len, &written, buf);
+	if (err) {
+		printk(PRINT_PREF "error %d while writing EB %d, written %zd"
+		      " bytes\n", err, ebnum, written);
+		return err;
+	}
+	if (written != len) {
+		printk(PRINT_PREF "written only %zd bytes of %zd, but no error"
+		       " reported\n", written, len);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int __init tort_init(void)
+{
+	int err = 0, i, infinite = !cycles_count;
+	int bad_ebs[ebcnt];
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "=================================================\n");
+	printk(PRINT_PREF "Warning: this program is trying to wear out your "
+	       "flash, stop it if this is not wanted.\n");
+	printk(PRINT_PREF "MTD device: %d\n", dev);
+	printk(PRINT_PREF "torture %d eraseblocks (%d-%d) of mtd%d\n",
+	       ebcnt, eb, eb + ebcnt - 1, dev);
+	if (pgcnt)
+		printk(PRINT_PREF "torturing just %d pages per eraseblock\n",
+			pgcnt);
+	printk(PRINT_PREF "write verify %s\n", check ? "enabled" : "disabled");
+
+	mtd = get_mtd_device(NULL, dev);
+	if (IS_ERR(mtd)) {
+		err = PTR_ERR(mtd);
+		printk(PRINT_PREF "error: cannot get MTD device\n");
+		return err;
+	}
+
+	if (mtd->writesize == 1) {
+		printk(PRINT_PREF "not NAND flash, assume page size is 512 "
+		       "bytes.\n");
+		pgsize = 512;
+	} else
+		pgsize = mtd->writesize;
+
+	if (pgcnt && (pgcnt > mtd->erasesize / pgsize || pgcnt < 0)) {
+		printk(PRINT_PREF "error: invalid pgcnt value %d\n", pgcnt);
+		goto out_mtd;
+	}
+
+	err = -ENOMEM;
+	patt_5A5 = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!patt_5A5) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out_mtd;
+	}
+
+	patt_A5A = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!patt_A5A) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out_patt_5A5;
+	}
+
+	patt_FF = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!patt_FF) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out_patt_A5A;
+	}
+
+	check_buf = kmalloc(mtd->erasesize, GFP_KERNEL);
+	if (!check_buf) {
+		printk(PRINT_PREF "error: cannot allocate memory\n");
+		goto out_patt_FF;
+	}
+
+	err = 0;
+
+	/* Initialize patterns */
+	memset(patt_FF, 0xFF, mtd->erasesize);
+	for (i = 0; i < mtd->erasesize / pgsize; i++) {
+		if (!(i & 1)) {
+			memset(patt_5A5 + i * pgsize, 0x55, pgsize);
+			memset(patt_A5A + i * pgsize, 0xAA, pgsize);
+		} else {
+			memset(patt_5A5 + i * pgsize, 0xAA, pgsize);
+			memset(patt_A5A + i * pgsize, 0x55, pgsize);
+		}
+	}
+
+	/*
+	 * Check if there is a bad eraseblock among those we are going to test.
+	 */
+	memset(&bad_ebs[0], 0, sizeof(int) * ebcnt);
+	if (mtd->block_isbad) {
+		for (i = eb; i < eb + ebcnt; i++) {
+			err = mtd->block_isbad(mtd,
+					       (loff_t)i * mtd->erasesize);
+
+			if (err < 0) {
+				printk(PRINT_PREF "block_isbad() returned %d "
+				       "for EB %d\n", err, i);
+				goto out;
+			}
+
+			if (err) {
+				printk("EB %d is bad. Skip it.\n", i);
+				bad_ebs[i - eb] = 1;
+			}
+		}
+	}
+
+	start_timing();
+	while (1) {
+		int i;
+		void *patt;
+
+		/* Erase all eraseblocks */
+		for (i = eb; i < eb + ebcnt; i++) {
+			if (bad_ebs[i - eb])
+				continue;
+			err = erase_eraseblock(i);
+			if (err)
+				goto out;
+			cond_resched();
+		}
+
+		/* Check if the eraseblocks contain only 0xFF bytes */
+		if (check) {
+			for (i = eb; i < eb + ebcnt; i++) {
+				if (bad_ebs[i - eb])
+					continue;
+				err = check_eraseblock(i, patt_FF);
+				if (err) {
+					printk(PRINT_PREF "verify failed"
+					       " for 0xFF... pattern\n");
+					goto out;
+				}
+				cond_resched();
+			}
+		}
+
+		/* Write the pattern */
+		for (i = eb; i < eb + ebcnt; i++) {
+			if (bad_ebs[i - eb])
+				continue;
+			if ((eb + erase_cycles) & 1)
+				patt = patt_5A5;
+			else
+				patt = patt_A5A;
+			err = write_pattern(i, patt);
+			if (err)
+				goto out;
+			cond_resched();
+		}
+
+		/* Verify what we wrote */
+		if (check) {
+			for (i = eb; i < eb + ebcnt; i++) {
+				if (bad_ebs[i - eb])
+					continue;
+				if ((eb + erase_cycles) & 1)
+					patt = patt_5A5;
+				else
+					patt = patt_A5A;
+				err = check_eraseblock(i, patt);
+				if (err) {
+					printk(PRINT_PREF "verify failed for %s"
+					       " pattern\n",
+					       ((eb + erase_cycles) & 1) ?
+					       "0x55AA55..." : "0xAA55AA...");
+					goto out;
+				}
+				cond_resched();
+			}
+		}
+
+		erase_cycles += 1;
+
+		if (erase_cycles % gran == 0) {
+			long ms;
+
+			stop_timing();
+			ms = (finish.tv_sec - start.tv_sec) * 1000 +
+			     (finish.tv_usec - start.tv_usec) / 1000;
+			printk(PRINT_PREF "%08u erase cycles done, took %lu "
+			       "milliseconds (%lu seconds)\n",
+			       erase_cycles, ms, ms / 1000);
+			start_timing();
+		}
+
+		if (!infinite && --cycles_count == 0)
+			break;
+	}
+out:
+
+	printk(PRINT_PREF "finished after %u erase cycles\n",
+	       erase_cycles);
+	kfree(check_buf);
+out_patt_FF:
+	kfree(patt_FF);
+out_patt_A5A:
+	kfree(patt_A5A);
+out_patt_5A5:
+	kfree(patt_5A5);
+out_mtd:
+	put_mtd_device(mtd);
+	if (err)
+		printk(PRINT_PREF "error %d occurred during torturing\n", err);
+	printk(KERN_INFO "=================================================\n");
+	return err;
+}
+module_init(tort_init);
+
+static void __exit tort_exit(void)
+{
+	return;
+}
+module_exit(tort_exit);
+
+static int countdiffs(unsigned char *buf, unsigned char *check_buf,
+		      unsigned offset, unsigned len, unsigned *bytesp,
+		      unsigned *bitsp);
+static void print_bufs(unsigned char *read, unsigned char *written, int start,
+		       int len);
+
+/*
+ * Report the detailed information about how the read EB differs from what was
+ * written.
+ */
+static void report_corrupt(unsigned char *read, unsigned char *written)
+{
+	int i;
+	int bytes, bits, pages, first;
+	int offset, len;
+	size_t check_len = mtd->erasesize;
+
+	if (pgcnt)
+		check_len = pgcnt * pgsize;
+
+	bytes = bits = pages = 0;
+	for (i = 0; i < check_len; i += pgsize)
+		if (countdiffs(written, read, i, pgsize, &bytes,
+			       &bits) >= 0)
+			pages++;
+
+	printk(PRINT_PREF "verify fails on %d pages, %d bytes/%d bits\n",
+	       pages, bytes, bits);
+	printk(PRINT_PREF "The following is a list of all differences between"
+	       " what was read from flash and what was expected\n");
+
+	for (i = 0; i < check_len; i += pgsize) {
+		cond_resched();
+		bytes = bits = 0;
+		first = countdiffs(written, read, i, pgsize, &bytes,
+				   &bits);
+		if (first < 0)
+			continue;
+
+		printk("-------------------------------------------------------"
+		       "----------------------------------\n");
+
+		printk(PRINT_PREF "Page %zd has %d bytes/%d bits failing verify,"
+		       " starting at offset 0x%x\n",
+		       (mtd->erasesize - check_len + i) / pgsize,
+		       bytes, bits, first);
+
+		offset = first & ~0x7;
+		len = ((first + bytes) | 0x7) + 1 - offset;
+
+		print_bufs(read, written, offset, len);
+	}
+}
+
+static void print_bufs(unsigned char *read, unsigned char *written, int start,
+		       int len)
+{
+	int i = 0, j1, j2;
+	char *diff;
+
+	printk("Offset       Read                          Written\n");
+	while (i < len) {
+		printk("0x%08x: ", start + i);
+		diff = "   ";
+		for (j1 = 0; j1 < 8 && i + j1 < len; j1++) {
+			printk(" %02x", read[start + i + j1]);
+			if (read[start + i + j1] != written[start + i + j1])
+				diff = "***";
+		}
+
+		while (j1 < 8) {
+			printk(" ");
+			j1 += 1;
+		}
+
+		printk("  %s ", diff);
+
+		for (j2 = 0; j2 < 8 && i + j2 < len; j2++)
+			printk(" %02x", written[start + i + j2]);
+		printk("\n");
+		i += 8;
+	}
+}
+
+/*
+ * Count the number of differing bytes and bits and return the first differing
+ * offset.
+ */
+static int countdiffs(unsigned char *buf, unsigned char *check_buf,
+		      unsigned offset, unsigned len, unsigned *bytesp,
+		      unsigned *bitsp)
+{
+	unsigned i, bit;
+	int first = -1;
+
+	for (i = offset; i < offset + len; i++)
+		if (buf[i] != check_buf[i]) {
+			first = i;
+			break;
+		}
+
+	while (i < offset + len) {
+		if (buf[i] != check_buf[i]) {
+			(*bytesp)++;
+			bit = 1;
+			while (bit < 256) {
+				if ((buf[i] & bit) != (check_buf[i] & bit))
+					(*bitsp)++;
+				bit <<= 1;
+			}
+		}
+		i++;
+	}
+
+	return first;
+}
+
+MODULE_DESCRIPTION("Eraseblock torturing module");
+MODULE_AUTHOR("Artem Bityutskiy, Jarkko Lavinen, Adrian Hunter");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
index 7caf22cd5ad..9082768cc6c 100644
--- a/drivers/mtd/ubi/build.c
+++ b/drivers/mtd/ubi/build.c
@@ -561,7 +561,7 @@ static int io_init(struct ubi_device *ubi)
 	 */
 
 	ubi->peb_size   = ubi->mtd->erasesize;
-	ubi->peb_count  = ubi->mtd->size / ubi->mtd->erasesize;
+	ubi->peb_count  = mtd_div_by_eb(ubi->mtd->size, ubi->mtd);
 	ubi->flash_size = ubi->mtd->size;
 
 	if (ubi->mtd->block_isbad && ubi->mtd->block_markbad)
diff --git a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c
index 605812bb0b1..6dd4f5e77f8 100644
--- a/drivers/mtd/ubi/gluebi.c
+++ b/drivers/mtd/ubi/gluebi.c
@@ -215,7 +215,8 @@ static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr)
 	struct ubi_volume *vol;
 	struct ubi_device *ubi;
 
-	dbg_gen("erase %u bytes at offset %u", instr->len, instr->addr);
+	dbg_gen("erase %llu bytes at offset %llu", (unsigned long long)instr->len,
+		 (unsigned long long)instr->addr);
 
 	if (instr->addr < 0 || instr->addr > mtd->size - mtd->erasesize)
 		return -EINVAL;
@@ -223,11 +224,11 @@ static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr)
 	if (instr->len < 0 || instr->addr + instr->len > mtd->size)
 		return -EINVAL;
 
-	if (instr->addr % mtd->writesize || instr->len % mtd->writesize)
+	if (mtd_mod_by_ws(instr->addr, mtd) || mtd_mod_by_ws(instr->len, mtd))
 		return -EINVAL;
 
-	lnum = instr->addr / mtd->erasesize;
-	count = instr->len / mtd->erasesize;
+	lnum = mtd_div_by_eb(instr->addr, mtd);
+	count = mtd_div_by_eb(instr->len, mtd);
 
 	vol = container_of(mtd, struct ubi_volume, gluebi_mtd);
 	ubi = vol->ubi;
@@ -255,7 +256,7 @@ static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr)
 
 out_err:
 	instr->state = MTD_ERASE_FAILED;
-	instr->fail_addr = lnum * mtd->erasesize;
+	instr->fail_addr = (long long)lnum * mtd->erasesize;
 	return err;
 }
 
@@ -294,7 +295,7 @@ int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol)
 	 * bytes.
 	 */
 	if (vol->vol_type == UBI_DYNAMIC_VOLUME)
-		mtd->size = vol->usable_leb_size * vol->reserved_pebs;
+		mtd->size = (long long)vol->usable_leb_size * vol->reserved_pebs;
 	else
 		mtd->size = vol->used_bytes;
 
@@ -304,8 +305,8 @@ int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol)
 		return -ENFILE;
 	}
 
-	dbg_gen("added mtd%d (\"%s\"), size %u, EB size %u",
-		mtd->index, mtd->name, mtd->size, mtd->erasesize);
+	dbg_gen("added mtd%d (\"%s\"), size %llu, EB size %u",
+		mtd->index, mtd->name, (unsigned long long)mtd->size, mtd->erasesize);
 	return 0;
 }
 
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c
index 65e8294a9e2..9da5a4b8113 100644
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -1,11 +1,12 @@
 /**
  * @file buffer_sync.c
  *
- * @remark Copyright 2002 OProfile authors
+ * @remark Copyright 2002-2009 OProfile authors
  * @remark Read the file COPYING
  *
  * @author John Levon <levon@movementarian.org>
  * @author Barry Kasindorf
+ * @author Robert Richter <robert.richter@amd.com>
  *
  * This is the core of the buffer management. Each
  * CPU buffer is processed and entered into the
@@ -315,88 +316,73 @@ static void add_trace_begin(void)
 	add_event_entry(TRACE_BEGIN_CODE);
 }
 
-#ifdef CONFIG_OPROFILE_IBS
-
-#define IBS_FETCH_CODE_SIZE	2
-#define IBS_OP_CODE_SIZE	5
-
-/*
- * Add IBS fetch and op entries to event buffer
- */
-static void add_ibs_begin(int cpu, int code, struct mm_struct *mm)
+static void add_data(struct op_entry *entry, struct mm_struct *mm)
 {
-	unsigned long rip;
-	int i, count;
-	unsigned long ibs_cookie = 0;
+	unsigned long code, pc, val;
+	unsigned long cookie;
 	off_t offset;
-	struct op_sample *sample;
-
-	sample = cpu_buffer_read_entry(cpu);
-	if (!sample)
-		goto Error;
-	rip = sample->eip;
 
-#ifdef __LP64__
-	rip += sample->event << 32;
-#endif
+	if (!op_cpu_buffer_get_data(entry, &code))
+		return;
+	if (!op_cpu_buffer_get_data(entry, &pc))
+		return;
+	if (!op_cpu_buffer_get_size(entry))
+		return;
 
 	if (mm) {
-		ibs_cookie = lookup_dcookie(mm, rip, &offset);
+		cookie = lookup_dcookie(mm, pc, &offset);
 
-		if (ibs_cookie == NO_COOKIE)
-			offset = rip;
-		if (ibs_cookie == INVALID_COOKIE) {
+		if (cookie == NO_COOKIE)
+			offset = pc;
+		if (cookie == INVALID_COOKIE) {
 			atomic_inc(&oprofile_stats.sample_lost_no_mapping);
-			offset = rip;
+			offset = pc;
 		}
-		if (ibs_cookie != last_cookie) {
-			add_cookie_switch(ibs_cookie);
-			last_cookie = ibs_cookie;
+		if (cookie != last_cookie) {
+			add_cookie_switch(cookie);
+			last_cookie = cookie;
 		}
 	} else
-		offset = rip;
+		offset = pc;
 
 	add_event_entry(ESCAPE_CODE);
 	add_event_entry(code);
 	add_event_entry(offset);	/* Offset from Dcookie */
 
-	/* we send the Dcookie offset, but send the raw Linear Add also*/
-	add_event_entry(sample->eip);
-	add_event_entry(sample->event);
-
-	if (code == IBS_FETCH_CODE)
-		count = IBS_FETCH_CODE_SIZE;	/*IBS FETCH is 2 int64s*/
-	else
-		count = IBS_OP_CODE_SIZE;	/*IBS OP is 5 int64s*/
-
-	for (i = 0; i < count; i++) {
-		sample = cpu_buffer_read_entry(cpu);
-		if (!sample)
-			goto Error;
-		add_event_entry(sample->eip);
-		add_event_entry(sample->event);
-	}
-
-	return;
-
-Error:
-	return;
+	while (op_cpu_buffer_get_data(entry, &val))
+		add_event_entry(val);
 }
 
-#endif
-
-static void add_sample_entry(unsigned long offset, unsigned long event)
+static inline void add_sample_entry(unsigned long offset, unsigned long event)
 {
 	add_event_entry(offset);
 	add_event_entry(event);
 }
 
 
-static int add_us_sample(struct mm_struct *mm, struct op_sample *s)
+/*
+ * Add a sample to the global event buffer. If possible the
+ * sample is converted into a persistent dentry/offset pair
+ * for later lookup from userspace. Return 0 on failure.
+ */
+static int
+add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
 {
 	unsigned long cookie;
 	off_t offset;
 
+	if (in_kernel) {
+		add_sample_entry(s->eip, s->event);
+		return 1;
+	}
+
+	/* add userspace sample */
+
+	if (!mm) {
+		atomic_inc(&oprofile_stats.sample_lost_no_mm);
+		return 0;
+	}
+
 	cookie = lookup_dcookie(mm, s->eip, &offset);
 
 	if (cookie == INVALID_COOKIE) {
@@ -415,25 +401,6 @@ static int add_us_sample(struct mm_struct *mm, struct op_sample *s)
 }
 
 
-/* Add a sample to the global event buffer. If possible the
- * sample is converted into a persistent dentry/offset pair
- * for later lookup from userspace.
- */
-static int
-add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
-{
-	if (in_kernel) {
-		add_sample_entry(s->eip, s->event);
-		return 1;
-	} else if (mm) {
-		return add_us_sample(mm, s);
-	} else {
-		atomic_inc(&oprofile_stats.sample_lost_no_mm);
-	}
-	return 0;
-}
-
-
 static void release_mm(struct mm_struct *mm)
 {
 	if (!mm)
@@ -526,66 +493,69 @@ void sync_buffer(int cpu)
 {
 	struct mm_struct *mm = NULL;
 	struct mm_struct *oldmm;
+	unsigned long val;
 	struct task_struct *new;
 	unsigned long cookie = 0;
 	int in_kernel = 1;
 	sync_buffer_state state = sb_buffer_start;
 	unsigned int i;
 	unsigned long available;
+	unsigned long flags;
+	struct op_entry entry;
+	struct op_sample *sample;
 
 	mutex_lock(&buffer_mutex);
 
 	add_cpu_switch(cpu);
 
-	cpu_buffer_reset(cpu);
-	available = cpu_buffer_entries(cpu);
+	op_cpu_buffer_reset(cpu);
+	available = op_cpu_buffer_entries(cpu);
 
 	for (i = 0; i < available; ++i) {
-		struct op_sample *s = cpu_buffer_read_entry(cpu);
-		if (!s)
+		sample = op_cpu_buffer_read_entry(&entry, cpu);
+		if (!sample)
 			break;
 
-		if (is_code(s->eip)) {
-			switch (s->event) {
-			case 0:
-			case CPU_IS_KERNEL:
+		if (is_code(sample->eip)) {
+			flags = sample->event;
+			if (flags & TRACE_BEGIN) {
+				state = sb_bt_start;
+				add_trace_begin();
+			}
+			if (flags & KERNEL_CTX_SWITCH) {
 				/* kernel/userspace switch */
-				in_kernel = s->event;
+				in_kernel = flags & IS_KERNEL;
 				if (state == sb_buffer_start)
 					state = sb_sample_start;
-				add_kernel_ctx_switch(s->event);
-				break;
-			case CPU_TRACE_BEGIN:
-				state = sb_bt_start;
-				add_trace_begin();
-				break;
-#ifdef CONFIG_OPROFILE_IBS
-			case IBS_FETCH_BEGIN:
-				state = sb_bt_start;
-				add_ibs_begin(cpu, IBS_FETCH_CODE, mm);
-				break;
-			case IBS_OP_BEGIN:
-				state = sb_bt_start;
-				add_ibs_begin(cpu, IBS_OP_CODE, mm);
-				break;
-#endif
-			default:
+				add_kernel_ctx_switch(flags & IS_KERNEL);
+			}
+			if (flags & USER_CTX_SWITCH
+			    && op_cpu_buffer_get_data(&entry, &val)) {
 				/* userspace context switch */
+				new = (struct task_struct *)val;
 				oldmm = mm;
-				new = (struct task_struct *)s->event;
 				release_mm(oldmm);
 				mm = take_tasks_mm(new);
 				if (mm != oldmm)
 					cookie = get_exec_dcookie(mm);
 				add_user_ctx_switch(new, cookie);
-				break;
-			}
-		} else if (state >= sb_bt_start &&
-			   !add_sample(mm, s, in_kernel)) {
-			if (state == sb_bt_start) {
-				state = sb_bt_ignore;
-				atomic_inc(&oprofile_stats.bt_lost_no_mapping);
 			}
+			if (op_cpu_buffer_get_size(&entry))
+				add_data(&entry, mm);
+			continue;
+		}
+
+		if (state < sb_bt_start)
+			/* ignore sample */
+			continue;
+
+		if (add_sample(mm, sample, in_kernel))
+			continue;
+
+		/* ignore backtraces if failed to add a sample */
+		if (state == sb_bt_start) {
+			state = sb_bt_ignore;
+			atomic_inc(&oprofile_stats.bt_lost_no_mapping);
 		}
 	}
 	release_mm(mm);
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 61090969158..2e03b6d796d 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -1,11 +1,12 @@
 /**
  * @file cpu_buffer.c
  *
- * @remark Copyright 2002 OProfile authors
+ * @remark Copyright 2002-2009 OProfile authors
  * @remark Read the file COPYING
  *
  * @author John Levon <levon@movementarian.org>
  * @author Barry Kasindorf <barry.kasindorf@amd.com>
+ * @author Robert Richter <robert.richter@amd.com>
  *
  * Each CPU has a local buffer that stores PC value/event
  * pairs. We also log context switches when we notice them.
@@ -45,8 +46,8 @@
  * can be changed to a single buffer solution when the ring buffer
  * access is implemented as non-locking atomic code.
  */
-struct ring_buffer *op_ring_buffer_read;
-struct ring_buffer *op_ring_buffer_write;
+static struct ring_buffer *op_ring_buffer_read;
+static struct ring_buffer *op_ring_buffer_write;
 DEFINE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer);
 
 static void wq_sync_buffer(struct work_struct *work);
@@ -54,19 +55,9 @@ static void wq_sync_buffer(struct work_struct *work);
 #define DEFAULT_TIMER_EXPIRE (HZ / 10)
 static int work_enabled;
 
-void free_cpu_buffers(void)
-{
-	if (op_ring_buffer_read)
-		ring_buffer_free(op_ring_buffer_read);
-	op_ring_buffer_read = NULL;
-	if (op_ring_buffer_write)
-		ring_buffer_free(op_ring_buffer_write);
-	op_ring_buffer_write = NULL;
-}
-
 unsigned long oprofile_get_cpu_buffer_size(void)
 {
-	return fs_cpu_buffer_size;
+	return oprofile_cpu_buffer_size;
 }
 
 void oprofile_cpu_buffer_inc_smpl_lost(void)
@@ -77,11 +68,21 @@ void oprofile_cpu_buffer_inc_smpl_lost(void)
 	cpu_buf->sample_lost_overflow++;
 }
 
+void free_cpu_buffers(void)
+{
+	if (op_ring_buffer_read)
+		ring_buffer_free(op_ring_buffer_read);
+	op_ring_buffer_read = NULL;
+	if (op_ring_buffer_write)
+		ring_buffer_free(op_ring_buffer_write);
+	op_ring_buffer_write = NULL;
+}
+
 int alloc_cpu_buffers(void)
 {
 	int i;
 
-	unsigned long buffer_size = fs_cpu_buffer_size;
+	unsigned long buffer_size = oprofile_cpu_buffer_size;
 
 	op_ring_buffer_read = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS);
 	if (!op_ring_buffer_read)
@@ -97,8 +98,6 @@ int alloc_cpu_buffers(void)
 		b->last_is_kernel = -1;
 		b->tracing = 0;
 		b->buffer_size = buffer_size;
-		b->tail_pos = 0;
-		b->head_pos = 0;
 		b->sample_received = 0;
 		b->sample_lost_overflow = 0;
 		b->backtrace_aborted = 0;
@@ -145,47 +144,156 @@ void end_cpu_work(void)
 	flush_scheduled_work();
 }
 
-static inline int
-add_sample(struct oprofile_cpu_buffer *cpu_buf,
-	   unsigned long pc, unsigned long event)
+/*
+ * This function prepares the cpu buffer to write a sample.
+ *
+ * Struct op_entry is used during operations on the ring buffer while
+ * struct op_sample contains the data that is stored in the ring
+ * buffer. Struct entry can be uninitialized. The function reserves a
+ * data array that is specified by size. Use
+ * op_cpu_buffer_write_commit() after preparing the sample. In case of
+ * errors a null pointer is returned, otherwise the pointer to the
+ * sample.
+ *
+ */
+struct op_sample
+*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
+{
+	entry->event = ring_buffer_lock_reserve
+		(op_ring_buffer_write, sizeof(struct op_sample) +
+		 size * sizeof(entry->sample->data[0]), &entry->irq_flags);
+	if (entry->event)
+		entry->sample = ring_buffer_event_data(entry->event);
+	else
+		entry->sample = NULL;
+
+	if (!entry->sample)
+		return NULL;
+
+	entry->size = size;
+	entry->data = entry->sample->data;
+
+	return entry->sample;
+}
+
+int op_cpu_buffer_write_commit(struct op_entry *entry)
+{
+	return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event,
+					 entry->irq_flags);
+}
+
+struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
+{
+	struct ring_buffer_event *e;
+	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+	if (e)
+		goto event;
+	if (ring_buffer_swap_cpu(op_ring_buffer_read,
+				 op_ring_buffer_write,
+				 cpu))
+		return NULL;
+	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+	if (e)
+		goto event;
+	return NULL;
+
+event:
+	entry->event = e;
+	entry->sample = ring_buffer_event_data(e);
+	entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
+		/ sizeof(entry->sample->data[0]);
+	entry->data = entry->sample->data;
+	return entry->sample;
+}
+
+unsigned long op_cpu_buffer_entries(int cpu)
+{
+	return ring_buffer_entries_cpu(op_ring_buffer_read, cpu)
+		+ ring_buffer_entries_cpu(op_ring_buffer_write, cpu);
+}
+
+static int
+op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
+	    int is_kernel, struct task_struct *task)
 {
 	struct op_entry entry;
-	int ret;
+	struct op_sample *sample;
+	unsigned long flags;
+	int size;
+
+	flags = 0;
+
+	if (backtrace)
+		flags |= TRACE_BEGIN;
+
+	/* notice a switch from user->kernel or vice versa */
+	is_kernel = !!is_kernel;
+	if (cpu_buf->last_is_kernel != is_kernel) {
+		cpu_buf->last_is_kernel = is_kernel;
+		flags |= KERNEL_CTX_SWITCH;
+		if (is_kernel)
+			flags |= IS_KERNEL;
+	}
+
+	/* notice a task switch */
+	if (cpu_buf->last_task != task) {
+		cpu_buf->last_task = task;
+		flags |= USER_CTX_SWITCH;
+	}
+
+	if (!flags)
+		/* nothing to do */
+		return 0;
+
+	if (flags & USER_CTX_SWITCH)
+		size = 1;
+	else
+		size = 0;
+
+	sample = op_cpu_buffer_write_reserve(&entry, size);
+	if (!sample)
+		return -ENOMEM;
 
-	ret = cpu_buffer_write_entry(&entry);
-	if (ret)
-		return ret;
+	sample->eip = ESCAPE_CODE;
+	sample->event = flags;
 
-	entry.sample->eip = pc;
-	entry.sample->event = event;
+	if (size)
+		op_cpu_buffer_add_data(&entry, (unsigned long)task);
 
-	ret = cpu_buffer_write_commit(&entry);
-	if (ret)
-		return ret;
+	op_cpu_buffer_write_commit(&entry);
 
 	return 0;
 }
 
 static inline int
-add_code(struct oprofile_cpu_buffer *buffer, unsigned long value)
+op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
+	      unsigned long pc, unsigned long event)
 {
-	return add_sample(buffer, ESCAPE_CODE, value);
+	struct op_entry entry;
+	struct op_sample *sample;
+
+	sample = op_cpu_buffer_write_reserve(&entry, 0);
+	if (!sample)
+		return -ENOMEM;
+
+	sample->eip = pc;
+	sample->event = event;
+
+	return op_cpu_buffer_write_commit(&entry);
 }
 
-/* This must be safe from any context. It's safe writing here
- * because of the head/tail separation of the writer and reader
- * of the CPU buffer.
+/*
+ * This must be safe from any context.
  *
  * is_kernel is needed because on some architectures you cannot
  * tell if you are in kernel or user space simply by looking at
  * pc. We tag this in the buffer by generating kernel enter/exit
  * events whenever is_kernel changes
  */
-static int log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
-		      int is_kernel, unsigned long event)
+static int
+log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
+	   unsigned long backtrace, int is_kernel, unsigned long event)
 {
-	struct task_struct *task;
-
 	cpu_buf->sample_received++;
 
 	if (pc == ESCAPE_CODE) {
@@ -193,25 +301,10 @@ static int log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
 		return 0;
 	}
 
-	is_kernel = !!is_kernel;
-
-	task = current;
-
-	/* notice a switch from user->kernel or vice versa */
-	if (cpu_buf->last_is_kernel != is_kernel) {
-		cpu_buf->last_is_kernel = is_kernel;
-		if (add_code(cpu_buf, is_kernel))
-			goto fail;
-	}
-
-	/* notice a task switch */
-	if (cpu_buf->last_task != task) {
-		cpu_buf->last_task = task;
-		if (add_code(cpu_buf, (unsigned long)task))
-			goto fail;
-	}
+	if (op_add_code(cpu_buf, backtrace, is_kernel, current))
+		goto fail;
 
-	if (add_sample(cpu_buf, pc, event))
+	if (op_add_sample(cpu_buf, pc, event))
 		goto fail;
 
 	return 1;
@@ -221,109 +314,102 @@ fail:
 	return 0;
 }
 
-static int oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
+static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
 {
-	add_code(cpu_buf, CPU_TRACE_BEGIN);
 	cpu_buf->tracing = 1;
-	return 1;
 }
 
-static void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
+static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
 {
 	cpu_buf->tracing = 0;
 }
 
-void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
-				unsigned long event, int is_kernel)
+static inline void
+__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
+			  unsigned long event, int is_kernel)
 {
 	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
-
-	if (!backtrace_depth) {
-		log_sample(cpu_buf, pc, is_kernel, event);
-		return;
-	}
-
-	if (!oprofile_begin_trace(cpu_buf))
-		return;
+	unsigned long backtrace = oprofile_backtrace_depth;
 
 	/*
 	 * if log_sample() fail we can't backtrace since we lost the
 	 * source of this event
 	 */
-	if (log_sample(cpu_buf, pc, is_kernel, event))
-		oprofile_ops.backtrace(regs, backtrace_depth);
+	if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event))
+		/* failed */
+		return;
+
+	if (!backtrace)
+		return;
+
+	oprofile_begin_trace(cpu_buf);
+	oprofile_ops.backtrace(regs, backtrace);
 	oprofile_end_trace(cpu_buf);
 }
 
+void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
+			     unsigned long event, int is_kernel)
+{
+	__oprofile_add_ext_sample(pc, regs, event, is_kernel);
+}
+
 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 {
 	int is_kernel = !user_mode(regs);
 	unsigned long pc = profile_pc(regs);
 
-	oprofile_add_ext_sample(pc, regs, event, is_kernel);
+	__oprofile_add_ext_sample(pc, regs, event, is_kernel);
 }
 
-#ifdef CONFIG_OPROFILE_IBS
-
-#define MAX_IBS_SAMPLE_SIZE 14
-
-void oprofile_add_ibs_sample(struct pt_regs * const regs,
-			     unsigned int * const ibs_sample, int ibs_code)
+/*
+ * Add samples with data to the ring buffer.
+ *
+ * Use oprofile_add_data(&entry, val) to add data and
+ * oprofile_write_commit(&entry) to commit the sample.
+ */
+void
+oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
+		       unsigned long pc, int code, int size)
 {
+	struct op_sample *sample;
 	int is_kernel = !user_mode(regs);
 	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
-	struct task_struct *task;
-	int fail = 0;
 
 	cpu_buf->sample_received++;
 
-	/* notice a switch from user->kernel or vice versa */
-	if (cpu_buf->last_is_kernel != is_kernel) {
-		if (add_code(cpu_buf, is_kernel))
-			goto fail;
-		cpu_buf->last_is_kernel = is_kernel;
-	}
-
-	/* notice a task switch */
-	if (!is_kernel) {
-		task = current;
-		if (cpu_buf->last_task != task) {
-			if (add_code(cpu_buf, (unsigned long)task))
-				goto fail;
-			cpu_buf->last_task = task;
-		}
-	}
-
-	fail = fail || add_code(cpu_buf, ibs_code);
-	fail = fail || add_sample(cpu_buf, ibs_sample[0], ibs_sample[1]);
-	fail = fail || add_sample(cpu_buf, ibs_sample[2], ibs_sample[3]);
-	fail = fail || add_sample(cpu_buf, ibs_sample[4], ibs_sample[5]);
-
-	if (ibs_code == IBS_OP_BEGIN) {
-		fail = fail || add_sample(cpu_buf, ibs_sample[6], ibs_sample[7]);
-		fail = fail || add_sample(cpu_buf, ibs_sample[8], ibs_sample[9]);
-		fail = fail || add_sample(cpu_buf, ibs_sample[10], ibs_sample[11]);
-	}
+	/* no backtraces for samples with data */
+	if (op_add_code(cpu_buf, 0, is_kernel, current))
+		goto fail;
 
-	if (fail)
+	sample = op_cpu_buffer_write_reserve(entry, size + 2);
+	if (!sample)
 		goto fail;
+	sample->eip = ESCAPE_CODE;
+	sample->event = 0;		/* no flags */
 
-	if (backtrace_depth)
-		oprofile_ops.backtrace(regs, backtrace_depth);
+	op_cpu_buffer_add_data(entry, code);
+	op_cpu_buffer_add_data(entry, pc);
 
 	return;
 
 fail:
 	cpu_buf->sample_lost_overflow++;
-	return;
 }
 
-#endif
+int oprofile_add_data(struct op_entry *entry, unsigned long val)
+{
+	return op_cpu_buffer_add_data(entry, val);
+}
+
+int oprofile_write_commit(struct op_entry *entry)
+{
+	return op_cpu_buffer_write_commit(entry);
+}
 
 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 {
 	struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
-	log_sample(cpu_buf, pc, is_kernel, event);
+	log_sample(cpu_buf, pc, 0, is_kernel, event);
 }
 
 void oprofile_add_trace(unsigned long pc)
@@ -340,7 +426,7 @@ void oprofile_add_trace(unsigned long pc)
 	if (pc == ESCAPE_CODE)
 		goto fail;
 
-	if (add_sample(cpu_buf, pc, 0))
+	if (op_add_sample(cpu_buf, pc, 0))
 		goto fail;
 
 	return;
diff --git a/drivers/oprofile/cpu_buffer.h b/drivers/oprofile/cpu_buffer.h
index aacb0f0bc56..63f81c44846 100644
--- a/drivers/oprofile/cpu_buffer.h
+++ b/drivers/oprofile/cpu_buffer.h
@@ -1,10 +1,11 @@
 /**
  * @file cpu_buffer.h
  *
- * @remark Copyright 2002 OProfile authors
+ * @remark Copyright 2002-2009 OProfile authors
  * @remark Read the file COPYING
  *
  * @author John Levon <levon@movementarian.org>
+ * @author Robert Richter <robert.richter@amd.com>
  */
 
 #ifndef OPROFILE_CPU_BUFFER_H
@@ -31,17 +32,12 @@ void end_cpu_work(void);
 struct op_sample {
 	unsigned long eip;
 	unsigned long event;
+	unsigned long data[0];
 };
 
-struct op_entry {
-	struct ring_buffer_event *event;
-	struct op_sample *sample;
-	unsigned long irq_flags;
-};
+struct op_entry;
 
 struct oprofile_cpu_buffer {
-	volatile unsigned long head_pos;
-	volatile unsigned long tail_pos;
 	unsigned long buffer_size;
 	struct task_struct *last_task;
 	int last_is_kernel;
@@ -54,8 +50,6 @@ struct oprofile_cpu_buffer {
 	struct delayed_work work;
 };
 
-extern struct ring_buffer *op_ring_buffer_read;
-extern struct ring_buffer *op_ring_buffer_write;
 DECLARE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer);
 
 /*
@@ -64,7 +58,7 @@ DECLARE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer);
  * reset these to invalid values; the next sample collected will
  * populate the buffer with proper values to initialize the buffer
  */
-static inline void cpu_buffer_reset(int cpu)
+static inline void op_cpu_buffer_reset(int cpu)
 {
 	struct oprofile_cpu_buffer *cpu_buf = &per_cpu(cpu_buffer, cpu);
 
@@ -72,55 +66,48 @@ static inline void cpu_buffer_reset(int cpu)
 	cpu_buf->last_task = NULL;
 }
 
-static inline int cpu_buffer_write_entry(struct op_entry *entry)
-{
-	entry->event = ring_buffer_lock_reserve(op_ring_buffer_write,
-						sizeof(struct op_sample),
-						&entry->irq_flags);
-	if (entry->event)
-		entry->sample = ring_buffer_event_data(entry->event);
-	else
-		entry->sample = NULL;
-
-	if (!entry->sample)
-		return -ENOMEM;
-
-	return 0;
-}
+struct op_sample
+*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size);
+int op_cpu_buffer_write_commit(struct op_entry *entry);
+struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu);
+unsigned long op_cpu_buffer_entries(int cpu);
 
-static inline int cpu_buffer_write_commit(struct op_entry *entry)
+/* returns the remaining free size of data in the entry */
+static inline
+int op_cpu_buffer_add_data(struct op_entry *entry, unsigned long val)
 {
-	return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event,
-					 entry->irq_flags);
+	if (!entry->size)
+		return 0;
+	*entry->data = val;
+	entry->size--;
+	entry->data++;
+	return entry->size;
 }
 
-static inline struct op_sample *cpu_buffer_read_entry(int cpu)
+/* returns the size of data in the entry */
+static inline
+int op_cpu_buffer_get_size(struct op_entry *entry)
 {
-	struct ring_buffer_event *e;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
-	if (e)
-		return ring_buffer_event_data(e);
-	if (ring_buffer_swap_cpu(op_ring_buffer_read,
-				 op_ring_buffer_write,
-				 cpu))
-		return NULL;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
-	if (e)
-		return ring_buffer_event_data(e);
-	return NULL;
+	return entry->size;
 }
 
-/* "acquire" as many cpu buffer slots as we can */
-static inline unsigned long cpu_buffer_entries(int cpu)
+/* returns 0 if empty or the size of data including the current value */
+static inline
+int op_cpu_buffer_get_data(struct op_entry *entry, unsigned long *val)
 {
-	return ring_buffer_entries_cpu(op_ring_buffer_read, cpu)
-		+ ring_buffer_entries_cpu(op_ring_buffer_write, cpu);
+	int size = entry->size;
+	if (!size)
+		return 0;
+	*val = *entry->data;
+	entry->size--;
+	entry->data++;
+	return size;
 }
 
-/* transient events for the CPU buffer -> event buffer */
-#define CPU_IS_KERNEL 1
-#define CPU_TRACE_BEGIN 2
-#define IBS_FETCH_BEGIN 3
-#define IBS_OP_BEGIN    4
+/* extra data flags */
+#define KERNEL_CTX_SWITCH	(1UL << 0)
+#define IS_KERNEL		(1UL << 1)
+#define TRACE_BEGIN		(1UL << 2)
+#define USER_CTX_SWITCH		(1UL << 3)
 
 #endif /* OPROFILE_CPU_BUFFER_H */
diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c
index 191a3202cec..2b7ae366ceb 100644
--- a/drivers/oprofile/event_buffer.c
+++ b/drivers/oprofile/event_buffer.c
@@ -73,8 +73,8 @@ int alloc_event_buffer(void)
 	unsigned long flags;
 
 	spin_lock_irqsave(&oprofilefs_lock, flags);
-	buffer_size = fs_buffer_size;
-	buffer_watershed = fs_buffer_watershed;
+	buffer_size = oprofile_buffer_size;
+	buffer_watershed = oprofile_buffer_watershed;
 	spin_unlock_irqrestore(&oprofilefs_lock, flags);
 
 	if (buffer_watershed >= buffer_size)
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index cd375907f26..3cffce90f82 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -23,7 +23,7 @@
 struct oprofile_operations oprofile_ops;
 
 unsigned long oprofile_started;
-unsigned long backtrace_depth;
+unsigned long oprofile_backtrace_depth;
 static unsigned long is_setup;
 static DEFINE_MUTEX(start_mutex);
 
@@ -172,7 +172,7 @@ int oprofile_set_backtrace(unsigned long val)
 		goto out;
 	}
 
-	backtrace_depth = val;
+	oprofile_backtrace_depth = val;
 
 out:
 	mutex_unlock(&start_mutex);
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
index 5df0c21a608..c288d3c24b5 100644
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -21,12 +21,12 @@ void oprofile_stop(void);
 
 struct oprofile_operations;
 
-extern unsigned long fs_buffer_size;
-extern unsigned long fs_cpu_buffer_size;
-extern unsigned long fs_buffer_watershed;
+extern unsigned long oprofile_buffer_size;
+extern unsigned long oprofile_cpu_buffer_size;
+extern unsigned long oprofile_buffer_watershed;
 extern struct oprofile_operations oprofile_ops;
 extern unsigned long oprofile_started;
-extern unsigned long backtrace_depth;
+extern unsigned long oprofile_backtrace_depth;
 
 struct super_block;
 struct dentry;
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c
index d8201998b0b..5d36ffc30dd 100644
--- a/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@ -14,17 +14,18 @@
 #include "oprofile_stats.h"
 #include "oprof.h"
 
-#define FS_BUFFER_SIZE_DEFAULT		131072
-#define FS_CPU_BUFFER_SIZE_DEFAULT	8192
-#define FS_BUFFER_WATERSHED_DEFAULT	32768	/* FIXME: tune */
+#define BUFFER_SIZE_DEFAULT		131072
+#define CPU_BUFFER_SIZE_DEFAULT		8192
+#define BUFFER_WATERSHED_DEFAULT	32768	/* FIXME: tune */
 
-unsigned long fs_buffer_size;
-unsigned long fs_cpu_buffer_size;
-unsigned long fs_buffer_watershed;
+unsigned long oprofile_buffer_size;
+unsigned long oprofile_cpu_buffer_size;
+unsigned long oprofile_buffer_watershed;
 
 static ssize_t depth_read(struct file *file, char __user *buf, size_t count, loff_t *offset)
 {
-	return oprofilefs_ulong_to_user(backtrace_depth, buf, count, offset);
+	return oprofilefs_ulong_to_user(oprofile_backtrace_depth, buf, count,
+					offset);
 }
 
 
@@ -125,16 +126,16 @@ static const struct file_operations dump_fops = {
 void oprofile_create_files(struct super_block *sb, struct dentry *root)
 {
 	/* reinitialize default values */
-	fs_buffer_size =	FS_BUFFER_SIZE_DEFAULT;
-	fs_cpu_buffer_size =	FS_CPU_BUFFER_SIZE_DEFAULT;
-	fs_buffer_watershed =	FS_BUFFER_WATERSHED_DEFAULT;
+	oprofile_buffer_size =		BUFFER_SIZE_DEFAULT;
+	oprofile_cpu_buffer_size =	CPU_BUFFER_SIZE_DEFAULT;
+	oprofile_buffer_watershed =	BUFFER_WATERSHED_DEFAULT;
 
 	oprofilefs_create_file(sb, root, "enable", &enable_fops);
 	oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
 	oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
-	oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size);
-	oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed);
-	oprofilefs_create_ulong(sb, root, "cpu_buffer_size", &fs_cpu_buffer_size);
+	oprofilefs_create_ulong(sb, root, "buffer_size", &oprofile_buffer_size);
+	oprofilefs_create_ulong(sb, root, "buffer_watershed", &oprofile_buffer_watershed);
+	oprofilefs_create_ulong(sb, root, "cpu_buffer_size", &oprofile_cpu_buffer_size);
 	oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops);
 	oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops);
 	oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops);
diff --git a/drivers/parisc/asp.c b/drivers/parisc/asp.c
index 82136913536..7931133526c 100644
--- a/drivers/parisc/asp.c
+++ b/drivers/parisc/asp.c
@@ -71,8 +71,7 @@ static void asp_choose_irq(struct parisc_device *dev, void *ctrl)
  */
 #define ASP_INTERRUPT_ADDR 0xf0800000
 
-int __init
-asp_init_chip(struct parisc_device *dev)
+static int __init asp_init_chip(struct parisc_device *dev)
 {
 	struct gsc_irq gsc_irq;
 	int ret;
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index dcc1e9958d2..cd4dd7ed2c0 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -555,7 +555,7 @@ static u32 hint_lookup[] = {
  * (Load Coherence Index) instruction.  The 8 bits used for the virtual
  * index are bits 12:19 of the value returned by LCI.
  */ 
-void CCIO_INLINE
+static void CCIO_INLINE
 ccio_io_pdir_entry(u64 *pdir_ptr, space_t sid, unsigned long vba,
 		   unsigned long hints)
 {
@@ -1578,8 +1578,6 @@ static int __init ccio_probe(struct parisc_device *dev)
 
 	ioc_count++;
 
-	parisc_vmerge_boundary = IOVP_SIZE;
-	parisc_vmerge_max_size = BITS_PER_LONG * IOVP_SIZE;
 	parisc_has_iommu();
 	return 0;
 }
diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index 77cc8bfef8c..d539d9df88e 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -287,7 +287,7 @@ DINO_PORT_OUT(b,  8, 3)
 DINO_PORT_OUT(w, 16, 2)
 DINO_PORT_OUT(l, 32, 0)
 
-struct pci_port_ops dino_port_ops = {
+static struct pci_port_ops dino_port_ops = {
 	.inb	= dino_in8,
 	.inw	= dino_in16,
 	.inl	= dino_in32,
@@ -690,7 +690,7 @@ dino_fixup_bus(struct pci_bus *bus)
 }
 
 
-struct pci_bios_ops dino_bios_ops = {
+static struct pci_bios_ops dino_bios_ops = {
 	.init		= dino_bios_init,
 	.fixup_bus	= dino_fixup_bus
 };
diff --git a/drivers/parisc/hppb.c b/drivers/parisc/hppb.c
index 65eee67aa2a..13856415b43 100644
--- a/drivers/parisc/hppb.c
+++ b/drivers/parisc/hppb.c
@@ -29,7 +29,7 @@ struct hppb_card {
 	struct hppb_card *next;
 };
 
-struct hppb_card hppb_card_head = {
+static struct hppb_card hppb_card_head = {
 	.hpa = 0,
 	.next = NULL,
 };
diff --git a/drivers/parisc/lasi.c b/drivers/parisc/lasi.c
index bee510098ce..e65727ca9fc 100644
--- a/drivers/parisc/lasi.c
+++ b/drivers/parisc/lasi.c
@@ -107,7 +107,7 @@ lasi_init_irq(struct gsc_asic *this_lasi)
 
 #else
 
-void __init lasi_led_init(unsigned long lasi_hpa)
+static void __init lasi_led_init(unsigned long lasi_hpa)
 {
 	unsigned long datareg;
 
@@ -163,8 +163,7 @@ static void lasi_power_off(void)
 	gsc_writel(0x02, datareg);
 }
 
-int __init
-lasi_init_chip(struct parisc_device *dev)
+static int __init lasi_init_chip(struct parisc_device *dev)
 {
 	extern void (*chassis_power_off)(void);
 	struct gsc_asic *lasi;
diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c
index a28c8946dea..d8233de8c75 100644
--- a/drivers/parisc/lba_pci.c
+++ b/drivers/parisc/lba_pci.c
@@ -824,7 +824,7 @@ lba_fixup_bus(struct pci_bus *bus)
 }
 
 
-struct pci_bios_ops lba_bios_ops = {
+static struct pci_bios_ops lba_bios_ops = {
 	.init =		lba_bios_init,
 	.fixup_bus =	lba_fixup_bus,
 };
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index bc73b96346f..3fac8f81d59 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -561,7 +561,7 @@ typedef unsigned long space_t;
  * IOMMU uses little endian for the pdir.
  */
 
-void SBA_INLINE
+static void SBA_INLINE
 sba_io_pdir_entry(u64 *pdir_ptr, space_t sid, unsigned long vba,
 		  unsigned long hint)
 {
@@ -1874,7 +1874,7 @@ static struct parisc_device_id sba_tbl[] = {
 	{ 0, }
 };
 
-int sba_driver_callback(struct parisc_device *);
+static int sba_driver_callback(struct parisc_device *);
 
 static struct parisc_driver sba_driver = {
 	.name =		MODULE_NAME,
@@ -1887,8 +1887,7 @@ static struct parisc_driver sba_driver = {
 ** If so, initialize the chip and tell other partners in crime they
 ** have work to do.
 */
-int
-sba_driver_callback(struct parisc_device *dev)
+static int sba_driver_callback(struct parisc_device *dev)
 {
 	struct sba_device *sba_dev;
 	u32 func_class;
@@ -1979,8 +1978,6 @@ sba_driver_callback(struct parisc_device *dev)
 	proc_create("sba_iommu-bitmap", 0, root, &sba_proc_bitmap_fops);
 #endif
 
-	parisc_vmerge_boundary = IOVP_SIZE;
-	parisc_vmerge_max_size = IOVP_SIZE * BITS_PER_LONG;
 	parisc_has_iommu();
 	return 0;
 }
diff --git a/drivers/parisc/wax.c b/drivers/parisc/wax.c
index 892a83bbe73..da9d5ad1353 100644
--- a/drivers/parisc/wax.c
+++ b/drivers/parisc/wax.c
@@ -68,8 +68,7 @@ wax_init_irq(struct gsc_asic *wax)
 //	gsc_writel(0xFFFFFFFF, base+0x2000); /* RS232-B on Wax */
 }
 
-int __init
-wax_init_chip(struct parisc_device *dev)
+static int __init wax_init_chip(struct parisc_device *dev)
 {
 	struct gsc_asic *wax;
 	struct parisc_device *parent;
diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c
index c62ab8d240a..1c114180106 100644
--- a/drivers/pci/hotplug/acpi_pcihp.c
+++ b/drivers/pci/hotplug/acpi_pcihp.c
@@ -33,7 +33,6 @@
 #include <linux/pci-acpi.h>
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/actypes.h>
 
 #define MY_NAME	"acpi_pcihp"
 
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 27fd18f019f..db85284ffb6 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -217,7 +217,6 @@ struct hpc_ops {
 #ifdef CONFIG_ACPI
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/actypes.h>
 #include <linux/pci-acpi.h>
 
 extern void __init pciehp_acpi_slot_detection_init(void);
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 3582512e722..deea8a187eb 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -13,8 +13,6 @@
 #include <linux/module.h>
 #include <linux/pci-aspm.h>
 #include <acpi/acpi.h>
-#include <acpi/acnamesp.h>
-#include <acpi/acresrc.h>
 #include <acpi/acpi_bus.h>
 
 #include <linux/pci-acpi.h>
diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig
new file mode 100644
index 00000000000..9652c3fe7f5
--- /dev/null
+++ b/drivers/platform/Kconfig
@@ -0,0 +1,5 @@
+# drivers/platform/Kconfig
+
+if X86
+source "drivers/platform/x86/Kconfig"
+endif
diff --git a/drivers/platform/Makefile b/drivers/platform/Makefile
new file mode 100644
index 00000000000..782953ae4c0
--- /dev/null
+++ b/drivers/platform/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for linux/drivers/platform
+#
+
+obj-$(CONFIG_X86)		+= x86/
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
new file mode 100644
index 00000000000..e65448e99b4
--- /dev/null
+++ b/drivers/platform/x86/Kconfig
@@ -0,0 +1,375 @@
+#
+# X86 Platform Specific Drivers
+#
+
+menuconfig X86_PLATFORM_DEVICES
+	bool "X86 Platform Specific Device Drivers"
+	default y
+	---help---
+	  Say Y here to get to see options for device drivers for various
+	  x86 platforms, including vendor-specific laptop extension drivers.
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and disabled.
+
+if X86_PLATFORM_DEVICES
+
+config ACER_WMI
+	tristate "Acer WMI Laptop Extras (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	depends on ACPI
+	depends on LEDS_CLASS
+	depends on NEW_LEDS
+	depends on BACKLIGHT_CLASS_DEVICE
+	depends on SERIO_I8042
+	depends on RFKILL
+	select ACPI_WMI
+	---help---
+	  This is a driver for newer Acer (and Wistron) laptops. It adds
+	  wireless radio and bluetooth control, and on some laptops,
+	  exposes the mail LED and LCD backlight.
+
+	  For more information about this driver see
+	  <file:Documentation/laptops/acer-wmi.txt>
+
+	  If you have an ACPI-WMI compatible Acer/ Wistron laptop, say Y or M
+	  here.
+
+config ASUS_LAPTOP
+	tristate "Asus Laptop Extras (EXPERIMENTAL)"
+	depends on ACPI
+	depends on EXPERIMENTAL && !ACPI_ASUS
+	depends on LEDS_CLASS
+	depends on NEW_LEDS
+	depends on BACKLIGHT_CLASS_DEVICE
+	---help---
+	  This is the new Linux driver for Asus laptops. It may also support some
+	  MEDION, JVC or VICTOR laptops. It makes all the extra buttons generate
+	  standard ACPI events that go through /proc/acpi/events. It also adds
+	  support for video output switching, LCD backlight control, Bluetooth and
+	  Wlan control, and most importantly, allows you to blink those fancy LEDs.
+
+	  For more information and a userspace daemon for handling the extra
+	  buttons see <http://acpi4asus.sf.net/>.
+
+	  If you have an ACPI-compatible ASUS laptop, say Y or M here.
+
+config FUJITSU_LAPTOP
+	tristate "Fujitsu Laptop Extras"
+	depends on ACPI
+	depends on INPUT
+	depends on BACKLIGHT_CLASS_DEVICE
+	---help---
+	  This is a driver for laptops built by Fujitsu:
+
+	    * P2xxx/P5xxx/S6xxx/S7xxx series Lifebooks
+	    * Possibly other Fujitsu laptop models
+	    * Tested with S6410 and S7020
+
+	  It adds support for LCD brightness control and some hotkeys.
+
+	  If you have a Fujitsu laptop, say Y or M here.
+
+config FUJITSU_LAPTOP_DEBUG
+	bool "Verbose debug mode for Fujitsu Laptop Extras"
+	depends on FUJITSU_LAPTOP
+	default n
+	---help---
+	  Enables extra debug output from the fujitsu extras driver, at the
+	  expense of a slight increase in driver size.
+
+	  If you are not sure, say N here.
+
+config TC1100_WMI
+	tristate "HP Compaq TC1100 Tablet WMI Extras (EXPERIMENTAL)"
+	depends on !X86_64
+	depends on EXPERIMENTAL
+	depends on ACPI
+	select ACPI_WMI
+	---help---
+	  This is a driver for the WMI extensions (wireless and bluetooth power
+	  control) of the HP Compaq TC1100 tablet.
+
+config HP_WMI
+	tristate "HP WMI extras"
+	depends on ACPI_WMI
+	depends on INPUT
+	depends on RFKILL
+	help
+	 Say Y here if you want to support WMI-based hotkeys on HP laptops and
+	 to read data from WMI such as docking or ambient light sensor state.
+
+	 To compile this driver as a module, choose M here: the module will
+	 be called hp-wmi.
+
+config MSI_LAPTOP
+	tristate "MSI Laptop Extras"
+	depends on ACPI
+	depends on BACKLIGHT_CLASS_DEVICE
+	---help---
+	  This is a driver for laptops built by MSI (MICRO-STAR
+	  INTERNATIONAL):
+
+	  MSI MegaBook S270 (MS-1013)
+	  Cytron/TCM/Medion/Tchibo MD96100/SAM2000
+
+	  It adds support for Bluetooth, WLAN and LCD brightness control.
+
+	  More information about this driver is available at
+	  <http://0pointer.de/lennart/tchibo.html>.
+
+	  If you have an MSI S270 laptop, say Y or M here.
+
+config PANASONIC_LAPTOP
+	tristate "Panasonic Laptop Extras"
+	depends on INPUT && ACPI
+	depends on BACKLIGHT_CLASS_DEVICE
+	---help---
+	  This driver adds support for access to backlight control and hotkeys
+	  on Panasonic Let's Note laptops.
+
+	  If you have a Panasonic Let's note laptop (such as the R1(N variant),
+	  R2, R3, R5, T2, W2 and Y2 series), say Y.
+
+config COMPAL_LAPTOP
+	tristate "Compal Laptop Extras"
+	depends on ACPI
+	depends on BACKLIGHT_CLASS_DEVICE
+	---help---
+	  This is a driver for laptops built by Compal:
+
+	  Compal FL90/IFL90
+	  Compal FL91/IFL91
+	  Compal FL92/JFL92
+	  Compal FT00/IFT00
+
+	  It adds support for Bluetooth, WLAN and LCD brightness control.
+
+	  If you have an Compal FL9x/IFL9x/FT00 laptop, say Y or M here.
+
+config SONY_LAPTOP
+	tristate "Sony Laptop Extras"
+	depends on ACPI
+	select BACKLIGHT_CLASS_DEVICE
+	depends on INPUT
+	  ---help---
+	  This mini-driver drives the SNC and SPIC devices present in the ACPI
+	  BIOS of the Sony Vaio laptops.
+
+	  It gives access to some extra laptop functionalities like Bluetooth,
+	  screen brightness control, Fn keys and allows powering on/off some
+	  devices.
+
+	  Read <file:Documentation/laptops/sony-laptop.txt> for more information.
+
+config SONYPI_COMPAT
+	bool "Sonypi compatibility"
+	depends on SONY_LAPTOP
+	  ---help---
+	  Build the sonypi driver compatibility code into the sony-laptop driver.
+
+config THINKPAD_ACPI
+	tristate "ThinkPad ACPI Laptop Extras"
+	depends on ACPI
+	select BACKLIGHT_LCD_SUPPORT
+	select BACKLIGHT_CLASS_DEVICE
+	select HWMON
+	select NVRAM
+	select INPUT
+	select NEW_LEDS
+	select LEDS_CLASS
+	select NET
+	select RFKILL
+	---help---
+	  This is a driver for the IBM and Lenovo ThinkPad laptops. It adds
+	  support for Fn-Fx key combinations, Bluetooth control, video
+	  output switching, ThinkLight control, UltraBay eject and more.
+	  For more information about this driver see
+	  <file:Documentation/laptops/thinkpad-acpi.txt> and
+	  <http://ibm-acpi.sf.net/> .
+
+	  This driver was formerly known as ibm-acpi.
+
+	  If you have an IBM or Lenovo ThinkPad laptop, say Y or M here.
+
+config THINKPAD_ACPI_DEBUG
+	bool "Verbose debug mode"
+	depends on THINKPAD_ACPI
+	default n
+	---help---
+	  Enables extra debugging information, at the expense of a slightly
+	  increase in driver size.
+
+	  If you are not sure, say N here.
+
+config THINKPAD_ACPI_DOCK
+	bool "Legacy Docking Station Support"
+	depends on THINKPAD_ACPI
+	depends on ACPI_DOCK=n
+	default n
+	---help---
+	  Allows the thinkpad_acpi driver to handle docking station events.
+	  This support was made obsolete by the generic ACPI docking station
+	  support (CONFIG_ACPI_DOCK).  It will allow locking and removing the
+	  laptop from the docking station, but will not properly connect PCI
+	  devices.
+
+	  If you are not sure, say N here.
+
+config THINKPAD_ACPI_BAY
+	bool "Legacy Removable Bay Support"
+	depends on THINKPAD_ACPI
+	default y
+	---help---
+	  Allows the thinkpad_acpi driver to handle removable bays.  It will
+	  electrically disable the device in the bay, and also generate
+	  notifications when the bay lever is ejected or inserted.
+
+	  If you are not sure, say Y here.
+
+config THINKPAD_ACPI_VIDEO
+	bool "Video output control support"
+	depends on THINKPAD_ACPI
+	default y
+	---help---
+	  Allows the thinkpad_acpi driver to provide an interface to control
+	  the various video output ports.
+
+	  This feature often won't work well, depending on ThinkPad model,
+	  display state, video output devices in use, whether there is a X
+	  server running, phase of the moon, and the current mood of
+	  Schroedinger's cat.  If you can use X.org's RandR to control
+	  your ThinkPad's video output ports instead of this feature,
+	  don't think twice: do it and say N here to save some memory.
+
+	  If you are not sure, say Y here.
+
+config THINKPAD_ACPI_HOTKEY_POLL
+	bool "Support NVRAM polling for hot keys"
+	depends on THINKPAD_ACPI
+	default y
+	---help---
+	  Some thinkpad models benefit from NVRAM polling to detect a few of
+	  the hot key press events.  If you know your ThinkPad model does not
+	  need to do NVRAM polling to support any of the hot keys you use,
+	  unselecting this option will save about 1kB of memory.
+
+	  ThinkPads T40 and newer, R52 and newer, and X31 and newer are
+	  unlikely to need NVRAM polling in their latest BIOS versions.
+
+	  NVRAM polling can detect at most the following keys: ThinkPad/Access
+	  IBM, Zoom, Switch Display (fn+F7), ThinkLight, Volume up/down/mute,
+	  Brightness up/down, Display Expand (fn+F8), Hibernate (fn+F12).
+
+	  If you are not sure, say Y here.  The driver enables polling only if
+	  it is strictly necessary to do so.
+
+config INTEL_MENLOW
+	tristate "Thermal Management driver for Intel menlow platform"
+	depends on ACPI_THERMAL
+	select THERMAL
+	---help---
+	  ACPI thermal management enhancement driver on
+	  Intel Menlow platform.
+
+	  If unsure, say N.
+
+config EEEPC_LAPTOP
+	tristate "Eee PC Hotkey Driver (EXPERIMENTAL)"
+	depends on ACPI
+	depends on EXPERIMENTAL
+	select BACKLIGHT_CLASS_DEVICE
+	select HWMON
+	select RFKILL
+	---help---
+	  This driver supports the Fn-Fx keys on Eee PC laptops.
+	  It also adds the ability to switch camera/wlan on/off.
+
+	  If you have an Eee PC laptop, say Y or M here.
+
+
+config ACPI_WMI
+	tristate "WMI (EXPERIMENTAL)"
+	depends on ACPI
+	depends on EXPERIMENTAL
+	help
+	  This driver adds support for the ACPI-WMI (Windows Management
+	  Instrumentation) mapper device (PNP0C14) found on some systems.
+
+	  ACPI-WMI is a proprietary extension to ACPI to expose parts of the
+	  ACPI firmware to userspace - this is done through various vendor
+	  defined methods and data blocks in a PNP0C14 device, which are then
+	  made available for userspace to call.
+
+	  The implementation of this in Linux currently only exposes this to
+	  other kernel space drivers.
+
+	  This driver is a required dependency to build the firmware specific
+	  drivers needed on many machines, including Acer and HP laptops.
+
+	  It is safe to enable this driver even if your DSDT doesn't define
+	  any ACPI-WMI devices.
+
+config ACPI_ASUS
+	tristate "ASUS/Medion Laptop Extras"
+	depends on ACPI
+	select BACKLIGHT_CLASS_DEVICE
+	---help---
+	  This driver provides support for extra features of ACPI-compatible
+	  ASUS laptops. As some of Medion laptops are made by ASUS, it may also
+	  support some Medion laptops (such as 9675 for example).  It makes all
+	  the extra buttons generate standard ACPI events that go through
+	  /proc/acpi/events, and (on some models) adds support for changing the
+	  display brightness and output, switching the LCD backlight on and off,
+	  and most importantly, allows you to blink those fancy LEDs intended
+	  for reporting mail and wireless status.
+
+	  Note: display switching code is currently considered EXPERIMENTAL,
+	  toying with these values may even lock your machine.
+
+	  All settings are changed via /proc/acpi/asus directory entries. Owner
+	  and group for these entries can be set with asus_uid and asus_gid
+	  parameters.
+
+	  More information and a userspace daemon for handling the extra buttons
+	  at <http://sourceforge.net/projects/acpi4asus/>.
+
+	  If you have an ACPI-compatible ASUS laptop, say Y or M here. This
+	  driver is still under development, so if your laptop is unsupported or
+	  something works not quite as expected, please use the mailing list
+	  available on the above page (acpi4asus-user@lists.sourceforge.net).
+
+	  NOTE: This driver is deprecated and will probably be removed soon,
+	  use asus-laptop instead.
+
+config ACPI_TOSHIBA
+	tristate "Toshiba Laptop Extras"
+	depends on ACPI
+	depends on INPUT
+	select INPUT_POLLDEV
+	select NET
+	select RFKILL
+	select BACKLIGHT_CLASS_DEVICE
+	---help---
+	  This driver adds support for access to certain system settings
+	  on "legacy free" Toshiba laptops.  These laptops can be recognized by
+	  their lack of a BIOS setup menu and APM support.
+
+	  On these machines, all system configuration is handled through the
+	  ACPI.  This driver is required for access to controls not covered
+	  by the general ACPI drivers, such as LCD brightness, video output,
+	  etc.
+
+	  This driver differs from the non-ACPI Toshiba laptop driver (located
+	  under "Processor type and features") in several aspects.
+	  Configuration is accessed by reading and writing text files in the
+	  /proc tree instead of by program interface to /dev.  Furthermore, no
+	  power management functions are exposed, as those are handled by the
+	  general ACPI drivers.
+
+	  More information about this driver is available at
+	  <http://memebeam.org/toys/ToshibaAcpiDriver>.
+
+	  If you have a legacy free Toshiba laptop (such as the Libretto L1
+	  series), say Y.
+endif # X86_PLATFORM_DEVICES
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
new file mode 100644
index 00000000000..1e9de2ae0de
--- /dev/null
+++ b/drivers/platform/x86/Makefile
@@ -0,0 +1,19 @@
+#
+# Makefile for linux/drivers/platform/x86
+# x86 Platform-Specific Drivers
+#
+obj-$(CONFIG_ASUS_LAPTOP)	+= asus-laptop.o
+obj-$(CONFIG_EEEPC_LAPTOP)	+= eeepc-laptop.o
+obj-$(CONFIG_MSI_LAPTOP)	+= msi-laptop.o
+obj-$(CONFIG_COMPAL_LAPTOP)	+= compal-laptop.o
+obj-$(CONFIG_ACER_WMI)		+= acer-wmi.o
+obj-$(CONFIG_HP_WMI)		+= hp-wmi.o
+obj-$(CONFIG_TC1100_WMI)	+= tc1100-wmi.o
+obj-$(CONFIG_SONY_LAPTOP)	+= sony-laptop.o
+obj-$(CONFIG_THINKPAD_ACPI)	+= thinkpad_acpi.o
+obj-$(CONFIG_FUJITSU_LAPTOP)	+= fujitsu-laptop.o
+obj-$(CONFIG_PANASONIC_LAPTOP)	+= panasonic-laptop.o
+obj-$(CONFIG_INTEL_MENLOW)	+= intel_menlow.o
+obj-$(CONFIG_ACPI_WMI)		+= wmi.o
+obj-$(CONFIG_ACPI_ASUS)		+= asus_acpi.o
+obj-$(CONFIG_ACPI_TOSHIBA)	+= toshiba_acpi.o
diff --git a/drivers/misc/acer-wmi.c b/drivers/platform/x86/acer-wmi.c
index 94c9f911824..94c9f911824 100644
--- a/drivers/misc/acer-wmi.c
+++ b/drivers/platform/x86/acer-wmi.c
diff --git a/drivers/misc/asus-laptop.c b/drivers/platform/x86/asus-laptop.c
index 8fb8b359104..8fb8b359104 100644
--- a/drivers/misc/asus-laptop.c
+++ b/drivers/platform/x86/asus-laptop.c
diff --git a/drivers/acpi/asus_acpi.c b/drivers/platform/x86/asus_acpi.c
index 1e74988c7b2..1e74988c7b2 100644
--- a/drivers/acpi/asus_acpi.c
+++ b/drivers/platform/x86/asus_acpi.c
diff --git a/drivers/misc/compal-laptop.c b/drivers/platform/x86/compal-laptop.c
index 11003bba10d..11003bba10d 100644
--- a/drivers/misc/compal-laptop.c
+++ b/drivers/platform/x86/compal-laptop.c
diff --git a/drivers/misc/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c
index 02fe2b8b893..02fe2b8b893 100644
--- a/drivers/misc/eeepc-laptop.c
+++ b/drivers/platform/x86/eeepc-laptop.c
diff --git a/drivers/misc/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c
index a7dd3e9fb79..65dc41540c6 100644
--- a/drivers/misc/fujitsu-laptop.c
+++ b/drivers/platform/x86/fujitsu-laptop.c
@@ -3,6 +3,7 @@
 /*
   Copyright (C) 2007,2008 Jonathan Woithe <jwoithe@physics.adelaide.edu.au>
   Copyright (C) 2008 Peter Gruber <nokos@gmx.net>
+  Copyright (C) 2008 Tony Vroon <tony@linx.net>
   Based on earlier work:
     Copyright (C) 2003 Shane Spencer <shane@bogomip.com>
     Adrian Yee <brewt-fujitsu@brewt.org>
@@ -65,8 +66,11 @@
 #include <linux/kfifo.h>
 #include <linux/video_output.h>
 #include <linux/platform_device.h>
+#ifdef CONFIG_LEDS_CLASS
+#include <linux/leds.h>
+#endif
 
-#define FUJITSU_DRIVER_VERSION "0.4.3"
+#define FUJITSU_DRIVER_VERSION "0.5.0"
 
 #define FUJITSU_LCD_N_LEVELS 8
 
@@ -83,6 +87,24 @@
 #define ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS     0x86
 #define ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS     0x87
 
+/* FUNC interface - command values */
+#define FUNC_RFKILL	0x1000
+#define FUNC_LEDS	0x1001
+#define FUNC_BUTTONS	0x1002
+#define FUNC_BACKLIGHT  0x1004
+
+/* FUNC interface - responses */
+#define UNSUPPORTED_CMD 0x80000000
+
+#ifdef CONFIG_LEDS_CLASS
+/* FUNC interface - LED control */
+#define FUNC_LED_OFF	0x1
+#define FUNC_LED_ON	0x30001
+#define KEYBOARD_LAMPS	0x100
+#define LOGOLAMP_POWERON 0x2000
+#define LOGOLAMP_ALWAYS  0x4000
+#endif
+
 /* Hotkey details */
 #define KEY1_CODE	0x410	/* codes for the keys in the GIRB register */
 #define KEY2_CODE	0x411
@@ -133,7 +155,6 @@ struct fujitsu_t {
 
 static struct fujitsu_t *fujitsu;
 static int use_alt_lcd_levels = -1;
-static int disable_brightness_keys = -1;
 static int disable_brightness_adjust = -1;
 
 /* Device used to access other hotkeys on the laptop */
@@ -145,8 +166,9 @@ struct fujitsu_hotkey_t {
 	struct platform_device *pf_device;
 	struct kfifo *fifo;
 	spinlock_t fifo_lock;
-
-	unsigned int irb;	/* info about the pressed buttons */
+	int rfkill_state;
+	int logolamp_registered;
+	int kblamps_registered;
 };
 
 static struct fujitsu_hotkey_t *fujitsu_hotkey;
@@ -154,12 +176,139 @@ static struct fujitsu_hotkey_t *fujitsu_hotkey;
 static void acpi_fujitsu_hotkey_notify(acpi_handle handle, u32 event,
 				       void *data);
 
+#ifdef CONFIG_LEDS_CLASS
+static enum led_brightness logolamp_get(struct led_classdev *cdev);
+static void logolamp_set(struct led_classdev *cdev,
+			       enum led_brightness brightness);
+
+struct led_classdev logolamp_led = {
+ .name = "fujitsu::logolamp",
+ .brightness_get = logolamp_get,
+ .brightness_set = logolamp_set
+};
+
+static enum led_brightness kblamps_get(struct led_classdev *cdev);
+static void kblamps_set(struct led_classdev *cdev,
+			       enum led_brightness brightness);
+
+struct led_classdev kblamps_led = {
+ .name = "fujitsu::kblamps",
+ .brightness_get = kblamps_get,
+ .brightness_set = kblamps_set
+};
+#endif
+
 #ifdef CONFIG_FUJITSU_LAPTOP_DEBUG
 static u32 dbg_level = 0x03;
 #endif
 
 static void acpi_fujitsu_notify(acpi_handle handle, u32 event, void *data);
 
+/* Fujitsu ACPI interface function */
+
+static int call_fext_func(int cmd, int arg0, int arg1, int arg2)
+{
+	acpi_status status = AE_OK;
+	union acpi_object params[4] = {
+	{ .type = ACPI_TYPE_INTEGER },
+	{ .type = ACPI_TYPE_INTEGER },
+	{ .type = ACPI_TYPE_INTEGER },
+	{ .type = ACPI_TYPE_INTEGER }
+	};
+	struct acpi_object_list arg_list = { 4, &params[0] };
+	struct acpi_buffer output;
+	union acpi_object out_obj;
+	acpi_handle handle = NULL;
+
+	status = acpi_get_handle(fujitsu_hotkey->acpi_handle, "FUNC", &handle);
+	if (ACPI_FAILURE(status)) {
+		vdbg_printk(FUJLAPTOP_DBG_ERROR,
+				"FUNC interface is not present\n");
+		return -ENODEV;
+	}
+
+	params[0].integer.value = cmd;
+	params[1].integer.value = arg0;
+	params[2].integer.value = arg1;
+	params[3].integer.value = arg2;
+
+	output.length = sizeof(out_obj);
+	output.pointer = &out_obj;
+
+	status = acpi_evaluate_object(handle, NULL, &arg_list, &output);
+	if (ACPI_FAILURE(status)) {
+		vdbg_printk(FUJLAPTOP_DBG_WARN,
+			"FUNC 0x%x (args 0x%x, 0x%x, 0x%x) call failed\n",
+				cmd, arg0, arg1, arg2);
+		return -ENODEV;
+	}
+
+	if (out_obj.type != ACPI_TYPE_INTEGER) {
+		vdbg_printk(FUJLAPTOP_DBG_WARN,
+			"FUNC 0x%x (args 0x%x, 0x%x, 0x%x) did not "
+			"return an integer\n",
+			cmd, arg0, arg1, arg2);
+		return -ENODEV;
+	}
+
+	vdbg_printk(FUJLAPTOP_DBG_TRACE,
+		"FUNC 0x%x (args 0x%x, 0x%x, 0x%x) returned 0x%x\n",
+			cmd, arg0, arg1, arg2, (int)out_obj.integer.value);
+	return out_obj.integer.value;
+}
+
+#ifdef CONFIG_LEDS_CLASS
+/* LED class callbacks */
+
+static void logolamp_set(struct led_classdev *cdev,
+			       enum led_brightness brightness)
+{
+	if (brightness >= LED_FULL) {
+		call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_POWERON, FUNC_LED_ON);
+		call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_ALWAYS, FUNC_LED_ON);
+	} else if (brightness >= LED_HALF) {
+		call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_POWERON, FUNC_LED_ON);
+		call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_ALWAYS, FUNC_LED_OFF);
+	} else {
+		call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_POWERON, FUNC_LED_OFF);
+	}
+}
+
+static void kblamps_set(struct led_classdev *cdev,
+			       enum led_brightness brightness)
+{
+	if (brightness >= LED_FULL)
+		call_fext_func(FUNC_LEDS, 0x1, KEYBOARD_LAMPS, FUNC_LED_ON);
+	else
+		call_fext_func(FUNC_LEDS, 0x1, KEYBOARD_LAMPS, FUNC_LED_OFF);
+}
+
+static enum led_brightness logolamp_get(struct led_classdev *cdev)
+{
+	enum led_brightness brightness = LED_OFF;
+	int poweron, always;
+
+	poweron = call_fext_func(FUNC_LEDS, 0x2, LOGOLAMP_POWERON, 0x0);
+	if (poweron == FUNC_LED_ON) {
+		brightness = LED_HALF;
+		always = call_fext_func(FUNC_LEDS, 0x2, LOGOLAMP_ALWAYS, 0x0);
+		if (always == FUNC_LED_ON)
+			brightness = LED_FULL;
+	}
+	return brightness;
+}
+
+static enum led_brightness kblamps_get(struct led_classdev *cdev)
+{
+	enum led_brightness brightness = LED_OFF;
+
+	if (call_fext_func(FUNC_LEDS, 0x2, KEYBOARD_LAMPS, 0x0) == FUNC_LED_ON)
+		brightness = LED_FULL;
+
+	return brightness;
+}
+#endif
+
 /* Hardware access for LCD brightness control */
 
 static int set_lcd_level(int level)
@@ -263,44 +412,34 @@ static int get_max_brightness(void)
 	return fujitsu->max_brightness;
 }
 
-static int get_lcd_level_alt(void)
-{
-	unsigned long long state = 0;
-	acpi_status status = AE_OK;
-
-	vdbg_printk(FUJLAPTOP_DBG_TRACE, "get lcd level via GBLS\n");
-
-	status =
-	    acpi_evaluate_integer(fujitsu->acpi_handle, "GBLS", NULL, &state);
-	if (status < 0)
-		return status;
-
-	fujitsu->brightness_level = state & 0x0fffffff;
-
-	if (state & 0x80000000)
-		fujitsu->brightness_changed = 1;
-	else
-		fujitsu->brightness_changed = 0;
-
-	return fujitsu->brightness_level;
-}
-
 /* Backlight device stuff */
 
 static int bl_get_brightness(struct backlight_device *b)
 {
-	if (use_alt_lcd_levels)
-		return get_lcd_level_alt();
-	else
-		return get_lcd_level();
+	return get_lcd_level();
 }
 
 static int bl_update_status(struct backlight_device *b)
 {
+	int ret;
+	if (b->props.power == 4)
+		ret = call_fext_func(FUNC_BACKLIGHT, 0x1, 0x4, 0x3);
+	else
+		ret = call_fext_func(FUNC_BACKLIGHT, 0x1, 0x4, 0x0);
+	if (ret != 0)
+		vdbg_printk(FUJLAPTOP_DBG_ERROR,
+			"Unable to adjust backlight power, error code %i\n",
+			ret);
+
 	if (use_alt_lcd_levels)
-		return set_lcd_level_alt(b->props.brightness);
+		ret = set_lcd_level_alt(b->props.brightness);
 	else
-		return set_lcd_level(b->props.brightness);
+		ret = set_lcd_level(b->props.brightness);
+	if (ret != 0)
+		vdbg_printk(FUJLAPTOP_DBG_ERROR,
+			"Unable to adjust LCD brightness, error code %i\n",
+			ret);
+	return ret;
 }
 
 static struct backlight_ops fujitsubl_ops = {
@@ -344,10 +483,7 @@ static ssize_t show_lcd_level(struct device *dev,
 
 	int ret;
 
-	if (use_alt_lcd_levels)
-		ret = get_lcd_level_alt();
-	else
-		ret = get_lcd_level();
+	ret = get_lcd_level();
 	if (ret < 0)
 		return ret;
 
@@ -372,52 +508,71 @@ static ssize_t store_lcd_level(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	if (use_alt_lcd_levels)
-		ret = get_lcd_level_alt();
-	else
-		ret = get_lcd_level();
+	ret = get_lcd_level();
 	if (ret < 0)
 		return ret;
 
 	return count;
 }
 
-/* Hardware access for hotkey device */
-
-static int get_irb(void)
+static ssize_t
+ignore_store(struct device *dev,
+	     struct device_attribute *attr, const char *buf, size_t count)
 {
-	unsigned long long state = 0;
-	acpi_status status = AE_OK;
-
-	vdbg_printk(FUJLAPTOP_DBG_TRACE, "Get irb\n");
-
-	status =
-	    acpi_evaluate_integer(fujitsu_hotkey->acpi_handle, "GIRB", NULL,
-				  &state);
-	if (status < 0)
-		return status;
+	return count;
+}
 
-	fujitsu_hotkey->irb = state;
+static ssize_t
+show_lid_state(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	if (fujitsu_hotkey->rfkill_state == UNSUPPORTED_CMD)
+		return sprintf(buf, "unknown\n");
+	if (fujitsu_hotkey->rfkill_state & 0x100)
+		return sprintf(buf, "open\n");
+	else
+		return sprintf(buf, "closed\n");
+}
 
-	return fujitsu_hotkey->irb;
+static ssize_t
+show_dock_state(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	if (fujitsu_hotkey->rfkill_state == UNSUPPORTED_CMD)
+		return sprintf(buf, "unknown\n");
+	if (fujitsu_hotkey->rfkill_state & 0x200)
+		return sprintf(buf, "docked\n");
+	else
+		return sprintf(buf, "undocked\n");
 }
 
 static ssize_t
-ignore_store(struct device *dev,
-	     struct device_attribute *attr, const char *buf, size_t count)
+show_radios_state(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
-	return count;
+	if (fujitsu_hotkey->rfkill_state == UNSUPPORTED_CMD)
+		return sprintf(buf, "unknown\n");
+	if (fujitsu_hotkey->rfkill_state & 0x20)
+		return sprintf(buf, "on\n");
+	else
+		return sprintf(buf, "killed\n");
 }
 
 static DEVICE_ATTR(max_brightness, 0444, show_max_brightness, ignore_store);
 static DEVICE_ATTR(brightness_changed, 0444, show_brightness_changed,
 		   ignore_store);
 static DEVICE_ATTR(lcd_level, 0644, show_lcd_level, store_lcd_level);
+static DEVICE_ATTR(lid, 0444, show_lid_state, ignore_store);
+static DEVICE_ATTR(dock, 0444, show_dock_state, ignore_store);
+static DEVICE_ATTR(radios, 0444, show_radios_state, ignore_store);
 
 static struct attribute *fujitsupf_attributes[] = {
 	&dev_attr_brightness_changed.attr,
 	&dev_attr_max_brightness.attr,
 	&dev_attr_lcd_level.attr,
+	&dev_attr_lid.attr,
+	&dev_attr_dock.attr,
+	&dev_attr_radios.attr,
 	NULL
 };
 
@@ -435,24 +590,16 @@ static struct platform_driver fujitsupf_driver = {
 static void dmi_check_cb_common(const struct dmi_system_id *id)
 {
 	acpi_handle handle;
-	int have_blnf;
 	printk(KERN_INFO "fujitsu-laptop: Identified laptop model '%s'.\n",
 	       id->ident);
-	have_blnf = ACPI_SUCCESS
-	    (acpi_get_handle(NULL, "\\_SB.PCI0.GFX0.LCD.BLNF", &handle));
 	if (use_alt_lcd_levels == -1) {
-		vdbg_printk(FUJLAPTOP_DBG_TRACE, "auto-detecting usealt\n");
-		use_alt_lcd_levels = 1;
-	}
-	if (disable_brightness_keys == -1) {
-		vdbg_printk(FUJLAPTOP_DBG_TRACE,
-			    "auto-detecting disable_keys\n");
-		disable_brightness_keys = have_blnf ? 1 : 0;
-	}
-	if (disable_brightness_adjust == -1) {
-		vdbg_printk(FUJLAPTOP_DBG_TRACE,
-			    "auto-detecting disable_adjust\n");
-		disable_brightness_adjust = have_blnf ? 0 : 1;
+		if (ACPI_SUCCESS(acpi_get_handle(NULL,
+				"\\_SB.PCI0.LPCB.FJEX.SBL2", &handle)))
+			use_alt_lcd_levels = 1;
+		else
+			use_alt_lcd_levels = 0;
+		vdbg_printk(FUJLAPTOP_DBG_TRACE, "auto-detected usealt as "
+			"%i\n", use_alt_lcd_levels);
 	}
 }
 
@@ -581,19 +728,14 @@ static int acpi_fujitsu_add(struct acpi_device *device)
 
 	/* do config (detect defaults) */
 	use_alt_lcd_levels = use_alt_lcd_levels == 1 ? 1 : 0;
-	disable_brightness_keys = disable_brightness_keys == 1 ? 1 : 0;
 	disable_brightness_adjust = disable_brightness_adjust == 1 ? 1 : 0;
 	vdbg_printk(FUJLAPTOP_DBG_INFO,
-		    "config: [alt interface: %d], [key disable: %d], [adjust disable: %d]\n",
-		    use_alt_lcd_levels, disable_brightness_keys,
-		    disable_brightness_adjust);
+		    "config: [alt interface: %d], [adjust disable: %d]\n",
+		    use_alt_lcd_levels, disable_brightness_adjust);
 
 	if (get_max_brightness() <= 0)
 		fujitsu->max_brightness = FUJITSU_LCD_N_LEVELS;
-	if (use_alt_lcd_levels)
-		get_lcd_level_alt();
-	else
-		get_lcd_level();
+	get_lcd_level();
 
 	return result;
 
@@ -644,43 +786,23 @@ static void acpi_fujitsu_notify(acpi_handle handle, u32 event, void *data)
 	case ACPI_FUJITSU_NOTIFY_CODE1:
 		keycode = 0;
 		oldb = fujitsu->brightness_level;
-		get_lcd_level();  /* the alt version always yields changed */
+		get_lcd_level();
 		newb = fujitsu->brightness_level;
 
 		vdbg_printk(FUJLAPTOP_DBG_TRACE,
 			    "brightness button event [%i -> %i (%i)]\n",
 			    oldb, newb, fujitsu->brightness_changed);
 
-		if (oldb == newb && fujitsu->brightness_changed) {
-			keycode = 0;
-			if (disable_brightness_keys != 1) {
-				if (oldb == 0) {
-					acpi_bus_generate_proc_event
-					    (fujitsu->dev,
-					     ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS,
-					     0);
-					keycode = KEY_BRIGHTNESSDOWN;
-				} else if (oldb ==
-					   (fujitsu->max_brightness) - 1) {
-					acpi_bus_generate_proc_event
-					    (fujitsu->dev,
-					     ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS,
-					     0);
-					keycode = KEY_BRIGHTNESSUP;
-				}
-			}
-		} else if (oldb < newb) {
+		if (oldb < newb) {
 			if (disable_brightness_adjust != 1) {
 				if (use_alt_lcd_levels)
 					set_lcd_level_alt(newb);
 				else
 					set_lcd_level(newb);
 			}
-			if (disable_brightness_keys != 1) {
-				acpi_bus_generate_proc_event(fujitsu->dev,
-					ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS, 0);
-				keycode = KEY_BRIGHTNESSUP;
-			}
+			acpi_bus_generate_proc_event(fujitsu->dev,
+				ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS, 0);
+			keycode = KEY_BRIGHTNESSUP;
 		} else if (oldb > newb) {
 			if (disable_brightness_adjust != 1) {
 				if (use_alt_lcd_levels)
@@ -688,13 +810,9 @@ static void acpi_fujitsu_notify(acpi_handle handle, u32 event, void *data)
 				else
 					set_lcd_level(newb);
 			}
-			if (disable_brightness_keys != 1) {
-				acpi_bus_generate_proc_event(fujitsu->dev,
-					ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS, 0);
-				keycode = KEY_BRIGHTNESSDOWN;
-			}
-		} else {
-			keycode = KEY_UNKNOWN;
+			acpi_bus_generate_proc_event(fujitsu->dev,
+				ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS, 0);
+			keycode = KEY_BRIGHTNESSDOWN;
 		}
 		break;
 	default:
@@ -771,7 +889,8 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
 	input->id.bustype = BUS_HOST;
 	input->id.product = 0x06;
 	input->dev.parent = &device->dev;
-	input->evbit[0] = BIT(EV_KEY);
+
+	set_bit(EV_KEY, input->evbit);
 	set_bit(fujitsu->keycode1, input->keybit);
 	set_bit(fujitsu->keycode2, input->keybit);
 	set_bit(fujitsu->keycode3, input->keybit);
@@ -803,10 +922,44 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device)
 			printk(KERN_ERR "_INI Method failed\n");
 	}
 
-	i = 0;			/* Discard hotkey ringbuffer */
-	while (get_irb() != 0 && (i++) < MAX_HOTKEY_RINGBUFFER_SIZE) ;
+	i = 0;
+	while (call_fext_func(FUNC_BUTTONS, 0x1, 0x0, 0x0) != 0
+		&& (i++) < MAX_HOTKEY_RINGBUFFER_SIZE)
+		; /* No action, result is discarded */
 	vdbg_printk(FUJLAPTOP_DBG_INFO, "Discarded %i ringbuffer entries\n", i);
 
+	fujitsu_hotkey->rfkill_state =
+		call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0);
+
+	/* Suspect this is a keymap of the application panel, print it */
+	printk(KERN_INFO "fujitsu-laptop: BTNI: [0x%x]\n",
+		call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0));
+
+	#ifdef CONFIG_LEDS_CLASS
+	if (call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & LOGOLAMP_POWERON) {
+		result = led_classdev_register(&fujitsu->pf_device->dev,
+						&logolamp_led);
+		if (result == 0) {
+			fujitsu_hotkey->logolamp_registered = 1;
+		} else {
+			printk(KERN_ERR "fujitsu-laptop: Could not register "
+			"LED handler for logo lamp, error %i\n", result);
+		}
+	}
+
+	if ((call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & KEYBOARD_LAMPS) &&
+	   (call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0) == 0x0)) {
+		result = led_classdev_register(&fujitsu->pf_device->dev,
+						&kblamps_led);
+		if (result == 0) {
+			fujitsu_hotkey->kblamps_registered = 1;
+		} else {
+			printk(KERN_ERR "fujitsu-laptop: Could not register "
+			"LED handler for keyboard lamps, error %i\n", result);
+		}
+	}
+	#endif
+
 	return result;
 
 end:
@@ -852,16 +1005,15 @@ static void acpi_fujitsu_hotkey_notify(acpi_handle handle, u32 event,
 
 	input = fujitsu_hotkey->input;
 
-	vdbg_printk(FUJLAPTOP_DBG_TRACE, "Hotkey event\n");
+	fujitsu_hotkey->rfkill_state =
+		call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0);
 
 	switch (event) {
 	case ACPI_FUJITSU_NOTIFY_CODE1:
 		i = 0;
-		while ((irb = get_irb()) != 0
-		       && (i++) < MAX_HOTKEY_RINGBUFFER_SIZE) {
-			vdbg_printk(FUJLAPTOP_DBG_TRACE, "GIRB result [%x]\n",
-				    irb);
-
+		while ((irb =
+			call_fext_func(FUNC_BUTTONS, 0x1, 0x0, 0x0)) != 0
+				&& (i++) < MAX_HOTKEY_RINGBUFFER_SIZE) {
 			switch (irb & 0x4ff) {
 			case KEY1_CODE:
 				keycode = fujitsu->keycode1;
@@ -1035,6 +1187,15 @@ static int __init fujitsu_init(void)
 		goto fail_hotkey1;
 	}
 
+	/* Sync backlight power status (needs FUJ02E3 device, hence deferred) */
+
+	if (!acpi_video_backlight_support()) {
+		if (call_fext_func(FUNC_BACKLIGHT, 0x2, 0x4, 0x0) == 3)
+			fujitsu->bl_device->props.power = 4;
+		else
+			fujitsu->bl_device->props.power = 0;
+	}
+
 	printk(KERN_INFO "fujitsu-laptop: driver " FUJITSU_DRIVER_VERSION
 	       " successfully loaded.\n");
 
@@ -1074,6 +1235,14 @@ fail_acpi:
 
 static void __exit fujitsu_cleanup(void)
 {
+	#ifdef CONFIG_LEDS_CLASS
+	if (fujitsu_hotkey->logolamp_registered != 0)
+		led_classdev_unregister(&logolamp_led);
+
+	if (fujitsu_hotkey->kblamps_registered != 0)
+		led_classdev_unregister(&kblamps_led);
+	#endif
+
 	sysfs_remove_group(&fujitsu->pf_device->dev.kobj,
 			   &fujitsupf_attribute_group);
 	platform_device_unregister(fujitsu->pf_device);
@@ -1098,9 +1267,6 @@ module_exit(fujitsu_cleanup);
 module_param(use_alt_lcd_levels, uint, 0644);
 MODULE_PARM_DESC(use_alt_lcd_levels,
 		 "Use alternative interface for lcd_levels (needed for Lifebook s6410).");
-module_param(disable_brightness_keys, uint, 0644);
-MODULE_PARM_DESC(disable_brightness_keys,
-		 "Disable brightness keys (eg. if they are already handled by the generic ACPI_VIDEO device).");
 module_param(disable_brightness_adjust, uint, 0644);
 MODULE_PARM_DESC(disable_brightness_adjust, "Disable brightness adjustment .");
 #ifdef CONFIG_FUJITSU_LAPTOP_DEBUG
@@ -1108,12 +1274,13 @@ module_param_named(debug, dbg_level, uint, 0644);
 MODULE_PARM_DESC(debug, "Sets debug level bit-mask");
 #endif
 
-MODULE_AUTHOR("Jonathan Woithe, Peter Gruber");
+MODULE_AUTHOR("Jonathan Woithe, Peter Gruber, Tony Vroon");
 MODULE_DESCRIPTION("Fujitsu laptop extras support");
 MODULE_VERSION(FUJITSU_DRIVER_VERSION);
 MODULE_LICENSE("GPL");
 
 MODULE_ALIAS("dmi:*:svnFUJITSUSIEMENS:*:pvr:rvnFUJITSU:rnFJNB1D3:*:cvrS6410:*");
+MODULE_ALIAS("dmi:*:svnFUJITSUSIEMENS:*:pvr:rvnFUJITSU:rnFJNB1E6:*:cvrS6420:*");
 MODULE_ALIAS("dmi:*:svnFUJITSU:*:pvr:rvnFUJITSU:rnFJNB19C:*:cvrS7020:*");
 
 static struct pnp_device_id pnp_ids[] = {
diff --git a/drivers/misc/hp-wmi.c b/drivers/platform/x86/hp-wmi.c
index 4b7c24c519c..4b7c24c519c 100644
--- a/drivers/misc/hp-wmi.c
+++ b/drivers/platform/x86/hp-wmi.c
diff --git a/drivers/misc/intel_menlow.c b/drivers/platform/x86/intel_menlow.c
index 27b7662955b..27b7662955b 100644
--- a/drivers/misc/intel_menlow.c
+++ b/drivers/platform/x86/intel_menlow.c
diff --git a/drivers/misc/msi-laptop.c b/drivers/platform/x86/msi-laptop.c
index 759763d18e4..759763d18e4 100644
--- a/drivers/misc/msi-laptop.c
+++ b/drivers/platform/x86/msi-laptop.c
diff --git a/drivers/misc/panasonic-laptop.c b/drivers/platform/x86/panasonic-laptop.c
index 4a1bc64485d..f30db367c82 100644
--- a/drivers/misc/panasonic-laptop.c
+++ b/drivers/platform/x86/panasonic-laptop.c
@@ -241,8 +241,6 @@ static int acpi_pcc_write_sset(struct pcc_acpi *pcc, int func, int val)
 	};
 	acpi_status status = AE_OK;
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_write_sset");
-
 	status = acpi_evaluate_object(pcc->handle, METHOD_HKEY_SSET,
 				      &params, NULL);
 
@@ -254,8 +252,6 @@ static inline int acpi_pcc_get_sqty(struct acpi_device *device)
 	unsigned long long s;
 	acpi_status status;
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_get_sqty");
-
 	status = acpi_evaluate_integer(device->handle, METHOD_HKEY_SQTY,
 				       NULL, &s);
 	if (ACPI_SUCCESS(status))
@@ -274,8 +270,6 @@ static int acpi_pcc_retrieve_biosdata(struct pcc_acpi *pcc, u32 *sinf)
 	union acpi_object *hkey = NULL;
 	int i;
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_retrieve_biosdata");
-
 	status = acpi_evaluate_object(pcc->handle, METHOD_HKEY_SINF, 0,
 				      &buffer);
 	if (ACPI_FAILURE(status)) {
@@ -501,8 +495,6 @@ static void acpi_pcc_generate_keyinput(struct pcc_acpi *pcc)
 	int key_code, hkey_num;
 	unsigned long long result;
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_generate_keyinput");
-
 	rc = acpi_evaluate_integer(pcc->handle, METHOD_HKEY_QUERY,
 				   NULL, &result);
 	if (!ACPI_SUCCESS(rc)) {
@@ -538,8 +530,6 @@ static void acpi_pcc_hotkey_notify(acpi_handle handle, u32 event, void *data)
 {
 	struct pcc_acpi *pcc = (struct pcc_acpi *) data;
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_hotkey_notify");
-
 	switch (event) {
 	case HKEY_NOTIFY:
 		acpi_pcc_generate_keyinput(pcc);
@@ -554,8 +544,6 @@ static int acpi_pcc_init_input(struct pcc_acpi *pcc)
 {
 	int i, rc;
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_init_input");
-
 	pcc->input_dev = input_allocate_device();
 	if (!pcc->input_dev) {
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
@@ -597,8 +585,6 @@ static int acpi_pcc_hotkey_resume(struct acpi_device *device)
 	struct pcc_acpi *pcc = acpi_driver_data(device);
 	acpi_status status = AE_OK;
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_hotkey_resume");
-
 	if (device == NULL || pcc == NULL)
 		return -EINVAL;
 
@@ -616,8 +602,6 @@ static int acpi_pcc_hotkey_add(struct acpi_device *device)
 	struct pcc_acpi *pcc;
 	int num_sifr, result;
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_hotkey_add");
-
 	if (!device)
 		return -EINVAL;
 
@@ -714,8 +698,6 @@ static int __init acpi_pcc_init(void)
 {
 	int result = 0;
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_init");
-
 	if (acpi_disabled)
 		return -ENODEV;
 
@@ -733,8 +715,6 @@ static int acpi_pcc_hotkey_remove(struct acpi_device *device, int type)
 {
 	struct pcc_acpi *pcc = acpi_driver_data(device);
 
-	ACPI_FUNCTION_TRACE("acpi_pcc_hotkey_remove");
-
 	if (!device || !pcc)
 		return -EINVAL;
 
@@ -757,8 +737,6 @@ static int acpi_pcc_hotkey_remove(struct acpi_device *device, int type)
 
 static void __exit acpi_pcc_exit(void)
 {
-	ACPI_FUNCTION_TRACE("acpi_pcc_exit");
-
 	acpi_bus_unregister_driver(&acpi_pcc_driver);
 }
 
diff --git a/drivers/misc/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 571b211608d..537959d0714 100644
--- a/drivers/misc/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -935,14 +935,17 @@ static void sony_acpi_notify(acpi_handle handle, u32 event, void *data)
 static acpi_status sony_walk_callback(acpi_handle handle, u32 level,
 				      void *context, void **return_value)
 {
-	struct acpi_namespace_node *node;
-	union acpi_operand_object *operand;
+	struct acpi_device_info *info;
+	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
 
-	node = (struct acpi_namespace_node *)handle;
-	operand = (union acpi_operand_object *)node->object;
+	if (ACPI_SUCCESS(acpi_get_object_info(handle, &buffer))) {
+		info = buffer.pointer;
 
-	printk(KERN_WARNING DRV_PFX "method: name: %4.4s, args %X\n", node->name.ascii,
-	       (u32) operand->method.param_count);
+		printk(KERN_WARNING DRV_PFX "method: name: %4.4s, args %X\n",
+			(char *)&info->name, info->param_count);
+
+		kfree(buffer.pointer);
+	}
 
 	return AE_OK;
 }
diff --git a/drivers/misc/tc1100-wmi.c b/drivers/platform/x86/tc1100-wmi.c
index f25e4c974dc..b4a4aa9ee48 100644
--- a/drivers/misc/tc1100-wmi.c
+++ b/drivers/platform/x86/tc1100-wmi.c
@@ -30,7 +30,6 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <acpi/acpi.h>
-#include <acpi/actypes.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 #include <linux/platform_device.h>
diff --git a/drivers/misc/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 899766e16fa..3478453eba7 100644
--- a/drivers/misc/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -76,7 +76,6 @@
 #include <linux/workqueue.h>
 
 #include <acpi/acpi_drivers.h>
-#include <acpi/acnamesp.h>
 
 #include <linux/pci_ids.h>
 
diff --git a/drivers/acpi/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index 40e60fc2e59..40e60fc2e59 100644
--- a/drivers/acpi/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
diff --git a/drivers/acpi/wmi.c b/drivers/platform/x86/wmi.c
index 8a8b377712c..8a8b377712c 100644
--- a/drivers/acpi/wmi.c
+++ b/drivers/platform/x86/wmi.c
diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c
index 383e47c392a..2834846a185 100644
--- a/drivers/pnp/pnpacpi/core.c
+++ b/drivers/pnp/pnpacpi/core.c
@@ -23,7 +23,6 @@
 #include <linux/pnp.h>
 #include <linux/mod_devicetable.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/actypes.h>
 
 #include "../base.h"
 #include "pnpacpi.h"
diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c
index c68c496b2c4..7aa35248181 100644
--- a/drivers/regulator/wm8350-regulator.c
+++ b/drivers/regulator/wm8350-regulator.c
@@ -1412,6 +1412,97 @@ int wm8350_register_regulator(struct wm8350 *wm8350, int reg,
 }
 EXPORT_SYMBOL_GPL(wm8350_register_regulator);
 
+/**
+ * wm8350_register_led - Register a WM8350 LED output
+ *
+ * @param wm8350 The WM8350 device to configure.
+ * @param lednum LED device index to create.
+ * @param dcdc The DCDC to use for the LED.
+ * @param isink The ISINK to use for the LED.
+ * @param pdata Configuration for the LED.
+ *
+ * The WM8350 supports the use of an ISINK together with a DCDC to
+ * provide a power-efficient LED driver.  This function registers the
+ * regulators and instantiates the platform device for a LED.  The
+ * operating modes for the LED regulators must be configured using
+ * wm8350_isink_set_flash(), wm8350_dcdc25_set_mode() and
+ * wm8350_dcdc_set_slot() prior to calling this function.
+ */
+int wm8350_register_led(struct wm8350 *wm8350, int lednum, int dcdc, int isink,
+			struct wm8350_led_platform_data *pdata)
+{
+	struct wm8350_led *led;
+	struct platform_device *pdev;
+	int ret;
+
+	if (lednum > ARRAY_SIZE(wm8350->pmic.led) || lednum < 0) {
+		dev_err(wm8350->dev, "Invalid LED index %d\n", lednum);
+		return -ENODEV;
+	}
+
+	led = &wm8350->pmic.led[lednum];
+
+	if (led->pdev) {
+		dev_err(wm8350->dev, "LED %d already allocated\n", lednum);
+		return -EINVAL;
+	}
+
+	pdev = platform_device_alloc("wm8350-led", lednum);
+	if (pdev == NULL) {
+		dev_err(wm8350->dev, "Failed to allocate LED %d\n", lednum);
+		return -ENOMEM;
+	}
+
+	led->isink_consumer.dev = &pdev->dev;
+	led->isink_consumer.supply = "led_isink";
+	led->isink_init.num_consumer_supplies = 1;
+	led->isink_init.consumer_supplies = &led->isink_consumer;
+	led->isink_init.constraints.min_uA = 0;
+	led->isink_init.constraints.max_uA = pdata->max_uA;
+	led->isink_init.constraints.valid_ops_mask = REGULATOR_CHANGE_CURRENT;
+	led->isink_init.constraints.valid_modes_mask = REGULATOR_MODE_NORMAL;
+	ret = wm8350_register_regulator(wm8350, isink, &led->isink_init);
+	if (ret != 0) {
+		platform_device_put(pdev);
+		return ret;
+	}
+
+	led->dcdc_consumer.dev = &pdev->dev;
+	led->dcdc_consumer.supply = "led_vcc";
+	led->dcdc_init.num_consumer_supplies = 1;
+	led->dcdc_init.consumer_supplies = &led->dcdc_consumer;
+	led->dcdc_init.constraints.valid_modes_mask = REGULATOR_MODE_NORMAL;
+	ret = wm8350_register_regulator(wm8350, dcdc, &led->dcdc_init);
+	if (ret != 0) {
+		platform_device_put(pdev);
+		return ret;
+	}
+
+	switch (isink) {
+	case WM8350_ISINK_A:
+		wm8350->pmic.isink_A_dcdc = dcdc;
+		break;
+	case WM8350_ISINK_B:
+		wm8350->pmic.isink_B_dcdc = dcdc;
+		break;
+	}
+
+	pdev->dev.platform_data = pdata;
+	pdev->dev.parent = wm8350->dev;
+	ret = platform_device_add(pdev);
+	if (ret != 0) {
+		dev_err(wm8350->dev, "Failed to register LED %d: %d\n",
+			lednum, ret);
+		platform_device_put(pdev);
+		return ret;
+	}
+
+	led->pdev = pdev;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(wm8350_register_led);
+
 static struct platform_driver wm8350_regulator_driver = {
 	.probe = wm8350_regulator_probe,
 	.remove = wm8350_regulator_remove,
diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c
index 346d633655e..c6bfa6fe1a2 100644
--- a/drivers/rtc/rtc-parisc.c
+++ b/drivers/rtc/rtc-parisc.c
@@ -34,7 +34,8 @@ static int parisc_get_time(struct device *dev, struct rtc_time *tm)
 static int parisc_set_time(struct device *dev, struct rtc_time *tm)
 {
 	struct parisc_rtc *p = dev_get_drvdata(dev);
-	unsigned long flags, ret;
+	unsigned long flags;
+	int ret;
 
 	spin_lock_irqsave(&p->lock, flags);
 	ret = set_rtc_time(tm);
diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index 4a4dd9adc32..72facb9eb7d 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -52,11 +52,11 @@ config LCD_ILI9320
 	  then say y to include a power driver for it.
 
 config LCD_TDO24M
-	tristate "Toppoly TDO24M LCD Panels support"
+	tristate "Toppoly TDO24M  and TDO35S LCD Panels support"
 	depends on LCD_CLASS_DEVICE && SPI_MASTER
 	default n
 	help
-	  If you have a Toppoly TDO24M series LCD panel, say y here to
+	  If you have a Toppoly TDO24M/TDO35S series LCD panel, say y here to
 	  include the support for it.
 
 config LCD_VGG2432A4
@@ -123,17 +123,14 @@ config BACKLIGHT_ATMEL_PWM
 	  To compile this driver as a module, choose M here: the module will be
 	  called atmel-pwm-bl.
 
-config BACKLIGHT_CORGI
-	tristate "Generic (aka Sharp Corgi) Backlight Driver (DEPRECATED)"
+config BACKLIGHT_GENERIC
+	tristate "Generic (aka Sharp Corgi) Backlight Driver"
 	depends on BACKLIGHT_CLASS_DEVICE
-	default n
+	default y
 	help
 	  Say y to enable the generic platform backlight driver previously
 	  known as the Corgi backlight driver. If you have a Sharp Zaurus
-	  SL-C7xx, SL-Cxx00 or SL-6000x say y. Most users can say n.
-
-	  Note: this driver is marked as deprecated, try enable SPI and
-	  use the new corgi_lcd driver with integrated backlight control
+	  SL-C7xx, SL-Cxx00 or SL-6000x say y.
 
 config BACKLIGHT_LOCOMO
 	tristate "Sharp LOCOMO LCD/Backlight Driver"
diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile
index 103427de670..363b3cb2f01 100644
--- a/drivers/video/backlight/Makefile
+++ b/drivers/video/backlight/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_LCD_TOSA)		   += tosa_lcd.o
 
 obj-$(CONFIG_BACKLIGHT_CLASS_DEVICE) += backlight.o
 obj-$(CONFIG_BACKLIGHT_ATMEL_PWM)    += atmel-pwm-bl.o
-obj-$(CONFIG_BACKLIGHT_CORGI)	+= corgi_bl.o
+obj-$(CONFIG_BACKLIGHT_GENERIC)	+= generic_bl.o
 obj-$(CONFIG_BACKLIGHT_HP680)	+= hp680_bl.o
 obj-$(CONFIG_BACKLIGHT_LOCOMO)	+= locomolcd.o
 obj-$(CONFIG_BACKLIGHT_OMAP1)	+= omap1_bl.o
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 0664fc03223..157057c79ca 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -40,6 +40,10 @@ static int fb_notifier_callback(struct notifier_block *self,
 		if (!bd->ops->check_fb ||
 		    bd->ops->check_fb(evdata->info)) {
 			bd->props.fb_blank = *(int *)evdata->data;
+			if (bd->props.fb_blank == FB_BLANK_UNBLANK)
+				bd->props.state &= ~BL_CORE_FBBLANK;
+			else
+				bd->props.state |= BL_CORE_FBBLANK;
 			backlight_update_status(bd);
 		}
 	mutex_unlock(&bd->ops_lock);
@@ -80,20 +84,18 @@ static ssize_t backlight_show_power(struct device *dev,
 static ssize_t backlight_store_power(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
-	int rc = -ENXIO;
-	char *endp;
+	int rc;
 	struct backlight_device *bd = to_backlight_device(dev);
-	int power = simple_strtoul(buf, &endp, 0);
-	size_t size = endp - buf;
+	unsigned long power;
 
-	if (*endp && isspace(*endp))
-		size++;
-	if (size != count)
-		return -EINVAL;
+	rc = strict_strtoul(buf, 0, &power);
+	if (rc)
+		return rc;
 
+	rc = -ENXIO;
 	mutex_lock(&bd->ops_lock);
 	if (bd->ops) {
-		pr_debug("backlight: set power to %d\n", power);
+		pr_debug("backlight: set power to %lu\n", power);
 		if (bd->props.power != power) {
 			bd->props.power = power;
 			backlight_update_status(bd);
@@ -116,28 +118,25 @@ static ssize_t backlight_show_brightness(struct device *dev,
 static ssize_t backlight_store_brightness(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
-	int rc = -ENXIO;
-	char *endp;
+	int rc;
 	struct backlight_device *bd = to_backlight_device(dev);
-	int brightness = simple_strtoul(buf, &endp, 0);
-	size_t size = endp - buf;
+	unsigned long brightness;
+
+	rc = strict_strtoul(buf, 0, &brightness);
+	if (rc)
+		return rc;
 
-	if (*endp && isspace(*endp))
-		size++;
-	if (size != count)
-		return -EINVAL;
+	rc = -ENXIO;
 
 	mutex_lock(&bd->ops_lock);
 	if (bd->ops) {
 		if (brightness > bd->props.max_brightness)
 			rc = -EINVAL;
 		else {
-			pr_debug("backlight: set brightness to %d\n",
+			pr_debug("backlight: set brightness to %lu\n",
 				 brightness);
-			if (bd->props.brightness != brightness) {
-				bd->props.brightness = brightness;
-				backlight_update_status(bd);
-			}
+			bd->props.brightness = brightness;
+			backlight_update_status(bd);
 			rc = count;
 		}
 	}
@@ -170,6 +169,34 @@ static ssize_t backlight_show_actual_brightness(struct device *dev,
 
 static struct class *backlight_class;
 
+static int backlight_suspend(struct device *dev, pm_message_t state)
+{
+	struct backlight_device *bd = to_backlight_device(dev);
+
+	if (bd->ops->options & BL_CORE_SUSPENDRESUME) {
+		mutex_lock(&bd->ops_lock);
+		bd->props.state |= BL_CORE_SUSPENDED;
+		backlight_update_status(bd);
+		mutex_unlock(&bd->ops_lock);
+	}
+
+	return 0;
+}
+
+static int backlight_resume(struct device *dev)
+{
+	struct backlight_device *bd = to_backlight_device(dev);
+
+	if (bd->ops->options & BL_CORE_SUSPENDRESUME) {
+		mutex_lock(&bd->ops_lock);
+		bd->props.state &= ~BL_CORE_SUSPENDED;
+		backlight_update_status(bd);
+		mutex_unlock(&bd->ops_lock);
+	}
+
+	return 0;
+}
+
 static void bl_device_release(struct device *dev)
 {
 	struct backlight_device *bd = to_backlight_device(dev);
@@ -286,6 +313,8 @@ static int __init backlight_class_init(void)
 	}
 
 	backlight_class->dev_attrs = bl_device_attributes;
+	backlight_class->suspend = backlight_suspend;
+	backlight_class->resume = backlight_resume;
 	return 0;
 }
 
diff --git a/drivers/video/backlight/corgi_bl.c b/drivers/video/backlight/corgi_bl.c
deleted file mode 100644
index 4d4d037e3ec..00000000000
--- a/drivers/video/backlight/corgi_bl.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- *  Backlight Driver for Sharp Zaurus Handhelds (various models)
- *
- *  Copyright (c) 2004-2006 Richard Purdie
- *
- *  Based on Sharp's 2.4 Backlight Driver
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/mutex.h>
-#include <linux/fb.h>
-#include <linux/backlight.h>
-
-static int corgibl_intensity;
-static struct backlight_properties corgibl_data;
-static struct backlight_device *corgi_backlight_device;
-static struct generic_bl_info *bl_machinfo;
-
-static unsigned long corgibl_flags;
-#define CORGIBL_SUSPENDED     0x01
-#define CORGIBL_BATTLOW       0x02
-
-static int corgibl_send_intensity(struct backlight_device *bd)
-{
-	int intensity = bd->props.brightness;
-
-	if (bd->props.power != FB_BLANK_UNBLANK)
-		intensity = 0;
-	if (bd->props.fb_blank != FB_BLANK_UNBLANK)
-		intensity = 0;
-	if (corgibl_flags & CORGIBL_SUSPENDED)
-		intensity = 0;
-	if (corgibl_flags & CORGIBL_BATTLOW)
-		intensity &= bl_machinfo->limit_mask;
-
-	bl_machinfo->set_bl_intensity(intensity);
-
-	corgibl_intensity = intensity;
-
-	if (bl_machinfo->kick_battery)
-		bl_machinfo->kick_battery();
-
-	return 0;
-}
-
-#ifdef CONFIG_PM
-static int corgibl_suspend(struct platform_device *pdev, pm_message_t state)
-{
-	struct backlight_device *bd = platform_get_drvdata(pdev);
-
-	corgibl_flags |= CORGIBL_SUSPENDED;
-	backlight_update_status(bd);
-	return 0;
-}
-
-static int corgibl_resume(struct platform_device *pdev)
-{
-	struct backlight_device *bd = platform_get_drvdata(pdev);
-
-	corgibl_flags &= ~CORGIBL_SUSPENDED;
-	backlight_update_status(bd);
-	return 0;
-}
-#else
-#define corgibl_suspend	NULL
-#define corgibl_resume	NULL
-#endif
-
-static int corgibl_get_intensity(struct backlight_device *bd)
-{
-	return corgibl_intensity;
-}
-
-/*
- * Called when the battery is low to limit the backlight intensity.
- * If limit==0 clear any limit, otherwise limit the intensity
- */
-void corgibl_limit_intensity(int limit)
-{
-	if (limit)
-		corgibl_flags |= CORGIBL_BATTLOW;
-	else
-		corgibl_flags &= ~CORGIBL_BATTLOW;
-	backlight_update_status(corgi_backlight_device);
-}
-EXPORT_SYMBOL(corgibl_limit_intensity);
-
-
-static struct backlight_ops corgibl_ops = {
-	.get_brightness = corgibl_get_intensity,
-	.update_status  = corgibl_send_intensity,
-};
-
-static int corgibl_probe(struct platform_device *pdev)
-{
-	struct generic_bl_info *machinfo = pdev->dev.platform_data;
-	const char *name = "generic-bl";
-
-	bl_machinfo = machinfo;
-	if (!machinfo->limit_mask)
-		machinfo->limit_mask = -1;
-
-	if (machinfo->name)
-		name = machinfo->name;
-
-	corgi_backlight_device = backlight_device_register (name,
-		&pdev->dev, NULL, &corgibl_ops);
-	if (IS_ERR (corgi_backlight_device))
-		return PTR_ERR (corgi_backlight_device);
-
-	platform_set_drvdata(pdev, corgi_backlight_device);
-
-	corgi_backlight_device->props.max_brightness = machinfo->max_intensity;
-	corgi_backlight_device->props.power = FB_BLANK_UNBLANK;
-	corgi_backlight_device->props.brightness = machinfo->default_intensity;
-	backlight_update_status(corgi_backlight_device);
-
-	printk("Corgi Backlight Driver Initialized.\n");
-	return 0;
-}
-
-static int corgibl_remove(struct platform_device *pdev)
-{
-	struct backlight_device *bd = platform_get_drvdata(pdev);
-
-	corgibl_data.power = 0;
-	corgibl_data.brightness = 0;
-	backlight_update_status(bd);
-
-	backlight_device_unregister(bd);
-
-	printk("Corgi Backlight Driver Unloaded\n");
-	return 0;
-}
-
-static struct platform_driver corgibl_driver = {
-	.probe		= corgibl_probe,
-	.remove		= corgibl_remove,
-	.suspend	= corgibl_suspend,
-	.resume		= corgibl_resume,
-	.driver		= {
-		.name	= "generic-bl",
-	},
-};
-
-static int __init corgibl_init(void)
-{
-	return platform_driver_register(&corgibl_driver);
-}
-
-static void __exit corgibl_exit(void)
-{
-	platform_driver_unregister(&corgibl_driver);
-}
-
-module_init(corgibl_init);
-module_exit(corgibl_exit);
-
-MODULE_AUTHOR("Richard Purdie <rpurdie@rpsys.net>");
-MODULE_DESCRIPTION("Corgi Backlight Driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/backlight/cr_bllcd.c b/drivers/video/backlight/cr_bllcd.c
index 26add889860..b9fe62b475c 100644
--- a/drivers/video/backlight/cr_bllcd.c
+++ b/drivers/video/backlight/cr_bllcd.c
@@ -259,22 +259,18 @@ static int __init cr_backlight_init(void)
 {
 	int ret = platform_driver_register(&cr_backlight_driver);
 
-	if (!ret) {
-		crp = platform_device_alloc("cr_backlight", -1);
-		if (!crp)
-			return -ENOMEM;
+	if (ret)
+		return ret;
 
-		ret = platform_device_add(crp);
-
-		if (ret) {
-			platform_device_put(crp);
-			platform_driver_unregister(&cr_backlight_driver);
-		}
+	crp = platform_device_register_simple("cr_backlight", -1, NULL, 0);
+	if (IS_ERR(crp)) {
+		platform_driver_unregister(&cr_backlight_driver);
+		return PTR_ERR(crp);
 	}
 
 	printk("Carillo Ranch Backlight Driver Initialized.\n");
 
-	return ret;
+	return 0;
 }
 
 static void __exit cr_backlight_exit(void)
diff --git a/drivers/video/backlight/generic_bl.c b/drivers/video/backlight/generic_bl.c
new file mode 100644
index 00000000000..6d27f62fdcd
--- /dev/null
+++ b/drivers/video/backlight/generic_bl.c
@@ -0,0 +1,147 @@
+/*
+ *  Generic Backlight Driver
+ *
+ *  Copyright (c) 2004-2008 Richard Purdie
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/fb.h>
+#include <linux/backlight.h>
+
+static int genericbl_intensity;
+static struct backlight_device *generic_backlight_device;
+static struct generic_bl_info *bl_machinfo;
+
+/* Flag to signal when the battery is low */
+#define GENERICBL_BATTLOW       BL_CORE_DRIVER1
+
+static int genericbl_send_intensity(struct backlight_device *bd)
+{
+	int intensity = bd->props.brightness;
+
+	if (bd->props.power != FB_BLANK_UNBLANK)
+		intensity = 0;
+	if (bd->props.state & BL_CORE_FBBLANK)
+		intensity = 0;
+	if (bd->props.state & BL_CORE_SUSPENDED)
+		intensity = 0;
+	if (bd->props.state & GENERICBL_BATTLOW)
+		intensity &= bl_machinfo->limit_mask;
+
+	bl_machinfo->set_bl_intensity(intensity);
+
+	genericbl_intensity = intensity;
+
+	if (bl_machinfo->kick_battery)
+		bl_machinfo->kick_battery();
+
+	return 0;
+}
+
+static int genericbl_get_intensity(struct backlight_device *bd)
+{
+	return genericbl_intensity;
+}
+
+/*
+ * Called when the battery is low to limit the backlight intensity.
+ * If limit==0 clear any limit, otherwise limit the intensity
+ */
+void corgibl_limit_intensity(int limit)
+{
+	struct backlight_device *bd = generic_backlight_device;
+
+	mutex_lock(&bd->ops_lock);
+	if (limit)
+		bd->props.state |= GENERICBL_BATTLOW;
+	else
+		bd->props.state &= ~GENERICBL_BATTLOW;
+	backlight_update_status(generic_backlight_device);
+	mutex_unlock(&bd->ops_lock);
+}
+EXPORT_SYMBOL(corgibl_limit_intensity);
+
+static struct backlight_ops genericbl_ops = {
+	.options = BL_CORE_SUSPENDRESUME,
+	.get_brightness = genericbl_get_intensity,
+	.update_status  = genericbl_send_intensity,
+};
+
+static int genericbl_probe(struct platform_device *pdev)
+{
+	struct generic_bl_info *machinfo = pdev->dev.platform_data;
+	const char *name = "generic-bl";
+	struct backlight_device *bd;
+
+	bl_machinfo = machinfo;
+	if (!machinfo->limit_mask)
+		machinfo->limit_mask = -1;
+
+	if (machinfo->name)
+		name = machinfo->name;
+
+	bd = backlight_device_register (name,
+		&pdev->dev, NULL, &genericbl_ops);
+	if (IS_ERR (bd))
+		return PTR_ERR (bd);
+
+	platform_set_drvdata(pdev, bd);
+
+	bd->props.max_brightness = machinfo->max_intensity;
+	bd->props.power = FB_BLANK_UNBLANK;
+	bd->props.brightness = machinfo->default_intensity;
+	backlight_update_status(bd);
+
+	generic_backlight_device = bd;
+
+	printk("Generic Backlight Driver Initialized.\n");
+	return 0;
+}
+
+static int genericbl_remove(struct platform_device *pdev)
+{
+	struct backlight_device *bd = platform_get_drvdata(pdev);
+
+	bd->props.power = 0;
+	bd->props.brightness = 0;
+	backlight_update_status(bd);
+
+	backlight_device_unregister(bd);
+
+	printk("Generic Backlight Driver Unloaded\n");
+	return 0;
+}
+
+static struct platform_driver genericbl_driver = {
+	.probe		= genericbl_probe,
+	.remove		= genericbl_remove,
+	.driver		= {
+		.name	= "generic-bl",
+	},
+};
+
+static int __init genericbl_init(void)
+{
+	return platform_driver_register(&genericbl_driver);
+}
+
+static void __exit genericbl_exit(void)
+{
+	platform_driver_unregister(&genericbl_driver);
+}
+
+module_init(genericbl_init);
+module_exit(genericbl_exit);
+
+MODULE_AUTHOR("Richard Purdie <rpurdie@rpsys.net>");
+MODULE_DESCRIPTION("Generic Backlight Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/video/backlight/hp680_bl.c b/drivers/video/backlight/hp680_bl.c
index d4cfed0b26d..5be55a20d8c 100644
--- a/drivers/video/backlight/hp680_bl.c
+++ b/drivers/video/backlight/hp680_bl.c
@@ -151,19 +151,15 @@ static int __init hp680bl_init(void)
 	int ret;
 
 	ret = platform_driver_register(&hp680bl_driver);
-	if (!ret) {
-		hp680bl_device = platform_device_alloc("hp680-bl", -1);
-		if (!hp680bl_device)
-			return -ENOMEM;
-
-		ret = platform_device_add(hp680bl_device);
-
-		if (ret) {
-			platform_device_put(hp680bl_device);
-			platform_driver_unregister(&hp680bl_driver);
-		}
+	if (ret)
+		return ret;
+	hp680bl_device = platform_device_register_simple("hp680-bl", -1,
+							NULL, 0);
+	if (IS_ERR(hp680bl_device)) {
+		platform_driver_unregister(&hp680bl_driver);
+		return PTR_ERR(hp680bl_device);
 	}
-	return ret;
+	return 0;
 }
 
 static void __exit hp680bl_exit(void)
diff --git a/drivers/video/backlight/mbp_nvidia_bl.c b/drivers/video/backlight/mbp_nvidia_bl.c
index 06964af761c..65864c50045 100644
--- a/drivers/video/backlight/mbp_nvidia_bl.c
+++ b/drivers/video/backlight/mbp_nvidia_bl.c
@@ -70,6 +70,7 @@ static int mbp_get_intensity(struct backlight_device *bd)
 }
 
 static struct backlight_ops mbp_ops = {
+	.options = BL_CORE_SUSPENDRESUME,
 	.get_brightness = mbp_get_intensity,
 	.update_status  = mbp_send_intensity,
 };
diff --git a/drivers/video/backlight/progear_bl.c b/drivers/video/backlight/progear_bl.c
index 15fb4d58b5b..9edaf24fd82 100644
--- a/drivers/video/backlight/progear_bl.c
+++ b/drivers/video/backlight/progear_bl.c
@@ -119,20 +119,16 @@ static int __init progearbl_init(void)
 {
 	int ret = platform_driver_register(&progearbl_driver);
 
-	if (!ret) {
-		progearbl_device = platform_device_alloc("progear-bl", -1);
-		if (!progearbl_device)
-			return -ENOMEM;
-
-		ret = platform_device_add(progearbl_device);
-
-		if (ret) {
-			platform_device_put(progearbl_device);
-			platform_driver_unregister(&progearbl_driver);
-		}
+	if (ret)
+		return ret;
+	progearbl_device = platform_device_register_simple("progear-bl", -1,
+								NULL, 0);
+	if (IS_ERR(progearbl_device)) {
+		platform_driver_unregister(&progearbl_driver);
+		return PTR_ERR(progearbl_device);
 	}
 
-	return ret;
+	return 0;
 }
 
 static void __exit progearbl_exit(void)
diff --git a/drivers/video/backlight/tdo24m.c b/drivers/video/backlight/tdo24m.c
index 8427669162e..1dae7f8f3c6 100644
--- a/drivers/video/backlight/tdo24m.c
+++ b/drivers/video/backlight/tdo24m.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/spi/spi.h>
+#include <linux/spi/tdo24m.h>
 #include <linux/fb.h>
 #include <linux/lcd.h>
 
@@ -31,6 +32,9 @@ struct tdo24m {
 	struct spi_transfer	xfer;
 	uint8_t			*buf;
 
+	int (*adj_mode)(struct tdo24m *lcd, int mode);
+	int color_invert;
+
 	int			power;
 	int			mode;
 };
@@ -66,7 +70,7 @@ static uint32_t lcd_panel_off[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_vga_pass_through[] = {
+static uint32_t lcd_vga_pass_through_tdo24m[] = {
 	CMD1(0xB0, 0x16),
 	CMD1(0xBC, 0x80),
 	CMD1(0xE1, 0x00),
@@ -75,7 +79,7 @@ static uint32_t lcd_vga_pass_through[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_qvga_pass_through[] = {
+static uint32_t lcd_qvga_pass_through_tdo24m[] = {
 	CMD1(0xB0, 0x16),
 	CMD1(0xBC, 0x81),
 	CMD1(0xE1, 0x00),
@@ -84,7 +88,7 @@ static uint32_t lcd_qvga_pass_through[] = {
 	CMD_NULL,
 };
 
-static uint32_t lcd_vga_transfer[] = {
+static uint32_t lcd_vga_transfer_tdo24m[] = {
 	CMD1(0xcf, 0x02), 	/* Blanking period control (1) */
 	CMD2(0xd0, 0x08, 0x04),	/* Blanking period control (2) */
 	CMD1(0xd1, 0x01),	/* CKV timing control on/off */
@@ -110,6 +114,35 @@ static uint32_t lcd_qvga_transfer[] = {
 	CMD_NULL,
 };
 
+static uint32_t lcd_vga_pass_through_tdo35s[] = {
+	CMD1(0xB0, 0x16),
+	CMD1(0xBC, 0x80),
+	CMD1(0xE1, 0x00),
+	CMD1(0x3B, 0x00),
+	CMD_NULL,
+};
+
+static uint32_t lcd_qvga_pass_through_tdo35s[] = {
+	CMD1(0xB0, 0x16),
+	CMD1(0xBC, 0x81),
+	CMD1(0xE1, 0x00),
+	CMD1(0x3B, 0x22),
+	CMD_NULL,
+};
+
+static uint32_t lcd_vga_transfer_tdo35s[] = {
+	CMD1(0xcf, 0x02), 	/* Blanking period control (1) */
+	CMD2(0xd0, 0x08, 0x04),	/* Blanking period control (2) */
+	CMD1(0xd1, 0x01),	/* CKV timing control on/off */
+	CMD2(0xd2, 0x00, 0x1e),	/* CKV 1,2 timing control */
+	CMD2(0xd3, 0x14, 0x28),	/* OEV timing control */
+	CMD2(0xd4, 0x28, 0x64),	/* ASW timing control (1) */
+	CMD1(0xd5, 0x28),	/* ASW timing control (2) */
+	CMD0(0x21),		/* Invert for normally black display */
+	CMD0(0x29),		/* Display on */
+	CMD_NULL,
+};
+
 static uint32_t lcd_panel_config[] = {
 	CMD2(0xb8, 0xff, 0xf9),	/* Output control */
 	CMD0(0x11),		/* sleep out */
@@ -148,6 +181,8 @@ static int tdo24m_writes(struct tdo24m *lcd, uint32_t *array)
 	int nparams, err = 0;
 
 	for (; *p != CMD_NULL; p++) {
+		if (!lcd->color_invert && *p == CMD0(0x21))
+			continue;
 
 		nparams = (*p >> 30) & 0x3;
 
@@ -184,12 +219,33 @@ static int tdo24m_adj_mode(struct tdo24m *lcd, int mode)
 {
 	switch (mode) {
 	case MODE_VGA:
-		tdo24m_writes(lcd, lcd_vga_pass_through);
+		tdo24m_writes(lcd, lcd_vga_pass_through_tdo24m);
 		tdo24m_writes(lcd, lcd_panel_config);
-		tdo24m_writes(lcd, lcd_vga_transfer);
+		tdo24m_writes(lcd, lcd_vga_transfer_tdo24m);
 		break;
 	case MODE_QVGA:
-		tdo24m_writes(lcd, lcd_qvga_pass_through);
+		tdo24m_writes(lcd, lcd_qvga_pass_through_tdo24m);
+		tdo24m_writes(lcd, lcd_panel_config);
+		tdo24m_writes(lcd, lcd_qvga_transfer);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	lcd->mode = mode;
+	return 0;
+}
+
+static int tdo35s_adj_mode(struct tdo24m *lcd, int mode)
+{
+	switch (mode) {
+	case MODE_VGA:
+		tdo24m_writes(lcd, lcd_vga_pass_through_tdo35s);
+		tdo24m_writes(lcd, lcd_panel_config);
+		tdo24m_writes(lcd, lcd_vga_transfer_tdo35s);
+		break;
+	case MODE_QVGA:
+		tdo24m_writes(lcd, lcd_qvga_pass_through_tdo35s);
 		tdo24m_writes(lcd, lcd_panel_config);
 		tdo24m_writes(lcd, lcd_qvga_transfer);
 		break;
@@ -213,7 +269,7 @@ static int tdo24m_power_on(struct tdo24m *lcd)
 	if (err)
 		goto out;
 
-	err = tdo24m_adj_mode(lcd, lcd->mode);
+	err = lcd->adj_mode(lcd, lcd->mode);
 out:
 	return err;
 }
@@ -262,7 +318,7 @@ static int tdo24m_set_mode(struct lcd_device *ld, struct fb_videomode *m)
 	if (lcd->mode == mode)
 		return 0;
 
-	return tdo24m_adj_mode(lcd, mode);
+	return lcd->adj_mode(lcd, mode);
 }
 
 static struct lcd_ops tdo24m_ops = {
@@ -276,8 +332,16 @@ static int __devinit tdo24m_probe(struct spi_device *spi)
 	struct tdo24m *lcd;
 	struct spi_message *m;
 	struct spi_transfer *x;
+	struct tdo24m_platform_data *pdata;
+	enum tdo24m_model model;
 	int err;
 
+	pdata = spi->dev.platform_data;
+	if (pdata)
+		model = pdata->model;
+	else
+		model = TDO24M;
+
 	spi->bits_per_word = 8;
 	spi->mode = SPI_MODE_3;
 	err = spi_setup(spi);
@@ -306,6 +370,20 @@ static int __devinit tdo24m_probe(struct spi_device *spi)
 	x->tx_buf = &lcd->buf[0];
 	spi_message_add_tail(x, m);
 
+	switch (model) {
+	case TDO24M:
+		lcd->color_invert = 1;
+		lcd->adj_mode = tdo24m_adj_mode;
+		break;
+	case TDO35S:
+		lcd->adj_mode = tdo35s_adj_mode;
+		lcd->color_invert = 0;
+		break;
+	default:
+		dev_err(&spi->dev, "Unsupported model");
+		goto out_free;
+	}
+
 	lcd->lcd_dev = lcd_device_register("tdo24m", &spi->dev,
 					lcd, &tdo24m_ops);
 	if (IS_ERR(lcd->lcd_dev)) {
diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c
index 57a26649f1a..b7fbc75a62f 100644
--- a/drivers/video/backlight/tosa_lcd.c
+++ b/drivers/video/backlight/tosa_lcd.c
@@ -39,6 +39,7 @@ struct tosa_lcd_data {
 	struct i2c_client *i2c;
 
 	int lcd_power;
+	bool is_vga;
 };
 
 static int tosa_tg_send(struct spi_device *spi, int adrs, uint8_t data)
@@ -81,8 +82,12 @@ static void tosa_lcd_tg_init(struct tosa_lcd_data *data)
 static void tosa_lcd_tg_on(struct tosa_lcd_data *data)
 {
 	struct spi_device *spi = data->spi;
-	const int value = TG_REG0_COLOR | TG_REG0_UD | TG_REG0_LR;
-	tosa_tg_send(spi, TG_PNLCTL, value | TG_REG0_VQV); /* this depends on mode */
+	int value = TG_REG0_COLOR | TG_REG0_UD | TG_REG0_LR;
+
+	if (data->is_vga)
+		value |= TG_REG0_VQV;
+
+	tosa_tg_send(spi, TG_PNLCTL, value);
 
 	/* TG LCD pannel power up */
 	tosa_tg_send(spi, TG_PINICTL,0x4);
@@ -142,9 +147,25 @@ static int tosa_lcd_get_power(struct lcd_device *lcd)
 	return data->lcd_power;
 }
 
+static int tosa_lcd_set_mode(struct lcd_device *lcd, struct fb_videomode *mode)
+{
+	struct tosa_lcd_data *data = lcd_get_data(lcd);
+
+	if (mode->xres == 320 || mode->yres == 320)
+		data->is_vga = false;
+	else
+		data->is_vga = true;
+
+	if (POWER_IS_ON(data->lcd_power))
+		tosa_lcd_tg_on(data);
+
+	return 0;
+}
+
 static struct lcd_ops tosa_lcd_ops = {
 	.set_power = tosa_lcd_set_power,
 	.get_power = tosa_lcd_get_power,
+	.set_mode = tosa_lcd_set_mode,
 };
 
 static int __devinit tosa_lcd_probe(struct spi_device *spi)
@@ -156,6 +177,8 @@ static int __devinit tosa_lcd_probe(struct spi_device *spi)
 	if (!data)
 		return -ENOMEM;
 
+	data->is_vga = true; /* defaut to VGA mode */
+
 	/*
 	 * bits_per_word cannot be configured in platform data
 	 */
diff --git a/drivers/video/backlight/vgg2432a4.c b/drivers/video/backlight/vgg2432a4.c
index 593c7687d54..8e653b8a6f1 100644
--- a/drivers/video/backlight/vgg2432a4.c
+++ b/drivers/video/backlight/vgg2432a4.c
@@ -137,7 +137,7 @@ static int vgg2432a4_lcd_init(struct ili9320 *lcd,
 
 	ili9320_write(lcd, ILI9320_RGB_IF1, cfg->rgb_if1);
 	ili9320_write(lcd, ILI9320_FRAMEMAKER, 0x0);
-	ili9320_write(lcd, ILI9320_RGB_IF2, ILI9320_RGBIF2_DPL);
+	ili9320_write(lcd, ILI9320_RGB_IF2, cfg->rgb_if2);
 
 	ret = ili9320_write_regs(lcd, vgg_init1, ARRAY_SIZE(vgg_init1));
 	if (ret != 0)
diff --git a/fs/Kconfig b/fs/Kconfig
index 32883589ee5..02cff86af1b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -269,6 +269,25 @@ config OCFS2_FS_POSIX_ACL
 	  Posix Access Control Lists (ACLs) support permissions for users and
 	  groups beyond the owner/group/world scheme.
 
+config BTRFS_FS
+	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+	depends on EXPERIMENTAL
+	select LIBCRC32C
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	help
+	  Btrfs is a new filesystem with extents, writable snapshotting,
+	  support for multiple devices and many more features.
+
+	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+	  FINALIZED.  You should say N here unless you are interested in
+	  testing Btrfs with non-critical data.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called btrfs.
+
+	  If unsure, say N.
+
 endif # BLOCK
 
 source "fs/notify/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c830611550d..bc4e14df108 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -119,4 +119,5 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
+obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 00000000000..d2cf5a54a4b
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
+ifneq ($(KERNELRELEASE),)
+# kbuild part of makefile
+
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
+btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+	   file-item.o inode-item.o inode-map.o disk-io.o \
+	   transaction.o inode.o file.o tree-defrag.o \
+	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+	   compression.o
+else
+
+# Normal Makefile
+
+KERNELDIR := /lib/modules/`uname -r`/build
+all:
+	$(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
+
+modules_install:
+	$(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
+clean:
+	$(MAKE) -C $(KERNELDIR) M=`pwd` clean
+
+endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 00000000000..1d53b62dbba
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "xattr.h"
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+static void btrfs_update_cached_acl(struct inode *inode,
+				    struct posix_acl **p_acl,
+				    struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
+		posix_acl_release(*p_acl);
+	*p_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+	int size;
+	const char *name;
+	char *value = NULL;
+	struct posix_acl *acl = NULL, **p_acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		p_acl = &BTRFS_I(inode)->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		p_acl = &BTRFS_I(inode)->i_default_acl;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	spin_lock(&inode->i_lock);
+	if (*p_acl != BTRFS_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*p_acl);
+	spin_unlock(&inode->i_lock);
+
+	if (acl)
+		return acl;
+
+
+	size = __btrfs_getxattr(inode, name, "", 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = __btrfs_getxattr(inode, name, value, size);
+		if (size > 0) {
+			acl = posix_acl_from_xattr(value, size);
+			btrfs_update_cached_acl(inode, p_acl, acl);
+		}
+		kfree(value);
+	} else if (size == -ENOENT) {
+		acl = NULL;
+		btrfs_update_cached_acl(inode, p_acl, acl);
+	}
+
+	return acl;
+}
+
+static int btrfs_xattr_get_acl(struct inode *inode, int type,
+			       void *value, size_t size)
+{
+	struct posix_acl *acl;
+	int ret = 0;
+
+	acl = btrfs_get_acl(inode, type);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	ret = posix_acl_to_xattr(acl, value, size);
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int ret, size = 0;
+	const char *name;
+	struct posix_acl **p_acl;
+	char *value = NULL;
+	mode_t mode;
+
+	if (acl) {
+		ret = posix_acl_valid(acl);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+	}
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		mode = inode->i_mode;
+		ret = posix_acl_equiv_mode(acl, &mode);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+		inode->i_mode = mode;
+		name = POSIX_ACL_XATTR_ACCESS;
+		p_acl = &BTRFS_I(inode)->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EINVAL : 0;
+		name = POSIX_ACL_XATTR_DEFAULT;
+		p_acl = &BTRFS_I(inode)->i_default_acl;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_NOFS);
+		if (!value) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = posix_acl_to_xattr(acl, value, size);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = __btrfs_setxattr(inode, name, value, size, 0);
+
+out:
+	kfree(value);
+
+	if (!ret)
+		btrfs_update_cached_acl(inode, p_acl, acl);
+
+	return ret;
+}
+
+static int btrfs_xattr_set_acl(struct inode *inode, int type,
+			       const void *value, size_t size)
+{
+	int ret = 0;
+	struct posix_acl *acl = NULL;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (acl == NULL) {
+			value = NULL;
+			size = 0;
+		} else if (IS_ERR(acl)) {
+			return PTR_ERR(acl);
+		}
+	}
+
+	ret = btrfs_set_acl(inode, acl, type);
+
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+
+static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
+				      void *value, size_t size)
+{
+	return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
+				      const void *value, size_t size, int flags)
+{
+	return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
+				       void *value, size_t size)
+{
+	return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
+			       const void *value, size_t size, int flags)
+{
+	return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl;
+	int error = -EAGAIN;
+
+	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+	}
+
+	return error;
+}
+
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that.  If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int ret = 0;
+
+	/* this happens with subvols */
+	if (!dir)
+		return 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (IS_POSIXACL(dir)) {
+			acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+
+	if (IS_POSIXACL(dir) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
+			if (ret)
+				goto failed;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		ret = -ENOMEM;
+		if (!clone)
+			goto failed;
+
+		mode = inode->i_mode;
+		ret = posix_acl_create_masq(clone, &mode);
+		if (ret >= 0) {
+			inode->i_mode = mode;
+			if (ret > 0) {
+				/* we need an acl */
+				ret = btrfs_set_acl(inode, clone,
+						    ACL_TYPE_ACCESS);
+			}
+		}
+	}
+failed:
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int ret = 0;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!IS_POSIXACL(inode))
+		return 0;
+
+	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+
+	ret = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!ret)
+		ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
+
+	posix_acl_release(clone);
+
+	return ret;
+}
+
+struct xattr_handler btrfs_xattr_acl_default_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.get	= btrfs_xattr_acl_default_get,
+	.set	= btrfs_xattr_acl_default_set,
+};
+
+struct xattr_handler btrfs_xattr_acl_access_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.get	= btrfs_xattr_acl_access_get,
+	.set	= btrfs_xattr_acl_access_set,
+};
+
+#else /* CONFIG_FS_POSIX_ACL */
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+	return 0;
+}
+
+#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 00000000000..8e2fec05dbe
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+# include <linux/freezer.h>
+#include "async-thread.h"
+
+#define WORK_QUEUED_BIT 0
+#define WORK_DONE_BIT 1
+#define WORK_ORDER_DONE_BIT 2
+
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+	/* pool we belong to */
+	struct btrfs_workers *workers;
+
+	/* list of struct btrfs_work that are waiting for service */
+	struct list_head pending;
+
+	/* list of worker threads from struct btrfs_workers */
+	struct list_head worker_list;
+
+	/* kthread */
+	struct task_struct *task;
+
+	/* number of things on the pending list */
+	atomic_t num_pending;
+
+	unsigned long sequence;
+
+	/* protects the pending list. */
+	spinlock_t lock;
+
+	/* set to non-zero when this thread is already awake and kicking */
+	int working;
+
+	/* are we currently idle */
+	int idle;
+};
+
+/*
+ * helper function to move a thread onto the idle list after it
+ * has finished some requests.
+ */
+static void check_idle_worker(struct btrfs_worker_thread *worker)
+{
+	if (!worker->idle && atomic_read(&worker->num_pending) <
+	    worker->workers->idle_thresh / 2) {
+		unsigned long flags;
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 1;
+		list_move(&worker->worker_list, &worker->workers->idle_list);
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+}
+
+/*
+ * helper function to move a thread off the idle list after new
+ * pending work is added.
+ */
+static void check_busy_worker(struct btrfs_worker_thread *worker)
+{
+	if (worker->idle && atomic_read(&worker->num_pending) >=
+	    worker->workers->idle_thresh) {
+		unsigned long flags;
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 0;
+		list_move_tail(&worker->worker_list,
+			       &worker->workers->worker_list);
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+}
+
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+					    struct btrfs_work *work)
+{
+	unsigned long flags;
+
+	if (!workers->ordered)
+		return 0;
+
+	set_bit(WORK_DONE_BIT, &work->flags);
+
+	spin_lock_irqsave(&workers->lock, flags);
+
+	while (!list_empty(&workers->order_list)) {
+		work = list_entry(workers->order_list.next,
+				  struct btrfs_work, order_list);
+
+		if (!test_bit(WORK_DONE_BIT, &work->flags))
+			break;
+
+		/* we are going to call the ordered done function, but
+		 * we leave the work item on the list as a barrier so
+		 * that later work items that are done don't have their
+		 * functions called before this one returns
+		 */
+		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+			break;
+
+		spin_unlock_irqrestore(&workers->lock, flags);
+
+		work->ordered_func(work);
+
+		/* now take the lock again and call the freeing code */
+		spin_lock_irqsave(&workers->lock, flags);
+		list_del(&work->order_list);
+		work->ordered_free(work);
+	}
+
+	spin_unlock_irqrestore(&workers->lock, flags);
+	return 0;
+}
+
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+	struct btrfs_worker_thread *worker = arg;
+	struct list_head *cur;
+	struct btrfs_work *work;
+	do {
+		spin_lock_irq(&worker->lock);
+		while (!list_empty(&worker->pending)) {
+			cur = worker->pending.next;
+			work = list_entry(cur, struct btrfs_work, list);
+			list_del(&work->list);
+			clear_bit(WORK_QUEUED_BIT, &work->flags);
+
+			work->worker = worker;
+			spin_unlock_irq(&worker->lock);
+
+			work->func(work);
+
+			atomic_dec(&worker->num_pending);
+			/*
+			 * unless this is an ordered work queue,
+			 * 'work' was probably freed by func above.
+			 */
+			run_ordered_completions(worker->workers, work);
+
+			spin_lock_irq(&worker->lock);
+			check_idle_worker(worker);
+
+		}
+		worker->working = 0;
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&worker->lock);
+			if (!kthread_should_stop())
+				schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+	struct list_head *cur;
+	struct btrfs_worker_thread *worker;
+
+	list_splice_init(&workers->idle_list, &workers->worker_list);
+	while (!list_empty(&workers->worker_list)) {
+		cur = workers->worker_list.next;
+		worker = list_entry(cur, struct btrfs_worker_thread,
+				    worker_list);
+		kthread_stop(worker->task);
+		list_del(&worker->worker_list);
+		kfree(worker);
+	}
+	return 0;
+}
+
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+{
+	workers->num_workers = 0;
+	INIT_LIST_HEAD(&workers->worker_list);
+	INIT_LIST_HEAD(&workers->idle_list);
+	INIT_LIST_HEAD(&workers->order_list);
+	spin_lock_init(&workers->lock);
+	workers->max_workers = max;
+	workers->idle_thresh = 32;
+	workers->name = name;
+	workers->ordered = 0;
+}
+
+/*
+ * starts new worker threads.  This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+	struct btrfs_worker_thread *worker;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < num_workers; i++) {
+		worker = kzalloc(sizeof(*worker), GFP_NOFS);
+		if (!worker) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		INIT_LIST_HEAD(&worker->pending);
+		INIT_LIST_HEAD(&worker->worker_list);
+		spin_lock_init(&worker->lock);
+		atomic_set(&worker->num_pending, 0);
+		worker->task = kthread_run(worker_loop, worker,
+					   "btrfs-%s-%d", workers->name,
+					   workers->num_workers + i);
+		worker->workers = workers;
+		if (IS_ERR(worker->task)) {
+			kfree(worker);
+			ret = PTR_ERR(worker->task);
+			goto fail;
+		}
+
+		spin_lock_irq(&workers->lock);
+		list_add_tail(&worker->worker_list, &workers->idle_list);
+		worker->idle = 1;
+		workers->num_workers++;
+		spin_unlock_irq(&workers->lock);
+	}
+	return 0;
+fail:
+	btrfs_stop_workers(workers);
+	return ret;
+}
+
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now.  This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	struct list_head *next;
+	int enforce_min = workers->num_workers < workers->max_workers;
+
+	/*
+	 * if we find an idle thread, don't move it to the end of the
+	 * idle list.  This improves the chance that the next submission
+	 * will reuse the same thread, and maybe catch it while it is still
+	 * working
+	 */
+	if (!list_empty(&workers->idle_list)) {
+		next = workers->idle_list.next;
+		worker = list_entry(next, struct btrfs_worker_thread,
+				    worker_list);
+		return worker;
+	}
+	if (enforce_min || list_empty(&workers->worker_list))
+		return NULL;
+
+	/*
+	 * if we pick a busy task, move the task to the end of the list.
+	 * hopefully this will keep things somewhat evenly balanced.
+	 * Do the move in batches based on the sequence number.  This groups
+	 * requests submitted at roughly the same time onto the same worker.
+	 */
+	next = workers->worker_list.next;
+	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+	atomic_inc(&worker->num_pending);
+	worker->sequence++;
+
+	if (worker->sequence % workers->idle_thresh == 0)
+		list_move_tail(next, &workers->worker_list);
+	return worker;
+}
+
+/*
+ * selects a worker thread to take the next job.  This will either find
+ * an idle worker, start a new worker up to the max count, or just return
+ * one of the existing busy workers.
+ */
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+
+again:
+	spin_lock_irqsave(&workers->lock, flags);
+	worker = next_worker(workers);
+	spin_unlock_irqrestore(&workers->lock, flags);
+
+	if (!worker) {
+		spin_lock_irqsave(&workers->lock, flags);
+		if (workers->num_workers >= workers->max_workers) {
+			struct list_head *fallback = NULL;
+			/*
+			 * we have failed to find any workers, just
+			 * return the force one
+			 */
+			if (!list_empty(&workers->worker_list))
+				fallback = workers->worker_list.next;
+			if (!list_empty(&workers->idle_list))
+				fallback = workers->idle_list.next;
+			BUG_ON(!fallback);
+			worker = list_entry(fallback,
+				  struct btrfs_worker_thread, worker_list);
+			spin_unlock_irqrestore(&workers->lock, flags);
+		} else {
+			spin_unlock_irqrestore(&workers->lock, flags);
+			/* we're below the limit, start another worker */
+			btrfs_start_workers(workers, 1);
+			goto again;
+		}
+	}
+	return worker;
+}
+
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from.  It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker = work->worker;
+	unsigned long flags;
+
+	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+		goto out;
+
+	spin_lock_irqsave(&worker->lock, flags);
+	atomic_inc(&worker->num_pending);
+	list_add_tail(&work->list, &worker->pending);
+
+	/* by definition we're busy, take ourselves off the idle
+	 * list
+	 */
+	if (worker->idle) {
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 0;
+		list_move_tail(&worker->worker_list,
+			       &worker->workers->worker_list);
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&worker->lock, flags);
+
+out:
+	return 0;
+}
+
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+	int wake = 0;
+
+	/* don't requeue something already on a list */
+	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+		goto out;
+
+	worker = find_worker(workers);
+	if (workers->ordered) {
+		spin_lock_irqsave(&workers->lock, flags);
+		list_add_tail(&work->order_list, &workers->order_list);
+		spin_unlock_irqrestore(&workers->lock, flags);
+	} else {
+		INIT_LIST_HEAD(&work->order_list);
+	}
+
+	spin_lock_irqsave(&worker->lock, flags);
+	atomic_inc(&worker->num_pending);
+	check_busy_worker(worker);
+	list_add_tail(&work->list, &worker->pending);
+
+	/*
+	 * avoid calling into wake_up_process if this thread has already
+	 * been kicked
+	 */
+	if (!worker->working)
+		wake = 1;
+	worker->working = 1;
+
+	spin_unlock_irqrestore(&worker->lock, flags);
+
+	if (wake)
+		wake_up_process(worker->task);
+out:
+	return 0;
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 00000000000..31be4ed8b63
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+
+struct btrfs_worker_thread;
+
+/*
+ * This is similar to a workqueue, but it is meant to spread the operations
+ * across all available cpus instead of just the CPU that was used to
+ * queue the work.  There is also some batching introduced to try and
+ * cut down on context switches.
+ *
+ * By default threads are added on demand up to 2 * the number of cpus.
+ * Changing struct btrfs_workers->max_workers is one way to prevent
+ * demand creation of kthreads.
+ *
+ * the basic model of these worker threads is to embed a btrfs_work
+ * structure in your own data struct, and use container_of in a
+ * work function to get back to your data struct.
+ */
+struct btrfs_work {
+	/*
+	 * func should be set to the function you want called
+	 * your work struct is passed as the only arg
+	 *
+	 * ordered_func must be set for work sent to an ordered work queue,
+	 * and it is called to complete a given work item in the same
+	 * order they were sent to the queue.
+	 */
+	void (*func)(struct btrfs_work *work);
+	void (*ordered_func)(struct btrfs_work *work);
+	void (*ordered_free)(struct btrfs_work *work);
+
+	/*
+	 * flags should be set to zero.  It is used to make sure the
+	 * struct is only inserted once into the list.
+	 */
+	unsigned long flags;
+
+	/* don't touch these */
+	struct btrfs_worker_thread *worker;
+	struct list_head list;
+	struct list_head order_list;
+};
+
+struct btrfs_workers {
+	/* current number of running workers */
+	int num_workers;
+
+	/* max number of workers allowed.  changed by btrfs_start_workers */
+	int max_workers;
+
+	/* once a worker has this many requests or fewer, it is idle */
+	int idle_thresh;
+
+	/* force completions in the order they were queued */
+	int ordered;
+
+	/* list with all the work threads.  The workers on the idle thread
+	 * may be actively servicing jobs, but they haven't yet hit the
+	 * idle thresh limit above.
+	 */
+	struct list_head worker_list;
+	struct list_head idle_list;
+
+	/*
+	 * when operating in ordered mode, this maintains the list
+	 * of work items waiting for completion
+	 */
+	struct list_head order_list;
+
+	/* lock for finding the next worker thread to queue on */
+	spinlock_t lock;
+
+	/* extra name for this worker, used for current->name */
+	char *name;
+};
+
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+int btrfs_requeue_work(struct btrfs_work *work);
+#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 00000000000..a8c9693b75a
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+
+#include "extent_map.h"
+#include "extent_io.h"
+#include "ordered-data.h"
+
+/* in memory btrfs inode */
+struct btrfs_inode {
+	/* which subvolume this inode belongs to */
+	struct btrfs_root *root;
+
+	/* key used to find this inode on disk.  This is used by the code
+	 * to read in roots of subvolumes
+	 */
+	struct btrfs_key location;
+
+	/* the extent_tree has caches of all the extent mappings to disk */
+	struct extent_map_tree extent_tree;
+
+	/* the io_tree does range state (DIRTY, LOCKED etc) */
+	struct extent_io_tree io_tree;
+
+	/* special utility tree used to record which mirrors have already been
+	 * tried when checksums fail for a given block
+	 */
+	struct extent_io_tree io_failure_tree;
+
+	/* held while inesrting or deleting extents from files */
+	struct mutex extent_mutex;
+
+	/* held while logging the inode in tree-log.c */
+	struct mutex log_mutex;
+
+	/* used to order data wrt metadata */
+	struct btrfs_ordered_inode_tree ordered_tree;
+
+	/* standard acl pointers */
+	struct posix_acl *i_acl;
+	struct posix_acl *i_default_acl;
+
+	/* for keeping track of orphaned inodes */
+	struct list_head i_orphan;
+
+	/* list of all the delalloc inodes in the FS.  There are times we need
+	 * to write all the delalloc pages to disk, and this list is used
+	 * to walk them all.
+	 */
+	struct list_head delalloc_inodes;
+
+	/* full 64 bit generation number, struct vfs_inode doesn't have a big
+	 * enough field for this.
+	 */
+	u64 generation;
+
+	/* sequence number for NFS changes */
+	u64 sequence;
+
+	/*
+	 * transid of the trans_handle that last modified this inode
+	 */
+	u64 last_trans;
+	/*
+	 * transid that last logged this inode
+	 */
+	u64 logged_trans;
+
+	/*
+	 * trans that last made a change that should be fully fsync'd.  This
+	 * gets reset to zero each time the inode is logged
+	 */
+	u64 log_dirty_trans;
+
+	/* total number of bytes pending delalloc, used by stat to calc the
+	 * real block usage of the file
+	 */
+	u64 delalloc_bytes;
+
+	/*
+	 * the size of the file stored in the metadata on disk.  data=ordered
+	 * means the in-memory i_size might be larger than the size on disk
+	 * because not all the blocks are written yet.
+	 */
+	u64 disk_i_size;
+
+	/* flags field from the on disk inode */
+	u32 flags;
+
+	/*
+	 * if this is a directory then index_cnt is the counter for the index
+	 * number for new files that are created
+	 */
+	u64 index_cnt;
+
+	/* the start of block group preferred for allocations. */
+	u64 block_group;
+
+	struct inode vfs_inode;
+};
+
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+	return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+	inode->i_size = size;
+	BTRFS_I(inode)->disk_i_size = size;
+}
+
+
+#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 00000000000..7c4503ef6ef
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
+#ifndef _COMPAT_H_
+#define _COMPAT_H_
+
+#define btrfs_drop_nlink(inode) drop_nlink(inode)
+#define btrfs_inc_nlink(inode)	inc_nlink(inode)
+
+#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 00000000000..ee848d8585d
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include <linux/pagevec.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+	/* number of bios pending for this compressed extent */
+	atomic_t pending_bios;
+
+	/* the pages with the compressed data on them */
+	struct page **compressed_pages;
+
+	/* inode that owns this data */
+	struct inode *inode;
+
+	/* starting offset in the inode for our pages */
+	u64 start;
+
+	/* number of bytes in the inode we're working on */
+	unsigned long len;
+
+	/* number of bytes on disk */
+	unsigned long compressed_len;
+
+	/* number of compressed pages in the array */
+	unsigned long nr_pages;
+
+	/* IO errors */
+	int errors;
+	int mirror_num;
+
+	/* for reads, this is the bio we are copying the data into */
+	struct bio *orig_bio;
+
+	/*
+	 * the start of a variable length array of checksums only
+	 * used by reads
+	 */
+	u32 sums;
+};
+
+static inline int compressed_bio_size(struct btrfs_root *root,
+				      unsigned long disk_size)
+{
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	return sizeof(struct compressed_bio) +
+		((disk_size + root->sectorsize - 1) / root->sectorsize) *
+		csum_size;
+}
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+					u64 first_byte, gfp_t gfp_flags)
+{
+	struct bio *bio;
+	int nr_vecs;
+
+	nr_vecs = bio_get_nr_vecs(bdev);
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_size = 0;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_byte >> 9;
+	}
+	return bio;
+}
+
+static int check_compressed_csum(struct inode *inode,
+				 struct compressed_bio *cb,
+				 u64 disk_start)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page *page;
+	unsigned long i;
+	char *kaddr;
+	u32 csum;
+	u32 *cb_sum = &cb->sums;
+
+	if (btrfs_test_flag(inode, NODATASUM))
+		return 0;
+
+	for (i = 0; i < cb->nr_pages; i++) {
+		page = cb->compressed_pages[i];
+		csum = ~(u32)0;
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+		btrfs_csum_final(csum, (char *)&csum);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		if (csum != *cb_sum) {
+			printk(KERN_INFO "btrfs csum failed ino %lu "
+			       "extent %llu csum %u "
+			       "wanted %u mirror %d\n", inode->i_ino,
+			       (unsigned long long)disk_start,
+			       csum, *cb_sum, cb->mirror_num);
+			ret = -EIO;
+			goto fail;
+		}
+		cb_sum++;
+
+	}
+	ret = 0;
+fail:
+	return ret;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+	int ret;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	inode = cb->inode;
+	ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+	if (ret)
+		goto csum_failed;
+
+	/* ok, we're the last bio for this extent, lets start
+	 * the decompression.
+	 */
+	tree = &BTRFS_I(inode)->io_tree;
+	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+					cb->start,
+					cb->orig_bio->bi_io_vec,
+					cb->orig_bio->bi_vcnt,
+					cb->compressed_len);
+csum_failed:
+	if (ret)
+		cb->errors = 1;
+
+	/* release the compressed pages */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* do io completion on the original bio */
+	if (cb->errors) {
+		bio_io_error(cb->orig_bio);
+	} else {
+		int bio_index = 0;
+		struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+
+		/*
+		 * we have verified the checksum already, set page
+		 * checked so the end_io handlers know about it
+		 */
+		while (bio_index < cb->orig_bio->bi_vcnt) {
+			SetPageChecked(bvec->bv_page);
+			bvec++;
+			bio_index++;
+		}
+		bio_endio(cb->orig_bio, 0);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+					     unsigned long ram_size)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+	struct page *pages[16];
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int ret;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			nr_pages -= 1;
+			index += 1;
+			continue;
+		}
+		for (i = 0; i < ret; i++) {
+			end_page_writeback(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+	}
+	/* the inode may be gone now */
+	return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, step one is to
+	 * call back into the FS and do all the end_io operations
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+					 cb->start,
+					 cb->start + cb->len - 1,
+					 NULL, 1);
+	cb->compressed_pages[0]->mapping = NULL;
+
+	end_compressed_writeback(inode, cb->start, cb->len);
+	/* note, our inode could be gone now */
+
+	/*
+	 * release the compressed pages, these came from alloc_page and
+	 * are not attached to the inode at all
+	 */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				 unsigned long len, u64 disk_start,
+				 unsigned long compressed_len,
+				 struct page **compressed_pages,
+				 unsigned long nr_pages)
+{
+	struct bio *bio = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct compressed_bio *cb;
+	unsigned long bytes_left;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int page_index = 0;
+	struct page *page;
+	u64 first_byte = disk_start;
+	struct block_device *bdev;
+	int ret;
+
+	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->start = start;
+	cb->len = len;
+	cb->mirror_num = 0;
+	cb->compressed_pages = compressed_pages;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = NULL;
+	cb->nr_pages = nr_pages;
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+	bio->bi_private = cb;
+	bio->bi_end_io = end_compressed_bio_write;
+	atomic_inc(&cb->pending_bios);
+
+	/* create and submit bios for the compressed pages */
+	bytes_left = compressed_len;
+	for (page_index = 0; page_index < cb->nr_pages; page_index++) {
+		page = compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		if (bio->bi_size)
+			ret = io_tree->ops->merge_bio_hook(page, 0,
+							   PAGE_CACHE_SIZE,
+							   bio, 0);
+		else
+			ret = 0;
+
+		page->mapping = NULL;
+		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(bio);
+
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
+			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+			BUG_ON(ret);
+
+			bio_put(bio);
+
+			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		if (bytes_left < PAGE_CACHE_SIZE) {
+			printk("bytes left %lu compress len %lu nr %lu\n",
+			       bytes_left, cb->compressed_len, cb->nr_pages);
+		}
+		bytes_left -= PAGE_CACHE_SIZE;
+		first_byte += PAGE_CACHE_SIZE;
+		cond_resched();
+	}
+	bio_get(bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+	BUG_ON(ret);
+
+	bio_put(bio);
+	return 0;
+}
+
+static noinline int add_ra_bio_pages(struct inode *inode,
+				     u64 compressed_end,
+				     struct compressed_bio *cb)
+{
+	unsigned long end_index;
+	unsigned long page_index;
+	u64 last_offset;
+	u64 isize = i_size_read(inode);
+	int ret;
+	struct page *page;
+	unsigned long nr_pages = 0;
+	struct extent_map *em;
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	struct extent_map_tree *em_tree;
+	struct extent_io_tree *tree;
+	u64 end;
+	int misses = 0;
+
+	page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+	last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	tree = &BTRFS_I(inode)->io_tree;
+
+	if (isize == 0)
+		return 0;
+
+	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+	while (last_offset < compressed_end) {
+		page_index = last_offset >> PAGE_CACHE_SHIFT;
+
+		if (page_index > end_index)
+			break;
+
+		rcu_read_lock();
+		page = radix_tree_lookup(&mapping->page_tree, page_index);
+		rcu_read_unlock();
+		if (page) {
+			misses++;
+			if (misses > 4)
+				break;
+			goto next;
+		}
+
+		page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+		if (!page)
+			break;
+
+		page->index = page_index;
+		/*
+		 * what we want to do here is call add_to_page_cache_lru,
+		 * but that isn't exported, so we reproduce it here
+		 */
+		if (add_to_page_cache(page, mapping,
+				      page->index, GFP_NOFS)) {
+			page_cache_release(page);
+			goto next;
+		}
+
+		/* open coding of lru_cache_add, also not exported */
+		page_cache_get(page);
+		if (!pagevec_add(&pvec, page))
+			__pagevec_lru_add_file(&pvec);
+
+		end = last_offset + PAGE_CACHE_SIZE - 1;
+		/*
+		 * at this point, we have a locked page in the page cache
+		 * for these bytes in the file.  But, we have to make
+		 * sure they map to this compressed extent on disk.
+		 */
+		set_page_extent_mapped(page);
+		lock_extent(tree, last_offset, end, GFP_NOFS);
+		spin_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, last_offset,
+					   PAGE_CACHE_SIZE);
+		spin_unlock(&em_tree->lock);
+
+		if (!em || last_offset < em->start ||
+		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+		    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+			free_extent_map(em);
+			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+		free_extent_map(em);
+
+		if (page->index == end_index) {
+			char *userpage;
+			size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+
+			if (zero_offset) {
+				int zeros;
+				zeros = PAGE_CACHE_SIZE - zero_offset;
+				userpage = kmap_atomic(page, KM_USER0);
+				memset(userpage + zero_offset, 0, zeros);
+				flush_dcache_page(page);
+				kunmap_atomic(userpage, KM_USER0);
+			}
+		}
+
+		ret = bio_add_page(cb->orig_bio, page,
+				   PAGE_CACHE_SIZE, 0);
+
+		if (ret == PAGE_CACHE_SIZE) {
+			nr_pages++;
+			page_cache_release(page);
+		} else {
+			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+next:
+		last_offset += PAGE_CACHE_SIZE;
+	}
+	if (pagevec_count(&pvec))
+		__pagevec_lru_add_file(&pvec);
+	return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *em_tree;
+	struct compressed_bio *cb;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	unsigned long compressed_len;
+	unsigned long nr_pages;
+	unsigned long page_index;
+	struct page *page;
+	struct block_device *bdev;
+	struct bio *comp_bio;
+	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	u64 em_len;
+	u64 em_start;
+	struct extent_map *em;
+	int ret;
+	u32 *sums;
+
+	tree = &BTRFS_I(inode)->io_tree;
+	em_tree = &BTRFS_I(inode)->extent_tree;
+
+	/* we need the actual starting offset of this extent in the file */
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree,
+				   page_offset(bio->bi_io_vec->bv_page),
+				   PAGE_CACHE_SIZE);
+	spin_unlock(&em_tree->lock);
+
+	compressed_len = em->block_len;
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->mirror_num = mirror_num;
+	sums = &cb->sums;
+
+	cb->start = em->orig_start;
+	em_len = em->len;
+	em_start = em->start;
+
+	free_extent_map(em);
+	em = NULL;
+
+	cb->len = uncompressed_len;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = bio;
+
+	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+				 PAGE_CACHE_SIZE;
+	cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+				       GFP_NOFS);
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+							      __GFP_HIGHMEM);
+	}
+	cb->nr_pages = nr_pages;
+
+	add_ra_bio_pages(inode, em_start + em_len, cb);
+
+	/* include any pages we added in add_ra-bio_pages */
+	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	cb->len = uncompressed_len;
+
+	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+	comp_bio->bi_private = cb;
+	comp_bio->bi_end_io = end_compressed_bio_read;
+	atomic_inc(&cb->pending_bios);
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		page = cb->compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		page->index = em_start >> PAGE_CACHE_SHIFT;
+
+		if (comp_bio->bi_size)
+			ret = tree->ops->merge_bio_hook(page, 0,
+							PAGE_CACHE_SIZE,
+							comp_bio, 0);
+		else
+			ret = 0;
+
+		page->mapping = NULL;
+		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(comp_bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+			BUG_ON(ret);
+
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
+
+			if (!btrfs_test_flag(inode, NODATASUM)) {
+				btrfs_lookup_bio_sums(root, inode, comp_bio,
+						      sums);
+			}
+			sums += (comp_bio->bi_size + root->sectorsize - 1) /
+				root->sectorsize;
+
+			ret = btrfs_map_bio(root, READ, comp_bio,
+					    mirror_num, 0);
+			BUG_ON(ret);
+
+			bio_put(comp_bio);
+
+			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+							GFP_NOFS);
+			comp_bio->bi_private = cb;
+			comp_bio->bi_end_io = end_compressed_bio_read;
+
+			bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		cur_disk_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(comp_bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+	BUG_ON(ret);
+
+	if (!btrfs_test_flag(inode, NODATASUM))
+		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+
+	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
+	BUG_ON(ret);
+
+	bio_put(comp_bio);
+	return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 00000000000..421f5b4aa71
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+			      u64 disk_start,
+			      struct bio_vec *bvec,
+			      int vcnt,
+			      size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				  unsigned long len, u64 disk_start,
+				  unsigned long compressed_len,
+				  struct page **compressed_pages,
+				  unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 00000000000..6e1b3de3670
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_CRC32C__
+#define __BTRFS_CRC32C__
+#include <linux/crc32c.h>
+
+/*
+ * this file used to do more for selecting the HW version of crc32c,
+ * perhaps it will one day again soon.
+ */
+#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
+#endif
+
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 00000000000..9e46c077681
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3953 @@
+/*
+ * Copyright (C) 2007,2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "locking.h"
+
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *ins_key,
+		      struct btrfs_path *path, int data_size, int extend);
+static int push_node_left(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *dst,
+			  struct extent_buffer *src, int empty);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *dst_buf,
+			      struct extent_buffer *src_buf);
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int level, int slot);
+
+inline void btrfs_init_path(struct btrfs_path *p)
+{
+	memset(p, 0, sizeof(*p));
+}
+
+struct btrfs_path *btrfs_alloc_path(void)
+{
+	struct btrfs_path *path;
+	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
+	if (path) {
+		btrfs_init_path(path);
+		path->reada = 1;
+	}
+	return path;
+}
+
+/* this also releases the path */
+void btrfs_free_path(struct btrfs_path *p)
+{
+	btrfs_release_path(NULL, p);
+	kmem_cache_free(btrfs_path_cachep, p);
+}
+
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
+noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		p->slots[i] = 0;
+		if (!p->nodes[i])
+			continue;
+		if (p->locks[i]) {
+			btrfs_tree_unlock(p->nodes[i]);
+			p->locks[i] = 0;
+		}
+		free_extent_buffer(p->nodes[i]);
+		p->nodes[i] = NULL;
+	}
+}
+
+/*
+ * safely gets a reference on the root node of a tree.  A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree.  See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear.  It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+	spin_lock(&root->node_lock);
+	eb = root->node;
+	extent_buffer_get(eb);
+	spin_unlock(&root->node_lock);
+	return eb;
+}
+
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+
+	while (1) {
+		eb = btrfs_root_node(root);
+		btrfs_tree_lock(eb);
+
+		spin_lock(&root->node_lock);
+		if (eb == root->node) {
+			spin_unlock(&root->node_lock);
+			break;
+		}
+		spin_unlock(&root->node_lock);
+
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return eb;
+}
+
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list.  transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+	if (root->track_dirty && list_empty(&root->dirty_list)) {
+		list_add(&root->dirty_list,
+			 &root->fs_info->dirty_cowonly_roots);
+	}
+}
+
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid.  The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+	struct extent_buffer *cow;
+	u32 nritems;
+	int ret = 0;
+	int level;
+	struct btrfs_root *new_root;
+
+	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
+	if (!new_root)
+		return -ENOMEM;
+
+	memcpy(new_root, root, sizeof(*new_root));
+	new_root->root_key.objectid = new_root_objectid;
+
+	WARN_ON(root->ref_cows && trans->transid !=
+		root->fs_info->running_transaction->transid);
+	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+	level = btrfs_header_level(buf);
+	nritems = btrfs_header_nritems(buf);
+
+	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+				     new_root_objectid, trans->transid,
+				     level, buf->start, 0);
+	if (IS_ERR(cow)) {
+		kfree(new_root);
+		return PTR_ERR(cow);
+	}
+
+	copy_extent_buffer(cow, buf, 0, 0, cow->len);
+	btrfs_set_header_bytenr(cow, cow->start);
+	btrfs_set_header_generation(cow, trans->transid);
+	btrfs_set_header_owner(cow, new_root_objectid);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+
+	write_extent_buffer(cow, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(cow),
+			    BTRFS_FSID_SIZE);
+
+	WARN_ON(btrfs_header_generation(buf) > trans->transid);
+	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
+	kfree(new_root);
+
+	if (ret)
+		return ret;
+
+	btrfs_mark_buffer_dirty(cow);
+	*cow_ret = cow;
+	return 0;
+}
+
+/*
+ * does the dirty work in cow of a single block.  The parent block (if
+ * supplied) is updated to point to the new cow copy.  The new buffer is marked
+ * dirty and returned locked.  If you modify the block it needs to be marked
+ * dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
+ *
+ * prealloc_dest -- if you have already reserved a destination for the cow,
+ * this uses that block instead of allocating a new one.
+ * btrfs_alloc_reserved_extent is used to finish the allocation.
+ */
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct extent_buffer *buf,
+			     struct extent_buffer *parent, int parent_slot,
+			     struct extent_buffer **cow_ret,
+			     u64 search_start, u64 empty_size,
+			     u64 prealloc_dest)
+{
+	u64 parent_start;
+	struct extent_buffer *cow;
+	u32 nritems;
+	int ret = 0;
+	int level;
+	int unlock_orig = 0;
+
+	if (*cow_ret == buf)
+		unlock_orig = 1;
+
+	WARN_ON(!btrfs_tree_locked(buf));
+
+	if (parent)
+		parent_start = parent->start;
+	else
+		parent_start = 0;
+
+	WARN_ON(root->ref_cows && trans->transid !=
+		root->fs_info->running_transaction->transid);
+	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+	level = btrfs_header_level(buf);
+	nritems = btrfs_header_nritems(buf);
+
+	if (prealloc_dest) {
+		struct btrfs_key ins;
+
+		ins.objectid = prealloc_dest;
+		ins.offset = buf->len;
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
+						  root->root_key.objectid,
+						  trans->transid, level, &ins);
+		BUG_ON(ret);
+		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
+					    buf->len);
+	} else {
+		cow = btrfs_alloc_free_block(trans, root, buf->len,
+					     parent_start,
+					     root->root_key.objectid,
+					     trans->transid, level,
+					     search_start, empty_size);
+	}
+	if (IS_ERR(cow))
+		return PTR_ERR(cow);
+
+	copy_extent_buffer(cow, buf, 0, 0, cow->len);
+	btrfs_set_header_bytenr(cow, cow->start);
+	btrfs_set_header_generation(cow, trans->transid);
+	btrfs_set_header_owner(cow, root->root_key.objectid);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+
+	write_extent_buffer(cow, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(cow),
+			    BTRFS_FSID_SIZE);
+
+	WARN_ON(btrfs_header_generation(buf) > trans->transid);
+	if (btrfs_header_generation(buf) != trans->transid) {
+		u32 nr_extents;
+		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
+		if (ret)
+			return ret;
+
+		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
+		WARN_ON(ret);
+	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
+		/*
+		 * There are only two places that can drop reference to
+		 * tree blocks owned by living reloc trees, one is here,
+		 * the other place is btrfs_drop_subtree. In both places,
+		 * we check reference count while tree block is locked.
+		 * Furthermore, if reference count is one, it won't get
+		 * increased by someone else.
+		 */
+		u32 refs;
+		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
+					      buf->len, &refs);
+		BUG_ON(ret);
+		if (refs == 1) {
+			ret = btrfs_update_ref(trans, root, buf, cow,
+					       0, nritems);
+			clean_tree_block(trans, root, buf);
+		} else {
+			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
+		}
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
+		if (ret)
+			return ret;
+		clean_tree_block(trans, root, buf);
+	}
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
+		WARN_ON(ret);
+	}
+
+	if (buf == root->node) {
+		WARN_ON(parent && parent != buf);
+
+		spin_lock(&root->node_lock);
+		root->node = cow;
+		extent_buffer_get(cow);
+		spin_unlock(&root->node_lock);
+
+		if (buf != root->commit_root) {
+			btrfs_free_extent(trans, root, buf->start,
+					  buf->len, buf->start,
+					  root->root_key.objectid,
+					  btrfs_header_generation(buf),
+					  level, 1);
+		}
+		free_extent_buffer(buf);
+		add_root_to_dirty_list(root);
+	} else {
+		btrfs_set_node_blockptr(parent, parent_slot,
+					cow->start);
+		WARN_ON(trans->transid == 0);
+		btrfs_set_node_ptr_generation(parent, parent_slot,
+					      trans->transid);
+		btrfs_mark_buffer_dirty(parent);
+		WARN_ON(btrfs_header_generation(parent) != trans->transid);
+		btrfs_free_extent(trans, root, buf->start, buf->len,
+				  parent_start, btrfs_header_owner(parent),
+				  btrfs_header_generation(parent), level, 1);
+	}
+	if (unlock_orig)
+		btrfs_tree_unlock(buf);
+	free_extent_buffer(buf);
+	btrfs_mark_buffer_dirty(cow);
+	*cow_ret = cow;
+	return 0;
+}
+
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct extent_buffer *buf,
+		    struct extent_buffer *parent, int parent_slot,
+		    struct extent_buffer **cow_ret, u64 prealloc_dest)
+{
+	u64 search_start;
+	int ret;
+
+	if (trans->transaction != root->fs_info->running_transaction) {
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)
+		       root->fs_info->running_transaction->transid);
+		WARN_ON(1);
+	}
+	if (trans->transid != root->fs_info->generation) {
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)root->fs_info->generation);
+		WARN_ON(1);
+	}
+
+	spin_lock(&root->fs_info->hash_lock);
+	if (btrfs_header_generation(buf) == trans->transid &&
+	    btrfs_header_owner(buf) == root->root_key.objectid &&
+	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+		*cow_ret = buf;
+		spin_unlock(&root->fs_info->hash_lock);
+		WARN_ON(prealloc_dest);
+		return 0;
+	}
+	spin_unlock(&root->fs_info->hash_lock);
+	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+	ret = __btrfs_cow_block(trans, root, buf, parent,
+				 parent_slot, cow_ret, search_start, 0,
+				 prealloc_dest);
+	return ret;
+}
+
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+	if (blocknr < other && other - (blocknr + blocksize) < 32768)
+		return 1;
+	if (blocknr > other && blocknr - (other + blocksize) < 32768)
+		return 1;
+	return 0;
+}
+
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+{
+	struct btrfs_key k1;
+
+	btrfs_disk_key_to_cpu(&k1, disk);
+
+	if (k1.objectid > k2->objectid)
+		return 1;
+	if (k1.objectid < k2->objectid)
+		return -1;
+	if (k1.type > k2->type)
+		return 1;
+	if (k1.type < k2->type)
+		return -1;
+	if (k1.offset > k2->offset)
+		return 1;
+	if (k1.offset < k2->offset)
+		return -1;
+	return 0;
+}
+
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+	if (k1->objectid > k2->objectid)
+		return 1;
+	if (k1->objectid < k2->objectid)
+		return -1;
+	if (k1->type > k2->type)
+		return 1;
+	if (k1->type < k2->type)
+		return -1;
+	if (k1->offset > k2->offset)
+		return 1;
+	if (k1->offset < k2->offset)
+		return -1;
+	return 0;
+}
+
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct extent_buffer *parent,
+		       int start_slot, int cache_only, u64 *last_ret,
+		       struct btrfs_key *progress)
+{
+	struct extent_buffer *cur;
+	u64 blocknr;
+	u64 gen;
+	u64 search_start = *last_ret;
+	u64 last_block = 0;
+	u64 other;
+	u32 parent_nritems;
+	int end_slot;
+	int i;
+	int err = 0;
+	int parent_level;
+	int uptodate;
+	u32 blocksize;
+	int progress_passed = 0;
+	struct btrfs_disk_key disk_key;
+
+	parent_level = btrfs_header_level(parent);
+	if (cache_only && parent_level != 1)
+		return 0;
+
+	if (trans->transaction != root->fs_info->running_transaction)
+		WARN_ON(1);
+	if (trans->transid != root->fs_info->generation)
+		WARN_ON(1);
+
+	parent_nritems = btrfs_header_nritems(parent);
+	blocksize = btrfs_level_size(root, parent_level - 1);
+	end_slot = parent_nritems;
+
+	if (parent_nritems == 1)
+		return 0;
+
+	for (i = start_slot; i < end_slot; i++) {
+		int close = 1;
+
+		if (!parent->map_token) {
+			map_extent_buffer(parent,
+					btrfs_node_key_ptr_offset(i),
+					sizeof(struct btrfs_key_ptr),
+					&parent->map_token, &parent->kaddr,
+					&parent->map_start, &parent->map_len,
+					KM_USER1);
+		}
+		btrfs_node_key(parent, &disk_key, i);
+		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+			continue;
+
+		progress_passed = 1;
+		blocknr = btrfs_node_blockptr(parent, i);
+		gen = btrfs_node_ptr_generation(parent, i);
+		if (last_block == 0)
+			last_block = blocknr;
+
+		if (i > 0) {
+			other = btrfs_node_blockptr(parent, i - 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (!close && i < end_slot - 2) {
+			other = btrfs_node_blockptr(parent, i + 1);
+			close = close_blocks(blocknr, other, blocksize);
+		}
+		if (close) {
+			last_block = blocknr;
+			continue;
+		}
+		if (parent->map_token) {
+			unmap_extent_buffer(parent, parent->map_token,
+					    KM_USER1);
+			parent->map_token = NULL;
+		}
+
+		cur = btrfs_find_tree_block(root, blocknr, blocksize);
+		if (cur)
+			uptodate = btrfs_buffer_uptodate(cur, gen);
+		else
+			uptodate = 0;
+		if (!cur || !uptodate) {
+			if (cache_only) {
+				free_extent_buffer(cur);
+				continue;
+			}
+			if (!cur) {
+				cur = read_tree_block(root, blocknr,
+							 blocksize, gen);
+			} else if (!uptodate) {
+				btrfs_read_buffer(cur, gen);
+			}
+		}
+		if (search_start == 0)
+			search_start = last_block;
+
+		btrfs_tree_lock(cur);
+		err = __btrfs_cow_block(trans, root, cur, parent, i,
+					&cur, search_start,
+					min(16 * blocksize,
+					    (end_slot - i) * blocksize), 0);
+		if (err) {
+			btrfs_tree_unlock(cur);
+			free_extent_buffer(cur);
+			break;
+		}
+		search_start = cur->start;
+		last_block = cur->start;
+		*last_ret = search_start;
+		btrfs_tree_unlock(cur);
+		free_extent_buffer(cur);
+	}
+	if (parent->map_token) {
+		unmap_extent_buffer(parent, parent->map_token,
+				    KM_USER1);
+		parent->map_token = NULL;
+	}
+	return err;
+}
+
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+					 struct extent_buffer *leaf)
+{
+	u32 nr = btrfs_header_nritems(leaf);
+	if (nr == 0)
+		return BTRFS_LEAF_DATA_SIZE(root);
+	return btrfs_item_offset_nr(leaf, nr - 1);
+}
+
+/*
+ * extra debugging checks to make sure all the items in a key are
+ * well formed and in the proper order
+ */
+static int check_node(struct btrfs_root *root, struct btrfs_path *path,
+		      int level)
+{
+	struct extent_buffer *parent = NULL;
+	struct extent_buffer *node = path->nodes[level];
+	struct btrfs_disk_key parent_key;
+	struct btrfs_disk_key node_key;
+	int parent_slot;
+	int slot;
+	struct btrfs_key cpukey;
+	u32 nritems = btrfs_header_nritems(node);
+
+	if (path->nodes[level + 1])
+		parent = path->nodes[level + 1];
+
+	slot = path->slots[level];
+	BUG_ON(nritems == 0);
+	if (parent) {
+		parent_slot = path->slots[level + 1];
+		btrfs_node_key(parent, &parent_key, parent_slot);
+		btrfs_node_key(node, &node_key, 0);
+		BUG_ON(memcmp(&parent_key, &node_key,
+			      sizeof(struct btrfs_disk_key)));
+		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+		       btrfs_header_bytenr(node));
+	}
+	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
+	if (slot != 0) {
+		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
+		btrfs_node_key(node, &node_key, slot);
+		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
+	}
+	if (slot < nritems - 1) {
+		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
+		btrfs_node_key(node, &node_key, slot);
+		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
+	}
+	return 0;
+}
+
+/*
+ * extra checking to make sure all the items in a leaf are
+ * well formed and in the proper order
+ */
+static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
+		      int level)
+{
+	struct extent_buffer *leaf = path->nodes[level];
+	struct extent_buffer *parent = NULL;
+	int parent_slot;
+	struct btrfs_key cpukey;
+	struct btrfs_disk_key parent_key;
+	struct btrfs_disk_key leaf_key;
+	int slot = path->slots[0];
+
+	u32 nritems = btrfs_header_nritems(leaf);
+
+	if (path->nodes[level + 1])
+		parent = path->nodes[level + 1];
+
+	if (nritems == 0)
+		return 0;
+
+	if (parent) {
+		parent_slot = path->slots[level + 1];
+		btrfs_node_key(parent, &parent_key, parent_slot);
+		btrfs_item_key(leaf, &leaf_key, 0);
+
+		BUG_ON(memcmp(&parent_key, &leaf_key,
+		       sizeof(struct btrfs_disk_key)));
+		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+		       btrfs_header_bytenr(leaf));
+	}
+	if (slot != 0 && slot < nritems - 1) {
+		btrfs_item_key(leaf, &leaf_key, slot);
+		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
+		if (comp_keys(&leaf_key, &cpukey) <= 0) {
+			btrfs_print_leaf(root, leaf);
+			printk(KERN_CRIT "slot %d offset bad key\n", slot);
+			BUG_ON(1);
+		}
+		if (btrfs_item_offset_nr(leaf, slot - 1) !=
+		       btrfs_item_end_nr(leaf, slot)) {
+			btrfs_print_leaf(root, leaf);
+			printk(KERN_CRIT "slot %d offset bad\n", slot);
+			BUG_ON(1);
+		}
+	}
+	if (slot < nritems - 1) {
+		btrfs_item_key(leaf, &leaf_key, slot);
+		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
+		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
+		if (btrfs_item_offset_nr(leaf, slot) !=
+			btrfs_item_end_nr(leaf, slot + 1)) {
+			btrfs_print_leaf(root, leaf);
+			printk(KERN_CRIT "slot %d offset bad\n", slot);
+			BUG_ON(1);
+		}
+	}
+	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
+	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
+	return 0;
+}
+
+static noinline int check_block(struct btrfs_root *root,
+				struct btrfs_path *path, int level)
+{
+	return 0;
+	if (level == 0)
+		return check_leaf(root, path, level);
+	return check_node(root, path, level);
+}
+
+/*
+ * search for key in the extent_buffer.  The items start at offset p,
+ * and they are item_size apart.  There are 'max' items in p.
+ *
+ * the slot in the array is returned via slot, and it points to
+ * the place where you would insert key if it is not found in
+ * the array.
+ *
+ * slot may point to max if the key is bigger than all of the keys
+ */
+static noinline int generic_bin_search(struct extent_buffer *eb,
+				       unsigned long p,
+				       int item_size, struct btrfs_key *key,
+				       int max, int *slot)
+{
+	int low = 0;
+	int high = max;
+	int mid;
+	int ret;
+	struct btrfs_disk_key *tmp = NULL;
+	struct btrfs_disk_key unaligned;
+	unsigned long offset;
+	char *map_token = NULL;
+	char *kaddr = NULL;
+	unsigned long map_start = 0;
+	unsigned long map_len = 0;
+	int err;
+
+	while (low < high) {
+		mid = (low + high) / 2;
+		offset = p + mid * item_size;
+
+		if (!map_token || offset < map_start ||
+		    (offset + sizeof(struct btrfs_disk_key)) >
+		    map_start + map_len) {
+			if (map_token) {
+				unmap_extent_buffer(eb, map_token, KM_USER0);
+				map_token = NULL;
+			}
+
+			err = map_private_extent_buffer(eb, offset,
+						sizeof(struct btrfs_disk_key),
+						&map_token, &kaddr,
+						&map_start, &map_len, KM_USER0);
+
+			if (!err) {
+				tmp = (struct btrfs_disk_key *)(kaddr + offset -
+							map_start);
+			} else {
+				read_extent_buffer(eb, &unaligned,
+						   offset, sizeof(unaligned));
+				tmp = &unaligned;
+			}
+
+		} else {
+			tmp = (struct btrfs_disk_key *)(kaddr + offset -
+							map_start);
+		}
+		ret = comp_keys(tmp, key);
+
+		if (ret < 0)
+			low = mid + 1;
+		else if (ret > 0)
+			high = mid;
+		else {
+			*slot = mid;
+			if (map_token)
+				unmap_extent_buffer(eb, map_token, KM_USER0);
+			return 0;
+		}
+	}
+	*slot = low;
+	if (map_token)
+		unmap_extent_buffer(eb, map_token, KM_USER0);
+	return 1;
+}
+
+/*
+ * simple bin_search frontend that does the right thing for
+ * leaves vs nodes
+ */
+static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		      int level, int *slot)
+{
+	if (level == 0) {
+		return generic_bin_search(eb,
+					  offsetof(struct btrfs_leaf, items),
+					  sizeof(struct btrfs_item),
+					  key, btrfs_header_nritems(eb),
+					  slot);
+	} else {
+		return generic_bin_search(eb,
+					  offsetof(struct btrfs_node, ptrs),
+					  sizeof(struct btrfs_key_ptr),
+					  key, btrfs_header_nritems(eb),
+					  slot);
+	}
+	return -1;
+}
+
+/* given a node and slot number, this reads the blocks it points to.  The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
+				   struct extent_buffer *parent, int slot)
+{
+	int level = btrfs_header_level(parent);
+	if (slot < 0)
+		return NULL;
+	if (slot >= btrfs_header_nritems(parent))
+		return NULL;
+
+	BUG_ON(level == 0);
+
+	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
+		       btrfs_level_size(root, level - 1),
+		       btrfs_node_ptr_generation(parent, slot));
+}
+
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion.  We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
+static noinline int balance_level(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path, int level)
+{
+	struct extent_buffer *right = NULL;
+	struct extent_buffer *mid;
+	struct extent_buffer *left = NULL;
+	struct extent_buffer *parent = NULL;
+	int ret = 0;
+	int wret;
+	int pslot;
+	int orig_slot = path->slots[level];
+	int err_on_enospc = 0;
+	u64 orig_ptr;
+
+	if (level == 0)
+		return 0;
+
+	mid = path->nodes[level];
+	WARN_ON(!path->locks[level]);
+	WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
+	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+	if (level < BTRFS_MAX_LEVEL - 1)
+		parent = path->nodes[level + 1];
+	pslot = path->slots[level + 1];
+
+	/*
+	 * deal with the case where there is only one pointer in the root
+	 * by promoting the node below to a root
+	 */
+	if (!parent) {
+		struct extent_buffer *child;
+
+		if (btrfs_header_nritems(mid) != 1)
+			return 0;
+
+		/* promote the child to a root */
+		child = read_node_slot(root, mid, 0);
+		btrfs_tree_lock(child);
+		BUG_ON(!child);
+		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+		BUG_ON(ret);
+
+		spin_lock(&root->node_lock);
+		root->node = child;
+		spin_unlock(&root->node_lock);
+
+		ret = btrfs_update_extent_ref(trans, root, child->start,
+					      mid->start, child->start,
+					      root->root_key.objectid,
+					      trans->transid, level - 1);
+		BUG_ON(ret);
+
+		add_root_to_dirty_list(root);
+		btrfs_tree_unlock(child);
+		path->locks[level] = 0;
+		path->nodes[level] = NULL;
+		clean_tree_block(trans, root, mid);
+		btrfs_tree_unlock(mid);
+		/* once for the path */
+		free_extent_buffer(mid);
+		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
+					mid->start, root->root_key.objectid,
+					btrfs_header_generation(mid),
+					level, 1);
+		/* once for the root ptr */
+		free_extent_buffer(mid);
+		return ret;
+	}
+	if (btrfs_header_nritems(mid) >
+	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
+		return 0;
+
+	if (btrfs_header_nritems(mid) < 2)
+		err_on_enospc = 1;
+
+	left = read_node_slot(root, parent, pslot - 1);
+	if (left) {
+		btrfs_tree_lock(left);
+		wret = btrfs_cow_block(trans, root, left,
+				       parent, pslot - 1, &left, 0);
+		if (wret) {
+			ret = wret;
+			goto enospc;
+		}
+	}
+	right = read_node_slot(root, parent, pslot + 1);
+	if (right) {
+		btrfs_tree_lock(right);
+		wret = btrfs_cow_block(trans, root, right,
+				       parent, pslot + 1, &right, 0);
+		if (wret) {
+			ret = wret;
+			goto enospc;
+		}
+	}
+
+	/* first, try to make some room in the middle buffer */
+	if (left) {
+		orig_slot += btrfs_header_nritems(left);
+		wret = push_node_left(trans, root, left, mid, 1);
+		if (wret < 0)
+			ret = wret;
+		if (btrfs_header_nritems(mid) < 2)
+			err_on_enospc = 1;
+	}
+
+	/*
+	 * then try to empty the right most buffer into the middle
+	 */
+	if (right) {
+		wret = push_node_left(trans, root, mid, right, 1);
+		if (wret < 0 && wret != -ENOSPC)
+			ret = wret;
+		if (btrfs_header_nritems(right) == 0) {
+			u64 bytenr = right->start;
+			u64 generation = btrfs_header_generation(parent);
+			u32 blocksize = right->len;
+
+			clean_tree_block(trans, root, right);
+			btrfs_tree_unlock(right);
+			free_extent_buffer(right);
+			right = NULL;
+			wret = del_ptr(trans, root, path, level + 1, pslot +
+				       1);
+			if (wret)
+				ret = wret;
+			wret = btrfs_free_extent(trans, root, bytenr,
+						 blocksize, parent->start,
+						 btrfs_header_owner(parent),
+						 generation, level, 1);
+			if (wret)
+				ret = wret;
+		} else {
+			struct btrfs_disk_key right_key;
+			btrfs_node_key(right, &right_key, 0);
+			btrfs_set_node_key(parent, &right_key, pslot + 1);
+			btrfs_mark_buffer_dirty(parent);
+		}
+	}
+	if (btrfs_header_nritems(mid) == 1) {
+		/*
+		 * we're not allowed to leave a node with one item in the
+		 * tree during a delete.  A deletion from lower in the tree
+		 * could try to delete the only pointer in this node.
+		 * So, pull some keys from the left.
+		 * There has to be a left pointer at this point because
+		 * otherwise we would have pulled some pointers from the
+		 * right
+		 */
+		BUG_ON(!left);
+		wret = balance_node_right(trans, root, mid, left);
+		if (wret < 0) {
+			ret = wret;
+			goto enospc;
+		}
+		if (wret == 1) {
+			wret = push_node_left(trans, root, left, mid, 1);
+			if (wret < 0)
+				ret = wret;
+		}
+		BUG_ON(wret == 1);
+	}
+	if (btrfs_header_nritems(mid) == 0) {
+		/* we've managed to empty the middle node, drop it */
+		u64 root_gen = btrfs_header_generation(parent);
+		u64 bytenr = mid->start;
+		u32 blocksize = mid->len;
+
+		clean_tree_block(trans, root, mid);
+		btrfs_tree_unlock(mid);
+		free_extent_buffer(mid);
+		mid = NULL;
+		wret = del_ptr(trans, root, path, level + 1, pslot);
+		if (wret)
+			ret = wret;
+		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
+					 parent->start,
+					 btrfs_header_owner(parent),
+					 root_gen, level, 1);
+		if (wret)
+			ret = wret;
+	} else {
+		/* update the parent key to reflect our changes */
+		struct btrfs_disk_key mid_key;
+		btrfs_node_key(mid, &mid_key, 0);
+		btrfs_set_node_key(parent, &mid_key, pslot);
+		btrfs_mark_buffer_dirty(parent);
+	}
+
+	/* update the path */
+	if (left) {
+		if (btrfs_header_nritems(left) > orig_slot) {
+			extent_buffer_get(left);
+			/* left was locked after cow */
+			path->nodes[level] = left;
+			path->slots[level + 1] -= 1;
+			path->slots[level] = orig_slot;
+			if (mid) {
+				btrfs_tree_unlock(mid);
+				free_extent_buffer(mid);
+			}
+		} else {
+			orig_slot -= btrfs_header_nritems(left);
+			path->slots[level] = orig_slot;
+		}
+	}
+	/* double check we haven't messed things up */
+	check_block(root, path, level);
+	if (orig_ptr !=
+	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
+		BUG();
+enospc:
+	if (right) {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+	if (left) {
+		if (path->nodes[level] != left)
+			btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+	}
+	return ret;
+}
+
+/* Node balancing for insertion.  Here we only split or push nodes around
+ * when they are completely full.  This is also done top down, so we
+ * have to be pessimistic.
+ */
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, int level)
+{
+	struct extent_buffer *right = NULL;
+	struct extent_buffer *mid;
+	struct extent_buffer *left = NULL;
+	struct extent_buffer *parent = NULL;
+	int ret = 0;
+	int wret;
+	int pslot;
+	int orig_slot = path->slots[level];
+	u64 orig_ptr;
+
+	if (level == 0)
+		return 1;
+
+	mid = path->nodes[level];
+	WARN_ON(btrfs_header_generation(mid) != trans->transid);
+	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+	if (level < BTRFS_MAX_LEVEL - 1)
+		parent = path->nodes[level + 1];
+	pslot = path->slots[level + 1];
+
+	if (!parent)
+		return 1;
+
+	left = read_node_slot(root, parent, pslot - 1);
+
+	/* first, try to make some room in the middle buffer */
+	if (left) {
+		u32 left_nr;
+
+		btrfs_tree_lock(left);
+		left_nr = btrfs_header_nritems(left);
+		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+			wret = 1;
+		} else {
+			ret = btrfs_cow_block(trans, root, left, parent,
+					      pslot - 1, &left, 0);
+			if (ret)
+				wret = 1;
+			else {
+				wret = push_node_left(trans, root,
+						      left, mid, 0);
+			}
+		}
+		if (wret < 0)
+			ret = wret;
+		if (wret == 0) {
+			struct btrfs_disk_key disk_key;
+			orig_slot += left_nr;
+			btrfs_node_key(mid, &disk_key, 0);
+			btrfs_set_node_key(parent, &disk_key, pslot);
+			btrfs_mark_buffer_dirty(parent);
+			if (btrfs_header_nritems(left) > orig_slot) {
+				path->nodes[level] = left;
+				path->slots[level + 1] -= 1;
+				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(mid);
+				free_extent_buffer(mid);
+			} else {
+				orig_slot -=
+					btrfs_header_nritems(left);
+				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(left);
+				free_extent_buffer(left);
+			}
+			return 0;
+		}
+		btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+	}
+	right = read_node_slot(root, parent, pslot + 1);
+
+	/*
+	 * then try to empty the right most buffer into the middle
+	 */
+	if (right) {
+		u32 right_nr;
+		btrfs_tree_lock(right);
+		right_nr = btrfs_header_nritems(right);
+		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+			wret = 1;
+		} else {
+			ret = btrfs_cow_block(trans, root, right,
+					      parent, pslot + 1,
+					      &right, 0);
+			if (ret)
+				wret = 1;
+			else {
+				wret = balance_node_right(trans, root,
+							  right, mid);
+			}
+		}
+		if (wret < 0)
+			ret = wret;
+		if (wret == 0) {
+			struct btrfs_disk_key disk_key;
+
+			btrfs_node_key(right, &disk_key, 0);
+			btrfs_set_node_key(parent, &disk_key, pslot + 1);
+			btrfs_mark_buffer_dirty(parent);
+
+			if (btrfs_header_nritems(mid) <= orig_slot) {
+				path->nodes[level] = right;
+				path->slots[level + 1] += 1;
+				path->slots[level] = orig_slot -
+					btrfs_header_nritems(mid);
+				btrfs_tree_unlock(mid);
+				free_extent_buffer(mid);
+			} else {
+				btrfs_tree_unlock(right);
+				free_extent_buffer(right);
+			}
+			return 0;
+		}
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+	return 1;
+}
+
+/*
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
+ */
+static noinline void reada_for_search(struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      int level, int slot, u64 objectid)
+{
+	struct extent_buffer *node;
+	struct btrfs_disk_key disk_key;
+	u32 nritems;
+	u64 search;
+	u64 lowest_read;
+	u64 highest_read;
+	u64 nread = 0;
+	int direction = path->reada;
+	struct extent_buffer *eb;
+	u32 nr;
+	u32 blocksize;
+	u32 nscan = 0;
+
+	if (level != 1)
+		return;
+
+	if (!path->nodes[level])
+		return;
+
+	node = path->nodes[level];
+
+	search = btrfs_node_blockptr(node, slot);
+	blocksize = btrfs_level_size(root, level - 1);
+	eb = btrfs_find_tree_block(root, search, blocksize);
+	if (eb) {
+		free_extent_buffer(eb);
+		return;
+	}
+
+	highest_read = search;
+	lowest_read = search;
+
+	nritems = btrfs_header_nritems(node);
+	nr = slot;
+	while (1) {
+		if (direction < 0) {
+			if (nr == 0)
+				break;
+			nr--;
+		} else if (direction > 0) {
+			nr++;
+			if (nr >= nritems)
+				break;
+		}
+		if (path->reada < 0 && objectid) {
+			btrfs_node_key(node, &disk_key, nr);
+			if (btrfs_disk_key_objectid(&disk_key) != objectid)
+				break;
+		}
+		search = btrfs_node_blockptr(node, nr);
+		if ((search >= lowest_read && search <= highest_read) ||
+		    (search < lowest_read && lowest_read - search <= 16384) ||
+		    (search > highest_read && search - highest_read <= 16384)) {
+			readahead_tree_block(root, search, blocksize,
+				     btrfs_node_ptr_generation(node, nr));
+			nread += blocksize;
+		}
+		nscan++;
+		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
+			break;
+
+		if (nread > (256 * 1024) || nscan > 128)
+			break;
+
+		if (search < lowest_read)
+			lowest_read = search;
+		if (search > highest_read)
+			highest_read = search;
+	}
+}
+
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree.  The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block.  This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
+ * if lowest_unlock is 1, level 0 won't be unlocked
+ */
+static noinline void unlock_up(struct btrfs_path *path, int level,
+			       int lowest_unlock)
+{
+	int i;
+	int skip_level = level;
+	int no_skips = 0;
+	struct extent_buffer *t;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			break;
+		if (!path->locks[i])
+			break;
+		if (!no_skips && path->slots[i] == 0) {
+			skip_level = i + 1;
+			continue;
+		}
+		if (!no_skips && path->keep_locks) {
+			u32 nritems;
+			t = path->nodes[i];
+			nritems = btrfs_header_nritems(t);
+			if (nritems < 1 || path->slots[i] >= nritems - 1) {
+				skip_level = i + 1;
+				continue;
+			}
+		}
+		if (skip_level < i && i >= lowest_unlock)
+			no_skips = 1;
+
+		t = path->nodes[i];
+		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+			btrfs_tree_unlock(t);
+			path->locks[i] = 0;
+		}
+	}
+}
+
+/*
+ * look for key in the tree.  path is filled in with nodes along the way
+ * if key is found, we return zero and you can find the item in the leaf
+ * level of the path (level 0)
+ *
+ * If the key isn't found, the path points to the slot where it should
+ * be inserted, and 1 is returned.  If there are other errors during the
+ * search a negative error number is returned.
+ *
+ * if ins_len > 0, nodes and leaves will be split as we walk down the
+ * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
+ * possible)
+ */
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_path *p, int
+		      ins_len, int cow)
+{
+	struct extent_buffer *b;
+	struct extent_buffer *tmp;
+	int slot;
+	int ret;
+	int level;
+	int should_reada = p->reada;
+	int lowest_unlock = 1;
+	int blocksize;
+	u8 lowest_level = 0;
+	u64 blocknr;
+	u64 gen;
+	struct btrfs_key prealloc_block;
+
+	lowest_level = p->lowest_level;
+	WARN_ON(lowest_level && ins_len > 0);
+	WARN_ON(p->nodes[0] != NULL);
+
+	if (ins_len < 0)
+		lowest_unlock = 2;
+
+	prealloc_block.objectid = 0;
+
+again:
+	if (p->skip_locking)
+		b = btrfs_root_node(root);
+	else
+		b = btrfs_lock_root_node(root);
+
+	while (b) {
+		level = btrfs_header_level(b);
+
+		/*
+		 * setup the path here so we can release it under lock
+		 * contention with the cow code
+		 */
+		p->nodes[level] = b;
+		if (!p->skip_locking)
+			p->locks[level] = 1;
+
+		if (cow) {
+			int wret;
+
+			/* is a cow on this block not required */
+			spin_lock(&root->fs_info->hash_lock);
+			if (btrfs_header_generation(b) == trans->transid &&
+			    btrfs_header_owner(b) == root->root_key.objectid &&
+			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
+				spin_unlock(&root->fs_info->hash_lock);
+				goto cow_done;
+			}
+			spin_unlock(&root->fs_info->hash_lock);
+
+			/* ok, we have to cow, is our old prealloc the right
+			 * size?
+			 */
+			if (prealloc_block.objectid &&
+			    prealloc_block.offset != b->len) {
+				btrfs_free_reserved_extent(root,
+					   prealloc_block.objectid,
+					   prealloc_block.offset);
+				prealloc_block.objectid = 0;
+			}
+
+			/*
+			 * for higher level blocks, try not to allocate blocks
+			 * with the block and the parent locks held.
+			 */
+			if (level > 1 && !prealloc_block.objectid &&
+			    btrfs_path_lock_waiting(p, level)) {
+				u32 size = b->len;
+				u64 hint = b->start;
+
+				btrfs_release_path(root, p);
+				ret = btrfs_reserve_extent(trans, root,
+							   size, size, 0,
+							   hint, (u64)-1,
+							   &prealloc_block, 0);
+				BUG_ON(ret);
+				goto again;
+			}
+
+			wret = btrfs_cow_block(trans, root, b,
+					       p->nodes[level + 1],
+					       p->slots[level + 1],
+					       &b, prealloc_block.objectid);
+			prealloc_block.objectid = 0;
+			if (wret) {
+				free_extent_buffer(b);
+				ret = wret;
+				goto done;
+			}
+		}
+cow_done:
+		BUG_ON(!cow && ins_len);
+		if (level != btrfs_header_level(b))
+			WARN_ON(1);
+		level = btrfs_header_level(b);
+
+		p->nodes[level] = b;
+		if (!p->skip_locking)
+			p->locks[level] = 1;
+
+		ret = check_block(root, p, level);
+		if (ret) {
+			ret = -1;
+			goto done;
+		}
+
+		ret = bin_search(b, key, level, &slot);
+		if (level != 0) {
+			if (ret && slot > 0)
+				slot -= 1;
+			p->slots[level] = slot;
+			if ((p->search_for_split || ins_len > 0) &&
+			    btrfs_header_nritems(b) >=
+			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+				int sret = split_node(trans, root, p, level);
+				BUG_ON(sret > 0);
+				if (sret) {
+					ret = sret;
+					goto done;
+				}
+				b = p->nodes[level];
+				slot = p->slots[level];
+			} else if (ins_len < 0) {
+				int sret = balance_level(trans, root, p,
+							 level);
+				if (sret) {
+					ret = sret;
+					goto done;
+				}
+				b = p->nodes[level];
+				if (!b) {
+					btrfs_release_path(NULL, p);
+					goto again;
+				}
+				slot = p->slots[level];
+				BUG_ON(btrfs_header_nritems(b) == 1);
+			}
+			unlock_up(p, level, lowest_unlock);
+
+			/* this is only true while dropping a snapshot */
+			if (level == lowest_level) {
+				ret = 0;
+				goto done;
+			}
+
+			blocknr = btrfs_node_blockptr(b, slot);
+			gen = btrfs_node_ptr_generation(b, slot);
+			blocksize = btrfs_level_size(root, level - 1);
+
+			tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+				b = tmp;
+			} else {
+				/*
+				 * reduce lock contention at high levels
+				 * of the btree by dropping locks before
+				 * we read.
+				 */
+				if (level > 1) {
+					btrfs_release_path(NULL, p);
+					if (tmp)
+						free_extent_buffer(tmp);
+					if (should_reada)
+						reada_for_search(root, p,
+								 level, slot,
+								 key->objectid);
+
+					tmp = read_tree_block(root, blocknr,
+							 blocksize, gen);
+					if (tmp)
+						free_extent_buffer(tmp);
+					goto again;
+				} else {
+					if (tmp)
+						free_extent_buffer(tmp);
+					if (should_reada)
+						reada_for_search(root, p,
+								 level, slot,
+								 key->objectid);
+					b = read_node_slot(root, b, slot);
+				}
+			}
+			if (!p->skip_locking)
+				btrfs_tree_lock(b);
+		} else {
+			p->slots[level] = slot;
+			if (ins_len > 0 &&
+			    btrfs_leaf_free_space(root, b) < ins_len) {
+				int sret = split_leaf(trans, root, key,
+						      p, ins_len, ret == 0);
+				BUG_ON(sret > 0);
+				if (sret) {
+					ret = sret;
+					goto done;
+				}
+			}
+			if (!p->search_for_split)
+				unlock_up(p, level, lowest_unlock);
+			goto done;
+		}
+	}
+	ret = 1;
+done:
+	if (prealloc_block.objectid) {
+		btrfs_free_reserved_extent(root,
+			   prealloc_block.objectid,
+			   prealloc_block.offset);
+	}
+
+	return ret;
+}
+
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_key *node_keys,
+		     u64 *nodes, int lowest_level)
+{
+	struct extent_buffer *eb;
+	struct extent_buffer *parent;
+	struct btrfs_key key;
+	u64 bytenr;
+	u64 generation;
+	u32 blocksize;
+	int level;
+	int slot;
+	int key_match;
+	int ret;
+
+	eb = btrfs_lock_root_node(root);
+	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+	BUG_ON(ret);
+
+	parent = eb;
+	while (1) {
+		level = btrfs_header_level(parent);
+		if (level == 0 || level <= lowest_level)
+			break;
+
+		ret = bin_search(parent, &node_keys[lowest_level], level,
+				 &slot);
+		if (ret && slot > 0)
+			slot--;
+
+		bytenr = btrfs_node_blockptr(parent, slot);
+		if (nodes[level - 1] == bytenr)
+			break;
+
+		blocksize = btrfs_level_size(root, level - 1);
+		generation = btrfs_node_ptr_generation(parent, slot);
+		btrfs_node_key_to_cpu(eb, &key, slot);
+		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
+
+		if (generation == trans->transid) {
+			eb = read_tree_block(root, bytenr, blocksize,
+					     generation);
+			btrfs_tree_lock(eb);
+		}
+
+		/*
+		 * if node keys match and node pointer hasn't been modified
+		 * in the running transaction, we can merge the path. for
+		 * blocks owened by reloc trees, the node pointer check is
+		 * skipped, this is because these blocks are fully controlled
+		 * by the space balance code, no one else can modify them.
+		 */
+		if (!nodes[level - 1] || !key_match ||
+		    (generation == trans->transid &&
+		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
+			if (level == 1 || level == lowest_level + 1) {
+				if (generation == trans->transid) {
+					btrfs_tree_unlock(eb);
+					free_extent_buffer(eb);
+				}
+				break;
+			}
+
+			if (generation != trans->transid) {
+				eb = read_tree_block(root, bytenr, blocksize,
+						generation);
+				btrfs_tree_lock(eb);
+			}
+
+			ret = btrfs_cow_block(trans, root, eb, parent, slot,
+					      &eb, 0);
+			BUG_ON(ret);
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID) {
+				if (!nodes[level - 1]) {
+					nodes[level - 1] = eb->start;
+					memcpy(&node_keys[level - 1], &key,
+					       sizeof(node_keys[0]));
+				} else {
+					WARN_ON(1);
+				}
+			}
+
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+			parent = eb;
+			continue;
+		}
+
+		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
+		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
+		btrfs_mark_buffer_dirty(parent);
+
+		ret = btrfs_inc_extent_ref(trans, root,
+					nodes[level - 1],
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					level - 1);
+		BUG_ON(ret);
+
+		/*
+		 * If the block was created in the running transaction,
+		 * it's possible this is the last reference to it, so we
+		 * should drop the subtree.
+		 */
+		if (generation == trans->transid) {
+			ret = btrfs_drop_subtree(trans, root, eb, parent);
+			BUG_ON(ret);
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		} else {
+			ret = btrfs_free_extent(trans, root, bytenr,
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					level - 1, 1);
+			BUG_ON(ret);
+		}
+		break;
+	}
+	btrfs_tree_unlock(parent);
+	free_extent_buffer(parent);
+	return 0;
+}
+
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ *
+ * If this fails to write a tree block, it returns -1, but continues
+ * fixing up the blocks in ram so the tree is consistent.
+ */
+static int fixup_low_keys(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct btrfs_path *path,
+			  struct btrfs_disk_key *key, int level)
+{
+	int i;
+	int ret = 0;
+	struct extent_buffer *t;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		int tslot = path->slots[i];
+		if (!path->nodes[i])
+			break;
+		t = path->nodes[i];
+		btrfs_set_node_key(t, key, tslot);
+		btrfs_mark_buffer_dirty(path->nodes[i]);
+		if (tslot != 0)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *new_key)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	if (slot > 0) {
+		btrfs_item_key(eb, &disk_key, slot - 1);
+		if (comp_keys(&disk_key, new_key) >= 0)
+			return -1;
+	}
+	if (slot < btrfs_header_nritems(eb) - 1) {
+		btrfs_item_key(eb, &disk_key, slot + 1);
+		if (comp_keys(&disk_key, new_key) <= 0)
+			return -1;
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, new_key);
+	btrfs_set_item_key(eb, &disk_key, slot);
+	btrfs_mark_buffer_dirty(eb);
+	if (slot == 0)
+		fixup_low_keys(trans, root, path, &disk_key, 1);
+	return 0;
+}
+
+/*
+ * try to push data from one node into the next node left in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
+ */
+static int push_node_left(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *dst,
+			  struct extent_buffer *src, int empty)
+{
+	int push_items = 0;
+	int src_nritems;
+	int dst_nritems;
+	int ret = 0;
+
+	src_nritems = btrfs_header_nritems(src);
+	dst_nritems = btrfs_header_nritems(dst);
+	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+	WARN_ON(btrfs_header_generation(src) != trans->transid);
+	WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+	if (!empty && src_nritems <= 8)
+		return 1;
+
+	if (push_items <= 0)
+		return 1;
+
+	if (empty) {
+		push_items = min(src_nritems, push_items);
+		if (push_items < src_nritems) {
+			/* leave at least 8 pointers in the node if
+			 * we aren't going to empty it
+			 */
+			if (src_nritems - push_items < 8) {
+				if (push_items <= 8)
+					return 1;
+				push_items -= 8;
+			}
+		}
+	} else
+		push_items = min(src_nritems - 8, push_items);
+
+	copy_extent_buffer(dst, src,
+			   btrfs_node_key_ptr_offset(dst_nritems),
+			   btrfs_node_key_ptr_offset(0),
+			   push_items * sizeof(struct btrfs_key_ptr));
+
+	if (push_items < src_nritems) {
+		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+				      btrfs_node_key_ptr_offset(push_items),
+				      (src_nritems - push_items) *
+				      sizeof(struct btrfs_key_ptr));
+	}
+	btrfs_set_header_nritems(src, src_nritems - push_items);
+	btrfs_set_header_nritems(dst, dst_nritems + push_items);
+	btrfs_mark_buffer_dirty(src);
+	btrfs_mark_buffer_dirty(dst);
+
+	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
+	BUG_ON(ret);
+
+	return ret;
+}
+
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will  only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *dst,
+			      struct extent_buffer *src)
+{
+	int push_items = 0;
+	int max_push;
+	int src_nritems;
+	int dst_nritems;
+	int ret = 0;
+
+	WARN_ON(btrfs_header_generation(src) != trans->transid);
+	WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
+	src_nritems = btrfs_header_nritems(src);
+	dst_nritems = btrfs_header_nritems(dst);
+	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+	if (push_items <= 0)
+		return 1;
+
+	if (src_nritems < 4)
+		return 1;
+
+	max_push = src_nritems / 2 + 1;
+	/* don't try to empty the node */
+	if (max_push >= src_nritems)
+		return 1;
+
+	if (max_push < push_items)
+		push_items = max_push;
+
+	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+				      btrfs_node_key_ptr_offset(0),
+				      (dst_nritems) *
+				      sizeof(struct btrfs_key_ptr));
+
+	copy_extent_buffer(dst, src,
+			   btrfs_node_key_ptr_offset(0),
+			   btrfs_node_key_ptr_offset(src_nritems - push_items),
+			   push_items * sizeof(struct btrfs_key_ptr));
+
+	btrfs_set_header_nritems(src, src_nritems - push_items);
+	btrfs_set_header_nritems(dst, dst_nritems + push_items);
+
+	btrfs_mark_buffer_dirty(src);
+	btrfs_mark_buffer_dirty(dst);
+
+	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
+	BUG_ON(ret);
+
+	return ret;
+}
+
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
+ */
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_path *path, int level)
+{
+	u64 lower_gen;
+	struct extent_buffer *lower;
+	struct extent_buffer *c;
+	struct extent_buffer *old;
+	struct btrfs_disk_key lower_key;
+	int ret;
+
+	BUG_ON(path->nodes[level]);
+	BUG_ON(path->nodes[level-1] != root->node);
+
+	lower = path->nodes[level-1];
+	if (level == 1)
+		btrfs_item_key(lower, &lower_key, 0);
+	else
+		btrfs_node_key(lower, &lower_key, 0);
+
+	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+				   root->root_key.objectid, trans->transid,
+				   level, root->node->start, 0);
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+
+	memset_extent_buffer(c, 0, 0, root->nodesize);
+	btrfs_set_header_nritems(c, 1);
+	btrfs_set_header_level(c, level);
+	btrfs_set_header_bytenr(c, c->start);
+	btrfs_set_header_generation(c, trans->transid);
+	btrfs_set_header_owner(c, root->root_key.objectid);
+
+	write_extent_buffer(c, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(c),
+			    BTRFS_FSID_SIZE);
+
+	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
+			    BTRFS_UUID_SIZE);
+
+	btrfs_set_node_key(c, &lower_key, 0);
+	btrfs_set_node_blockptr(c, 0, lower->start);
+	lower_gen = btrfs_header_generation(lower);
+	WARN_ON(lower_gen != trans->transid);
+
+	btrfs_set_node_ptr_generation(c, 0, lower_gen);
+
+	btrfs_mark_buffer_dirty(c);
+
+	spin_lock(&root->node_lock);
+	old = root->node;
+	root->node = c;
+	spin_unlock(&root->node_lock);
+
+	ret = btrfs_update_extent_ref(trans, root, lower->start,
+				      lower->start, c->start,
+				      root->root_key.objectid,
+				      trans->transid, level - 1);
+	BUG_ON(ret);
+
+	/* the super has an extra ref to root->node */
+	free_extent_buffer(old);
+
+	add_root_to_dirty_list(root);
+	extent_buffer_get(c);
+	path->nodes[level] = c;
+	path->locks[level] = 1;
+	path->slots[level] = 0;
+	return 0;
+}
+
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ *
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ *
+ * returns zero on success and < 0 on any error
+ */
+static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, struct btrfs_disk_key
+		      *key, u64 bytenr, int slot, int level)
+{
+	struct extent_buffer *lower;
+	int nritems;
+
+	BUG_ON(!path->nodes[level]);
+	lower = path->nodes[level];
+	nritems = btrfs_header_nritems(lower);
+	if (slot > nritems)
+		BUG();
+	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
+		BUG();
+	if (slot != nritems) {
+		memmove_extent_buffer(lower,
+			      btrfs_node_key_ptr_offset(slot + 1),
+			      btrfs_node_key_ptr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
+	}
+	btrfs_set_node_key(lower, key, slot);
+	btrfs_set_node_blockptr(lower, slot, bytenr);
+	WARN_ON(trans->transid == 0);
+	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
+	btrfs_set_header_nritems(lower, nritems + 1);
+	btrfs_mark_buffer_dirty(lower);
+	return 0;
+}
+
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
+ */
+static noinline int split_node(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path, int level)
+{
+	struct extent_buffer *c;
+	struct extent_buffer *split;
+	struct btrfs_disk_key disk_key;
+	int mid;
+	int ret;
+	int wret;
+	u32 c_nritems;
+
+	c = path->nodes[level];
+	WARN_ON(btrfs_header_generation(c) != trans->transid);
+	if (c == root->node) {
+		/* trying to split the root, lets make a new one */
+		ret = insert_new_root(trans, root, path, level + 1);
+		if (ret)
+			return ret;
+	} else {
+		ret = push_nodes_for_insert(trans, root, path, level);
+		c = path->nodes[level];
+		if (!ret && btrfs_header_nritems(c) <
+		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
+			return 0;
+		if (ret < 0)
+			return ret;
+	}
+
+	c_nritems = btrfs_header_nritems(c);
+
+	split = btrfs_alloc_free_block(trans, root, root->nodesize,
+					path->nodes[level + 1]->start,
+					root->root_key.objectid,
+					trans->transid, level, c->start, 0);
+	if (IS_ERR(split))
+		return PTR_ERR(split);
+
+	btrfs_set_header_flags(split, btrfs_header_flags(c));
+	btrfs_set_header_level(split, btrfs_header_level(c));
+	btrfs_set_header_bytenr(split, split->start);
+	btrfs_set_header_generation(split, trans->transid);
+	btrfs_set_header_owner(split, root->root_key.objectid);
+	btrfs_set_header_flags(split, 0);
+	write_extent_buffer(split, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(split),
+			    BTRFS_FSID_SIZE);
+	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
+			    BTRFS_UUID_SIZE);
+
+	mid = (c_nritems + 1) / 2;
+
+	copy_extent_buffer(split, c,
+			   btrfs_node_key_ptr_offset(0),
+			   btrfs_node_key_ptr_offset(mid),
+			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+	btrfs_set_header_nritems(split, c_nritems - mid);
+	btrfs_set_header_nritems(c, mid);
+	ret = 0;
+
+	btrfs_mark_buffer_dirty(c);
+	btrfs_mark_buffer_dirty(split);
+
+	btrfs_node_key(split, &disk_key, 0);
+	wret = insert_ptr(trans, root, path, &disk_key, split->start,
+			  path->slots[level + 1] + 1,
+			  level + 1);
+	if (wret)
+		ret = wret;
+
+	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
+	BUG_ON(ret);
+
+	if (path->slots[level] >= mid) {
+		path->slots[level] -= mid;
+		btrfs_tree_unlock(c);
+		free_extent_buffer(c);
+		path->nodes[level] = split;
+		path->slots[level + 1] += 1;
+	} else {
+		btrfs_tree_unlock(split);
+		free_extent_buffer(split);
+	}
+	return ret;
+}
+
+/*
+ * how many bytes are required to store the items in a leaf.  start
+ * and nr indicate which items in the leaf to check.  This totals up the
+ * space used both by the item structs and the item data
+ */
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+{
+	int data_len;
+	int nritems = btrfs_header_nritems(l);
+	int end = min(nritems, start + nr) - 1;
+
+	if (!nr)
+		return 0;
+	data_len = btrfs_item_end_nr(l, start);
+	data_len = data_len - btrfs_item_offset_nr(l, end);
+	data_len += sizeof(struct btrfs_item) * nr;
+	WARN_ON(data_len < 0);
+	return data_len;
+}
+
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data.  IOW, how much room
+ * the leaf has left for both items and data
+ */
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
+				   struct extent_buffer *leaf)
+{
+	int nritems = btrfs_header_nritems(leaf);
+	int ret;
+	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+	if (ret < 0) {
+		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+		       "used %d nritems %d\n",
+		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
+		       leaf_space_used(leaf, 0, nritems), nritems);
+	}
+	return ret;
+}
+
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct btrfs_path *path, int data_size,
+			   int empty)
+{
+	struct extent_buffer *left = path->nodes[0];
+	struct extent_buffer *right;
+	struct extent_buffer *upper;
+	struct btrfs_disk_key disk_key;
+	int slot;
+	u32 i;
+	int free_space;
+	int push_space = 0;
+	int push_items = 0;
+	struct btrfs_item *item;
+	u32 left_nritems;
+	u32 nr;
+	u32 right_nritems;
+	u32 data_end;
+	u32 this_item_size;
+	int ret;
+
+	slot = path->slots[1];
+	if (!path->nodes[1])
+		return 1;
+
+	upper = path->nodes[1];
+	if (slot >= btrfs_header_nritems(upper) - 1)
+		return 1;
+
+	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+
+	right = read_node_slot(root, upper, slot + 1);
+	btrfs_tree_lock(right);
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, right, upper,
+			      slot + 1, &right, 0);
+	if (ret)
+		goto out_unlock;
+
+	free_space = btrfs_leaf_free_space(root, right);
+	if (free_space < data_size)
+		goto out_unlock;
+
+	left_nritems = btrfs_header_nritems(left);
+	if (left_nritems == 0)
+		goto out_unlock;
+
+	if (empty)
+		nr = 0;
+	else
+		nr = 1;
+
+	if (path->slots[0] >= left_nritems)
+		push_space += data_size;
+
+	i = left_nritems - 1;
+	while (i >= nr) {
+		item = btrfs_item_nr(left, i);
+
+		if (!empty && push_items > 0) {
+			if (path->slots[0] > i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, left);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
+		if (path->slots[0] == i)
+			push_space += data_size;
+
+		if (!left->map_token) {
+			map_extent_buffer(left, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&left->map_token, &left->kaddr,
+					&left->map_start, &left->map_len,
+					KM_USER1);
+		}
+
+		this_item_size = btrfs_item_size(left, item);
+		if (this_item_size + sizeof(*item) + push_space > free_space)
+			break;
+
+		push_items++;
+		push_space += this_item_size + sizeof(*item);
+		if (i == 0)
+			break;
+		i--;
+	}
+	if (left->map_token) {
+		unmap_extent_buffer(left, left->map_token, KM_USER1);
+		left->map_token = NULL;
+	}
+
+	if (push_items == 0)
+		goto out_unlock;
+
+	if (!empty && push_items == left_nritems)
+		WARN_ON(1);
+
+	/* push left to right */
+	right_nritems = btrfs_header_nritems(right);
+
+	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+	push_space -= leaf_data_end(root, left);
+
+	/* make room in the right data area */
+	data_end = leaf_data_end(root, right);
+	memmove_extent_buffer(right,
+			      btrfs_leaf_data(right) + data_end - push_space,
+			      btrfs_leaf_data(right) + data_end,
+			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
+
+	/* copy from the left data area */
+	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
+		     btrfs_leaf_data(left) + leaf_data_end(root, left),
+		     push_space);
+
+	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+			      btrfs_item_nr_offset(0),
+			      right_nritems * sizeof(struct btrfs_item));
+
+	/* copy the items from left to right */
+	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+		   btrfs_item_nr_offset(left_nritems - push_items),
+		   push_items * sizeof(struct btrfs_item));
+
+	/* update the item pointers */
+	right_nritems += push_items;
+	btrfs_set_header_nritems(right, right_nritems);
+	push_space = BTRFS_LEAF_DATA_SIZE(root);
+	for (i = 0; i < right_nritems; i++) {
+		item = btrfs_item_nr(right, i);
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+		push_space -= btrfs_item_size(right, item);
+		btrfs_set_item_offset(right, item, push_space);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+	left_nritems -= push_items;
+	btrfs_set_header_nritems(left, left_nritems);
+
+	if (left_nritems)
+		btrfs_mark_buffer_dirty(left);
+	btrfs_mark_buffer_dirty(right);
+
+	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
+	BUG_ON(ret);
+
+	btrfs_item_key(right, &disk_key, 0);
+	btrfs_set_node_key(upper, &disk_key, slot + 1);
+	btrfs_mark_buffer_dirty(upper);
+
+	/* then fixup the leaf pointer in the path */
+	if (path->slots[0] >= left_nritems) {
+		path->slots[0] -= left_nritems;
+		if (btrfs_header_nritems(path->nodes[0]) == 0)
+			clean_tree_block(trans, root, path->nodes[0]);
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
+		path->slots[1] += 1;
+	} else {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+	return 0;
+
+out_unlock:
+	btrfs_tree_unlock(right);
+	free_extent_buffer(right);
+	return 1;
+}
+
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, int data_size,
+			  int empty)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *right = path->nodes[0];
+	struct extent_buffer *left;
+	int slot;
+	int i;
+	int free_space;
+	int push_space = 0;
+	int push_items = 0;
+	struct btrfs_item *item;
+	u32 old_left_nritems;
+	u32 right_nritems;
+	u32 nr;
+	int ret = 0;
+	int wret;
+	u32 this_item_size;
+	u32 old_left_item_size;
+
+	slot = path->slots[1];
+	if (slot == 0)
+		return 1;
+	if (!path->nodes[1])
+		return 1;
+
+	right_nritems = btrfs_header_nritems(right);
+	if (right_nritems == 0)
+		return 1;
+
+	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+
+	left = read_node_slot(root, path->nodes[1], slot - 1);
+	btrfs_tree_lock(left);
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	/* cow and double check */
+	ret = btrfs_cow_block(trans, root, left,
+			      path->nodes[1], slot - 1, &left, 0);
+	if (ret) {
+		/* we hit -ENOSPC, but it isn't fatal here */
+		ret = 1;
+		goto out;
+	}
+
+	free_space = btrfs_leaf_free_space(root, left);
+	if (free_space < data_size) {
+		ret = 1;
+		goto out;
+	}
+
+	if (empty)
+		nr = right_nritems;
+	else
+		nr = right_nritems - 1;
+
+	for (i = 0; i < nr; i++) {
+		item = btrfs_item_nr(right, i);
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		if (!empty && push_items > 0) {
+			if (path->slots[0] < i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, right);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
+		if (path->slots[0] == i)
+			push_space += data_size;
+
+		this_item_size = btrfs_item_size(right, item);
+		if (this_item_size + sizeof(*item) + push_space > free_space)
+			break;
+
+		push_items++;
+		push_space += this_item_size + sizeof(*item);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
+	if (push_items == 0) {
+		ret = 1;
+		goto out;
+	}
+	if (!empty && push_items == btrfs_header_nritems(right))
+		WARN_ON(1);
+
+	/* push data from right to left */
+	copy_extent_buffer(left, right,
+			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
+			   btrfs_item_nr_offset(0),
+			   push_items * sizeof(struct btrfs_item));
+
+	push_space = BTRFS_LEAF_DATA_SIZE(root) -
+		     btrfs_item_offset_nr(right, push_items - 1);
+
+	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+		     leaf_data_end(root, left) - push_space,
+		     btrfs_leaf_data(right) +
+		     btrfs_item_offset_nr(right, push_items - 1),
+		     push_space);
+	old_left_nritems = btrfs_header_nritems(left);
+	BUG_ON(old_left_nritems <= 0);
+
+	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+		u32 ioff;
+
+		item = btrfs_item_nr(left, i);
+		if (!left->map_token) {
+			map_extent_buffer(left, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&left->map_token, &left->kaddr,
+					&left->map_start, &left->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(left, item);
+		btrfs_set_item_offset(left, item,
+		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+	}
+	btrfs_set_header_nritems(left, old_left_nritems + push_items);
+	if (left->map_token) {
+		unmap_extent_buffer(left, left->map_token, KM_USER1);
+		left->map_token = NULL;
+	}
+
+	/* fixup right node */
+	if (push_items > right_nritems) {
+		printk(KERN_CRIT "push items %d nr %u\n", push_items,
+		       right_nritems);
+		WARN_ON(1);
+	}
+
+	if (push_items < right_nritems) {
+		push_space = btrfs_item_offset_nr(right, push_items - 1) -
+						  leaf_data_end(root, right);
+		memmove_extent_buffer(right, btrfs_leaf_data(right) +
+				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
+				      btrfs_leaf_data(right) +
+				      leaf_data_end(root, right), push_space);
+
+		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+			      btrfs_item_nr_offset(push_items),
+			     (btrfs_header_nritems(right) - push_items) *
+			     sizeof(struct btrfs_item));
+	}
+	right_nritems -= push_items;
+	btrfs_set_header_nritems(right, right_nritems);
+	push_space = BTRFS_LEAF_DATA_SIZE(root);
+	for (i = 0; i < right_nritems; i++) {
+		item = btrfs_item_nr(right, i);
+
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		push_space = push_space - btrfs_item_size(right, item);
+		btrfs_set_item_offset(right, item, push_space);
+	}
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
+	btrfs_mark_buffer_dirty(left);
+	if (right_nritems)
+		btrfs_mark_buffer_dirty(right);
+
+	ret = btrfs_update_ref(trans, root, right, left,
+			       old_left_nritems, push_items);
+	BUG_ON(ret);
+
+	btrfs_item_key(right, &disk_key, 0);
+	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
+	if (wret)
+		ret = wret;
+
+	/* then fixup the leaf pointer in the path */
+	if (path->slots[0] < push_items) {
+		path->slots[0] += old_left_nritems;
+		if (btrfs_header_nritems(path->nodes[0]) == 0)
+			clean_tree_block(trans, root, path->nodes[0]);
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = left;
+		path->slots[1] -= 1;
+	} else {
+		btrfs_tree_unlock(left);
+		free_extent_buffer(left);
+		path->slots[0] -= push_items;
+	}
+	BUG_ON(path->slots[0] < 0);
+	return ret;
+out:
+	btrfs_tree_unlock(left);
+	free_extent_buffer(left);
+	return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_key *ins_key,
+			       struct btrfs_path *path, int data_size,
+			       int extend)
+{
+	struct extent_buffer *l;
+	u32 nritems;
+	int mid;
+	int slot;
+	struct extent_buffer *right;
+	int data_copy_size;
+	int rt_data_off;
+	int i;
+	int ret = 0;
+	int wret;
+	int double_split;
+	int num_doubles = 0;
+	struct btrfs_disk_key disk_key;
+
+	/* first try to make some room by pushing left and right */
+	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+		wret = push_leaf_right(trans, root, path, data_size, 0);
+		if (wret < 0)
+			return wret;
+		if (wret) {
+			wret = push_leaf_left(trans, root, path, data_size, 0);
+			if (wret < 0)
+				return wret;
+		}
+		l = path->nodes[0];
+
+		/* did the pushes work? */
+		if (btrfs_leaf_free_space(root, l) >= data_size)
+			return 0;
+	}
+
+	if (!path->nodes[1]) {
+		ret = insert_new_root(trans, root, path, 1);
+		if (ret)
+			return ret;
+	}
+again:
+	double_split = 0;
+	l = path->nodes[0];
+	slot = path->slots[0];
+	nritems = btrfs_header_nritems(l);
+	mid = (nritems + 1) / 2;
+
+	right = btrfs_alloc_free_block(trans, root, root->leafsize,
+					path->nodes[1]->start,
+					root->root_key.objectid,
+					trans->transid, 0, l->start, 0);
+	if (IS_ERR(right)) {
+		BUG_ON(1);
+		return PTR_ERR(right);
+	}
+
+	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(right, right->start);
+	btrfs_set_header_generation(right, trans->transid);
+	btrfs_set_header_owner(right, root->root_key.objectid);
+	btrfs_set_header_level(right, 0);
+	write_extent_buffer(right, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(right),
+			    BTRFS_FSID_SIZE);
+
+	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
+			    BTRFS_UUID_SIZE);
+	if (mid <= slot) {
+		if (nritems == 1 ||
+		    leaf_space_used(l, mid, nritems - mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (slot >= nritems) {
+				btrfs_cpu_key_to_disk(&disk_key, ins_key);
+				btrfs_set_header_nritems(right, 0);
+				wret = insert_ptr(trans, root, path,
+						  &disk_key, right->start,
+						  path->slots[1] + 1, 1);
+				if (wret)
+					ret = wret;
+
+				btrfs_tree_unlock(path->nodes[0]);
+				free_extent_buffer(path->nodes[0]);
+				path->nodes[0] = right;
+				path->slots[0] = 0;
+				path->slots[1] += 1;
+				btrfs_mark_buffer_dirty(right);
+				return ret;
+			}
+			mid = slot;
+			if (mid != nritems &&
+			    leaf_space_used(l, mid, nritems - mid) +
+			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+				double_split = 1;
+			}
+		}
+	} else {
+		if (leaf_space_used(l, 0, mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (!extend && data_size && slot == 0) {
+				btrfs_cpu_key_to_disk(&disk_key, ins_key);
+				btrfs_set_header_nritems(right, 0);
+				wret = insert_ptr(trans, root, path,
+						  &disk_key,
+						  right->start,
+						  path->slots[1], 1);
+				if (wret)
+					ret = wret;
+				btrfs_tree_unlock(path->nodes[0]);
+				free_extent_buffer(path->nodes[0]);
+				path->nodes[0] = right;
+				path->slots[0] = 0;
+				if (path->slots[1] == 0) {
+					wret = fixup_low_keys(trans, root,
+						      path, &disk_key, 1);
+					if (wret)
+						ret = wret;
+				}
+				btrfs_mark_buffer_dirty(right);
+				return ret;
+			} else if ((extend || !data_size) && slot == 0) {
+				mid = 1;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					double_split = 1;
+				}
+			}
+		}
+	}
+	nritems = nritems - mid;
+	btrfs_set_header_nritems(right, nritems);
+	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+			   btrfs_item_nr_offset(mid),
+			   nritems * sizeof(struct btrfs_item));
+
+	copy_extent_buffer(right, l,
+		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+		     data_copy_size, btrfs_leaf_data(l) +
+		     leaf_data_end(root, l), data_copy_size);
+
+	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+		      btrfs_item_end_nr(l, mid);
+
+	for (i = 0; i < nritems; i++) {
+		struct btrfs_item *item = btrfs_item_nr(right, i);
+		u32 ioff;
+
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(right, item);
+		btrfs_set_item_offset(right, item, ioff + rt_data_off);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
+	btrfs_set_header_nritems(l, mid);
+	ret = 0;
+	btrfs_item_key(right, &disk_key, 0);
+	wret = insert_ptr(trans, root, path, &disk_key, right->start,
+			  path->slots[1] + 1, 1);
+	if (wret)
+		ret = wret;
+
+	btrfs_mark_buffer_dirty(right);
+	btrfs_mark_buffer_dirty(l);
+	BUG_ON(path->slots[0] != slot);
+
+	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+	BUG_ON(ret);
+
+	if (mid <= slot) {
+		btrfs_tree_unlock(path->nodes[0]);
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
+		path->slots[0] -= mid;
+		path->slots[1] += 1;
+	} else {
+		btrfs_tree_unlock(right);
+		free_extent_buffer(right);
+	}
+
+	BUG_ON(path->slots[0] < 0);
+
+	if (double_split) {
+		BUG_ON(num_doubles != 0);
+		num_doubles++;
+		goto again;
+	}
+	return ret;
+}
+
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation.  After
+ * the split, the path is pointing to the old item.  The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *new_key,
+		     unsigned long split_offset)
+{
+	u32 item_size;
+	struct extent_buffer *leaf;
+	struct btrfs_key orig_key;
+	struct btrfs_item *item;
+	struct btrfs_item *new_item;
+	int ret = 0;
+	int slot;
+	u32 nritems;
+	u32 orig_offset;
+	struct btrfs_disk_key disk_key;
+	char *buf;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
+	if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
+		goto split;
+
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	btrfs_release_path(root, path);
+
+	path->search_for_split = 1;
+	path->keep_locks = 1;
+
+	ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
+	path->search_for_split = 0;
+
+	/* if our item isn't there or got smaller, return now */
+	if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
+							path->slots[0])) {
+		path->keep_locks = 0;
+		return -EAGAIN;
+	}
+
+	ret = split_leaf(trans, root, &orig_key, path,
+			 sizeof(struct btrfs_item), 1);
+	path->keep_locks = 0;
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+split:
+	item = btrfs_item_nr(leaf, path->slots[0]);
+	orig_offset = btrfs_item_offset(leaf, item);
+	item_size = btrfs_item_size(leaf, item);
+
+
+	buf = kmalloc(item_size, GFP_NOFS);
+	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+			    path->slots[0]), item_size);
+	slot = path->slots[0] + 1;
+	leaf = path->nodes[0];
+
+	nritems = btrfs_header_nritems(leaf);
+
+	if (slot != nritems) {
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+			      btrfs_item_nr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_item));
+
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, new_key);
+	btrfs_set_item_key(leaf, &disk_key, slot);
+
+	new_item = btrfs_item_nr(leaf, slot);
+
+	btrfs_set_item_offset(leaf, new_item, orig_offset);
+	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+
+	btrfs_set_item_offset(leaf, item,
+			      orig_offset + item_size - split_offset);
+	btrfs_set_item_size(leaf, item, split_offset);
+
+	btrfs_set_header_nritems(leaf, nritems + 1);
+
+	/* write the data for the start of the original item */
+	write_extent_buffer(leaf, buf,
+			    btrfs_item_ptr_offset(leaf, path->slots[0]),
+			    split_offset);
+
+	/* write the data for the new item */
+	write_extent_buffer(leaf, buf + split_offset,
+			    btrfs_item_ptr_offset(leaf, slot),
+			    item_size - split_offset);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	kfree(buf);
+	return ret;
+}
+
+/*
+ * make the item pointed to by the path smaller.  new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			u32 new_size, int from_end)
+{
+	int ret = 0;
+	int slot;
+	int slot_orig;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	u32 nritems;
+	unsigned int data_end;
+	unsigned int old_data_start;
+	unsigned int old_size;
+	unsigned int size_diff;
+	int i;
+
+	slot_orig = path->slots[0];
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	old_size = btrfs_item_size_nr(leaf, slot);
+	if (old_size == new_size)
+		return 0;
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	old_data_start = btrfs_item_offset_nr(leaf, slot);
+
+	size_diff = old_size - new_size;
+
+	BUG_ON(slot < 0);
+	BUG_ON(slot >= nritems);
+
+	/*
+	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+	 */
+	/* first correct the data pointers */
+	for (i = slot; i < nritems; i++) {
+		u32 ioff;
+		item = btrfs_item_nr(leaf, i);
+
+		if (!leaf->map_token) {
+			map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(leaf, item);
+		btrfs_set_item_offset(leaf, item, ioff + size_diff);
+	}
+
+	if (leaf->map_token) {
+		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+		leaf->map_token = NULL;
+	}
+
+	/* shift the data */
+	if (from_end) {
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + size_diff, btrfs_leaf_data(leaf) +
+			      data_end, old_data_start + new_size - data_end);
+	} else {
+		struct btrfs_disk_key disk_key;
+		u64 offset;
+
+		btrfs_item_key(leaf, &disk_key, slot);
+
+		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+			unsigned long ptr;
+			struct btrfs_file_extent_item *fi;
+
+			fi = btrfs_item_ptr(leaf, slot,
+					    struct btrfs_file_extent_item);
+			fi = (struct btrfs_file_extent_item *)(
+			     (unsigned long)fi - size_diff);
+
+			if (btrfs_file_extent_type(leaf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE) {
+				ptr = btrfs_item_ptr_offset(leaf, slot);
+				memmove_extent_buffer(leaf, ptr,
+				      (unsigned long)fi,
+				      offsetof(struct btrfs_file_extent_item,
+						 disk_bytenr));
+			}
+		}
+
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + size_diff, btrfs_leaf_data(leaf) +
+			      data_end, old_data_start - data_end);
+
+		offset = btrfs_disk_key_offset(&disk_key);
+		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+		btrfs_set_item_key(leaf, &disk_key, slot);
+		if (slot == 0)
+			fixup_low_keys(trans, root, path, &disk_key, 1);
+	}
+
+	item = btrfs_item_nr(leaf, slot);
+	btrfs_set_item_size(leaf, item, new_size);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	return ret;
+}
+
+/*
+ * make the item pointed to by the path bigger, data_size is the new size.
+ */
+int btrfs_extend_item(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, struct btrfs_path *path,
+		      u32 data_size)
+{
+	int ret = 0;
+	int slot;
+	int slot_orig;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	u32 nritems;
+	unsigned int data_end;
+	unsigned int old_data;
+	unsigned int old_size;
+	int i;
+
+	slot_orig = path->slots[0];
+	leaf = path->nodes[0];
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < data_size) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	slot = path->slots[0];
+	old_data = btrfs_item_end_nr(leaf, slot);
+
+	BUG_ON(slot < 0);
+	if (slot >= nritems) {
+		btrfs_print_leaf(root, leaf);
+		printk(KERN_CRIT "slot %d too large, nritems %d\n",
+		       slot, nritems);
+		BUG_ON(1);
+	}
+
+	/*
+	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+	 */
+	/* first correct the data pointers */
+	for (i = slot; i < nritems; i++) {
+		u32 ioff;
+		item = btrfs_item_nr(leaf, i);
+
+		if (!leaf->map_token) {
+			map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+		}
+		ioff = btrfs_item_offset(leaf, item);
+		btrfs_set_item_offset(leaf, item, ioff - data_size);
+	}
+
+	if (leaf->map_token) {
+		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+		leaf->map_token = NULL;
+	}
+
+	/* shift the data */
+	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+		      data_end - data_size, btrfs_leaf_data(leaf) +
+		      data_end, old_data - data_end);
+
+	data_end = old_data;
+	old_size = btrfs_item_size_nr(leaf, slot);
+	item = btrfs_item_nr(leaf, slot);
+	btrfs_set_item_size(leaf, item, old_size + data_size);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ * Returns the number of keys that were inserted.
+ */
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	int ret = 0;
+	int slot;
+	int i;
+	u32 nritems;
+	u32 total_data = 0;
+	u32 total_size = 0;
+	unsigned int data_end;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_key found_key;
+
+	for (i = 0; i < nr; i++) {
+		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
+		    BTRFS_LEAF_DATA_SIZE(root)) {
+			break;
+			nr = i;
+		}
+		total_data += data_size[i];
+		total_size += data_size[i] + sizeof(struct btrfs_item);
+	}
+	BUG_ON(nr == 0);
+
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	if (ret == 0)
+		return -EEXIST;
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < total_size) {
+		for (i = nr; i >= 0; i--) {
+			total_data -= data_size[i];
+			total_size -= data_size[i] + sizeof(struct btrfs_item);
+			if (total_size < btrfs_leaf_free_space(root, leaf))
+				break;
+		}
+		nr = i;
+	}
+
+	slot = path->slots[0];
+	BUG_ON(slot < 0);
+
+	if (slot != nritems) {
+		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* figure out how many keys we can insert in here */
+		total_data = data_size[0];
+		for (i = 1; i < nr; i++) {
+			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+				break;
+			total_data += data_size[i];
+		}
+		nr = i;
+
+		if (old_data < data_end) {
+			btrfs_print_leaf(root, leaf);
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+			       slot, old_data, data_end);
+			BUG_ON(1);
+		}
+		/*
+		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+		 */
+		/* first correct the data pointers */
+		WARN_ON(leaf->map_token);
+		for (i = slot; i < nritems; i++) {
+			u32 ioff;
+
+			item = btrfs_item_nr(leaf, i);
+			if (!leaf->map_token) {
+				map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+			}
+
+			ioff = btrfs_item_offset(leaf, item);
+			btrfs_set_item_offset(leaf, item, ioff - total_data);
+		}
+		if (leaf->map_token) {
+			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+			leaf->map_token = NULL;
+		}
+
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+			      btrfs_item_nr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_item));
+
+		/* shift the data */
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end - total_data, btrfs_leaf_data(leaf) +
+			      data_end, old_data - data_end);
+		data_end = old_data;
+	} else {
+		/*
+		 * this sucks but it has to be done, if we are inserting at
+		 * the end of the leaf only insert 1 of the items, since we
+		 * have no way of knowing whats on the next leaf and we'd have
+		 * to drop our current locks to figure it out
+		 */
+		nr = 1;
+	}
+
+	/* setup the item for the new data */
+	for (i = 0; i < nr; i++) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+		btrfs_set_item_key(leaf, &disk_key, slot + i);
+		item = btrfs_item_nr(leaf, slot + i);
+		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+		data_end -= data_size[i];
+		btrfs_set_item_size(leaf, item, data_size[i]);
+	}
+	btrfs_set_header_nritems(leaf, nritems + nr);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+	if (slot == 0) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+	}
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+out:
+	if (!ret)
+		ret = nr;
+	return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	int ret = 0;
+	int slot;
+	int slot_orig;
+	int i;
+	u32 nritems;
+	u32 total_size = 0;
+	u32 total_data = 0;
+	unsigned int data_end;
+	struct btrfs_disk_key disk_key;
+
+	for (i = 0; i < nr; i++)
+		total_data += data_size[i];
+
+	total_size = total_data + (nr * sizeof(struct btrfs_item));
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	if (ret == 0)
+		return -EEXIST;
+	if (ret < 0)
+		goto out;
+
+	slot_orig = path->slots[0];
+	leaf = path->nodes[0];
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < total_size) {
+		btrfs_print_leaf(root, leaf);
+		printk(KERN_CRIT "not enough freespace need %u have %d\n",
+		       total_size, btrfs_leaf_free_space(root, leaf));
+		BUG();
+	}
+
+	slot = path->slots[0];
+	BUG_ON(slot < 0);
+
+	if (slot != nritems) {
+		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+		if (old_data < data_end) {
+			btrfs_print_leaf(root, leaf);
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+			       slot, old_data, data_end);
+			BUG_ON(1);
+		}
+		/*
+		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+		 */
+		/* first correct the data pointers */
+		WARN_ON(leaf->map_token);
+		for (i = slot; i < nritems; i++) {
+			u32 ioff;
+
+			item = btrfs_item_nr(leaf, i);
+			if (!leaf->map_token) {
+				map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+			}
+
+			ioff = btrfs_item_offset(leaf, item);
+			btrfs_set_item_offset(leaf, item, ioff - total_data);
+		}
+		if (leaf->map_token) {
+			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+			leaf->map_token = NULL;
+		}
+
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+			      btrfs_item_nr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_item));
+
+		/* shift the data */
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end - total_data, btrfs_leaf_data(leaf) +
+			      data_end, old_data - data_end);
+		data_end = old_data;
+	}
+
+	/* setup the item for the new data */
+	for (i = 0; i < nr; i++) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+		btrfs_set_item_key(leaf, &disk_key, slot + i);
+		item = btrfs_item_nr(leaf, slot + i);
+		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+		data_end -= data_size[i];
+		btrfs_set_item_size(leaf, item, data_size[i]);
+	}
+	btrfs_set_header_nritems(leaf, nritems + nr);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+	if (slot == 0) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+	}
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+out:
+	return ret;
+}
+
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *cpu_key, void *data, u32
+		      data_size)
+{
+	int ret = 0;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+	if (!ret) {
+		leaf = path->nodes[0];
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		write_extent_buffer(leaf, data, ptr, data_size);
+		btrfs_mark_buffer_dirty(leaf);
+	}
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * delete the pointer from a given node.
+ *
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
+ */
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int level, int slot)
+{
+	struct extent_buffer *parent = path->nodes[level];
+	u32 nritems;
+	int ret = 0;
+	int wret;
+
+	nritems = btrfs_header_nritems(parent);
+	if (slot != nritems - 1) {
+		memmove_extent_buffer(parent,
+			      btrfs_node_key_ptr_offset(slot),
+			      btrfs_node_key_ptr_offset(slot + 1),
+			      sizeof(struct btrfs_key_ptr) *
+			      (nritems - slot - 1));
+	}
+	nritems--;
+	btrfs_set_header_nritems(parent, nritems);
+	if (nritems == 0 && parent == root->node) {
+		BUG_ON(btrfs_header_level(root->node) != 1);
+		/* just turn the root into a leaf and break */
+		btrfs_set_header_level(root->node, 0);
+	} else if (slot == 0) {
+		struct btrfs_disk_key disk_key;
+
+		btrfs_node_key(parent, &disk_key, 0);
+		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
+		if (wret)
+			ret = wret;
+	}
+	btrfs_mark_buffer_dirty(parent);
+	return ret;
+}
+
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].  bytenr is the node block pointer, but since the callers
+ * already know it, it is faster to have them pass it down than to
+ * read it out of the node again.
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent.  zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing.  path->nodes[1] must be locked.
+ */
+noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 bytenr)
+{
+	int ret;
+	u64 root_gen = btrfs_header_generation(path->nodes[1]);
+
+	ret = del_ptr(trans, root, path, 1, path->slots[1]);
+	if (ret)
+		return ret;
+
+	ret = btrfs_free_extent(trans, root, bytenr,
+				btrfs_level_size(root, 0),
+				path->nodes[1]->start,
+				btrfs_header_owner(path->nodes[1]),
+				root_gen, 0, 1);
+	return ret;
+}
+/*
+ * delete the item at the leaf level in path.  If that empties
+ * the leaf, remove it from the tree
+ */
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct btrfs_path *path, int slot, int nr)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	int last_off;
+	int dsize = 0;
+	int ret = 0;
+	int wret;
+	int i;
+	u32 nritems;
+
+	leaf = path->nodes[0];
+	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+
+	for (i = 0; i < nr; i++)
+		dsize += btrfs_item_size_nr(leaf, slot + i);
+
+	nritems = btrfs_header_nritems(leaf);
+
+	if (slot + nr != nritems) {
+		int data_end = leaf_data_end(root, leaf);
+
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + dsize,
+			      btrfs_leaf_data(leaf) + data_end,
+			      last_off - data_end);
+
+		for (i = slot + nr; i < nritems; i++) {
+			u32 ioff;
+
+			item = btrfs_item_nr(leaf, i);
+			if (!leaf->map_token) {
+				map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+			}
+			ioff = btrfs_item_offset(leaf, item);
+			btrfs_set_item_offset(leaf, item, ioff + dsize);
+		}
+
+		if (leaf->map_token) {
+			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+			leaf->map_token = NULL;
+		}
+
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+			      btrfs_item_nr_offset(slot + nr),
+			      sizeof(struct btrfs_item) *
+			      (nritems - slot - nr));
+	}
+	btrfs_set_header_nritems(leaf, nritems - nr);
+	nritems -= nr;
+
+	/* delete the leaf if we've emptied it */
+	if (nritems == 0) {
+		if (leaf == root->node) {
+			btrfs_set_header_level(leaf, 0);
+		} else {
+			ret = btrfs_del_leaf(trans, root, path, leaf->start);
+			BUG_ON(ret);
+		}
+	} else {
+		int used = leaf_space_used(leaf, 0, nritems);
+		if (slot == 0) {
+			struct btrfs_disk_key disk_key;
+
+			btrfs_item_key(leaf, &disk_key, 0);
+			wret = fixup_low_keys(trans, root, path,
+					      &disk_key, 1);
+			if (wret)
+				ret = wret;
+		}
+
+		/* delete the leaf if it is mostly empty */
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+			/* push_leaf_left fixes the path.
+			 * make sure the path still points to our leaf
+			 * for possible call to del_ptr below
+			 */
+			slot = path->slots[1];
+			extent_buffer_get(leaf);
+
+			wret = push_leaf_left(trans, root, path, 1, 1);
+			if (wret < 0 && wret != -ENOSPC)
+				ret = wret;
+
+			if (path->nodes[0] == leaf &&
+			    btrfs_header_nritems(leaf)) {
+				wret = push_leaf_right(trans, root, path, 1, 1);
+				if (wret < 0 && wret != -ENOSPC)
+					ret = wret;
+			}
+
+			if (btrfs_header_nritems(leaf) == 0) {
+				path->slots[1] = slot;
+				ret = btrfs_del_leaf(trans, root, path,
+						     leaf->start);
+				BUG_ON(ret);
+				free_extent_buffer(leaf);
+			} else {
+				/* if we're still in the path, make sure
+				 * we're dirty.  Otherwise, one of the
+				 * push_leaf functions must have already
+				 * dirtied this buffer
+				 */
+				if (path->nodes[0] == leaf)
+					btrfs_mark_buffer_dirty(leaf);
+				free_extent_buffer(leaf);
+			}
+		} else {
+			btrfs_mark_buffer_dirty(leaf);
+		}
+	}
+	return ret;
+}
+
+/*
+ * search the tree again to find a leaf with lesser keys
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	struct btrfs_disk_key found_key;
+	int ret;
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+
+	if (key.offset > 0)
+		key.offset--;
+	else if (key.type > 0)
+		key.type--;
+	else if (key.objectid > 0)
+		key.objectid--;
+	else
+		return 1;
+
+	btrfs_release_path(root, path);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	btrfs_item_key(path->nodes[0], &found_key, 0);
+	ret = comp_keys(&found_key, &key);
+	if (ret < 0)
+		return 0;
+	return 1;
+}
+
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are either in cache or have a minimum
+ * transaction id.  This is used by the btree defrag code, and tree logging
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through.  Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_key *max_key,
+			 struct btrfs_path *path, int cache_only,
+			 u64 min_trans)
+{
+	struct extent_buffer *cur;
+	struct btrfs_key found_key;
+	int slot;
+	int sret;
+	u32 nritems;
+	int level;
+	int ret = 1;
+
+	WARN_ON(!path->keep_locks);
+again:
+	cur = btrfs_lock_root_node(root);
+	level = btrfs_header_level(cur);
+	WARN_ON(path->nodes[level]);
+	path->nodes[level] = cur;
+	path->locks[level] = 1;
+
+	if (btrfs_header_generation(cur) < min_trans) {
+		ret = 1;
+		goto out;
+	}
+	while (1) {
+		nritems = btrfs_header_nritems(cur);
+		level = btrfs_header_level(cur);
+		sret = bin_search(cur, min_key, level, &slot);
+
+		/* at the lowest level, we're done, setup the path and exit */
+		if (level == path->lowest_level) {
+			if (slot >= nritems)
+				goto find_next_key;
+			ret = 0;
+			path->slots[level] = slot;
+			btrfs_item_key_to_cpu(cur, &found_key, slot);
+			goto out;
+		}
+		if (sret && slot > 0)
+			slot--;
+		/*
+		 * check this node pointer against the cache_only and
+		 * min_trans parameters.  If it isn't in cache or is too
+		 * old, skip to the next one.
+		 */
+		while (slot < nritems) {
+			u64 blockptr;
+			u64 gen;
+			struct extent_buffer *tmp;
+			struct btrfs_disk_key disk_key;
+
+			blockptr = btrfs_node_blockptr(cur, slot);
+			gen = btrfs_node_ptr_generation(cur, slot);
+			if (gen < min_trans) {
+				slot++;
+				continue;
+			}
+			if (!cache_only)
+				break;
+
+			if (max_key) {
+				btrfs_node_key(cur, &disk_key, slot);
+				if (comp_keys(&disk_key, max_key) >= 0) {
+					ret = 1;
+					goto out;
+				}
+			}
+
+			tmp = btrfs_find_tree_block(root, blockptr,
+					    btrfs_level_size(root, level - 1));
+
+			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+				free_extent_buffer(tmp);
+				break;
+			}
+			if (tmp)
+				free_extent_buffer(tmp);
+			slot++;
+		}
+find_next_key:
+		/*
+		 * we didn't find a candidate key in this node, walk forward
+		 * and find another one
+		 */
+		if (slot >= nritems) {
+			path->slots[level] = slot;
+			sret = btrfs_find_next_key(root, path, min_key, level,
+						  cache_only, min_trans);
+			if (sret == 0) {
+				btrfs_release_path(root, path);
+				goto again;
+			} else {
+				goto out;
+			}
+		}
+		/* save our key for returning back */
+		btrfs_node_key_to_cpu(cur, &found_key, slot);
+		path->slots[level] = slot;
+		if (level == path->lowest_level) {
+			ret = 0;
+			unlock_up(path, level, 1);
+			goto out;
+		}
+		cur = read_node_slot(root, cur, slot);
+
+		btrfs_tree_lock(cur);
+		path->locks[level - 1] = 1;
+		path->nodes[level - 1] = cur;
+		unlock_up(path, level, 1);
+	}
+out:
+	if (ret == 0)
+		memcpy(min_key, &found_key, sizeof(found_key));
+	return ret;
+}
+
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path.  It looks for and returns the next key in the
+ * tree based on the current path and the cache_only and min_trans
+ * parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *key, int lowest_level,
+			int cache_only, u64 min_trans)
+{
+	int level = lowest_level;
+	int slot;
+	struct extent_buffer *c;
+
+	WARN_ON(!path->keep_locks);
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			return 1;
+
+		slot = path->slots[level] + 1;
+		c = path->nodes[level];
+next:
+		if (slot >= btrfs_header_nritems(c)) {
+			level++;
+			if (level == BTRFS_MAX_LEVEL)
+				return 1;
+			continue;
+		}
+		if (level == 0)
+			btrfs_item_key_to_cpu(c, key, slot);
+		else {
+			u64 blockptr = btrfs_node_blockptr(c, slot);
+			u64 gen = btrfs_node_ptr_generation(c, slot);
+
+			if (cache_only) {
+				struct extent_buffer *cur;
+				cur = btrfs_find_tree_block(root, blockptr,
+					    btrfs_level_size(root, level - 1));
+				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
+					slot++;
+					if (cur)
+						free_extent_buffer(cur);
+					goto next;
+				}
+				free_extent_buffer(cur);
+			}
+			if (gen < min_trans) {
+				slot++;
+				goto next;
+			}
+			btrfs_node_key_to_cpu(c, key, slot);
+		}
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * search the tree again to find a leaf with greater keys
+ * returns 0 if it found something or 1 if there are no greater leaves.
+ * returns < 0 on io errors.
+ */
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+	int slot;
+	int level = 1;
+	struct extent_buffer *c;
+	struct extent_buffer *next = NULL;
+	struct btrfs_key key;
+	u32 nritems;
+	int ret;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (nritems == 0)
+		return 1;
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+
+	btrfs_release_path(root, path);
+	path->keep_locks = 1;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	path->keep_locks = 0;
+
+	if (ret < 0)
+		return ret;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	/*
+	 * by releasing the path above we dropped all our locks.  A balance
+	 * could have added more items next to the key that used to be
+	 * at the very end of the block.  So, check again here and
+	 * advance the path if there are now more items available.
+	 */
+	if (nritems > 0 && path->slots[0] < nritems - 1) {
+		path->slots[0]++;
+		goto done;
+	}
+
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			return 1;
+
+		slot = path->slots[level] + 1;
+		c = path->nodes[level];
+		if (slot >= btrfs_header_nritems(c)) {
+			level++;
+			if (level == BTRFS_MAX_LEVEL)
+				return 1;
+			continue;
+		}
+
+		if (next) {
+			btrfs_tree_unlock(next);
+			free_extent_buffer(next);
+		}
+
+		if (level == 1 && (path->locks[1] || path->skip_locking) &&
+		    path->reada)
+			reada_for_search(root, path, level, slot, 0);
+
+		next = read_node_slot(root, c, slot);
+		if (!path->skip_locking) {
+			WARN_ON(!btrfs_tree_locked(c));
+			btrfs_tree_lock(next);
+		}
+		break;
+	}
+	path->slots[level] = slot;
+	while (1) {
+		level--;
+		c = path->nodes[level];
+		if (path->locks[level])
+			btrfs_tree_unlock(c);
+		free_extent_buffer(c);
+		path->nodes[level] = next;
+		path->slots[level] = 0;
+		if (!path->skip_locking)
+			path->locks[level] = 1;
+		if (!level)
+			break;
+		if (level == 1 && path->locks[1] && path->reada)
+			reada_for_search(root, path, level, slot, 0);
+		next = read_node_slot(root, next, 0);
+		if (!path->skip_locking) {
+			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+			btrfs_tree_lock(next);
+		}
+	}
+done:
+	unlock_up(path, 0, 1);
+	return 0;
+}
+
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int ret;
+
+	while (1) {
+		if (path->slots[0] == 0) {
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (nritems == 0)
+			return 1;
+		if (path->slots[0] == nritems)
+			path->slots[0]--;
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.type == type)
+			return 0;
+		if (found_key.objectid < min_objectid)
+			break;
+		if (found_key.objectid == min_objectid &&
+		    found_key.type < type)
+			break;
+	}
+	return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 00000000000..eee060f8811
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2129 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
+
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <asm/kmap_types.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "async-thread.h"
+
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+extern struct kmem_cache *btrfs_bit_radix_cachep;
+extern struct kmem_cache *btrfs_path_cachep;
+struct btrfs_ordered_sum;
+
+#define BTRFS_MAGIC "_BHRfS_M"
+
+#define BTRFS_ACL_NOT_CACHED    ((void *)-1)
+
+#ifdef CONFIG_LOCKDEP
+# define BTRFS_MAX_LEVEL 7
+#else
+# define BTRFS_MAX_LEVEL 8
+#endif
+
+/* holds pointers to all of the tree roots */
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device.  The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
+
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+
+/*
+ * All files have objectids in this range.
+ */
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
+
+
+/*
+ * the device items go into the chunk tree.  The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32	0
+
+static int btrfs_csum_sizes[] = { 4, 0 };
+
+/* four bytes for CRC32 */
+#define BTRFS_EMPTY_DIR_SIZE 0
+
+#define BTRFS_FT_UNKNOWN	0
+#define BTRFS_FT_REG_FILE	1
+#define BTRFS_FT_DIR		2
+#define BTRFS_FT_CHRDEV		3
+#define BTRFS_FT_BLKDEV		4
+#define BTRFS_FT_FIFO		5
+#define BTRFS_FT_SOCK		6
+#define BTRFS_FT_SYMLINK	7
+#define BTRFS_FT_XATTR		8
+#define BTRFS_FT_MAX		9
+
+/*
+ * the key defines the order in the tree, and so it also defines (optimal)
+ * block layout.  objectid corresonds to the inode number.  The flags
+ * tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with flags of 1 might refer to the inode
+ * data, flags of 2 may point to file data in the btree and flags == 3
+ * may point to extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
+ * in cpu native order.  Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
+ */
+struct btrfs_disk_key {
+	__le64 objectid;
+	u8 type;
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_key {
+	u64 objectid;
+	u8 type;
+	u64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_mapping_tree {
+	struct extent_map_tree map_tree;
+};
+
+#define BTRFS_UUID_SIZE 16
+struct btrfs_dev_item {
+	/* the internal btrfs device id */
+	__le64 devid;
+
+	/* size of the device */
+	__le64 total_bytes;
+
+	/* bytes used */
+	__le64 bytes_used;
+
+	/* optimal io alignment for this device */
+	__le32 io_align;
+
+	/* optimal io width for this device */
+	__le32 io_width;
+
+	/* minimal io size for this device */
+	__le32 sector_size;
+
+	/* type and info about this device */
+	__le64 type;
+
+	/* expected generation for this device */
+	__le64 generation;
+
+	/*
+	 * starting byte of this partition on the device,
+	 * to allowr for stripe alignment in the future
+	 */
+	__le64 start_offset;
+
+	/* grouping information for allocation decisions */
+	__le32 dev_group;
+
+	/* seek speed 0-100 where 100 is fastest */
+	u8 seek_speed;
+
+	/* bandwidth 0-100 where 100 is fastest */
+	u8 bandwidth;
+
+	/* btrfs generated uuid for this device */
+	u8 uuid[BTRFS_UUID_SIZE];
+
+	/* uuid of FS who owns this device */
+	u8 fsid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+	__le64 devid;
+	__le64 offset;
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+	/* size of this chunk in bytes */
+	__le64 length;
+
+	/* objectid of the root referencing this chunk */
+	__le64 owner;
+
+	__le64 stripe_len;
+	__le64 type;
+
+	/* optimal io alignment for this chunk */
+	__le32 io_align;
+
+	/* optimal io width for this chunk */
+	__le32 io_width;
+
+	/* minimal io size for this chunk */
+	__le32 sector_size;
+
+	/* 2^16 stripes is quite a lot, a second limit is the size of a single
+	 * item in the btree
+	 */
+	__le16 num_stripes;
+
+	/* sub stripes only matter for raid10 */
+	__le16 sub_stripes;
+	struct btrfs_stripe stripe;
+	/* additional stripes go here */
+} __attribute__ ((__packed__));
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+	BUG_ON(num_stripes == 0);
+	return sizeof(struct btrfs_chunk) +
+		sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
+#define BTRFS_FSID_SIZE 16
+#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+	/* these first four must match the super block */
+	u8 csum[BTRFS_CSUM_SIZE];
+	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+	__le64 bytenr; /* which block this node is supposed to live in */
+	__le64 flags;
+
+	/* allowed to be different from the super from here on down */
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+	__le64 generation;
+	__le64 owner;
+	__le32 nritems;
+	u8 level;
+} __attribute__ ((__packed__));
+
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
+				      sizeof(struct btrfs_header)) / \
+				     sizeof(struct btrfs_key_ptr))
+#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+					sizeof(struct btrfs_item) - \
+					sizeof(struct btrfs_file_extent_item))
+
+#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
+
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
+struct btrfs_super_block {
+	u8 csum[BTRFS_CSUM_SIZE];
+	/* the first 4 fields must match struct btrfs_header */
+	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
+	__le64 bytenr; /* this block number */
+	__le64 flags;
+
+	/* allowed to be different from the btrfs_header from here own down */
+	__le64 magic;
+	__le64 generation;
+	__le64 root;
+	__le64 chunk_root;
+	__le64 log_root;
+
+	/* this will help find the new super based on the log root */
+	__le64 log_root_transid;
+	__le64 total_bytes;
+	__le64 bytes_used;
+	__le64 root_dir_objectid;
+	__le64 num_devices;
+	__le32 sectorsize;
+	__le32 nodesize;
+	__le32 leafsize;
+	__le32 stripesize;
+	__le32 sys_chunk_array_size;
+	__le64 chunk_root_generation;
+	__le64 compat_flags;
+	__le64 compat_ro_flags;
+	__le64 incompat_flags;
+	__le16 csum_type;
+	u8 root_level;
+	u8 chunk_root_level;
+	u8 log_root_level;
+	struct btrfs_dev_item dev_item;
+
+	char label[BTRFS_LABEL_SIZE];
+
+	/* future expansion */
+	__le64 reserved[32];
+	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+} __attribute__ ((__packed__));
+
+/*
+ * Compat flags that we support.  If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_COMPAT_SUPP	0x0
+#define BTRFS_FEATURE_COMPAT_RO_SUPP	0x0
+#define BTRFS_FEATURE_INCOMPAT_SUPP	0x0
+
+/*
+ * A leaf is full of items. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+	struct btrfs_disk_key key;
+	__le32 offset;
+	__le32 size;
+} __attribute__ ((__packed__));
+
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
+struct btrfs_leaf {
+	struct btrfs_header header;
+	struct btrfs_item items[];
+} __attribute__ ((__packed__));
+
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
+struct btrfs_key_ptr {
+	struct btrfs_disk_key key;
+	__le64 blockptr;
+	__le64 generation;
+} __attribute__ ((__packed__));
+
+struct btrfs_node {
+	struct btrfs_header header;
+	struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+
+/*
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
+struct btrfs_path {
+	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
+	int slots[BTRFS_MAX_LEVEL];
+	/* if there is real range locking, this locks field will change */
+	int locks[BTRFS_MAX_LEVEL];
+	int reada;
+	/* keep some upper locks as we walk down */
+	int keep_locks;
+	int skip_locking;
+	int lowest_level;
+
+	/*
+	 * set by btrfs_split_item, tells search_slot to keep all locks
+	 * and to force calls to keep space in the nodes
+	 */
+	int search_for_split;
+};
+
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+struct btrfs_extent_item {
+	__le32 refs;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_ref {
+	__le64 root;
+	__le64 generation;
+	__le64 objectid;
+	__le32 num_refs;
+} __attribute__ ((__packed__));
+
+/* dev extents record free space on individual devices.  The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent.  The chunk tree uuid field is a way to double check the owner
+ */
+struct btrfs_dev_extent {
+	__le64 chunk_tree;
+	__le64 chunk_objectid;
+	__le64 chunk_offset;
+	__le64 length;
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_ref {
+	__le64 index;
+	__le16 name_len;
+	/* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_timespec {
+	__le64 sec;
+	__le32 nsec;
+} __attribute__ ((__packed__));
+
+typedef enum {
+	BTRFS_COMPRESS_NONE = 0,
+	BTRFS_COMPRESS_ZLIB = 1,
+	BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+
+/* we don't understand any encryption methods right now */
+typedef enum {
+	BTRFS_ENCRYPTION_NONE = 0,
+	BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+
+struct btrfs_inode_item {
+	/* nfs style generation number */
+	__le64 generation;
+	/* transid that last touched this inode */
+	__le64 transid;
+	__le64 size;
+	__le64 nbytes;
+	__le64 block_group;
+	__le32 nlink;
+	__le32 uid;
+	__le32 gid;
+	__le32 mode;
+	__le64 rdev;
+	__le64 flags;
+
+	/* modification sequence number for NFS */
+	__le64 sequence;
+
+	/*
+	 * a little future expansion, for more than this we can
+	 * just grow the inode item and version it
+	 */
+	__le64 reserved[4];
+	struct btrfs_timespec atime;
+	struct btrfs_timespec ctime;
+	struct btrfs_timespec mtime;
+	struct btrfs_timespec otime;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_log_item {
+	__le64 end;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_item {
+	struct btrfs_disk_key location;
+	__le64 transid;
+	__le16 data_len;
+	__le16 name_len;
+	u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_root_item {
+	struct btrfs_inode_item inode;
+	__le64 generation;
+	__le64 root_dirid;
+	__le64 bytenr;
+	__le64 byte_limit;
+	__le64 bytes_used;
+	__le64 last_snapshot;
+	__le64 flags;
+	__le32 refs;
+	struct btrfs_disk_key drop_progress;
+	u8 drop_level;
+	u8 level;
+} __attribute__ ((__packed__));
+
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+	__le64 dirid;
+	__le64 sequence;
+	__le16 name_len;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
+
+struct btrfs_file_extent_item {
+	/*
+	 * transaction id that created this extent
+	 */
+	__le64 generation;
+	/*
+	 * max number of bytes to hold this extent in ram
+	 * when we split a compressed extent we can't know how big
+	 * each of the resulting pieces will be.  So, this is
+	 * an upper limit on the size of the extent in ram instead of
+	 * an exact limit.
+	 */
+	__le64 ram_bytes;
+
+	/*
+	 * 32 bits for the various ways we might encode the data,
+	 * including compression and encryption.  If any of these
+	 * are set to something a given disk format doesn't understand
+	 * it is treated like an incompat flag for reading and writing,
+	 * but not for stat.
+	 */
+	u8 compression;
+	u8 encryption;
+	__le16 other_encoding; /* spare for later use */
+
+	/* are we inline data or a real extent? */
+	u8 type;
+
+	/*
+	 * disk space consumed by the extent, checksum blocks are included
+	 * in these numbers
+	 */
+	__le64 disk_bytenr;
+	__le64 disk_num_bytes;
+	/*
+	 * the logical offset in file blocks (no csums)
+	 * this extent record is for.  This allows a file extent to point
+	 * into the middle of an existing extent on disk, sharing it
+	 * between two snapshots (useful if some bytes in the middle of the
+	 * extent have changed
+	 */
+	__le64 offset;
+	/*
+	 * the logical number of file blocks (no csums included).  This
+	 * always reflects the size uncompressed and without encoding.
+	 */
+	__le64 num_bytes;
+
+} __attribute__ ((__packed__));
+
+struct btrfs_csum_item {
+	u8 csum;
+} __attribute__ ((__packed__));
+
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
+#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
+#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+
+struct btrfs_block_group_item {
+	__le64 used;
+	__le64 chunk_objectid;
+	__le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_space_info {
+	u64 flags;
+	u64 total_bytes;
+	u64 bytes_used;
+	u64 bytes_pinned;
+	u64 bytes_reserved;
+	u64 bytes_readonly;
+	int full;
+	int force_alloc;
+	struct list_head list;
+
+	/* for block groups in our same type */
+	struct list_head block_groups;
+	spinlock_t lock;
+	struct rw_semaphore groups_sem;
+};
+
+struct btrfs_free_space {
+	struct rb_node bytes_index;
+	struct rb_node offset_index;
+	u64 offset;
+	u64 bytes;
+};
+
+struct btrfs_block_group_cache {
+	struct btrfs_key key;
+	struct btrfs_block_group_item item;
+	spinlock_t lock;
+	struct mutex alloc_mutex;
+	struct mutex cache_mutex;
+	u64 pinned;
+	u64 reserved;
+	u64 flags;
+	int cached;
+	int ro;
+	int dirty;
+
+	struct btrfs_space_info *space_info;
+
+	/* free space cache stuff */
+	struct rb_root free_space_bytes;
+	struct rb_root free_space_offset;
+
+	/* block group cache stuff */
+	struct rb_node cache_node;
+
+	/* for block groups in the same raid type */
+	struct list_head list;
+
+	/* usage count */
+	atomic_t count;
+};
+
+struct btrfs_leaf_ref_tree {
+	struct rb_root root;
+	struct list_head list;
+	spinlock_t lock;
+};
+
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_fs_info {
+	u8 fsid[BTRFS_FSID_SIZE];
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+	struct btrfs_root *extent_root;
+	struct btrfs_root *tree_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
+	struct btrfs_root *fs_root;
+	struct btrfs_root *csum_root;
+
+	/* the log root tree is a directory of all the other log roots */
+	struct btrfs_root *log_root_tree;
+	struct radix_tree_root fs_roots_radix;
+
+	/* block group cache stuff */
+	spinlock_t block_group_cache_lock;
+	struct rb_root block_group_cache_tree;
+
+	struct extent_io_tree pinned_extents;
+	struct extent_io_tree pending_del;
+	struct extent_io_tree extent_ins;
+
+	/* logical->physical extent mapping */
+	struct btrfs_mapping_tree mapping_tree;
+
+	u64 generation;
+	u64 last_trans_committed;
+	u64 last_trans_new_blockgroup;
+	u64 open_ioctl_trans;
+	unsigned long mount_opt;
+	u64 max_extent;
+	u64 max_inline;
+	u64 alloc_start;
+	struct btrfs_transaction *running_transaction;
+	wait_queue_head_t transaction_throttle;
+	wait_queue_head_t transaction_wait;
+
+	wait_queue_head_t async_submit_wait;
+	wait_queue_head_t tree_log_wait;
+
+	struct btrfs_super_block super_copy;
+	struct btrfs_super_block super_for_commit;
+	struct block_device *__bdev;
+	struct super_block *sb;
+	struct inode *btree_inode;
+	struct backing_dev_info bdi;
+	spinlock_t hash_lock;
+	struct mutex trans_mutex;
+	struct mutex tree_log_mutex;
+	struct mutex transaction_kthread_mutex;
+	struct mutex cleaner_mutex;
+	struct mutex extent_ins_mutex;
+	struct mutex pinned_mutex;
+	struct mutex chunk_mutex;
+	struct mutex drop_mutex;
+	struct mutex volume_mutex;
+	struct mutex tree_reloc_mutex;
+	struct list_head trans_list;
+	struct list_head hashers;
+	struct list_head dead_roots;
+
+	atomic_t nr_async_submits;
+	atomic_t async_submit_draining;
+	atomic_t nr_async_bios;
+	atomic_t async_delalloc_pages;
+	atomic_t tree_log_writers;
+	atomic_t tree_log_commit;
+	unsigned long tree_log_batch;
+	u64 tree_log_transid;
+
+	/*
+	 * this is used by the balancing code to wait for all the pending
+	 * ordered extents
+	 */
+	spinlock_t ordered_extent_lock;
+	struct list_head ordered_extents;
+	struct list_head delalloc_inodes;
+
+	/*
+	 * there is a pool of worker threads for checksumming during writes
+	 * and a pool for checksumming after reads.  This is because readers
+	 * can run with FS locks held, and the writers may be waiting for
+	 * those locks.  We don't want ordering in the pending list to cause
+	 * deadlocks, and so the two are serviced separately.
+	 *
+	 * A third pool does submit_bio to avoid deadlocking with the other
+	 * two
+	 */
+	struct btrfs_workers workers;
+	struct btrfs_workers delalloc_workers;
+	struct btrfs_workers endio_workers;
+	struct btrfs_workers endio_meta_workers;
+	struct btrfs_workers endio_meta_write_workers;
+	struct btrfs_workers endio_write_workers;
+	struct btrfs_workers submit_workers;
+	/*
+	 * fixup workers take dirty pages that didn't properly go through
+	 * the cow mechanism and make them safe to write.  It happens
+	 * for the sys_munmap function call path
+	 */
+	struct btrfs_workers fixup_workers;
+	struct task_struct *transaction_kthread;
+	struct task_struct *cleaner_kthread;
+	int thread_pool_size;
+
+	/* tree relocation relocated fields */
+	struct list_head dead_reloc_roots;
+	struct btrfs_leaf_ref_tree reloc_ref_tree;
+	struct btrfs_leaf_ref_tree shared_ref_tree;
+
+	struct kobject super_kobj;
+	struct completion kobj_unregister;
+	int do_barriers;
+	int closing;
+	int log_root_recovering;
+	atomic_t throttles;
+	atomic_t throttle_gen;
+
+	u64 total_pinned;
+	struct list_head dirty_cowonly_roots;
+
+	struct btrfs_fs_devices *fs_devices;
+	struct list_head space_info;
+	spinlock_t delalloc_lock;
+	spinlock_t new_trans_lock;
+	u64 delalloc_bytes;
+	u64 last_alloc;
+	u64 last_data_alloc;
+
+	spinlock_t ref_cache_lock;
+	u64 total_ref_cache_size;
+
+	u64 avail_data_alloc_bits;
+	u64 avail_metadata_alloc_bits;
+	u64 avail_system_alloc_bits;
+	u64 data_alloc_profile;
+	u64 metadata_alloc_profile;
+	u64 system_alloc_profile;
+
+	void *bdev_holder;
+};
+
+/*
+ * in ram representation of the tree.  extent_root is used for all allocations
+ * and for the extent tree extent_root root.
+ */
+struct btrfs_dirty_root;
+struct btrfs_root {
+	struct extent_buffer *node;
+
+	/* the node lock is held while changing the node pointer */
+	spinlock_t node_lock;
+
+	struct extent_buffer *commit_root;
+	struct btrfs_leaf_ref_tree *ref_tree;
+	struct btrfs_leaf_ref_tree ref_tree_struct;
+	struct btrfs_dirty_root *dirty_root;
+	struct btrfs_root *log_root;
+	struct btrfs_root *reloc_root;
+
+	struct btrfs_root_item root_item;
+	struct btrfs_key root_key;
+	struct btrfs_fs_info *fs_info;
+	struct extent_io_tree dirty_log_pages;
+
+	struct kobject root_kobj;
+	struct completion kobj_unregister;
+	struct mutex objectid_mutex;
+	struct mutex log_mutex;
+
+	u64 objectid;
+	u64 last_trans;
+
+	/* data allocations are done in sectorsize units */
+	u32 sectorsize;
+
+	/* node allocations are done in nodesize units */
+	u32 nodesize;
+
+	/* leaf allocations are done in leafsize units */
+	u32 leafsize;
+
+	u32 stripesize;
+
+	u32 type;
+	u64 highest_inode;
+	u64 last_inode_alloc;
+	int ref_cows;
+	int track_dirty;
+	u64 defrag_trans_start;
+	struct btrfs_key defrag_progress;
+	struct btrfs_key defrag_max;
+	int defrag_running;
+	int defrag_level;
+	char *name;
+	int in_sysfs;
+
+	/* the dirty list is only used by non-reference counted roots */
+	struct list_head dirty_list;
+
+	spinlock_t list_lock;
+	struct list_head dead_list;
+	struct list_head orphan_list;
+
+	/*
+	 * right now this just gets used so that a root has its own devid
+	 * for stat.  It may be used for more later
+	 */
+	struct super_block anon_super;
+};
+
+/*
+
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics.  There is one for every file and dir in
+ * the FS
+ */
+#define BTRFS_INODE_ITEM_KEY		1
+#define BTRFS_INODE_REF_KEY		12
+#define BTRFS_XATTR_ITEM_KEY		24
+#define BTRFS_ORPHAN_ITEM_KEY		48
+/* reserve 2-15 close to the inode for later flexibility */
+
+/*
+ * dir items are the name -> inode pointers in a directory.  There is one
+ * for every name in a directory.
+ */
+#define BTRFS_DIR_LOG_ITEM_KEY  60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY	84
+#define BTRFS_DIR_INDEX_KEY	96
+/*
+ * extent data is for file data
+ */
+#define BTRFS_EXTENT_DATA_KEY	108
+
+/*
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
+ */
+#define BTRFS_EXTENT_CSUM_KEY	128
+
+/*
+ * root items point to tree roots.  There are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY	132
+
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY	144
+
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root.  They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY	156
+
+/*
+ * extent items are in the extent map tree.  These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY	168
+#define BTRFS_EXTENT_REF_KEY	180
+
+/*
+ * block groups give us hints into the extent allocation trees.  Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+
+#define BTRFS_DEV_EXTENT_KEY	204
+#define BTRFS_DEV_ITEM_KEY	216
+#define BTRFS_CHUNK_ITEM_KEY	228
+
+/*
+ * string items are for debugging.  They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY	253
+
+#define BTRFS_MOUNT_NODATASUM		(1 << 0)
+#define BTRFS_MOUNT_NODATACOW		(1 << 1)
+#define BTRFS_MOUNT_NOBARRIER		(1 << 2)
+#define BTRFS_MOUNT_SSD			(1 << 3)
+#define BTRFS_MOUNT_DEGRADED		(1 << 4)
+#define BTRFS_MOUNT_COMPRESS		(1 << 5)
+
+#define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
+					 BTRFS_MOUNT_##opt)
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM		(1 << 0)
+#define BTRFS_INODE_NODATACOW		(1 << 1)
+#define BTRFS_INODE_READONLY		(1 << 2)
+#define BTRFS_INODE_NOCOMPRESS		(1 << 3)
+#define BTRFS_INODE_PREALLOC		(1 << 4)
+#define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
+					 ~BTRFS_INODE_##flag)
+#define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
+					 BTRFS_INODE_##flag)
+#define btrfs_test_flag(inode, flag)	(BTRFS_I(inode)->flags & \
+					 BTRFS_INODE_##flag)
+/* some macros to generate set/get funcs for the struct fields.  This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+#define read_eb_member(eb, ptr, type, member, result) (			\
+	read_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) (		\
+	write_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#ifndef BTRFS_SETGET_FUNCS
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
+#endif
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
+{									\
+	type *p = kmap_atomic(eb->first_page, KM_USER0);		\
+	u##bits res = le##bits##_to_cpu(p->member);			\
+	kunmap_atomic(p, KM_USER0);					\
+	return res;							\
+}									\
+static inline void btrfs_set_##name(struct extent_buffer *eb,		\
+				    u##bits val)			\
+{									\
+	type *p = kmap_atomic(eb->first_page, KM_USER0);		\
+	p->member = cpu_to_le##bits(val);				\
+	kunmap_atomic(p, KM_USER0);					\
+}
+
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(type *s)				\
+{									\
+	return le##bits##_to_cpu(s->member);				\
+}									\
+static inline void btrfs_set_##name(type *s, u##bits val)		\
+{									\
+	s->member = cpu_to_le##bits(val);				\
+}
+
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+		   start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+			 dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+			 seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+			 bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+			 generation, 64);
+
+static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+	return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+	return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+			 stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+			 num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+			 sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+						   int nr)
+{
+	unsigned long offset = (unsigned long)c;
+	offset += offsetof(struct btrfs_chunk, stripe);
+	offset += nr * sizeof(struct btrfs_stripe);
+	return (struct btrfs_stripe *)offset;
+}
+
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+					     struct btrfs_chunk *c, int nr,
+					     u64 val)
+{
+	btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+					     struct btrfs_chunk *c, int nr,
+					     u64 val)
+{
+	btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+			 used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+			 used, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+			struct btrfs_block_group_item, chunk_objectid, 64);
+
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+		   struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+		   struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+			struct btrfs_block_group_item, flags, 64);
+
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+
+static inline struct btrfs_timespec *
+btrfs_inode_atime(struct btrfs_inode_item *inode_item)
+{
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, atime);
+	return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
+{
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, mtime);
+	return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
+{
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, ctime);
+	return (struct btrfs_timespec *)ptr;
+}
+
+static inline struct btrfs_timespec *
+btrfs_inode_otime(struct btrfs_inode_item *inode_item)
+{
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, otime);
+	return (struct btrfs_timespec *)ptr;
+}
+
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+		   chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+		   chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+		   chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+
+static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+	unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+	return (u8 *)((unsigned long)dev + ptr);
+}
+
+/* struct btrfs_extent_ref */
+BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
+
+BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
+			 objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
+			 num_refs, 32);
+
+/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
+			 refs, 32);
+
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+
+static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+					   int nr, u64 val)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+						 int nr, u64 val)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+	return offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+}
+
+void btrfs_node_key(struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr);
+
+static inline void btrfs_set_node_key(struct extent_buffer *eb,
+				      struct btrfs_disk_key *disk_key, int nr)
+{
+	unsigned long ptr;
+	ptr = btrfs_node_key_ptr_offset(nr);
+	write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
+}
+
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+	return offsetof(struct btrfs_leaf, items) +
+		sizeof(struct btrfs_item) * nr;
+}
+
+static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
+					       int nr)
+{
+	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+
+static inline u32 btrfs_item_end(struct extent_buffer *eb,
+				 struct btrfs_item *item)
+{
+	return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+}
+
+static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+{
+	return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+}
+
+static inline void btrfs_item_key(struct extent_buffer *eb,
+			   struct btrfs_disk_key *disk_key, int nr)
+{
+	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+			       struct btrfs_disk_key *disk_key, int nr)
+{
+	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+
+static inline void btrfs_dir_item_key(struct extent_buffer *eb,
+				      struct btrfs_dir_item *item,
+				      struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+					  struct btrfs_dir_item *item,
+					  struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+			 objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+					 struct btrfs_disk_key *disk)
+{
+	cpu->offset = le64_to_cpu(disk->offset);
+	cpu->type = disk->type;
+	cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+					 struct btrfs_key *cpu)
+{
+	disk->offset = cpu_to_le64(cpu->offset);
+	disk->type = cpu->type;
+	disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
+				  struct btrfs_key *key, int nr)
+{
+	struct btrfs_disk_key disk_key;
+	btrfs_node_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
+				  struct btrfs_key *key, int nr)
+{
+	struct btrfs_disk_key disk_key;
+	btrfs_item_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
+				      struct btrfs_dir_item *item,
+				      struct btrfs_key *key)
+{
+	struct btrfs_disk_key disk_key;
+	btrfs_dir_item_key(eb, item, &disk_key);
+	btrfs_disk_key_to_cpu(key, &disk_key);
+}
+
+
+static inline u8 btrfs_key_type(struct btrfs_key *key)
+{
+	return key->type;
+}
+
+static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
+{
+	key->type = val;
+}
+
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+			  generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+
+static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+	btrfs_set_header_flags(eb, flags | flag);
+	return (flags & flag) == flag;
+}
+
+static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+	btrfs_set_header_flags(eb, flags & ~flag);
+	return (flags & flag) == flag;
+}
+
+static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
+{
+	unsigned long ptr = offsetof(struct btrfs_header, fsid);
+	return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+	unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
+	return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
+{
+	unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
+	return (u8 *)ptr;
+}
+
+static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
+{
+	unsigned long ptr = offsetof(struct btrfs_header, csum);
+	return (u8 *)ptr;
+}
+
+static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
+{
+	return NULL;
+}
+
+static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
+{
+	return NULL;
+}
+
+static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
+{
+	return NULL;
+}
+
+static inline int btrfs_is_leaf(struct extent_buffer *eb)
+{
+	return btrfs_header_level(eb) == 0;
+}
+
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+			 last_snapshot, 64);
+
+/* struct btrfs_super_block */
+
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+			 struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+			 struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+			 root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+			 chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+			 chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+			 log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+			 log_root_transid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+			 log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+			 sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+			 nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
+			 leafsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+			 stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+			 root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+			 num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+			 compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+			 compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+			 incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+			 csum_type, 16);
+
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+	int t = btrfs_super_csum_type(s);
+	BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+	return btrfs_csum_sizes[t];
+}
+
+static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
+{
+	return offsetof(struct btrfs_leaf, items);
+}
+
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+{
+	unsigned long offset = (unsigned long)e;
+	offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
+	return offset;
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+	return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+}
+
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+		   disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+		   disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+		  offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+		   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+		   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+		   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+		   other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+					       struct btrfs_file_extent_item *e)
+{
+	return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+						    struct btrfs_item *e)
+{
+	unsigned long offset;
+	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+	return btrfs_item_size(eb, e) - offset;
+}
+
+static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline int btrfs_set_root_name(struct btrfs_root *root,
+				      const char *name, int len)
+{
+	/* if we already have a name just free it */
+	kfree(root->name);
+
+	root->name = kmalloc(len+1, GFP_KERNEL);
+	if (!root->name)
+		return -ENOMEM;
+
+	memcpy(root->name, name, len);
+	root->name[len] = '\0';
+
+	return 0;
+}
+
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
+{
+	if (level == 0)
+		return root->leafsize;
+	return root->nodesize;
+}
+
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+	((type *)(btrfs_leaf_data(leaf) + \
+	btrfs_item_offset_nr(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+	((unsigned long)(btrfs_leaf_data(leaf) + \
+	btrfs_item_offset_nr(leaf, slot)))
+
+static inline struct dentry *fdentry(struct file *file)
+{
+	return file->f_path.dentry;
+}
+
+/* extent-tree.c */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs);
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+				u64 bytenr, u64 num, int pin);
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 objectid, u64 bytenr);
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root);
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr);
+u64 btrfs_find_block_group(struct btrfs_root *root,
+			   u64 search_start, u64 search_hint, int owner);
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     u32 blocksize, u64 parent,
+					     u64 root_objectid,
+					     u64 ref_generation,
+					     int level,
+					     u64 hint,
+					     u64 empty_size);
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize);
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       u64 num_bytes, u64 parent, u64 min_bytes,
+		       u64 root_objectid, u64 ref_generation,
+		       u64 owner, u64 empty_size, u64 hint_byte,
+		       u64 search_end, struct btrfs_key *ins, u64 data);
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, u64 parent,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, struct btrfs_key *ins);
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, u64 parent,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+		  u32 *nr_extents);
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct extent_buffer *buf, u32 nr_extents);
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *orig_buf,
+		     struct extent_buffer *buf, int start_slot, int nr);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent,
+		      u64 root_objectid, u64 ref_generation,
+		      u64 owner_objectid, int pin);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_io_tree *unpin);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 ref_generation,
+			 u64 owner_objectid);
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 orig_parent, u64 parent,
+			    u64 root_objectid, u64 ref_generation,
+			    u64 owner_objectid);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
+			   u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_buffer *buf, u64 orig_start);
+int btrfs_add_dead_reloc_root(struct btrfs_root *root);
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+/* ctree.c */
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type);
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_key *node_keys,
+		     u64 *nodes, int lowest_level);
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *new_key);
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *key, int lowest_level,
+			int cache_only, u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_key *max_key,
+			 struct btrfs_path *path, int cache_only,
+			 u64 min_trans);
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct extent_buffer *buf,
+		    struct extent_buffer *parent, int parent_slot,
+		    struct extent_buffer **cow_ret, u64 prealloc_dest);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, u32 data_size);
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *new_key,
+		     unsigned long split_offset);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_path *p, int
+		      ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct extent_buffer *parent,
+		       int start_slot, int cache_only, u64 *last_ret,
+		       struct btrfs_key *progress);
+void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
+void btrfs_init_path(struct btrfs_path *p);
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int slot, int nr);
+int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 bytenr);
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path)
+{
+	return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     struct btrfs_key *cpu_key, u32 *data_size, int nr);
+
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  struct btrfs_key *key,
+					  u32 data_size)
+{
+	return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+}
+
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+			*root);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct extent_buffer *node,
+			struct extent_buffer *parent);
+/* root-item.c */
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+		   struct btrfs_path *path,
+		   u64 root_id, u64 ref_id);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u8 type, u64 ref_id,
+		       u64 dirid, u64 sequence,
+		       const char *name, int name_len);
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item);
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+			 btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+		      u64 *found_objectid);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+			  struct btrfs_root *latest_root);
+/* dir-item.c */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, const char *name,
+			  int name_len, u64 dir,
+			  struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path, u64 dir,
+					     const char *name, int name_len,
+					     int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dir,
+			    u64 objectid, const char *name, int name_len,
+			    int mod);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, const char *name,
+			    u16 name_len, const void *data, u16 data_len,
+			    u64 dir);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 dir,
+					  const char *name, u16 name_len,
+					  int mod);
+
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset);
+
+/* inode-map.c */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *fs_root,
+			     u64 dirid, u64 *objectid);
+int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
+
+/* inode-item.c */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod);
+
+/* file-item.c */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, u64 bytenr, u64 len);
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst);
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid,
+			     u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_ordered_sum *sums);
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+		       struct bio *bio, u64 file_start, int contig);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+			  u64 start, unsigned long len);
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, int cow);
+int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct btrfs_path *path,
+			u64 isize);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
+			     u64 end, struct list_head *list);
+/* inode.c */
+
+/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
+#define ClearPageChecked ClearPageFsMisc
+#define SetPageChecked SetPageFsMisc
+#define PageChecked PageFsMisc
+#endif
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct inode *inode, u64 new_size,
+			       u32 min_type);
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+int btrfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc);
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root, struct dentry *dentry,
+			     u64 new_dirid, u64 alloc_hint);
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio, unsigned long bio_flags);
+
+unsigned long btrfs_force_ra(struct address_space *mapping,
+			      struct file_ra_state *ra, struct file *file,
+			      pgoff_t offset, pgoff_t last_index);
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+			   int for_del);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_readpage(struct file *file, struct page *page);
+void btrfs_delete_inode(struct inode *inode);
+void btrfs_put_inode(struct inode *inode);
+void btrfs_read_locked_inode(struct inode *inode);
+int btrfs_write_inode(struct inode *inode, int wait);
+void btrfs_dirty_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+int btrfs_init_cachep(void);
+void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    struct btrfs_root *root, int wait);
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+				struct btrfs_root *root);
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+			 struct btrfs_root *root, int *is_new);
+int btrfs_commit_write(struct file *file, struct page *page,
+		       unsigned from, unsigned to);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t page_offset, u64 start, u64 end,
+				    int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+void btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t size);
+
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* file.c */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			    int skip_pinned);
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
+extern struct file_operations btrfs_file_operations;
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode,
+		       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode, u64 start, u64 end);
+int btrfs_release_file(struct inode *inode, struct file *file);
+
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, int cache_only);
+
+/* sysfs.c */
+int btrfs_init_sysfs(void);
+void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
+int btrfs_sysfs_add_root(struct btrfs_root *root);
+void btrfs_sysfs_del_root(struct btrfs_root *root);
+void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
+
+/* xattr.c */
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+
+/* super.c */
+u64 btrfs_parse_size(char *str);
+int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+
+/* acl.c */
+int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_init_acl(struct inode *inode, struct inode *dir);
+int btrfs_acl_chmod(struct inode *inode);
+
+/* free-space-cache.c */
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 bytenr, u64 size);
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+			      u64 offset, u64 bytes);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 bytenr, u64 size);
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+				 u64 offset, u64 bytes);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+				   *block_group);
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+					       *block_group, u64 offset,
+					       u64 bytes);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 00000000000..926a0b287a7
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision.  data_size indicates how big the item inserted should be.  On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+						   *trans,
+						   struct btrfs_root *root,
+						   struct btrfs_path *path,
+						   struct btrfs_key *cpu_key,
+						   u32 data_size,
+						   const char *name,
+						   int name_len)
+{
+	int ret;
+	char *ptr;
+	struct btrfs_item *item;
+	struct extent_buffer *leaf;
+
+	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+	if (ret == -EEXIST) {
+		struct btrfs_dir_item *di;
+		di = btrfs_match_dir_item_name(root, path, name, name_len);
+		if (di)
+			return ERR_PTR(-EEXIST);
+		ret = btrfs_extend_item(trans, root, path, data_size);
+		WARN_ON(ret > 0);
+	}
+	if (ret < 0)
+		return ERR_PTR(ret);
+	WARN_ON(ret > 0);
+	leaf = path->nodes[0];
+	item = btrfs_item_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+	BUG_ON(data_size > btrfs_item_size(leaf, item));
+	ptr += btrfs_item_size(leaf, item) - data_size;
+	return (struct btrfs_dir_item *)ptr;
+}
+
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, const char *name,
+			    u16 name_len, const void *data, u16 data_len,
+			    u64 dir)
+{
+	int ret = 0;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
+	unsigned long name_ptr, data_ptr;
+	struct btrfs_key key, location;
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *leaf;
+	u32 data_size;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	key.offset = btrfs_name_hash(name, name_len);
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	if (name_len + data_len + sizeof(struct btrfs_dir_item) >
+	    BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
+		return -ENOSPC;
+
+	data_size = sizeof(*dir_item) + name_len + data_len;
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
+	/*
+	 * FIXME: at some point we should handle xattr's that are larger than
+	 * what we can fit in our leaf.  We set location to NULL b/c we arent
+	 * pointing at anything else, that will change if we store the xattr
+	 * data in a separate inode.
+	 */
+	BUG_ON(IS_ERR(dir_item));
+	memset(&location, 0, sizeof(location));
+
+	leaf = path->nodes[0];
+	btrfs_cpu_key_to_disk(&disk_key, &location);
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+	btrfs_set_dir_data_len(leaf, dir_item, data_len);
+	name_ptr = (unsigned long)(dir_item + 1);
+	data_ptr = (unsigned long)((char *)name_ptr + name_len);
+
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	write_extent_buffer(leaf, data, data_ptr, data_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, const char *name, int name_len, u64 dir,
+			  struct btrfs_key *location, u8 type, u64 index)
+{
+	int ret = 0;
+	int ret2 = 0;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
+	struct extent_buffer *leaf;
+	unsigned long name_ptr;
+	struct btrfs_key key;
+	struct btrfs_disk_key disk_key;
+	u32 data_size;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	key.offset = btrfs_name_hash(name, name_len);
+	path = btrfs_alloc_path();
+	data_size = sizeof(*dir_item) + name_len;
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
+	if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
+		if (ret == -EEXIST)
+			goto second_insert;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_cpu_key_to_disk(&disk_key, location);
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, type);
+	btrfs_set_dir_data_len(leaf, dir_item, 0);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+	name_ptr = (unsigned long)(dir_item + 1);
+
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+second_insert:
+	/* FIXME, use some real flag for selecting the extra index */
+	if (root == root->fs_info->tree_root) {
+		ret = 0;
+		goto out;
+	}
+	btrfs_release_path(root, path);
+
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = index;
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
+	if (IS_ERR(dir_item)) {
+		ret2 = PTR_ERR(dir_item);
+		goto out;
+	}
+	leaf = path->nodes[0];
+	btrfs_cpu_key_to_disk(&disk_key, location);
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, type);
+	btrfs_set_dir_data_len(leaf, dir_item, 0);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+	name_ptr = (unsigned long)(dir_item + 1);
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
+}
+
+/*
+ * lookup a directory item based on name.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path, u64 dir,
+					     const char *name, int name_len,
+					     int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+
+	key.offset = btrfs_name_hash(name, name_len);
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			return NULL;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != dir ||
+	    btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
+	    found_key.offset != key.offset)
+		return NULL;
+
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * lookup a directory item based on index.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dir,
+			    u64 objectid, const char *name, int name_len,
+			    int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = objectid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return ERR_PTR(-ENOENT);
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 dir,
+					  const char *name, u16 name_len,
+					  int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	key.offset = btrfs_name_hash(name, name_len);
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			return NULL;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != dir ||
+	    btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
+	    found_key.offset != key.offset)
+		return NULL;
+
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len)
+{
+	struct btrfs_dir_item *dir_item;
+	unsigned long name_ptr;
+	u32 total_len;
+	u32 cur = 0;
+	u32 this_len;
+	struct extent_buffer *leaf;
+
+	leaf = path->nodes[0];
+	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+	while (cur < total_len) {
+		this_len = sizeof(*dir_item) +
+			btrfs_dir_name_len(leaf, dir_item) +
+			btrfs_dir_data_len(leaf, dir_item);
+		name_ptr = (unsigned long)(dir_item + 1);
+
+		if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+		    memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+			return dir_item;
+
+		cur += this_len;
+		dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+						     this_len);
+	}
+	return NULL;
+}
+
+/*
+ * given a pointer into a directory item, delete it.  This
+ * handles items that have more than one entry in them.
+ */
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      struct btrfs_dir_item *di)
+{
+
+	struct extent_buffer *leaf;
+	u32 sub_item_len;
+	u32 item_len;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+		btrfs_dir_data_len(leaf, di);
+	item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (sub_item_len == item_len) {
+		ret = btrfs_del_item(trans, root, path);
+	} else {
+		/* MARKER */
+		unsigned long ptr = (unsigned long)di;
+		unsigned long start;
+
+		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+			item_len - (ptr + sub_item_len - start));
+		ret = btrfs_truncate_item(trans, root, path,
+					  item_len - sub_item_len, 1);
+	}
+	return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 00000000000..81a313874ae
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include "compat.h"
+#include "crc32c.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "async-thread.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "tree-log.h"
+
+static struct extent_io_ops btree_extent_io_ops;
+static void end_workqueue_fn(struct btrfs_work *work);
+
+/*
+ * end_io_wq structs are used to do processing in task context when an IO is
+ * complete.  This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
+struct end_io_wq {
+	struct bio *bio;
+	bio_end_io_t *end_io;
+	void *private;
+	struct btrfs_fs_info *info;
+	int error;
+	int metadata;
+	struct list_head list;
+	struct btrfs_work work;
+};
+
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads.  They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+	struct inode *inode;
+	struct bio *bio;
+	struct list_head list;
+	extent_submit_bio_hook_t *submit_bio_start;
+	extent_submit_bio_hook_t *submit_bio_done;
+	int rw;
+	int mirror_num;
+	unsigned long bio_flags;
+	struct btrfs_work work;
+};
+
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
+static struct extent_map *btree_get_extent(struct inode *inode,
+		struct page *page, size_t page_offset, u64 start, u64 len,
+		int create)
+{
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	int ret;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	if (em) {
+		em->bdev =
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+		spin_unlock(&em_tree->lock);
+		goto out;
+	}
+	spin_unlock(&em_tree->lock);
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em) {
+		em = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	em->start = 0;
+	em->len = (u64)-1;
+	em->block_len = (u64)-1;
+	em->block_start = 0;
+	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	if (ret == -EEXIST) {
+		u64 failed_start = em->start;
+		u64 failed_len = em->len;
+
+		free_extent_map(em);
+		em = lookup_extent_mapping(em_tree, start, len);
+		if (em) {
+			ret = 0;
+		} else {
+			em = lookup_extent_mapping(em_tree, failed_start,
+						   failed_len);
+			ret = -EIO;
+		}
+	} else if (ret) {
+		free_extent_map(em);
+		em = NULL;
+	}
+	spin_unlock(&em_tree->lock);
+
+	if (ret)
+		em = ERR_PTR(ret);
+out:
+	return em;
+}
+
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+{
+	return btrfs_crc32c(seed, data, len);
+}
+
+void btrfs_csum_final(u32 crc, char *result)
+{
+	*(__le32 *)result = ~cpu_to_le32(crc);
+}
+
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+			   int verify)
+{
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	char *result = NULL;
+	unsigned long len;
+	unsigned long cur_len;
+	unsigned long offset = BTRFS_CSUM_SIZE;
+	char *map_token = NULL;
+	char *kaddr;
+	unsigned long map_start;
+	unsigned long map_len;
+	int err;
+	u32 crc = ~(u32)0;
+	unsigned long inline_result;
+
+	len = buf->len - offset;
+	while (len > 0) {
+		err = map_private_extent_buffer(buf, offset, 32,
+					&map_token, &kaddr,
+					&map_start, &map_len, KM_USER0);
+		if (err)
+			return 1;
+		cur_len = min(len, map_len - (offset - map_start));
+		crc = btrfs_csum_data(root, kaddr + offset - map_start,
+				      crc, cur_len);
+		len -= cur_len;
+		offset += cur_len;
+		unmap_extent_buffer(buf, map_token, KM_USER0);
+	}
+	if (csum_size > sizeof(inline_result)) {
+		result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+		if (!result)
+			return 1;
+	} else {
+		result = (char *)&inline_result;
+	}
+
+	btrfs_csum_final(crc, result);
+
+	if (verify) {
+		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
+			u32 val;
+			u32 found = 0;
+			memcpy(&found, result, csum_size);
+
+			read_extent_buffer(buf, &val, 0, csum_size);
+			printk(KERN_INFO "btrfs: %s checksum verify failed "
+			       "on %llu wanted %X found %X level %d\n",
+			       root->fs_info->sb->s_id,
+			       buf->start, val, found, btrfs_header_level(buf));
+			if (result != (char *)&inline_result)
+				kfree(result);
+			return 1;
+		}
+	} else {
+		write_extent_buffer(buf, result, 0, csum_size);
+	}
+	if (result != (char *)&inline_result)
+		kfree(result);
+	return 0;
+}
+
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer.  This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+				 struct extent_buffer *eb, u64 parent_transid)
+{
+	int ret;
+
+	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+		return 0;
+
+	lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+	if (extent_buffer_uptodate(io_tree, eb) &&
+	    btrfs_header_generation(eb) == parent_transid) {
+		ret = 0;
+		goto out;
+	}
+	printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+	       (unsigned long long)eb->start,
+	       (unsigned long long)parent_transid,
+	       (unsigned long long)btrfs_header_generation(eb));
+	ret = 1;
+	clear_extent_buffer_uptodate(io_tree, eb);
+out:
+	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+		      GFP_NOFS);
+	return ret;
+}
+
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+					  struct extent_buffer *eb,
+					  u64 start, u64 parent_transid)
+{
+	struct extent_io_tree *io_tree;
+	int ret;
+	int num_copies = 0;
+	int mirror_num = 0;
+
+	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+	while (1) {
+		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+					       btree_get_extent, mirror_num);
+		if (!ret &&
+		    !verify_parent_transid(io_tree, eb, parent_transid))
+			return ret;
+
+		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+					      eb->start, eb->len);
+		if (num_copies == 1)
+			return ret;
+
+		mirror_num++;
+		if (mirror_num > num_copies)
+			return ret;
+	}
+	return -EIO;
+}
+
+/*
+ * checksum a dirty tree block before IO.  This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
+ */
+
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+{
+	struct extent_io_tree *tree;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 found_start;
+	int found_level;
+	unsigned long len;
+	struct extent_buffer *eb;
+	int ret;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+	if (!page->private)
+		goto out;
+	len = page->private >> 2;
+	WARN_ON(len == 0);
+
+	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
+					     btrfs_header_generation(eb));
+	BUG_ON(ret);
+	found_start = btrfs_header_bytenr(eb);
+	if (found_start != start) {
+		WARN_ON(1);
+		goto err;
+	}
+	if (eb->first_page != page) {
+		WARN_ON(1);
+		goto err;
+	}
+	if (!PageUptodate(page)) {
+		WARN_ON(1);
+		goto err;
+	}
+	found_level = btrfs_header_level(eb);
+
+	csum_tree_block(root, eb, 0);
+err:
+	free_extent_buffer(eb);
+out:
+	return 0;
+}
+
+static int check_tree_block_fsid(struct btrfs_root *root,
+				 struct extent_buffer *eb)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	u8 fsid[BTRFS_UUID_SIZE];
+	int ret = 1;
+
+	read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
+			   BTRFS_FSID_SIZE);
+	while (fs_devices) {
+		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+			ret = 0;
+			break;
+		}
+		fs_devices = fs_devices->seed;
+	}
+	return ret;
+}
+
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+			       struct extent_state *state)
+{
+	struct extent_io_tree *tree;
+	u64 found_start;
+	int found_level;
+	unsigned long len;
+	struct extent_buffer *eb;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	int ret = 0;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+	if (!page->private)
+		goto out;
+
+	len = page->private >> 2;
+	WARN_ON(len == 0);
+
+	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+
+	found_start = btrfs_header_bytenr(eb);
+	if (found_start != start) {
+		printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
+		       (unsigned long long)found_start,
+		       (unsigned long long)eb->start);
+		ret = -EIO;
+		goto err;
+	}
+	if (eb->first_page != page) {
+		printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+		       eb->first_page->index, page->index);
+		WARN_ON(1);
+		ret = -EIO;
+		goto err;
+	}
+	if (check_tree_block_fsid(root, eb)) {
+		printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+		       (unsigned long long)eb->start);
+		ret = -EIO;
+		goto err;
+	}
+	found_level = btrfs_header_level(eb);
+
+	ret = csum_tree_block(root, eb, 1);
+	if (ret)
+		ret = -EIO;
+
+	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
+	end = eb->start + end - 1;
+err:
+	free_extent_buffer(eb);
+out:
+	return ret;
+}
+
+static void end_workqueue_bio(struct bio *bio, int err)
+{
+	struct end_io_wq *end_io_wq = bio->bi_private;
+	struct btrfs_fs_info *fs_info;
+
+	fs_info = end_io_wq->info;
+	end_io_wq->error = err;
+	end_io_wq->work.func = end_workqueue_fn;
+	end_io_wq->work.flags = 0;
+
+	if (bio->bi_rw & (1 << BIO_RW)) {
+		if (end_io_wq->metadata)
+			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
+					   &end_io_wq->work);
+		else
+			btrfs_queue_worker(&fs_info->endio_write_workers,
+					   &end_io_wq->work);
+	} else {
+		if (end_io_wq->metadata)
+			btrfs_queue_worker(&fs_info->endio_meta_workers,
+					   &end_io_wq->work);
+		else
+			btrfs_queue_worker(&fs_info->endio_workers,
+					   &end_io_wq->work);
+	}
+}
+
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+			int metadata)
+{
+	struct end_io_wq *end_io_wq;
+	end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+	if (!end_io_wq)
+		return -ENOMEM;
+
+	end_io_wq->private = bio->bi_private;
+	end_io_wq->end_io = bio->bi_end_io;
+	end_io_wq->info = info;
+	end_io_wq->error = 0;
+	end_io_wq->bio = bio;
+	end_io_wq->metadata = metadata;
+
+	bio->bi_private = end_io_wq;
+	bio->bi_end_io = end_workqueue_bio;
+	return 0;
+}
+
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
+{
+	unsigned long limit = min_t(unsigned long,
+				    info->workers.max_workers,
+				    info->fs_devices->open_devices);
+	return 256 * limit;
+}
+
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
+{
+	return atomic_read(&info->nr_async_bios) >
+		btrfs_async_submit_limit(info);
+}
+
+static void run_one_async_start(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct async_submit_bio *async;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	fs_info = BTRFS_I(async->inode)->root->fs_info;
+	async->submit_bio_start(async->inode, async->rw, async->bio,
+			       async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_done(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct async_submit_bio *async;
+	int limit;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	fs_info = BTRFS_I(async->inode)->root->fs_info;
+
+	limit = btrfs_async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
+	atomic_dec(&fs_info->nr_async_submits);
+
+	if (atomic_read(&fs_info->nr_async_submits) < limit &&
+	    waitqueue_active(&fs_info->async_submit_wait))
+		wake_up(&fs_info->async_submit_wait);
+
+	async->submit_bio_done(async->inode, async->rw, async->bio,
+			       async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+	struct async_submit_bio *async;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	kfree(async);
+}
+
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
+			extent_submit_bio_hook_t *submit_bio_start,
+			extent_submit_bio_hook_t *submit_bio_done)
+{
+	struct async_submit_bio *async;
+
+	async = kmalloc(sizeof(*async), GFP_NOFS);
+	if (!async)
+		return -ENOMEM;
+
+	async->inode = inode;
+	async->rw = rw;
+	async->bio = bio;
+	async->mirror_num = mirror_num;
+	async->submit_bio_start = submit_bio_start;
+	async->submit_bio_done = submit_bio_done;
+
+	async->work.func = run_one_async_start;
+	async->work.ordered_func = run_one_async_done;
+	async->work.ordered_free = run_one_async_free;
+
+	async->work.flags = 0;
+	async->bio_flags = bio_flags;
+
+	atomic_inc(&fs_info->nr_async_submits);
+	btrfs_queue_worker(&fs_info->workers, &async->work);
+#if 0
+	int limit = btrfs_async_submit_limit(fs_info);
+	if (atomic_read(&fs_info->nr_async_submits) > limit) {
+		wait_event_timeout(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_submits) < limit),
+			   HZ/10);
+
+		wait_event_timeout(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_bios) < limit),
+			   HZ/10);
+	}
+#endif
+	while (atomic_read(&fs_info->async_submit_draining) &&
+	      atomic_read(&fs_info->nr_async_submits)) {
+		wait_event(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_submits) == 0));
+	}
+
+	return 0;
+}
+
+static int btree_csum_one_bio(struct bio *bio)
+{
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	struct btrfs_root *root;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+	while (bio_index < bio->bi_vcnt) {
+		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+		csum_dirty_buffer(root, bvec->bv_page);
+		bio_index++;
+		bvec++;
+	}
+	return 0;
+}
+
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags)
+{
+	/*
+	 * when we're called for a write, we're already in the async
+	 * submission context.  Just jump into btrfs_map_bio
+	 */
+	btree_csum_one_bio(bio);
+	return 0;
+}
+
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
+	/*
+	 * when we're called for a write, we're already in the async
+	 * submission context.  Just jump into btrfs_map_bio
+	 */
+	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+}
+
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
+	int ret;
+
+	ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+					  bio, 1);
+	BUG_ON(ret);
+
+	if (!(rw & (1 << BIO_RW))) {
+		/*
+		 * called for a read, do the setup so that checksum validation
+		 * can happen in the async kernel threads
+		 */
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				     mirror_num, 0);
+	}
+	/*
+	 * kthread helpers are used to submit writes so that checksumming
+	 * can happen in parallel across all CPUs
+	 */
+	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+				   inode, rw, bio, mirror_num, 0,
+				   __btree_submit_bio_start,
+				   __btree_submit_bio_done);
+}
+
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+	if (current->flags & PF_MEMALLOC) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
+	return extent_write_full_page(tree, page, btree_get_extent, wbc);
+}
+
+static int btree_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		u64 num_dirty;
+		u64 start = 0;
+		unsigned long thresh = 32 * 1024 * 1024;
+
+		if (wbc->for_kupdate)
+			return 0;
+
+		num_dirty = count_range_bits(tree, &start, (u64)-1,
+					     thresh, EXTENT_DIRTY);
+		if (num_dirty < thresh)
+			return 0;
+	}
+	return extent_writepages(tree, mapping, btree_get_extent, wbc);
+}
+
+static int btree_readpage(struct file *file, struct page *page)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	return extent_read_full_page(tree, page, btree_get_extent);
+}
+
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *map;
+	int ret;
+
+	if (PageWriteback(page) || PageDirty(page))
+		return 0;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	map = &BTRFS_I(page->mapping->host)->extent_tree;
+
+	ret = try_release_extent_state(map, tree, page, gfp_flags);
+	if (!ret)
+		return 0;
+
+	ret = try_release_extent_buffer(tree, page);
+	if (ret == 1) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+
+	return ret;
+}
+
+static void btree_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	extent_invalidatepage(tree, page, offset);
+	btree_releasepage(page, GFP_NOFS);
+	if (PagePrivate(page)) {
+		printk(KERN_WARNING "btrfs warning page private not zero "
+		       "on page %llu\n", (unsigned long long)page_offset(page));
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+}
+
+#if 0
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct buffer_head *bh;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	struct buffer_head *head;
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, root->fs_info->sb->s_blocksize,
+					(1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (buffer_dirty(bh))
+			csum_tree_block(root, bh, 0);
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return block_write_full_page(page, btree_get_block, wbc);
+}
+#endif
+
+static struct address_space_operations btree_aops = {
+	.readpage	= btree_readpage,
+	.writepage	= btree_writepage,
+	.writepages	= btree_writepages,
+	.releasepage	= btree_releasepage,
+	.invalidatepage = btree_invalidatepage,
+	.sync_page	= block_sync_page,
+};
+
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 u64 parent_transid)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	int ret = 0;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return 0;
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+				 buf, 0, 0, btree_get_extent, 0);
+	free_extent_buffer(buf);
+	return ret;
+}
+
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_buffer *eb;
+	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+				bytenr, blocksize, GFP_NOFS);
+	return eb;
+}
+
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+						 u64 bytenr, u32 blocksize)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_buffer *eb;
+
+	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+				 bytenr, blocksize, NULL, GFP_NOFS);
+	return eb;
+}
+
+
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+	return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
+				      buf->start + buf->len - 1, WB_SYNC_ALL);
+}
+
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+				  buf->start, buf->start + buf->len - 1);
+}
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+				      u32 blocksize, u64 parent_transid)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_io_tree *io_tree;
+	int ret;
+
+	io_tree = &BTRFS_I(btree_inode)->io_tree;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return NULL;
+
+	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+
+	if (ret == 0)
+		buf->flags |= EXTENT_UPTODATE;
+	else
+		WARN_ON(1);
+	return buf;
+
+}
+
+int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     struct extent_buffer *buf)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	if (btrfs_header_generation(buf) ==
+	    root->fs_info->running_transaction->transid) {
+		WARN_ON(!btrfs_tree_locked(buf));
+		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+					  buf);
+	}
+	return 0;
+}
+
+static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+			u32 stripesize, struct btrfs_root *root,
+			struct btrfs_fs_info *fs_info,
+			u64 objectid)
+{
+	root->node = NULL;
+	root->commit_root = NULL;
+	root->ref_tree = NULL;
+	root->sectorsize = sectorsize;
+	root->nodesize = nodesize;
+	root->leafsize = leafsize;
+	root->stripesize = stripesize;
+	root->ref_cows = 0;
+	root->track_dirty = 0;
+
+	root->fs_info = fs_info;
+	root->objectid = objectid;
+	root->last_trans = 0;
+	root->highest_inode = 0;
+	root->last_inode_alloc = 0;
+	root->name = NULL;
+	root->in_sysfs = 0;
+
+	INIT_LIST_HEAD(&root->dirty_list);
+	INIT_LIST_HEAD(&root->orphan_list);
+	INIT_LIST_HEAD(&root->dead_list);
+	spin_lock_init(&root->node_lock);
+	spin_lock_init(&root->list_lock);
+	mutex_init(&root->objectid_mutex);
+	mutex_init(&root->log_mutex);
+	extent_io_tree_init(&root->dirty_log_pages,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+
+	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+	root->ref_tree = &root->ref_tree_struct;
+
+	memset(&root->root_key, 0, sizeof(root->root_key));
+	memset(&root->root_item, 0, sizeof(root->root_item));
+	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+	root->defrag_trans_start = fs_info->generation;
+	init_completion(&root->kobj_unregister);
+	root->defrag_running = 0;
+	root->defrag_level = 0;
+	root->root_key.objectid = objectid;
+	root->anon_super.s_root = NULL;
+	root->anon_super.s_dev = 0;
+	INIT_LIST_HEAD(&root->anon_super.s_list);
+	INIT_LIST_HEAD(&root->anon_super.s_instances);
+	init_rwsem(&root->anon_super.s_umount);
+
+	return 0;
+}
+
+static int find_and_setup_root(struct btrfs_root *tree_root,
+			       struct btrfs_fs_info *fs_info,
+			       u64 objectid,
+			       struct btrfs_root *root)
+{
+	int ret;
+	u32 blocksize;
+	u64 generation;
+
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, objectid);
+	ret = btrfs_find_last_root(tree_root, objectid,
+				   &root->root_item, &root->root_key);
+	BUG_ON(ret);
+
+	generation = btrfs_root_generation(&root->root_item);
+	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+				     blocksize, generation);
+	BUG_ON(!root->node);
+	return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	struct extent_buffer *eb;
+	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+	u64 start = 0;
+	u64 end = 0;
+	int ret;
+
+	if (!log_root_tree)
+		return 0;
+
+	while (1) {
+		ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
+				    0, &start, &end, EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		clear_extent_dirty(&log_root_tree->dirty_log_pages,
+				   start, end, GFP_NOFS);
+	}
+	eb = fs_info->log_root_tree->node;
+
+	WARN_ON(btrfs_header_level(eb) != 0);
+	WARN_ON(btrfs_header_nritems(eb) != 0);
+
+	ret = btrfs_free_reserved_extent(fs_info->tree_root,
+				eb->start, eb->len);
+	BUG_ON(ret);
+
+	free_extent_buffer(eb);
+	kfree(fs_info->log_root_tree);
+	fs_info->log_root_tree = NULL;
+	return 0;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+
+	root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (!root)
+		return -ENOMEM;
+
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+	root->ref_cows = 0;
+
+	root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
+					    0, BTRFS_TREE_LOG_OBJECTID,
+					    trans->transid, 0, 0, 0);
+
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	btrfs_set_header_bytenr(root->node, root->node->start);
+	btrfs_set_header_generation(root->node, trans->transid);
+	btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
+
+	write_extent_buffer(root->node, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(root->node),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(root->node);
+	btrfs_tree_unlock(root->node);
+	fs_info->log_root_tree = root;
+	return 0;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+					       struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_path *path;
+	struct extent_buffer *l;
+	u64 highest_inode;
+	u64 generation;
+	u32 blocksize;
+	int ret = 0;
+
+	root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+	if (location->offset == (u64)-1) {
+		ret = find_and_setup_root(tree_root, fs_info,
+					  location->objectid, root);
+		if (ret) {
+			kfree(root);
+			return ERR_PTR(ret);
+		}
+		goto insert;
+	}
+
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, location->objectid);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+	if (ret != 0) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto out;
+	}
+	l = path->nodes[0];
+	read_extent_buffer(l, &root->root_item,
+	       btrfs_item_ptr_offset(l, path->slots[0]),
+	       sizeof(root->root_item));
+	memcpy(&root->root_key, location, sizeof(*location));
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	if (ret) {
+		kfree(root);
+		return ERR_PTR(ret);
+	}
+	generation = btrfs_root_generation(&root->root_item);
+	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+				     blocksize, generation);
+	BUG_ON(!root->node);
+insert:
+	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+		root->ref_cows = 1;
+		ret = btrfs_find_highest_inode(root, &highest_inode);
+		if (ret == 0) {
+			root->highest_inode = highest_inode;
+			root->last_inode_alloc = highest_inode;
+		}
+	}
+	return root;
+}
+
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid)
+{
+	struct btrfs_root *root;
+
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return fs_info->tree_root;
+	if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		return fs_info->extent_root;
+
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)root_objectid);
+	return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+					      struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+	int ret;
+
+	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return fs_info->tree_root;
+	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		return fs_info->extent_root;
+	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+		return fs_info->chunk_root;
+	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+		return fs_info->dev_root;
+	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+		return fs_info->csum_root;
+
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)location->objectid);
+	if (root)
+		return root;
+
+	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+	if (IS_ERR(root))
+		return root;
+
+	set_anon_super(&root->anon_super, NULL);
+
+	ret = radix_tree_insert(&fs_info->fs_roots_radix,
+				(unsigned long)root->root_key.objectid,
+				root);
+	if (ret) {
+		free_extent_buffer(root->node);
+		kfree(root);
+		return ERR_PTR(ret);
+	}
+	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+		ret = btrfs_find_dead_roots(fs_info->tree_root,
+					    root->root_key.objectid, root);
+		BUG_ON(ret);
+		btrfs_orphan_cleanup(root);
+	}
+	return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+				      struct btrfs_key *location,
+				      const char *name, int namelen)
+{
+	struct btrfs_root *root;
+	int ret;
+
+	root = btrfs_read_fs_root_no_name(fs_info, location);
+	if (!root)
+		return NULL;
+
+	if (root->in_sysfs)
+		return root;
+
+	ret = btrfs_set_root_name(root, name, namelen);
+	if (ret) {
+		free_extent_buffer(root->node);
+		kfree(root);
+		return ERR_PTR(ret);
+	}
+#if 0
+	ret = btrfs_sysfs_add_root(root);
+	if (ret) {
+		free_extent_buffer(root->node);
+		kfree(root->name);
+		kfree(root);
+		return ERR_PTR(ret);
+	}
+#endif
+	root->in_sysfs = 1;
+	return root;
+}
+
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+	int ret = 0;
+	struct list_head *cur;
+	struct btrfs_device *device;
+	struct backing_dev_info *bdi;
+#if 0
+	if ((bdi_bits & (1 << BDI_write_congested)) &&
+	    btrfs_congested_async(info, 0))
+		return 1;
+#endif
+	list_for_each(cur, &info->fs_devices->devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->bdev)
+			continue;
+		bdi = blk_get_backing_dev_info(device->bdev);
+		if (bdi && bdi_congested(bdi, bdi_bits)) {
+			ret = 1;
+			break;
+		}
+	}
+	return ret;
+}
+
+/*
+ * this unplugs every device on the box, and it is only used when page
+ * is null
+ */
+static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+	struct list_head *cur;
+	struct btrfs_device *device;
+	struct btrfs_fs_info *info;
+
+	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
+	list_for_each(cur, &info->fs_devices->devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->bdev)
+			continue;
+
+		bdi = blk_get_backing_dev_info(device->bdev);
+		if (bdi->unplug_io_fn)
+			bdi->unplug_io_fn(bdi, page);
+	}
+}
+
+static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+	struct inode *inode;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct address_space *mapping;
+	u64 offset;
+
+	/* the generic O_DIRECT read code does this */
+	if (1 || !page) {
+		__unplug_io_fn(bdi, page);
+		return;
+	}
+
+	/*
+	 * page->mapping may change at any time.  Get a consistent copy
+	 * and use that for everything below
+	 */
+	smp_mb();
+	mapping = page->mapping;
+	if (!mapping)
+		return;
+
+	inode = mapping->host;
+
+	/*
+	 * don't do the expensive searching for a small number of
+	 * devices
+	 */
+	if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
+		__unplug_io_fn(bdi, page);
+		return;
+	}
+
+	offset = page_offset(page);
+
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+	spin_unlock(&em_tree->lock);
+	if (!em) {
+		__unplug_io_fn(bdi, page);
+		return;
+	}
+
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		free_extent_map(em);
+		__unplug_io_fn(bdi, page);
+		return;
+	}
+	offset = offset - em->start;
+	btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
+			  em->block_start + offset, page);
+	free_extent_map(em);
+}
+
+static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+{
+	bdi_init(bdi);
+	bdi->ra_pages	= default_backing_dev_info.ra_pages;
+	bdi->state		= 0;
+	bdi->capabilities	= default_backing_dev_info.capabilities;
+	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
+	bdi->unplug_io_data	= info;
+	bdi->congested_fn	= btrfs_congested_fn;
+	bdi->congested_data	= info;
+	return 0;
+}
+
+static int bio_ready_for_csum(struct bio *bio)
+{
+	u64 length = 0;
+	u64 buf_len = 0;
+	u64 start = 0;
+	struct page *page;
+	struct extent_io_tree *io_tree = NULL;
+	struct btrfs_fs_info *info = NULL;
+	struct bio_vec *bvec;
+	int i;
+	int ret;
+
+	bio_for_each_segment(bvec, bio, i) {
+		page = bvec->bv_page;
+		if (page->private == EXTENT_PAGE_PRIVATE) {
+			length += bvec->bv_len;
+			continue;
+		}
+		if (!page->private) {
+			length += bvec->bv_len;
+			continue;
+		}
+		length = bvec->bv_len;
+		buf_len = page->private >> 2;
+		start = page_offset(page) + bvec->bv_offset;
+		io_tree = &BTRFS_I(page->mapping->host)->io_tree;
+		info = BTRFS_I(page->mapping->host)->root->fs_info;
+	}
+	/* are we fully contained in this bio? */
+	if (buf_len <= length)
+		return 1;
+
+	ret = extent_range_uptodate(io_tree, start + length,
+				    start + buf_len - 1);
+	if (ret == 1)
+		return ret;
+	return ret;
+}
+
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions.  This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
+{
+	struct bio *bio;
+	struct end_io_wq *end_io_wq;
+	struct btrfs_fs_info *fs_info;
+	int error;
+
+	end_io_wq = container_of(work, struct end_io_wq, work);
+	bio = end_io_wq->bio;
+	fs_info = end_io_wq->info;
+
+	/* metadata bio reads are special because the whole tree block must
+	 * be checksummed at once.  This makes sure the entire block is in
+	 * ram and up to date before trying to verify things.  For
+	 * blocksize <= pagesize, it is basically a noop
+	 */
+	if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
+	    !bio_ready_for_csum(bio)) {
+		btrfs_queue_worker(&fs_info->endio_meta_workers,
+				   &end_io_wq->work);
+		return;
+	}
+	error = end_io_wq->error;
+	bio->bi_private = end_io_wq->private;
+	bio->bi_end_io = end_io_wq->end_io;
+	kfree(end_io_wq);
+	bio_endio(bio, error);
+}
+
+static int cleaner_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+
+	do {
+		smp_mb();
+		if (root->fs_info->closing)
+			break;
+
+		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		mutex_lock(&root->fs_info->cleaner_mutex);
+		btrfs_clean_old_snapshots(root);
+		mutex_unlock(&root->fs_info->cleaner_mutex);
+
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			smp_mb();
+			if (root->fs_info->closing)
+				break;
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+static int transaction_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_transaction *cur;
+	unsigned long now;
+	unsigned long delay;
+	int ret;
+
+	do {
+		smp_mb();
+		if (root->fs_info->closing)
+			break;
+
+		delay = HZ * 30;
+		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+		if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
+			printk(KERN_INFO "btrfs: total reference cache "
+			       "size %llu\n",
+			       root->fs_info->total_ref_cache_size);
+		}
+
+		mutex_lock(&root->fs_info->trans_mutex);
+		cur = root->fs_info->running_transaction;
+		if (!cur) {
+			mutex_unlock(&root->fs_info->trans_mutex);
+			goto sleep;
+		}
+
+		now = get_seconds();
+		if (now < cur->start_time || now - cur->start_time < 30) {
+			mutex_unlock(&root->fs_info->trans_mutex);
+			delay = HZ * 5;
+			goto sleep;
+		}
+		mutex_unlock(&root->fs_info->trans_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		ret = btrfs_commit_transaction(trans, root);
+sleep:
+		wake_up_process(root->fs_info->cleaner_kthread);
+		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			if (root->fs_info->closing)
+				break;
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(delay);
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options)
+{
+	u32 sectorsize;
+	u32 nodesize;
+	u32 leafsize;
+	u32 blocksize;
+	u32 stripesize;
+	u64 generation;
+	u64 features;
+	struct btrfs_key location;
+	struct buffer_head *bh;
+	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
+						 GFP_NOFS);
+	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
+						 GFP_NOFS);
+	struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
+					       GFP_NOFS);
+	struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
+						GFP_NOFS);
+	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
+						GFP_NOFS);
+	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
+					      GFP_NOFS);
+	struct btrfs_root *log_tree_root;
+
+	int ret;
+	int err = -EINVAL;
+
+	struct btrfs_super_block *disk_super;
+
+	if (!extent_root || !tree_root || !fs_info ||
+	    !chunk_root || !dev_root || !csum_root) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+	INIT_LIST_HEAD(&fs_info->trans_list);
+	INIT_LIST_HEAD(&fs_info->dead_roots);
+	INIT_LIST_HEAD(&fs_info->hashers);
+	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+	spin_lock_init(&fs_info->hash_lock);
+	spin_lock_init(&fs_info->delalloc_lock);
+	spin_lock_init(&fs_info->new_trans_lock);
+	spin_lock_init(&fs_info->ref_cache_lock);
+
+	init_completion(&fs_info->kobj_unregister);
+	fs_info->tree_root = tree_root;
+	fs_info->extent_root = extent_root;
+	fs_info->csum_root = csum_root;
+	fs_info->chunk_root = chunk_root;
+	fs_info->dev_root = dev_root;
+	fs_info->fs_devices = fs_devices;
+	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+	INIT_LIST_HEAD(&fs_info->space_info);
+	btrfs_mapping_init(&fs_info->mapping_tree);
+	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->async_delalloc_pages, 0);
+	atomic_set(&fs_info->async_submit_draining, 0);
+	atomic_set(&fs_info->nr_async_bios, 0);
+	atomic_set(&fs_info->throttles, 0);
+	atomic_set(&fs_info->throttle_gen, 0);
+	fs_info->sb = sb;
+	fs_info->max_extent = (u64)-1;
+	fs_info->max_inline = 8192 * 1024;
+	setup_bdi(fs_info, &fs_info->bdi);
+	fs_info->btree_inode = new_inode(sb);
+	fs_info->btree_inode->i_ino = 1;
+	fs_info->btree_inode->i_nlink = 1;
+
+	fs_info->thread_pool_size = min_t(unsigned long,
+					  num_online_cpus() + 2, 8);
+
+	INIT_LIST_HEAD(&fs_info->ordered_extents);
+	spin_lock_init(&fs_info->ordered_extent_lock);
+
+	sb->s_blocksize = 4096;
+	sb->s_blocksize_bits = blksize_bits(4096);
+
+	/*
+	 * we set the i_size on the btree inode to the max possible int.
+	 * the real end of the address space is determined by all of
+	 * the devices in the system
+	 */
+	fs_info->btree_inode->i_size = OFFSET_MAX;
+	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+
+	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+			     fs_info->btree_inode->i_mapping,
+			     GFP_NOFS);
+	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+			     GFP_NOFS);
+
+	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+
+	spin_lock_init(&fs_info->block_group_cache_lock);
+	fs_info->block_group_cache_tree.rb_node = NULL;
+
+	extent_io_tree_init(&fs_info->pinned_extents,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&fs_info->pending_del,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&fs_info->extent_ins,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	fs_info->do_barriers = 1;
+
+	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
+	btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
+	btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
+
+	BTRFS_I(fs_info->btree_inode)->root = tree_root;
+	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+	       sizeof(struct btrfs_key));
+	insert_inode_hash(fs_info->btree_inode);
+
+	mutex_init(&fs_info->trans_mutex);
+	mutex_init(&fs_info->tree_log_mutex);
+	mutex_init(&fs_info->drop_mutex);
+	mutex_init(&fs_info->extent_ins_mutex);
+	mutex_init(&fs_info->pinned_mutex);
+	mutex_init(&fs_info->chunk_mutex);
+	mutex_init(&fs_info->transaction_kthread_mutex);
+	mutex_init(&fs_info->cleaner_mutex);
+	mutex_init(&fs_info->volume_mutex);
+	mutex_init(&fs_info->tree_reloc_mutex);
+	init_waitqueue_head(&fs_info->transaction_throttle);
+	init_waitqueue_head(&fs_info->transaction_wait);
+	init_waitqueue_head(&fs_info->async_submit_wait);
+	init_waitqueue_head(&fs_info->tree_log_wait);
+	atomic_set(&fs_info->tree_log_commit, 0);
+	atomic_set(&fs_info->tree_log_writers, 0);
+	fs_info->tree_log_transid = 0;
+
+	__setup_root(4096, 4096, 4096, 4096, tree_root,
+		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
+
+
+	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
+	if (!bh)
+		goto fail_iput;
+
+	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
+	       sizeof(fs_info->super_for_commit));
+	brelse(bh);
+
+	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+
+	disk_super = &fs_info->super_copy;
+	if (!btrfs_super_root(disk_super))
+		goto fail_iput;
+
+	ret = btrfs_parse_options(tree_root, options);
+	if (ret) {
+		err = ret;
+		goto fail_iput;
+	}
+
+	features = btrfs_super_incompat_flags(disk_super) &
+		~BTRFS_FEATURE_INCOMPAT_SUPP;
+	if (features) {
+		printk(KERN_ERR "BTRFS: couldn't mount because of "
+		       "unsupported optional features (%Lx).\n",
+		       features);
+		err = -EINVAL;
+		goto fail_iput;
+	}
+
+	features = btrfs_super_compat_ro_flags(disk_super) &
+		~BTRFS_FEATURE_COMPAT_RO_SUPP;
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
+		       "unsupported option features (%Lx).\n",
+		       features);
+		err = -EINVAL;
+		goto fail_iput;
+	}
+
+	/*
+	 * we need to start all the end_io workers up front because the
+	 * queue work function gets called at interrupt time, and so it
+	 * cannot dynamically grow.
+	 */
+	btrfs_init_workers(&fs_info->workers, "worker",
+			   fs_info->thread_pool_size);
+
+	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+			   fs_info->thread_pool_size);
+
+	btrfs_init_workers(&fs_info->submit_workers, "submit",
+			   min_t(u64, fs_devices->num_devices,
+			   fs_info->thread_pool_size));
+
+	/* a higher idle thresh on the submit workers makes it much more
+	 * likely that bios will be send down in a sane order to the
+	 * devices
+	 */
+	fs_info->submit_workers.idle_thresh = 64;
+
+	fs_info->workers.idle_thresh = 16;
+	fs_info->workers.ordered = 1;
+
+	fs_info->delalloc_workers.idle_thresh = 2;
+	fs_info->delalloc_workers.ordered = 1;
+
+	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+	btrfs_init_workers(&fs_info->endio_workers, "endio",
+			   fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+			   fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_meta_write_workers,
+			   "endio-meta-write", fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+			   fs_info->thread_pool_size);
+
+	/*
+	 * endios are largely parallel and should have a very
+	 * low idle thresh
+	 */
+	fs_info->endio_workers.idle_thresh = 4;
+	fs_info->endio_write_workers.idle_thresh = 64;
+	fs_info->endio_meta_write_workers.idle_thresh = 64;
+
+	btrfs_start_workers(&fs_info->workers, 1);
+	btrfs_start_workers(&fs_info->submit_workers, 1);
+	btrfs_start_workers(&fs_info->delalloc_workers, 1);
+	btrfs_start_workers(&fs_info->fixup_workers, 1);
+	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_meta_workers,
+			    fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_meta_write_workers,
+			    fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_write_workers,
+			    fs_info->thread_pool_size);
+
+	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+
+	nodesize = btrfs_super_nodesize(disk_super);
+	leafsize = btrfs_super_leafsize(disk_super);
+	sectorsize = btrfs_super_sectorsize(disk_super);
+	stripesize = btrfs_super_stripesize(disk_super);
+	tree_root->nodesize = nodesize;
+	tree_root->leafsize = leafsize;
+	tree_root->sectorsize = sectorsize;
+	tree_root->stripesize = stripesize;
+
+	sb->s_blocksize = sectorsize;
+	sb->s_blocksize_bits = blksize_bits(sectorsize);
+
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+		    sizeof(disk_super->magic))) {
+		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+		goto fail_sb_buffer;
+	}
+
+	mutex_lock(&fs_info->chunk_mutex);
+	ret = btrfs_read_sys_array(tree_root);
+	mutex_unlock(&fs_info->chunk_mutex);
+	if (ret) {
+		printk(KERN_WARNING "btrfs: failed to read the system "
+		       "array on %s\n", sb->s_id);
+		goto fail_sys_array;
+	}
+
+	blocksize = btrfs_level_size(tree_root,
+				     btrfs_super_chunk_root_level(disk_super));
+	generation = btrfs_super_chunk_root_generation(disk_super);
+
+	__setup_root(nodesize, leafsize, sectorsize, stripesize,
+		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+
+	chunk_root->node = read_tree_block(chunk_root,
+					   btrfs_super_chunk_root(disk_super),
+					   blocksize, generation);
+	BUG_ON(!chunk_root->node);
+
+	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+	   BTRFS_UUID_SIZE);
+
+	mutex_lock(&fs_info->chunk_mutex);
+	ret = btrfs_read_chunk_tree(chunk_root);
+	mutex_unlock(&fs_info->chunk_mutex);
+	if (ret) {
+		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+		       sb->s_id);
+		goto fail_chunk_root;
+	}
+
+	btrfs_close_extra_devices(fs_devices);
+
+	blocksize = btrfs_level_size(tree_root,
+				     btrfs_super_root_level(disk_super));
+	generation = btrfs_super_generation(disk_super);
+
+	tree_root->node = read_tree_block(tree_root,
+					  btrfs_super_root(disk_super),
+					  blocksize, generation);
+	if (!tree_root->node)
+		goto fail_chunk_root;
+
+
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
+	if (ret)
+		goto fail_tree_root;
+	extent_root->track_dirty = 1;
+
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_DEV_TREE_OBJECTID, dev_root);
+	dev_root->track_dirty = 1;
+
+	if (ret)
+		goto fail_extent_root;
+
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
+	if (ret)
+		goto fail_extent_root;
+
+	csum_root->track_dirty = 1;
+
+	btrfs_read_block_groups(extent_root);
+
+	fs_info->generation = generation;
+	fs_info->last_trans_committed = generation;
+	fs_info->data_alloc_profile = (u64)-1;
+	fs_info->metadata_alloc_profile = (u64)-1;
+	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+					       "btrfs-cleaner");
+	if (!fs_info->cleaner_kthread)
+		goto fail_csum_root;
+
+	fs_info->transaction_kthread = kthread_run(transaction_kthread,
+						   tree_root,
+						   "btrfs-transaction");
+	if (!fs_info->transaction_kthread)
+		goto fail_cleaner;
+
+	if (btrfs_super_log_root(disk_super) != 0) {
+		u64 bytenr = btrfs_super_log_root(disk_super);
+
+		if (fs_devices->rw_devices == 0) {
+			printk(KERN_WARNING "Btrfs log replay required "
+			       "on RO media\n");
+			err = -EIO;
+			goto fail_trans_kthread;
+		}
+		blocksize =
+		     btrfs_level_size(tree_root,
+				      btrfs_super_log_root_level(disk_super));
+
+		log_tree_root = kzalloc(sizeof(struct btrfs_root),
+						      GFP_NOFS);
+
+		__setup_root(nodesize, leafsize, sectorsize, stripesize,
+			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+		log_tree_root->node = read_tree_block(tree_root, bytenr,
+						      blocksize,
+						      generation + 1);
+		ret = btrfs_recover_log_trees(log_tree_root);
+		BUG_ON(ret);
+
+		if (sb->s_flags & MS_RDONLY) {
+			ret =  btrfs_commit_super(tree_root);
+			BUG_ON(ret);
+		}
+	}
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		ret = btrfs_cleanup_reloc_trees(tree_root);
+		BUG_ON(ret);
+	}
+
+	location.objectid = BTRFS_FS_TREE_OBJECTID;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = (u64)-1;
+
+	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+	if (!fs_info->fs_root)
+		goto fail_trans_kthread;
+	return tree_root;
+
+fail_trans_kthread:
+	kthread_stop(fs_info->transaction_kthread);
+fail_cleaner:
+	kthread_stop(fs_info->cleaner_kthread);
+
+	/*
+	 * make sure we're done with the btree inode before we stop our
+	 * kthreads
+	 */
+	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+
+fail_csum_root:
+	free_extent_buffer(csum_root->node);
+fail_extent_root:
+	free_extent_buffer(extent_root->node);
+fail_tree_root:
+	free_extent_buffer(tree_root->node);
+fail_chunk_root:
+	free_extent_buffer(chunk_root->node);
+fail_sys_array:
+	free_extent_buffer(dev_root->node);
+fail_sb_buffer:
+	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->submit_workers);
+fail_iput:
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+	iput(fs_info->btree_inode);
+fail:
+	btrfs_close_devices(fs_info->fs_devices);
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+	kfree(extent_root);
+	kfree(tree_root);
+	bdi_destroy(&fs_info->bdi);
+	kfree(fs_info);
+	kfree(chunk_root);
+	kfree(dev_root);
+	kfree(csum_root);
+	return ERR_PTR(err);
+}
+
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+			printk(KERN_WARNING "lost page write due to "
+					"I/O error on %s\n",
+				       bdevname(bh->b_bdev, b));
+		}
+		/* note, we dont' set_buffer_write_io_error because we have
+		 * our own ways of dealing with the IO errors
+		 */
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+	struct buffer_head *bh;
+	struct buffer_head *latest = NULL;
+	struct btrfs_super_block *super;
+	int i;
+	u64 transid = 0;
+	u64 bytenr;
+
+	/* we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	for (i = 0; i < 1; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+			break;
+		bh = __bread(bdev, bytenr / 4096, 4096);
+		if (!bh)
+			continue;
+
+		super = (struct btrfs_super_block *)bh->b_data;
+		if (btrfs_super_bytenr(super) != bytenr ||
+		    strncmp((char *)(&super->magic), BTRFS_MAGIC,
+			    sizeof(super->magic))) {
+			brelse(bh);
+			continue;
+		}
+
+		if (!latest || btrfs_super_generation(super) > transid) {
+			brelse(latest);
+			latest = bh;
+			transid = btrfs_super_generation(super);
+		} else {
+			brelse(bh);
+		}
+	}
+	return latest;
+}
+
+static int write_dev_supers(struct btrfs_device *device,
+			    struct btrfs_super_block *sb,
+			    int do_barriers, int wait, int max_mirrors)
+{
+	struct buffer_head *bh;
+	int i;
+	int ret;
+	int errors = 0;
+	u32 crc;
+	u64 bytenr;
+	int last_barrier = 0;
+
+	if (max_mirrors == 0)
+		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+	/* make sure only the last submit_bh does a barrier */
+	if (do_barriers) {
+		for (i = 0; i < max_mirrors; i++) {
+			bytenr = btrfs_sb_offset(i);
+			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+			    device->total_bytes)
+				break;
+			last_barrier = i;
+		}
+	}
+
+	for (i = 0; i < max_mirrors; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+			break;
+
+		if (wait) {
+			bh = __find_get_block(device->bdev, bytenr / 4096,
+					      BTRFS_SUPER_INFO_SIZE);
+			BUG_ON(!bh);
+			brelse(bh);
+			wait_on_buffer(bh);
+			if (buffer_uptodate(bh)) {
+				brelse(bh);
+				continue;
+			}
+		} else {
+			btrfs_set_super_bytenr(sb, bytenr);
+
+			crc = ~(u32)0;
+			crc = btrfs_csum_data(NULL, (char *)sb +
+					      BTRFS_CSUM_SIZE, crc,
+					      BTRFS_SUPER_INFO_SIZE -
+					      BTRFS_CSUM_SIZE);
+			btrfs_csum_final(crc, sb->csum);
+
+			bh = __getblk(device->bdev, bytenr / 4096,
+				      BTRFS_SUPER_INFO_SIZE);
+			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+			set_buffer_uptodate(bh);
+			get_bh(bh);
+			lock_buffer(bh);
+			bh->b_end_io = btrfs_end_buffer_write_sync;
+		}
+
+		if (i == last_barrier && do_barriers && device->barriers) {
+			ret = submit_bh(WRITE_BARRIER, bh);
+			if (ret == -EOPNOTSUPP) {
+				printk("btrfs: disabling barriers on dev %s\n",
+				       device->name);
+				set_buffer_uptodate(bh);
+				device->barriers = 0;
+				get_bh(bh);
+				lock_buffer(bh);
+				ret = submit_bh(WRITE, bh);
+			}
+		} else {
+			ret = submit_bh(WRITE, bh);
+		}
+
+		if (!ret && wait) {
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				errors++;
+		} else if (ret) {
+			errors++;
+		}
+		if (wait)
+			brelse(bh);
+	}
+	return errors < i ? 0 : -1;
+}
+
+int write_all_supers(struct btrfs_root *root, int max_mirrors)
+{
+	struct list_head *cur;
+	struct list_head *head = &root->fs_info->fs_devices->devices;
+	struct btrfs_device *dev;
+	struct btrfs_super_block *sb;
+	struct btrfs_dev_item *dev_item;
+	int ret;
+	int do_barriers;
+	int max_errors;
+	int total_errors = 0;
+	u64 flags;
+
+	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	do_barriers = !btrfs_test_opt(root, NOBARRIER);
+
+	sb = &root->fs_info->super_for_commit;
+	dev_item = &sb->dev_item;
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (!dev->bdev) {
+			total_errors++;
+			continue;
+		}
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		btrfs_set_stack_device_generation(dev_item, 0);
+		btrfs_set_stack_device_type(dev_item, dev->type);
+		btrfs_set_stack_device_id(dev_item, dev->devid);
+		btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+		btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+
+		flags = btrfs_super_flags(sb);
+		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+
+		ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
+		if (ret)
+			total_errors++;
+	}
+	if (total_errors > max_errors) {
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
+		BUG();
+	}
+
+	total_errors = 0;
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (!dev->bdev)
+			continue;
+		if (!dev->in_fs_metadata || !dev->writeable)
+			continue;
+
+		ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+		if (ret)
+			total_errors++;
+	}
+	if (total_errors > max_errors) {
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
+		BUG();
+	}
+	return 0;
+}
+
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, int max_mirrors)
+{
+	int ret;
+
+	ret = write_all_supers(root, max_mirrors);
+	return ret;
+}
+
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+	radix_tree_delete(&fs_info->fs_roots_radix,
+			  (unsigned long)root->root_key.objectid);
+	if (root->anon_super.s_dev) {
+		down_write(&root->anon_super.s_umount);
+		kill_anon_super(&root->anon_super);
+	}
+	if (root->node)
+		free_extent_buffer(root->node);
+	if (root->commit_root)
+		free_extent_buffer(root->commit_root);
+	kfree(root->name);
+	kfree(root);
+	return 0;
+}
+
+static int del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *gang[8];
+	int i;
+
+	while (1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, 0,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++)
+			btrfs_free_fs_root(fs_info, gang[i]);
+	}
+	return 0;
+}
+
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	u64 root_objectid = 0;
+	struct btrfs_root *gang[8];
+	int i;
+	int ret;
+
+	while (1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, root_objectid,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			root_objectid = gang[i]->root_key.objectid;
+			ret = btrfs_find_dead_roots(fs_info->tree_root,
+						    root_objectid, gang[i]);
+			BUG_ON(ret);
+			btrfs_orphan_cleanup(gang[i]);
+		}
+		root_objectid++;
+	}
+	return 0;
+}
+
+int btrfs_commit_super(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_clean_old_snapshots(root);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_commit_transaction(trans, root);
+	BUG_ON(ret);
+	/* run commit again to drop the original snapshot */
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_commit_transaction(trans, root);
+	ret = btrfs_write_and_wait_transaction(NULL, root);
+	BUG_ON(ret);
+
+	ret = write_ctree_super(NULL, root, 0);
+	return ret;
+}
+
+int close_ctree(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	fs_info->closing = 1;
+	smp_mb();
+
+	kthread_stop(root->fs_info->transaction_kthread);
+	kthread_stop(root->fs_info->cleaner_kthread);
+
+	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+		ret =  btrfs_commit_super(root);
+		if (ret)
+			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+	}
+
+	if (fs_info->delalloc_bytes) {
+		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
+		       fs_info->delalloc_bytes);
+	}
+	if (fs_info->total_ref_cache_size) {
+		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
+		       (unsigned long long)fs_info->total_ref_cache_size);
+	}
+
+	if (fs_info->extent_root->node)
+		free_extent_buffer(fs_info->extent_root->node);
+
+	if (fs_info->tree_root->node)
+		free_extent_buffer(fs_info->tree_root->node);
+
+	if (root->fs_info->chunk_root->node)
+		free_extent_buffer(root->fs_info->chunk_root->node);
+
+	if (root->fs_info->dev_root->node)
+		free_extent_buffer(root->fs_info->dev_root->node);
+
+	if (root->fs_info->csum_root->node)
+		free_extent_buffer(root->fs_info->csum_root->node);
+
+	btrfs_free_block_groups(root->fs_info);
+
+	del_fs_roots(fs_info);
+
+	iput(fs_info->btree_inode);
+
+	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->submit_workers);
+
+#if 0
+	while (!list_empty(&fs_info->hashers)) {
+		struct btrfs_hasher *hasher;
+		hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
+				    hashers);
+		list_del(&hasher->hashers);
+		crypto_free_hash(&fs_info->hash_tfm);
+		kfree(hasher);
+	}
+#endif
+	btrfs_close_devices(fs_info->fs_devices);
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+	bdi_destroy(&fs_info->bdi);
+
+	kfree(fs_info->extent_root);
+	kfree(fs_info->tree_root);
+	kfree(fs_info->chunk_root);
+	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
+	return 0;
+}
+
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
+{
+	int ret;
+	struct inode *btree_inode = buf->first_page->mapping->host;
+
+	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+	if (!ret)
+		return ret;
+
+	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+				    parent_transid);
+	return !ret;
+}
+
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
+{
+	struct inode *btree_inode = buf->first_page->mapping->host;
+	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
+					  buf);
+}
+
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+	u64 transid = btrfs_header_generation(buf);
+	struct inode *btree_inode = root->fs_info->btree_inode;
+
+	WARN_ON(!btrfs_tree_locked(buf));
+	if (transid != root->fs_info->generation) {
+		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+		       "found %llu running %llu\n",
+			(unsigned long long)buf->start,
+			(unsigned long long)transid,
+			(unsigned long long)root->fs_info->generation);
+		WARN_ON(1);
+	}
+	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+}
+
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+	/*
+	 * looks as though older kernels can get into trouble with
+	 * this code, they end up stuck in balance_dirty_pages forever
+	 */
+	struct extent_io_tree *tree;
+	u64 num_dirty;
+	u64 start = 0;
+	unsigned long thresh = 32 * 1024 * 1024;
+	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+
+	if (current_is_pdflush() || current->flags & PF_MEMALLOC)
+		return;
+
+	num_dirty = count_range_bits(tree, &start, (u64)-1,
+				     thresh, EXTENT_DIRTY);
+	if (num_dirty > thresh) {
+		balance_dirty_pages_ratelimited_nr(
+				   root->fs_info->btree_inode->i_mapping, 1);
+	}
+	return;
+}
+
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+{
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+	int ret;
+	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+	if (ret == 0)
+		buf->flags |= EXTENT_UPTODATE;
+	return ret;
+}
+
+int btree_lock_page_hook(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_buffer *eb;
+	unsigned long len;
+	u64 bytenr = page_offset(page);
+
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+
+	len = page->private >> 2;
+	eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+	if (!eb)
+		goto out;
+
+	btrfs_tree_lock(eb);
+	spin_lock(&root->fs_info->hash_lock);
+	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+	spin_unlock(&root->fs_info->hash_lock);
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+out:
+	lock_page(page);
+	return 0;
+}
+
+static struct extent_io_ops btree_extent_io_ops = {
+	.write_cache_pages_lock_hook = btree_lock_page_hook,
+	.readpage_end_io_hook = btree_readpage_end_io_hook,
+	.submit_bio_hook = btree_submit_bio_hook,
+	/* note we're sharing with inode.c for the merge bio hook */
+	.merge_bio_hook = btrfs_merge_bio_hook,
+};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 00000000000..c0ff404c31b
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DISKIO__
+#define __DISKIO__
+
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
+
+#define BTRFS_SUPER_MIRROR_MAX	 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+	u64 start = 16 * 1024;
+	if (mirror)
+		return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+	return BTRFS_SUPER_INFO_OFFSET;
+}
+
+struct btrfs_device;
+struct btrfs_fs_devices;
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+				      u32 blocksize, u64 parent_transid);
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 u64 parent_transid);
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+						   u64 bytenr, u32 blocksize);
+int clean_tree_block(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *buf);
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options);
+int close_ctree(struct btrfs_root *root);
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_commit_super(struct btrfs_root *root);
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+				      struct btrfs_key *location,
+				      const char *name, int namelen);
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+					       struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+					      struct btrfs_key *location);
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
+int btrfs_insert_dev_radix(struct btrfs_root *root,
+			   struct block_device *bdev,
+			   u64 device_id,
+			   u64 block_start,
+			   u64 num_blocks);
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
+int wait_on_tree_block_writeback(struct btrfs_root *root,
+				 struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+void btrfs_csum_final(u32 crc, char *result);
+int btrfs_open_device(struct btrfs_device *dev);
+int btrfs_verify_block_csum(struct btrfs_root *root,
+			    struct extent_buffer *buf);
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+			int metadata);
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
+			extent_submit_bio_hook_t *submit_bio_start,
+			extent_submit_bio_hook_t *submit_bio_done);
+
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
+int btree_lock_page_hook(struct page *page);
+#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 00000000000..85315d2c90d
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,203 @@
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+#include "compat.h"
+
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+						 parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+					     parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
+
+static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+			   int connectable)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+	struct inode *inode = dentry->d_inode;
+	int len = *max_len;
+	int type;
+
+	if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+	    (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+		return 255;
+
+	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
+	type = FILEID_BTRFS_WITHOUT_PARENT;
+
+	fid->objectid = BTRFS_I(inode)->location.objectid;
+	fid->root_objectid = BTRFS_I(inode)->root->objectid;
+	fid->gen = inode->i_generation;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+		u64 parent_root_id;
+
+		spin_lock(&dentry->d_lock);
+
+		parent = dentry->d_parent->d_inode;
+		fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+		fid->parent_gen = parent->i_generation;
+		parent_root_id = BTRFS_I(parent)->root->objectid;
+
+		spin_unlock(&dentry->d_lock);
+
+		if (parent_root_id != fid->root_objectid) {
+			fid->parent_root_objectid = parent_root_id;
+			len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+			type = FILEID_BTRFS_WITH_PARENT_ROOT;
+		} else {
+			len = BTRFS_FID_SIZE_CONNECTABLE;
+			type = FILEID_BTRFS_WITH_PARENT;
+		}
+	}
+
+	*max_len = len;
+	return type;
+}
+
+static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+				       u64 root_objectid, u32 generation)
+{
+	struct btrfs_root *root;
+	struct inode *inode;
+	struct btrfs_key key;
+
+	key.objectid = root_objectid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	key.offset = (u64)-1;
+
+	root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
+	if (IS_ERR(root))
+		return ERR_CAST(root);
+
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	inode = btrfs_iget(sb, &key, root, NULL);
+	if (IS_ERR(inode))
+		return (void *)inode;
+
+	if (generation != inode->i_generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return d_obtain_alias(inode);
+}
+
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+	u64 objectid, root_objectid;
+	u32 generation;
+
+	if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+		if (fh_len !=  BTRFS_FID_SIZE_CONNECTABLE)
+			return NULL;
+		root_objectid = fid->root_objectid;
+	} else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+		if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+			return NULL;
+		root_objectid = fid->parent_root_objectid;
+	} else
+		return NULL;
+
+	objectid = fid->parent_objectid;
+	generation = fid->parent_gen;
+
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+	u64 objectid, root_objectid;
+	u32 generation;
+
+	if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+	     fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+	    (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+	     fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+	    (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+	     fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+		return NULL;
+
+	objectid = fid->objectid;
+	root_objectid = fid->root_objectid;
+	generation = fid->gen;
+
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+
+static struct dentry *btrfs_get_parent(struct dentry *child)
+{
+	struct inode *dir = child->d_inode;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int slot;
+	u64 objectid;
+	int ret;
+
+	path = btrfs_alloc_path();
+
+	key.objectid = dir->i_ino;
+	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		/* Error */
+		btrfs_free_path(path);
+		return ERR_PTR(ret);
+	}
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	if (ret) {
+		/* btrfs_search_slot() returns the slot where we'd want to
+		   insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
+		   The _real_ backref, telling us what the parent inode
+		   _actually_ is, will be in the slot _before_ the one
+		   that btrfs_search_slot() returns. */
+		if (!slot) {
+			/* Unless there is _no_ key in the tree before... */
+			btrfs_free_path(path);
+			return ERR_PTR(-EIO);
+		}
+		slot--;
+	}
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	btrfs_free_path(path);
+
+	if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
+		return ERR_PTR(-EINVAL);
+
+	objectid = key.offset;
+
+	/* If we are already at the root of a subvol, return the real root */
+	if (objectid == dir->i_ino)
+		return dget(dir->i_sb->s_root);
+
+	/* Build a new key for the inode item */
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+}
+
+const struct export_operations btrfs_export_ops = {
+	.encode_fh	= btrfs_encode_fh,
+	.fh_to_dentry	= btrfs_fh_to_dentry,
+	.fh_to_parent	= btrfs_fh_to_parent,
+	.get_parent	= btrfs_get_parent,
+};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 00000000000..074348a9584
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations btrfs_export_ops;
+
+struct btrfs_fid {
+	u64 objectid;
+	u64 root_objectid;
+	u32 gen;
+
+	u64 parent_objectid;
+	u32 parent_gen;
+
+	u64 parent_root_objectid;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 00000000000..293da650873
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5986 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/version.h>
+#include "compat.h"
+#include "hash.h"
+#include "crc32c.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "compat.h"
+
+#define PENDING_EXTENT_INSERT 0
+#define PENDING_EXTENT_DELETE 1
+#define PENDING_BACKREF_UPDATE 2
+
+struct pending_extent_op {
+	int type;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 parent;
+	u64 orig_parent;
+	u64 generation;
+	u64 orig_generation;
+	int level;
+	struct list_head list;
+	int del;
+};
+
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root, int all);
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 bytenr, u64 num_bytes, int is_data);
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      u64 bytenr, u64 num_bytes, int alloc,
+			      int mark_free);
+
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+	return (cache->flags & bits) == bits;
+}
+
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+				struct btrfs_block_group_cache *block_group)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct btrfs_block_group_cache *cache;
+
+	spin_lock(&info->block_group_cache_lock);
+	p = &info->block_group_cache_tree.rb_node;
+
+	while (*p) {
+		parent = *p;
+		cache = rb_entry(parent, struct btrfs_block_group_cache,
+				 cache_node);
+		if (block_group->key.objectid < cache->key.objectid) {
+			p = &(*p)->rb_left;
+		} else if (block_group->key.objectid > cache->key.objectid) {
+			p = &(*p)->rb_right;
+		} else {
+			spin_unlock(&info->block_group_cache_lock);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&block_group->cache_node, parent, p);
+	rb_insert_color(&block_group->cache_node,
+			&info->block_group_cache_tree);
+	spin_unlock(&info->block_group_cache_lock);
+
+	return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+			      int contains)
+{
+	struct btrfs_block_group_cache *cache, *ret = NULL;
+	struct rb_node *n;
+	u64 end, start;
+
+	spin_lock(&info->block_group_cache_lock);
+	n = info->block_group_cache_tree.rb_node;
+
+	while (n) {
+		cache = rb_entry(n, struct btrfs_block_group_cache,
+				 cache_node);
+		end = cache->key.objectid + cache->key.offset - 1;
+		start = cache->key.objectid;
+
+		if (bytenr < start) {
+			if (!contains && (!ret || start < ret->key.objectid))
+				ret = cache;
+			n = n->rb_left;
+		} else if (bytenr > start) {
+			if (contains && bytenr <= end) {
+				ret = cache;
+				break;
+			}
+			n = n->rb_right;
+		} else {
+			ret = cache;
+			break;
+		}
+	}
+	if (ret)
+		atomic_inc(&ret->count);
+	spin_unlock(&info->block_group_cache_lock);
+
+	return ret;
+}
+
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_fs_info *info, u64 start, u64 end)
+{
+	u64 extent_start, extent_end, size;
+	int ret;
+
+	mutex_lock(&info->pinned_mutex);
+	while (start < end) {
+		ret = find_first_extent_bit(&info->pinned_extents, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		if (extent_start == start) {
+			start = extent_end + 1;
+		} else if (extent_start > start && extent_start < end) {
+			size = extent_start - start;
+			ret = btrfs_add_free_space(block_group, start,
+						   size);
+			BUG_ON(ret);
+			start = extent_end + 1;
+		} else {
+			break;
+		}
+	}
+
+	if (start < end) {
+		size = end - start;
+		ret = btrfs_add_free_space(block_group, start, size);
+		BUG_ON(ret);
+	}
+	mutex_unlock(&info->pinned_mutex);
+
+	return 0;
+}
+
+static int remove_sb_from_cache(struct btrfs_root *root,
+				struct btrfs_block_group_cache *cache)
+{
+	u64 bytenr;
+	u64 *logical;
+	int stripe_len;
+	int i, nr, ret;
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+				       cache->key.objectid, bytenr, 0,
+				       &logical, &nr, &stripe_len);
+		BUG_ON(ret);
+		while (nr--) {
+			btrfs_remove_free_space(cache, logical[nr],
+						stripe_len);
+		}
+		kfree(logical);
+	}
+	return 0;
+}
+
+static int cache_block_group(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_path *path;
+	int ret = 0;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	int slot;
+	u64 last;
+
+	if (!block_group)
+		return 0;
+
+	root = root->fs_info->extent_root;
+
+	if (block_group->cached)
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 2;
+	/*
+	 * we get into deadlocks with paths held by callers of this function.
+	 * since the alloc_mutex is protecting things right now, just
+	 * skip the locking here
+	 */
+	path->skip_locking = 1;
+	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+	key.objectid = last;
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto err;
+			if (ret == 0)
+				continue;
+			else
+				break;
+		}
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid < block_group->key.objectid)
+			goto next;
+
+		if (key.objectid >= block_group->key.objectid +
+		    block_group->key.offset)
+			break;
+
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
+			add_new_free_space(block_group, root->fs_info, last,
+					   key.objectid);
+
+			last = key.objectid + key.offset;
+		}
+next:
+		path->slots[0]++;
+	}
+
+	add_new_free_space(block_group, root->fs_info, last,
+			   block_group->key.objectid +
+			   block_group->key.offset);
+
+	remove_sb_from_cache(root, block_group);
+	block_group->cached = 1;
+	ret = 0;
+err:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * return the block group that starts at or after bytenr
+ */
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = block_group_cache_tree_search(info, bytenr, 0);
+
+	return cache;
+}
+
+/*
+ * return the block group that contains teh given bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = block_group_cache_tree_search(info, bytenr, 1);
+
+	return cache;
+}
+
+static inline void put_block_group(struct btrfs_block_group_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->count))
+		kfree(cache);
+}
+
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+						  u64 flags)
+{
+	struct list_head *head = &info->space_info;
+	struct list_head *cur;
+	struct btrfs_space_info *found;
+	list_for_each(cur, head) {
+		found = list_entry(cur, struct btrfs_space_info, list);
+		if (found->flags == flags)
+			return found;
+	}
+	return NULL;
+}
+
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+u64 btrfs_find_block_group(struct btrfs_root *root,
+			   u64 search_start, u64 search_hint, int owner)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 used;
+	u64 last = max(search_hint, search_start);
+	u64 group_start = 0;
+	int full_search = 0;
+	int factor = 9;
+	int wrapped = 0;
+again:
+	while (1) {
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
+		if (!cache)
+			break;
+
+		spin_lock(&cache->lock);
+		last = cache->key.objectid + cache->key.offset;
+		used = btrfs_block_group_used(&cache->item);
+
+		if ((full_search || !cache->ro) &&
+		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
+			if (used + cache->pinned + cache->reserved <
+			    div_factor(cache->key.offset, factor)) {
+				group_start = cache->key.objectid;
+				spin_unlock(&cache->lock);
+				put_block_group(cache);
+				goto found;
+			}
+		}
+		spin_unlock(&cache->lock);
+		put_block_group(cache);
+		cond_resched();
+	}
+	if (!wrapped) {
+		last = search_start;
+		wrapped = 1;
+		goto again;
+	}
+	if (!full_search && factor < 10) {
+		last = search_start;
+		full_search = 1;
+		factor = 10;
+		goto again;
+	}
+found:
+	return group_start;
+}
+
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	key.objectid = start;
+	key.offset = len;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+				0, 0);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * Back reference rules.  Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ *    when a reference is dropped we can make sure it was a valid reference
+ *    before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ *    if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ *    maintenance.  This is actually the same as #2, but with a slightly
+ *    different use case.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - Generation number of the tree holding the reference
+ * - objectid of the file holding the reference
+ * - number of references holding by parent node (alway 1 for tree blocks)
+ *
+ * Btree leaf may hold multiple references to a file extent. In most cases,
+ * these references are from same file and the corresponding offsets inside
+ * the file are close together.
+ *
+ * When a file extent is allocated the fields are filled in:
+ *     (root_key.objectid, trans->transid, inode objectid, 1)
+ *
+ * When a leaf is cow'd new references are added for every file extent found
+ * in the leaf.  It looks similar to the create case, but trans->transid will
+ * be different when the block is cow'd.
+ *
+ *     (root_key.objectid, trans->transid, inode objectid,
+ *      number of references in the leaf)
+ *
+ * When a file extent is removed either during snapshot deletion or
+ * file truncation, we find the corresponding back reference and check
+ * the following fields:
+ *
+ *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ *      inode objectid)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ * - Different generations of the same subvolume
+ *
+ * When a tree block is created, back references are inserted:
+ *
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a tree block is cow'd, new back references are added for all the
+ * blocks it points to. If the tree block isn't in reference counted root,
+ * the old back references are removed. These new back references are of
+ * the form (trans->transid will have increased since creation):
+ *
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a backref is in deleting, the following fields are checked:
+ *
+ * if backref was for a tree root:
+ *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
+ * else
+ *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent, the key
+ * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * byte of parent extent. If a extent is tree root, the key offset is set
+ * to the key objectid.
+ */
+
+static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 ref_root, u64 ref_generation,
+					  u64 owner_objectid, int del)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_ref *ref;
+	struct extent_buffer *leaf;
+	u64 ref_objectid;
+	int ret;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = parent;
+
+	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	ref_objectid = btrfs_ref_objectid(leaf, ref);
+	if (btrfs_ref_root(leaf, ref) != ref_root ||
+	    btrfs_ref_generation(leaf, ref) != ref_generation ||
+	    (ref_objectid != owner_objectid &&
+	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+		ret = -EIO;
+		WARN_ON(1);
+		goto out;
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * updates all the backrefs that are pending on update_list for the
+ * extent_root
+ */
+static noinline int update_backrefs(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *extent_root,
+				    struct btrfs_path *path,
+				    struct list_head *update_list)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct pending_extent_op *op;
+	struct extent_buffer *leaf;
+	int ret = 0;
+	struct list_head *cur = update_list->next;
+	u64 ref_objectid;
+	u64 ref_root = extent_root->root_key.objectid;
+
+	op = list_entry(cur, struct pending_extent_op, list);
+
+search:
+	key.objectid = op->bytenr;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = op->orig_parent;
+
+	ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+
+loop:
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+
+	ref_objectid = btrfs_ref_objectid(leaf, ref);
+
+	if (btrfs_ref_root(leaf, ref) != ref_root ||
+	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
+	    (ref_objectid != op->level &&
+	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+		printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
+		       "root %llu, owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)op->orig_parent,
+		       (unsigned long long)ref_root, op->level);
+		btrfs_print_leaf(extent_root, leaf);
+		BUG();
+	}
+
+	key.objectid = op->bytenr;
+	key.offset = op->parent;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
+	BUG_ON(ret);
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	btrfs_set_ref_generation(leaf, ref, op->generation);
+
+	cur = cur->next;
+
+	list_del_init(&op->list);
+	unlock_extent(&info->extent_ins, op->bytenr,
+		      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+	kfree(op);
+
+	if (cur == update_list) {
+		btrfs_mark_buffer_dirty(path->nodes[0]);
+		btrfs_release_path(extent_root, path);
+		goto out;
+	}
+
+	op = list_entry(cur, struct pending_extent_op, list);
+
+	path->slots[0]++;
+	while (path->slots[0] < btrfs_header_nritems(leaf)) {
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid == op->bytenr &&
+		    key.type == BTRFS_EXTENT_REF_KEY)
+			goto loop;
+		path->slots[0]++;
+	}
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(extent_root, path);
+	goto search;
+
+out:
+	return 0;
+}
+
+static noinline int insert_extents(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *extent_root,
+				   struct btrfs_path *path,
+				   struct list_head *insert_list, int nr)
+{
+	struct btrfs_key *keys;
+	u32 *data_size;
+	struct pending_extent_op *op;
+	struct extent_buffer *leaf;
+	struct list_head *cur = insert_list->next;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	u64 ref_root = extent_root->root_key.objectid;
+	int i = 0, last = 0, ret;
+	int total = nr * 2;
+
+	if (!nr)
+		return 0;
+
+	keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
+	if (!keys)
+		return -ENOMEM;
+
+	data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
+	if (!data_size) {
+		kfree(keys);
+		return -ENOMEM;
+	}
+
+	list_for_each_entry(op, insert_list, list) {
+		keys[i].objectid = op->bytenr;
+		keys[i].offset = op->num_bytes;
+		keys[i].type = BTRFS_EXTENT_ITEM_KEY;
+		data_size[i] = sizeof(struct btrfs_extent_item);
+		i++;
+
+		keys[i].objectid = op->bytenr;
+		keys[i].offset = op->parent;
+		keys[i].type = BTRFS_EXTENT_REF_KEY;
+		data_size[i] = sizeof(struct btrfs_extent_ref);
+		i++;
+	}
+
+	op = list_entry(cur, struct pending_extent_op, list);
+	i = 0;
+	while (i < total) {
+		int c;
+		ret = btrfs_insert_some_items(trans, extent_root, path,
+					      keys+i, data_size+i, total-i);
+		BUG_ON(ret < 0);
+
+		if (last && ret > 1)
+			BUG();
+
+		leaf = path->nodes[0];
+		for (c = 0; c < ret; c++) {
+			int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
+
+			/*
+			 * if the first item we inserted was a backref, then
+			 * the EXTENT_ITEM will be the odd c's, else it will
+			 * be the even c's
+			 */
+			if ((ref_first && (c % 2)) ||
+			    (!ref_first && !(c % 2))) {
+				struct btrfs_extent_item *itm;
+
+				itm = btrfs_item_ptr(leaf, path->slots[0] + c,
+						     struct btrfs_extent_item);
+				btrfs_set_extent_refs(path->nodes[0], itm, 1);
+				op->del++;
+			} else {
+				struct btrfs_extent_ref *ref;
+
+				ref = btrfs_item_ptr(leaf, path->slots[0] + c,
+						     struct btrfs_extent_ref);
+				btrfs_set_ref_root(leaf, ref, ref_root);
+				btrfs_set_ref_generation(leaf, ref,
+							 op->generation);
+				btrfs_set_ref_objectid(leaf, ref, op->level);
+				btrfs_set_ref_num_refs(leaf, ref, 1);
+				op->del++;
+			}
+
+			/*
+			 * using del to see when its ok to free up the
+			 * pending_extent_op.  In the case where we insert the
+			 * last item on the list in order to help do batching
+			 * we need to not free the extent op until we actually
+			 * insert the extent_item
+			 */
+			if (op->del == 2) {
+				unlock_extent(&info->extent_ins, op->bytenr,
+					      op->bytenr + op->num_bytes - 1,
+					      GFP_NOFS);
+				cur = cur->next;
+				list_del_init(&op->list);
+				kfree(op);
+				if (cur != insert_list)
+					op = list_entry(cur,
+						struct pending_extent_op,
+						list);
+			}
+		}
+		btrfs_mark_buffer_dirty(leaf);
+		btrfs_release_path(extent_root, path);
+
+		/*
+		 * Ok backref's and items usually go right next to eachother,
+		 * but if we could only insert 1 item that means that we
+		 * inserted on the end of a leaf, and we have no idea what may
+		 * be on the next leaf so we just play it safe.  In order to
+		 * try and help this case we insert the last thing on our
+		 * insert list so hopefully it will end up being the last
+		 * thing on the leaf and everything else will be before it,
+		 * which will let us insert a whole bunch of items at the same
+		 * time.
+		 */
+		if (ret == 1 && !last && (i + ret < total)) {
+			/*
+			 * last: where we will pick up the next time around
+			 * i: our current key to insert, will be total - 1
+			 * cur: the current op we are screwing with
+			 * op: duh
+			 */
+			last = i + ret;
+			i = total - 1;
+			cur = insert_list->prev;
+			op = list_entry(cur, struct pending_extent_op, list);
+		} else if (last) {
+			/*
+			 * ok we successfully inserted the last item on the
+			 * list, lets reset everything
+			 *
+			 * i: our current key to insert, so where we left off
+			 *    last time
+			 * last: done with this
+			 * cur: the op we are messing with
+			 * op: duh
+			 * total: since we inserted the last key, we need to
+			 *        decrement total so we dont overflow
+			 */
+			i = last;
+			last = 0;
+			total--;
+			if (i < total) {
+				cur = insert_list->next;
+				op = list_entry(cur, struct pending_extent_op,
+						list);
+			}
+		} else {
+			i += ret;
+		}
+
+		cond_resched();
+	}
+	ret = 0;
+	kfree(keys);
+	kfree(data_size);
+	return ret;
+}
+
+static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 ref_root, u64 ref_generation,
+					  u64 owner_objectid)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref *ref;
+	u32 num_refs;
+	int ret;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = parent;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		btrfs_set_ref_root(leaf, ref, ref_root);
+		btrfs_set_ref_generation(leaf, ref, ref_generation);
+		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
+		btrfs_set_ref_num_refs(leaf, ref, 1);
+	} else if (ret == -EEXIST) {
+		u64 existing_owner;
+		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		if (btrfs_ref_root(leaf, ref) != ref_root ||
+		    btrfs_ref_generation(leaf, ref) != ref_generation) {
+			ret = -EIO;
+			WARN_ON(1);
+			goto out;
+		}
+
+		num_refs = btrfs_ref_num_refs(leaf, ref);
+		BUG_ON(num_refs == 0);
+		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+
+		existing_owner = btrfs_ref_objectid(leaf, ref);
+		if (existing_owner != owner_objectid &&
+		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+			btrfs_set_ref_objectid(leaf, ref,
+					BTRFS_MULTIPLE_OBJECTIDS);
+		}
+		ret = 0;
+	} else {
+		goto out;
+	}
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref *ref;
+	u32 num_refs;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	num_refs = btrfs_ref_num_refs(leaf, ref);
+	BUG_ON(num_refs == 0);
+	num_refs -= 1;
+	if (num_refs == 0) {
+		ret = btrfs_del_item(trans, root, path);
+	} else {
+		btrfs_set_ref_num_refs(leaf, ref, num_refs);
+		btrfs_mark_buffer_dirty(leaf);
+	}
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+#ifdef BIO_RW_DISCARD
+static void btrfs_issue_discard(struct block_device *bdev,
+				u64 start, u64 len)
+{
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+}
+#endif
+
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+				u64 num_bytes)
+{
+#ifdef BIO_RW_DISCARD
+	int ret;
+	u64 map_length = num_bytes;
+	struct btrfs_multi_bio *multi = NULL;
+
+	/* Tell the block device(s) that the sectors can be discarded */
+	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+			      bytenr, &map_length, &multi, 0);
+	if (!ret) {
+		struct btrfs_bio_stripe *stripe = multi->stripes;
+		int i;
+
+		if (map_length > num_bytes)
+			map_length = num_bytes;
+
+		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+			btrfs_issue_discard(stripe->dev->bdev,
+					    stripe->physical,
+					    map_length);
+		}
+		kfree(multi);
+	}
+
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+static noinline int free_extents(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root,
+				 struct list_head *del_list)
+{
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_path *path;
+	struct btrfs_key key, found_key;
+	struct extent_buffer *leaf;
+	struct list_head *cur;
+	struct pending_extent_op *op;
+	struct btrfs_extent_item *ei;
+	int ret, num_to_del, extent_slot = 0, found_extent = 0;
+	u32 refs;
+	u64 bytes_freed = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+search:
+	/* search for the backref for the current ref we want to delete */
+	cur = del_list->next;
+	op = list_entry(cur, struct pending_extent_op, list);
+	ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
+				    op->orig_parent,
+				    extent_root->root_key.objectid,
+				    op->orig_generation, op->level, 1);
+	if (ret) {
+		printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
+		       "root %llu gen %llu owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)extent_root->root_key.objectid,
+		       (unsigned long long)op->orig_generation, op->level);
+		btrfs_print_leaf(extent_root, path->nodes[0]);
+		WARN_ON(1);
+		goto out;
+	}
+
+	extent_slot = path->slots[0];
+	num_to_del = 1;
+	found_extent = 0;
+
+	/*
+	 * if we aren't the first item on the leaf we can move back one and see
+	 * if our ref is right next to our extent item
+	 */
+	if (likely(extent_slot)) {
+		extent_slot--;
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      extent_slot);
+		if (found_key.objectid == op->bytenr &&
+		    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+		    found_key.offset == op->num_bytes) {
+			num_to_del++;
+			found_extent = 1;
+		}
+	}
+
+	/*
+	 * if we didn't find the extent we need to delete the backref and then
+	 * search for the extent item key so we can update its ref count
+	 */
+	if (!found_extent) {
+		key.objectid = op->bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = op->num_bytes;
+
+		ret = remove_extent_backref(trans, extent_root, path);
+		BUG_ON(ret);
+		btrfs_release_path(extent_root, path);
+		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
+		BUG_ON(ret);
+		extent_slot = path->slots[0];
+	}
+
+	/* this is where we update the ref count for the extent */
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	BUG_ON(refs == 0);
+	refs--;
+	btrfs_set_extent_refs(leaf, ei, refs);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	/*
+	 * This extent needs deleting.  The reason cur_slot is extent_slot +
+	 * num_to_del is because extent_slot points to the slot where the extent
+	 * is, and if the backref was not right next to the extent we will be
+	 * deleting at least 1 item, and will want to start searching at the
+	 * slot directly next to extent_slot.  However if we did find the
+	 * backref next to the extent item them we will be deleting at least 2
+	 * items and will want to start searching directly after the ref slot
+	 */
+	if (!refs) {
+		struct list_head *pos, *n, *end;
+		int cur_slot = extent_slot+num_to_del;
+		u64 super_used;
+		u64 root_used;
+
+		path->slots[0] = extent_slot;
+		bytes_freed = op->num_bytes;
+
+		mutex_lock(&info->pinned_mutex);
+		ret = pin_down_bytes(trans, extent_root, op->bytenr,
+				     op->num_bytes, op->level >=
+				     BTRFS_FIRST_FREE_OBJECTID);
+		mutex_unlock(&info->pinned_mutex);
+		BUG_ON(ret < 0);
+		op->del = ret;
+
+		/*
+		 * we need to see if we can delete multiple things at once, so
+		 * start looping through the list of extents we are wanting to
+		 * delete and see if their extent/backref's are right next to
+		 * eachother and the extents only have 1 ref
+		 */
+		for (pos = cur->next; pos != del_list; pos = pos->next) {
+			struct pending_extent_op *tmp;
+
+			tmp = list_entry(pos, struct pending_extent_op, list);
+
+			/* we only want to delete extent+ref at this stage */
+			if (cur_slot >= btrfs_header_nritems(leaf) - 1)
+				break;
+
+			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
+			if (found_key.objectid != tmp->bytenr ||
+			    found_key.type != BTRFS_EXTENT_ITEM_KEY ||
+			    found_key.offset != tmp->num_bytes)
+				break;
+
+			/* check to make sure this extent only has one ref */
+			ei = btrfs_item_ptr(leaf, cur_slot,
+					    struct btrfs_extent_item);
+			if (btrfs_extent_refs(leaf, ei) != 1)
+				break;
+
+			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
+			if (found_key.objectid != tmp->bytenr ||
+			    found_key.type != BTRFS_EXTENT_REF_KEY ||
+			    found_key.offset != tmp->orig_parent)
+				break;
+
+			/*
+			 * the ref is right next to the extent, we can set the
+			 * ref count to 0 since we will delete them both now
+			 */
+			btrfs_set_extent_refs(leaf, ei, 0);
+
+			/* pin down the bytes for this extent */
+			mutex_lock(&info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
+					     tmp->num_bytes, tmp->level >=
+					     BTRFS_FIRST_FREE_OBJECTID);
+			mutex_unlock(&info->pinned_mutex);
+			BUG_ON(ret < 0);
+
+			/*
+			 * use the del field to tell if we need to go ahead and
+			 * free up the extent when we delete the item or not.
+			 */
+			tmp->del = ret;
+			bytes_freed += tmp->num_bytes;
+
+			num_to_del += 2;
+			cur_slot += 2;
+		}
+		end = pos;
+
+		/* update the free space counters */
+		spin_lock(&info->delalloc_lock);
+		super_used = btrfs_super_bytes_used(&info->super_copy);
+		btrfs_set_super_bytes_used(&info->super_copy,
+					   super_used - bytes_freed);
+
+		root_used = btrfs_root_used(&extent_root->root_item);
+		btrfs_set_root_used(&extent_root->root_item,
+				    root_used - bytes_freed);
+		spin_unlock(&info->delalloc_lock);
+
+		/* delete the items */
+		ret = btrfs_del_items(trans, extent_root, path,
+				      path->slots[0], num_to_del);
+		BUG_ON(ret);
+
+		/*
+		 * loop through the extents we deleted and do the cleanup work
+		 * on them
+		 */
+		for (pos = cur, n = pos->next; pos != end;
+		     pos = n, n = pos->next) {
+			struct pending_extent_op *tmp;
+			tmp = list_entry(pos, struct pending_extent_op, list);
+
+			/*
+			 * remember tmp->del tells us wether or not we pinned
+			 * down the extent
+			 */
+			ret = update_block_group(trans, extent_root,
+						 tmp->bytenr, tmp->num_bytes, 0,
+						 tmp->del);
+			BUG_ON(ret);
+
+			list_del_init(&tmp->list);
+			unlock_extent(&info->extent_ins, tmp->bytenr,
+				      tmp->bytenr + tmp->num_bytes - 1,
+				      GFP_NOFS);
+			kfree(tmp);
+		}
+	} else if (refs && found_extent) {
+		/*
+		 * the ref and extent were right next to eachother, but the
+		 * extent still has a ref, so just free the backref and keep
+		 * going
+		 */
+		ret = remove_extent_backref(trans, extent_root, path);
+		BUG_ON(ret);
+
+		list_del_init(&op->list);
+		unlock_extent(&info->extent_ins, op->bytenr,
+			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+		kfree(op);
+	} else {
+		/*
+		 * the extent has multiple refs and the backref we were looking
+		 * for was not right next to it, so just unlock and go next,
+		 * we're good to go
+		 */
+		list_del_init(&op->list);
+		unlock_extent(&info->extent_ins, op->bytenr,
+			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+		kfree(op);
+	}
+
+	btrfs_release_path(extent_root, path);
+	if (!list_empty(del_list))
+		goto search;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, u64 bytenr,
+				     u64 orig_parent, u64 parent,
+				     u64 orig_root, u64 ref_root,
+				     u64 orig_generation, u64 ref_generation,
+				     u64 owner_objectid)
+{
+	int ret;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_path *path;
+
+	if (root == root->fs_info->extent_root) {
+		struct pending_extent_op *extent_op;
+		u64 num_bytes;
+
+		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
+		num_bytes = btrfs_level_size(root, (int)owner_objectid);
+		mutex_lock(&root->fs_info->extent_ins_mutex);
+		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+			u64 priv;
+			ret = get_state_private(&root->fs_info->extent_ins,
+						bytenr, &priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+							(unsigned long)priv;
+			BUG_ON(extent_op->parent != orig_parent);
+			BUG_ON(extent_op->generation != orig_generation);
+
+			extent_op->parent = parent;
+			extent_op->generation = ref_generation;
+		} else {
+			extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+			BUG_ON(!extent_op);
+
+			extent_op->type = PENDING_BACKREF_UPDATE;
+			extent_op->bytenr = bytenr;
+			extent_op->num_bytes = num_bytes;
+			extent_op->parent = parent;
+			extent_op->orig_parent = orig_parent;
+			extent_op->generation = ref_generation;
+			extent_op->orig_generation = orig_generation;
+			extent_op->level = (int)owner_objectid;
+			INIT_LIST_HEAD(&extent_op->list);
+			extent_op->del = 0;
+
+			set_extent_bits(&root->fs_info->extent_ins,
+					bytenr, bytenr + num_bytes - 1,
+					EXTENT_WRITEBACK, GFP_NOFS);
+			set_state_private(&root->fs_info->extent_ins,
+					  bytenr, (unsigned long)extent_op);
+		}
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = lookup_extent_backref(trans, extent_root, path,
+				    bytenr, orig_parent, orig_root,
+				    orig_generation, owner_objectid, 1);
+	if (ret)
+		goto out;
+	ret = remove_extent_backref(trans, extent_root, path);
+	if (ret)
+		goto out;
+	ret = insert_extent_backref(trans, extent_root, path, bytenr,
+				    parent, ref_root, ref_generation,
+				    owner_objectid);
+	BUG_ON(ret);
+	finish_current_insert(trans, extent_root, 0);
+	del_pending_extents(trans, extent_root, 0);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 orig_parent, u64 parent,
+			    u64 ref_root, u64 ref_generation,
+			    u64 owner_objectid)
+{
+	int ret;
+	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
+					parent, ref_root, ref_root,
+					ref_generation, ref_generation,
+					owner_objectid);
+	return ret;
+}
+
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root, u64 bytenr,
+				  u64 orig_parent, u64 parent,
+				  u64 orig_root, u64 ref_root,
+				  u64 orig_generation, u64 ref_generation,
+				  u64 owner_objectid)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_extent_item *item;
+	u32 refs;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+				0, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(ret == 0 || path->slots[0] == 0);
+
+	path->slots[0]--;
+	l = path->nodes[0];
+
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+	if (key.objectid != bytenr) {
+		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)key.objectid);
+		BUG();
+	}
+	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
+
+	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(l, item);
+	btrfs_set_extent_refs(l, item, refs + 1);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_release_path(root->fs_info->extent_root, path);
+
+	path->reada = 1;
+	ret = insert_extent_backref(trans, root->fs_info->extent_root,
+				    path, bytenr, parent,
+				    ref_root, ref_generation,
+				    owner_objectid);
+	BUG_ON(ret);
+	finish_current_insert(trans, root->fs_info->extent_root, 0);
+	del_pending_extents(trans, root->fs_info->extent_root, 0);
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 ref_root, u64 ref_generation,
+			 u64 owner_objectid)
+{
+	int ret;
+	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+				     0, ref_root, 0, ref_generation,
+				     owner_objectid);
+	return ret;
+}
+
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root)
+{
+	finish_current_insert(trans, root->fs_info->extent_root, 1);
+	del_pending_extents(trans, root->fs_info->extent_root, 1);
+	return 0;
+}
+
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct extent_buffer *l;
+	struct btrfs_extent_item *item;
+
+	WARN_ON(num_bytes < root->sectorsize);
+	path = btrfs_alloc_path();
+	path->reada = 1;
+	key.objectid = bytenr;
+	key.offset = num_bytes;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+				0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret != 0) {
+		btrfs_print_leaf(root, path->nodes[0]);
+		printk(KERN_INFO "btrfs failed to find block number %llu\n",
+		       (unsigned long long)bytenr);
+		BUG();
+	}
+	l = path->nodes[0];
+	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+	*refs = btrfs_extent_refs(l, item);
+out:
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 objectid, u64 bytenr)
+{
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref *ref_item;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 ref_root;
+	u64 last_snapshot;
+	u32 nritems;
+	int ret;
+
+	key.objectid = bytenr;
+	key.offset = (u64)-1;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+
+	path = btrfs_alloc_path();
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	ret = -ENOENT;
+	if (path->slots[0] == 0)
+		goto out;
+
+	path->slots[0]--;
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != bytenr ||
+	    found_key.type != BTRFS_EXTENT_ITEM_KEY)
+		goto out;
+
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+	while (1) {
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
+			if (ret == 0)
+				continue;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != bytenr)
+			break;
+
+		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+			path->slots[0]++;
+			continue;
+		}
+
+		ref_item = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_extent_ref);
+		ref_root = btrfs_ref_root(leaf, ref_item);
+		if ((ref_root != root->root_key.objectid &&
+		     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
+		     objectid != btrfs_ref_objectid(leaf, ref_item)) {
+			ret = 1;
+			goto out;
+		}
+		if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
+			ret = 1;
+			goto out;
+		}
+
+		path->slots[0]++;
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct extent_buffer *buf, u32 nr_extents)
+{
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	u64 root_gen;
+	u32 nritems;
+	int i;
+	int level;
+	int ret = 0;
+	int shared = 0;
+
+	if (!root->ref_cows)
+		return 0;
+
+	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+		shared = 0;
+		root_gen = root->root_key.offset;
+	} else {
+		shared = 1;
+		root_gen = trans->transid - 1;
+	}
+
+	level = btrfs_header_level(buf);
+	nritems = btrfs_header_nritems(buf);
+
+	if (level == 0) {
+		struct btrfs_leaf_ref *ref;
+		struct btrfs_extent_info *info;
+
+		ref = btrfs_alloc_leaf_ref(root, nr_extents);
+		if (!ref) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ref->root_gen = root_gen;
+		ref->bytenr = buf->start;
+		ref->owner = btrfs_header_owner(buf);
+		ref->generation = btrfs_header_generation(buf);
+		ref->nritems = nr_extents;
+		info = ref->extents;
+
+		for (i = 0; nr_extents > 0 && i < nritems; i++) {
+			u64 disk_bytenr;
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (disk_bytenr == 0)
+				continue;
+
+			info->bytenr = disk_bytenr;
+			info->num_bytes =
+				btrfs_file_extent_disk_num_bytes(buf, fi);
+			info->objectid = key.objectid;
+			info->offset = key.offset;
+			info++;
+		}
+
+		ret = btrfs_add_leaf_ref(root, ref, shared);
+		if (ret == -EEXIST && shared) {
+			struct btrfs_leaf_ref *old;
+			old = btrfs_lookup_leaf_ref(root, ref->bytenr);
+			BUG_ON(!old);
+			btrfs_remove_leaf_ref(root, old);
+			btrfs_free_leaf_ref(root, old);
+			ret = btrfs_add_leaf_ref(root, ref, shared);
+		}
+		WARN_ON(ret);
+		btrfs_free_leaf_ref(root, ref);
+	}
+out:
+	return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+		  u32 *nr_extents)
+{
+	u64 bytenr;
+	u64 ref_root;
+	u64 orig_root;
+	u64 ref_generation;
+	u64 orig_generation;
+	u32 nritems;
+	u32 nr_file_extents = 0;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int level;
+	int ret = 0;
+	int faili = 0;
+	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+			    u64, u64, u64, u64, u64, u64, u64, u64);
+
+	ref_root = btrfs_header_owner(buf);
+	ref_generation = btrfs_header_generation(buf);
+	orig_root = btrfs_header_owner(orig_buf);
+	orig_generation = btrfs_header_generation(orig_buf);
+
+	nritems = btrfs_header_nritems(buf);
+	level = btrfs_header_level(buf);
+
+	if (root->ref_cows) {
+		process_func = __btrfs_inc_extent_ref;
+	} else {
+		if (level == 0 &&
+		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			goto out;
+		if (level != 0 &&
+		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+			goto out;
+		process_func = __btrfs_update_extent_ref;
+	}
+
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
+
+			nr_file_extents++;
+
+			ret = process_func(trans, root, bytenr,
+					   orig_buf->start, buf->start,
+					   orig_root, ref_root,
+					   orig_generation, ref_generation,
+					   key.objectid);
+
+			if (ret) {
+				faili = i;
+				WARN_ON(1);
+				goto fail;
+			}
+		} else {
+			bytenr = btrfs_node_blockptr(buf, i);
+			ret = process_func(trans, root, bytenr,
+					   orig_buf->start, buf->start,
+					   orig_root, ref_root,
+					   orig_generation, ref_generation,
+					   level - 1);
+			if (ret) {
+				faili = i;
+				WARN_ON(1);
+				goto fail;
+			}
+		}
+	}
+out:
+	if (nr_extents) {
+		if (level == 0)
+			*nr_extents = nr_file_extents;
+		else
+			*nr_extents = nritems;
+	}
+	return 0;
+fail:
+	WARN_ON(1);
+	return ret;
+}
+
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *orig_buf,
+		     struct extent_buffer *buf, int start_slot, int nr)
+
+{
+	u64 bytenr;
+	u64 ref_root;
+	u64 orig_root;
+	u64 ref_generation;
+	u64 orig_generation;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int ret;
+	int slot;
+	int level;
+
+	BUG_ON(start_slot < 0);
+	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
+
+	ref_root = btrfs_header_owner(buf);
+	ref_generation = btrfs_header_generation(buf);
+	orig_root = btrfs_header_owner(orig_buf);
+	orig_generation = btrfs_header_generation(orig_buf);
+	level = btrfs_header_level(buf);
+
+	if (!root->ref_cows) {
+		if (level == 0 &&
+		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			return 0;
+		if (level != 0 &&
+		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+			return 0;
+	}
+
+	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+		cond_resched();
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, slot);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, slot,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
+			ret = __btrfs_update_extent_ref(trans, root, bytenr,
+					    orig_buf->start, buf->start,
+					    orig_root, ref_root,
+					    orig_generation, ref_generation,
+					    key.objectid);
+			if (ret)
+				goto fail;
+		} else {
+			bytenr = btrfs_node_blockptr(buf, slot);
+			ret = __btrfs_update_extent_ref(trans, root, bytenr,
+					    orig_buf->start, buf->start,
+					    orig_root, ref_root,
+					    orig_generation, ref_generation,
+					    level - 1);
+			if (ret)
+				goto fail;
+		}
+	}
+	return 0;
+fail:
+	WARN_ON(1);
+	return -1;
+}
+
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_block_group_cache *cache)
+{
+	int ret;
+	int pending_ret;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	unsigned long bi;
+	struct extent_buffer *leaf;
+
+	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+	if (ret < 0)
+		goto fail;
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(extent_root, path);
+fail:
+	finish_current_insert(trans, extent_root, 0);
+	pending_ret = del_pending_extents(trans, extent_root, 0);
+	if (ret)
+		return ret;
+	if (pending_ret)
+		return pending_ret;
+	return 0;
+
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache, *entry;
+	struct rb_node *n;
+	int err = 0;
+	int werr = 0;
+	struct btrfs_path *path;
+	u64 last = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		cache = NULL;
+		spin_lock(&root->fs_info->block_group_cache_lock);
+		for (n = rb_first(&root->fs_info->block_group_cache_tree);
+		     n; n = rb_next(n)) {
+			entry = rb_entry(n, struct btrfs_block_group_cache,
+					 cache_node);
+			if (entry->dirty) {
+				cache = entry;
+				break;
+			}
+		}
+		spin_unlock(&root->fs_info->block_group_cache_lock);
+
+		if (!cache)
+			break;
+
+		cache->dirty = 0;
+		last += cache->key.offset;
+
+		err = write_one_cache_group(trans, root,
+					    path, cache);
+		/*
+		 * if we fail to write the cache group, we want
+		 * to keep it marked dirty in hopes that a later
+		 * write will work
+		 */
+		if (err) {
+			werr = err;
+			continue;
+		}
+	}
+	btrfs_free_path(path);
+	return werr;
+}
+
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_block_group_cache *block_group;
+	int readonly = 0;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+	if (!block_group || block_group->ro)
+		readonly = 1;
+	if (block_group)
+		put_block_group(block_group);
+	return readonly;
+}
+
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+			     u64 total_bytes, u64 bytes_used,
+			     struct btrfs_space_info **space_info)
+{
+	struct btrfs_space_info *found;
+
+	found = __find_space_info(info, flags);
+	if (found) {
+		spin_lock(&found->lock);
+		found->total_bytes += total_bytes;
+		found->bytes_used += bytes_used;
+		found->full = 0;
+		spin_unlock(&found->lock);
+		*space_info = found;
+		return 0;
+	}
+	found = kzalloc(sizeof(*found), GFP_NOFS);
+	if (!found)
+		return -ENOMEM;
+
+	list_add(&found->list, &info->space_info);
+	INIT_LIST_HEAD(&found->block_groups);
+	init_rwsem(&found->groups_sem);
+	spin_lock_init(&found->lock);
+	found->flags = flags;
+	found->total_bytes = total_bytes;
+	found->bytes_used = bytes_used;
+	found->bytes_pinned = 0;
+	found->bytes_reserved = 0;
+	found->bytes_readonly = 0;
+	found->full = 0;
+	found->force_alloc = 0;
+	*space_info = found;
+	return 0;
+}
+
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+				   BTRFS_BLOCK_GROUP_RAID1 |
+				   BTRFS_BLOCK_GROUP_RAID10 |
+				   BTRFS_BLOCK_GROUP_DUP);
+	if (extra_flags) {
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			fs_info->avail_data_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			fs_info->avail_metadata_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			fs_info->avail_system_alloc_bits |= extra_flags;
+	}
+}
+
+static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
+{
+	spin_lock(&cache->space_info->lock);
+	spin_lock(&cache->lock);
+	if (!cache->ro) {
+		cache->space_info->bytes_readonly += cache->key.offset -
+					btrfs_block_group_used(&cache->item);
+		cache->ro = 1;
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&cache->space_info->lock);
+}
+
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+{
+	u64 num_devices = root->fs_info->fs_devices->rw_devices;
+
+	if (num_devices == 1)
+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+	if (num_devices < 4)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+
+	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+		      BTRFS_BLOCK_GROUP_RAID10))) {
+		flags &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
+
+	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
+		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+	}
+
+	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
+	     (flags & BTRFS_BLOCK_GROUP_DUP)))
+		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+	return flags;
+}
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags, int force)
+{
+	struct btrfs_space_info *space_info;
+	u64 thresh;
+	int ret = 0;
+
+	mutex_lock(&extent_root->fs_info->chunk_mutex);
+
+	flags = btrfs_reduce_alloc_profile(extent_root, flags);
+
+	space_info = __find_space_info(extent_root->fs_info, flags);
+	if (!space_info) {
+		ret = update_space_info(extent_root->fs_info, flags,
+					0, 0, &space_info);
+		BUG_ON(ret);
+	}
+	BUG_ON(!space_info);
+
+	spin_lock(&space_info->lock);
+	if (space_info->force_alloc) {
+		force = 1;
+		space_info->force_alloc = 0;
+	}
+	if (space_info->full) {
+		spin_unlock(&space_info->lock);
+		goto out;
+	}
+
+	thresh = space_info->total_bytes - space_info->bytes_readonly;
+	thresh = div_factor(thresh, 6);
+	if (!force &&
+	   (space_info->bytes_used + space_info->bytes_pinned +
+	    space_info->bytes_reserved + alloc_bytes) < thresh) {
+		spin_unlock(&space_info->lock);
+		goto out;
+	}
+	spin_unlock(&space_info->lock);
+
+	ret = btrfs_alloc_chunk(trans, extent_root, flags);
+	if (ret)
+		space_info->full = 1;
+out:
+	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+	return ret;
+}
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      u64 bytenr, u64 num_bytes, int alloc,
+			      int mark_free)
+{
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 total = num_bytes;
+	u64 old_val;
+	u64 byte_in_group;
+
+	while (total) {
+		cache = btrfs_lookup_block_group(info, bytenr);
+		if (!cache)
+			return -1;
+		byte_in_group = bytenr - cache->key.objectid;
+		WARN_ON(byte_in_group > cache->key.offset);
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
+		cache->dirty = 1;
+		old_val = btrfs_block_group_used(&cache->item);
+		num_bytes = min(total, cache->key.offset - byte_in_group);
+		if (alloc) {
+			old_val += num_bytes;
+			cache->space_info->bytes_used += num_bytes;
+			if (cache->ro)
+				cache->space_info->bytes_readonly -= num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+		} else {
+			old_val -= num_bytes;
+			cache->space_info->bytes_used -= num_bytes;
+			if (cache->ro)
+				cache->space_info->bytes_readonly += num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+			if (mark_free) {
+				int ret;
+
+				ret = btrfs_discard_extent(root, bytenr,
+							   num_bytes);
+				WARN_ON(ret);
+
+				ret = btrfs_add_free_space(cache, bytenr,
+							   num_bytes);
+				WARN_ON(ret);
+			}
+		}
+		put_block_group(cache);
+		total -= num_bytes;
+		bytenr += num_bytes;
+	}
+	return 0;
+}
+
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 bytenr;
+
+	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+	if (!cache)
+		return 0;
+
+	bytenr = cache->key.objectid;
+	put_block_group(cache);
+
+	return bytenr;
+}
+
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+				u64 bytenr, u64 num, int pin)
+{
+	u64 len;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
+	if (pin) {
+		set_extent_dirty(&fs_info->pinned_extents,
+				bytenr, bytenr + num - 1, GFP_NOFS);
+	} else {
+		clear_extent_dirty(&fs_info->pinned_extents,
+				bytenr, bytenr + num - 1, GFP_NOFS);
+	}
+	while (num > 0) {
+		cache = btrfs_lookup_block_group(fs_info, bytenr);
+		BUG_ON(!cache);
+		len = min(num, cache->key.offset -
+			  (bytenr - cache->key.objectid));
+		if (pin) {
+			spin_lock(&cache->space_info->lock);
+			spin_lock(&cache->lock);
+			cache->pinned += len;
+			cache->space_info->bytes_pinned += len;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+			fs_info->total_pinned += len;
+		} else {
+			spin_lock(&cache->space_info->lock);
+			spin_lock(&cache->lock);
+			cache->pinned -= len;
+			cache->space_info->bytes_pinned -= len;
+			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
+			fs_info->total_pinned -= len;
+			if (cache->cached)
+				btrfs_add_free_space(cache, bytenr, len);
+		}
+		put_block_group(cache);
+		bytenr += len;
+		num -= len;
+	}
+	return 0;
+}
+
+static int update_reserved_extents(struct btrfs_root *root,
+				   u64 bytenr, u64 num, int reserve)
+{
+	u64 len;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	while (num > 0) {
+		cache = btrfs_lookup_block_group(fs_info, bytenr);
+		BUG_ON(!cache);
+		len = min(num, cache->key.offset -
+			  (bytenr - cache->key.objectid));
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
+		if (reserve) {
+			cache->reserved += len;
+			cache->space_info->bytes_reserved += len;
+		} else {
+			cache->reserved -= len;
+			cache->space_info->bytes_reserved -= len;
+		}
+		spin_unlock(&cache->lock);
+		spin_unlock(&cache->space_info->lock);
+		put_block_group(cache);
+		bytenr += len;
+		num -= len;
+	}
+	return 0;
+}
+
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
+{
+	u64 last = 0;
+	u64 start;
+	u64 end;
+	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
+	int ret;
+
+	mutex_lock(&root->fs_info->pinned_mutex);
+	while (1) {
+		ret = find_first_extent_bit(pinned_extents, last,
+					    &start, &end, EXTENT_DIRTY);
+		if (ret)
+			break;
+		set_extent_dirty(copy, start, end, GFP_NOFS);
+		last = end + 1;
+	}
+	mutex_unlock(&root->fs_info->pinned_mutex);
+	return 0;
+}
+
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_io_tree *unpin)
+{
+	u64 start;
+	u64 end;
+	int ret;
+
+	mutex_lock(&root->fs_info->pinned_mutex);
+	while (1) {
+		ret = find_first_extent_bit(unpin, 0, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		ret = btrfs_discard_extent(root, start, end + 1 - start);
+
+		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
+		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+
+		if (need_resched()) {
+			mutex_unlock(&root->fs_info->pinned_mutex);
+			cond_resched();
+			mutex_lock(&root->fs_info->pinned_mutex);
+		}
+	}
+	mutex_unlock(&root->fs_info->pinned_mutex);
+	return ret;
+}
+
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root, int all)
+{
+	u64 start;
+	u64 end;
+	u64 priv;
+	u64 search = 0;
+	u64 skipped = 0;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_path *path;
+	struct pending_extent_op *extent_op, *tmp;
+	struct list_head insert_list, update_list;
+	int ret;
+	int num_inserts = 0, max_inserts;
+
+	path = btrfs_alloc_path();
+	INIT_LIST_HEAD(&insert_list);
+	INIT_LIST_HEAD(&update_list);
+
+	max_inserts = extent_root->leafsize /
+		(2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
+		 sizeof(struct btrfs_extent_ref) +
+		 sizeof(struct btrfs_extent_item));
+again:
+	mutex_lock(&info->extent_ins_mutex);
+	while (1) {
+		ret = find_first_extent_bit(&info->extent_ins, search, &start,
+					    &end, EXTENT_WRITEBACK);
+		if (ret) {
+			if (skipped && all && !num_inserts) {
+				skipped = 0;
+				search = 0;
+				continue;
+			}
+			mutex_unlock(&info->extent_ins_mutex);
+			break;
+		}
+
+		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			skipped = 1;
+			search = end + 1;
+			if (need_resched()) {
+				mutex_unlock(&info->extent_ins_mutex);
+				cond_resched();
+				mutex_lock(&info->extent_ins_mutex);
+			}
+			continue;
+		}
+
+		ret = get_state_private(&info->extent_ins, start, &priv);
+		BUG_ON(ret);
+		extent_op = (struct pending_extent_op *)(unsigned long) priv;
+
+		if (extent_op->type == PENDING_EXTENT_INSERT) {
+			num_inserts++;
+			list_add_tail(&extent_op->list, &insert_list);
+			search = end + 1;
+			if (num_inserts == max_inserts) {
+				mutex_unlock(&info->extent_ins_mutex);
+				break;
+			}
+		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
+			list_add_tail(&extent_op->list, &update_list);
+			search = end + 1;
+		} else {
+			BUG();
+		}
+	}
+
+	/*
+	 * process the update list, clear the writeback bit for it, and if
+	 * somebody marked this thing for deletion then just unlock it and be
+	 * done, the free_extents will handle it
+	 */
+	mutex_lock(&info->extent_ins_mutex);
+	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
+		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+				  extent_op->bytenr + extent_op->num_bytes - 1,
+				  EXTENT_WRITEBACK, GFP_NOFS);
+		if (extent_op->del) {
+			list_del_init(&extent_op->list);
+			unlock_extent(&info->extent_ins, extent_op->bytenr,
+				      extent_op->bytenr + extent_op->num_bytes
+				      - 1, GFP_NOFS);
+			kfree(extent_op);
+		}
+	}
+	mutex_unlock(&info->extent_ins_mutex);
+
+	/*
+	 * still have things left on the update list, go ahead an update
+	 * everything
+	 */
+	if (!list_empty(&update_list)) {
+		ret = update_backrefs(trans, extent_root, path, &update_list);
+		BUG_ON(ret);
+	}
+
+	/*
+	 * if no inserts need to be done, but we skipped some extents and we
+	 * need to make sure everything is cleaned then reset everything and
+	 * go back to the beginning
+	 */
+	if (!num_inserts && all && skipped) {
+		search = 0;
+		skipped = 0;
+		INIT_LIST_HEAD(&update_list);
+		INIT_LIST_HEAD(&insert_list);
+		goto again;
+	} else if (!num_inserts) {
+		goto out;
+	}
+
+	/*
+	 * process the insert extents list.  Again if we are deleting this
+	 * extent, then just unlock it, pin down the bytes if need be, and be
+	 * done with it.  Saves us from having to actually insert the extent
+	 * into the tree and then subsequently come along and delete it
+	 */
+	mutex_lock(&info->extent_ins_mutex);
+	list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
+		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+				  extent_op->bytenr + extent_op->num_bytes - 1,
+				  EXTENT_WRITEBACK, GFP_NOFS);
+		if (extent_op->del) {
+			u64 used;
+			list_del_init(&extent_op->list);
+			unlock_extent(&info->extent_ins, extent_op->bytenr,
+				      extent_op->bytenr + extent_op->num_bytes
+				      - 1, GFP_NOFS);
+
+			mutex_lock(&extent_root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root,
+					     extent_op->bytenr,
+					     extent_op->num_bytes, 0);
+			mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
+			spin_lock(&info->delalloc_lock);
+			used = btrfs_super_bytes_used(&info->super_copy);
+			btrfs_set_super_bytes_used(&info->super_copy,
+					used - extent_op->num_bytes);
+			used = btrfs_root_used(&extent_root->root_item);
+			btrfs_set_root_used(&extent_root->root_item,
+					used - extent_op->num_bytes);
+			spin_unlock(&info->delalloc_lock);
+
+			ret = update_block_group(trans, extent_root,
+						 extent_op->bytenr,
+						 extent_op->num_bytes,
+						 0, ret > 0);
+			BUG_ON(ret);
+			kfree(extent_op);
+			num_inserts--;
+		}
+	}
+	mutex_unlock(&info->extent_ins_mutex);
+
+	ret = insert_extents(trans, extent_root, path, &insert_list,
+			     num_inserts);
+	BUG_ON(ret);
+
+	/*
+	 * if we broke out of the loop in order to insert stuff because we hit
+	 * the maximum number of inserts at a time we can handle, then loop
+	 * back and pick up where we left off
+	 */
+	if (num_inserts == max_inserts) {
+		INIT_LIST_HEAD(&insert_list);
+		INIT_LIST_HEAD(&update_list);
+		num_inserts = 0;
+		goto again;
+	}
+
+	/*
+	 * again, if we need to make absolutely sure there are no more pending
+	 * extent operations left and we know that we skipped some, go back to
+	 * the beginning and do it all again
+	 */
+	if (all && skipped) {
+		INIT_LIST_HEAD(&insert_list);
+		INIT_LIST_HEAD(&update_list);
+		search = 0;
+		skipped = 0;
+		num_inserts = 0;
+		goto again;
+	}
+out:
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 bytenr, u64 num_bytes, int is_data)
+{
+	int err = 0;
+	struct extent_buffer *buf;
+
+	if (is_data)
+		goto pinit;
+
+	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+	if (!buf)
+		goto pinit;
+
+	/* we can reuse a block if it hasn't been written
+	 * and it is from this transaction.  We can't
+	 * reuse anything from the tree log root because
+	 * it has tiny sub-transactions.
+	 */
+	if (btrfs_buffer_uptodate(buf, 0) &&
+	    btrfs_try_tree_lock(buf)) {
+		u64 header_owner = btrfs_header_owner(buf);
+		u64 header_transid = btrfs_header_generation(buf);
+		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+		    header_transid == trans->transid &&
+		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+			clean_tree_block(NULL, root, buf);
+			btrfs_tree_unlock(buf);
+			free_extent_buffer(buf);
+			return 1;
+		}
+		btrfs_tree_unlock(buf);
+	}
+	free_extent_buffer(buf);
+pinit:
+	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+
+	BUG_ON(err < 0);
+	return 0;
+}
+
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __free_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 ref_generation,
+			 u64 owner_objectid, int pin, int mark_free)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
+	struct extent_buffer *leaf;
+	int ret;
+	int extent_slot = 0;
+	int found_extent = 0;
+	int num_to_del = 1;
+	struct btrfs_extent_item *ei;
+	u32 refs;
+
+	key.objectid = bytenr;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	key.offset = num_bytes;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+	ret = lookup_extent_backref(trans, extent_root, path,
+				    bytenr, parent, root_objectid,
+				    ref_generation, owner_objectid, 1);
+	if (ret == 0) {
+		struct btrfs_key found_key;
+		extent_slot = path->slots[0];
+		while (extent_slot > 0) {
+			extent_slot--;
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      extent_slot);
+			if (found_key.objectid != bytenr)
+				break;
+			if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    found_key.offset == num_bytes) {
+				found_extent = 1;
+				break;
+			}
+			if (path->slots[0] - extent_slot > 5)
+				break;
+		}
+		if (!found_extent) {
+			ret = remove_extent_backref(trans, extent_root, path);
+			BUG_ON(ret);
+			btrfs_release_path(extent_root, path);
+			ret = btrfs_search_slot(trans, extent_root,
+						&key, path, -1, 1);
+			if (ret) {
+				printk(KERN_ERR "umm, got %d back from search"
+				       ", was looking for %llu\n", ret,
+				       (unsigned long long)bytenr);
+				btrfs_print_leaf(extent_root, path->nodes[0]);
+			}
+			BUG_ON(ret);
+			extent_slot = path->slots[0];
+		}
+	} else {
+		btrfs_print_leaf(extent_root, path->nodes[0]);
+		WARN_ON(1);
+		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
+		       "root %llu gen %llu owner %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)root_objectid,
+		       (unsigned long long)ref_generation,
+		       (unsigned long long)owner_objectid);
+	}
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, extent_slot,
+			    struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	BUG_ON(refs == 0);
+	refs -= 1;
+	btrfs_set_extent_refs(leaf, ei, refs);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+		struct btrfs_extent_ref *ref;
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+		/* if the back ref and the extent are next to each other
+		 * they get deleted below in one shot
+		 */
+		path->slots[0] = extent_slot;
+		num_to_del = 2;
+	} else if (found_extent) {
+		/* otherwise delete the extent back ref */
+		ret = remove_extent_backref(trans, extent_root, path);
+		BUG_ON(ret);
+		/* if refs are 0, we need to setup the path for deletion */
+		if (refs == 0) {
+			btrfs_release_path(extent_root, path);
+			ret = btrfs_search_slot(trans, extent_root, &key, path,
+						-1, 1);
+			BUG_ON(ret);
+		}
+	}
+
+	if (refs == 0) {
+		u64 super_used;
+		u64 root_used;
+
+		if (pin) {
+			mutex_lock(&root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+			mutex_unlock(&root->fs_info->pinned_mutex);
+			if (ret > 0)
+				mark_free = 1;
+			BUG_ON(ret < 0);
+		}
+		/* block accounting for super block */
+		spin_lock(&info->delalloc_lock);
+		super_used = btrfs_super_bytes_used(&info->super_copy);
+		btrfs_set_super_bytes_used(&info->super_copy,
+					   super_used - num_bytes);
+
+		/* block accounting for root item */
+		root_used = btrfs_root_used(&root->root_item);
+		btrfs_set_root_used(&root->root_item,
+					   root_used - num_bytes);
+		spin_unlock(&info->delalloc_lock);
+		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+				      num_to_del);
+		BUG_ON(ret);
+		btrfs_release_path(extent_root, path);
+
+		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+			BUG_ON(ret);
+		}
+
+		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+					 mark_free);
+		BUG_ON(ret);
+	}
+	btrfs_free_path(path);
+	finish_current_insert(trans, extent_root, 0);
+	return ret;
+}
+
+/*
+ * find all the blocks marked as pending in the radix tree and remove
+ * them from the extent map
+ */
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all)
+{
+	int ret;
+	int err = 0;
+	u64 start;
+	u64 end;
+	u64 priv;
+	u64 search = 0;
+	int nr = 0, skipped = 0;
+	struct extent_io_tree *pending_del;
+	struct extent_io_tree *extent_ins;
+	struct pending_extent_op *extent_op;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct list_head delete_list;
+
+	INIT_LIST_HEAD(&delete_list);
+	extent_ins = &extent_root->fs_info->extent_ins;
+	pending_del = &extent_root->fs_info->pending_del;
+
+again:
+	mutex_lock(&info->extent_ins_mutex);
+	while (1) {
+		ret = find_first_extent_bit(pending_del, search, &start, &end,
+					    EXTENT_WRITEBACK);
+		if (ret) {
+			if (all && skipped && !nr) {
+				search = 0;
+				continue;
+			}
+			mutex_unlock(&info->extent_ins_mutex);
+			break;
+		}
+
+		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			search = end+1;
+			skipped = 1;
+
+			if (need_resched()) {
+				mutex_unlock(&info->extent_ins_mutex);
+				cond_resched();
+				mutex_lock(&info->extent_ins_mutex);
+			}
+
+			continue;
+		}
+		BUG_ON(ret < 0);
+
+		ret = get_state_private(pending_del, start, &priv);
+		BUG_ON(ret);
+		extent_op = (struct pending_extent_op *)(unsigned long)priv;
+
+		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
+				  GFP_NOFS);
+		if (!test_range_bit(extent_ins, start, end,
+				    EXTENT_WRITEBACK, 0)) {
+			list_add_tail(&extent_op->list, &delete_list);
+			nr++;
+		} else {
+			kfree(extent_op);
+
+			ret = get_state_private(&info->extent_ins, start,
+						&priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+						(unsigned long)priv;
+
+			clear_extent_bits(&info->extent_ins, start, end,
+					  EXTENT_WRITEBACK, GFP_NOFS);
+
+			if (extent_op->type == PENDING_BACKREF_UPDATE) {
+				list_add_tail(&extent_op->list, &delete_list);
+				search = end + 1;
+				nr++;
+				continue;
+			}
+
+			mutex_lock(&extent_root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root, start,
+					     end + 1 - start, 0);
+			mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
+			ret = update_block_group(trans, extent_root, start,
+						end + 1 - start, 0, ret > 0);
+
+			unlock_extent(extent_ins, start, end, GFP_NOFS);
+			BUG_ON(ret);
+			kfree(extent_op);
+		}
+		if (ret)
+			err = ret;
+
+		search = end + 1;
+
+		if (need_resched()) {
+			mutex_unlock(&info->extent_ins_mutex);
+			cond_resched();
+			mutex_lock(&info->extent_ins_mutex);
+		}
+	}
+
+	if (nr) {
+		ret = free_extents(trans, extent_root, &delete_list);
+		BUG_ON(ret);
+	}
+
+	if (all && skipped) {
+		INIT_LIST_HEAD(&delete_list);
+		search = 0;
+		nr = 0;
+		goto again;
+	}
+
+	return err;
+}
+
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 root_objectid, u64 ref_generation,
+			       u64 owner_objectid, int pin)
+{
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	int pending_ret;
+	int ret;
+
+	WARN_ON(num_bytes < root->sectorsize);
+	if (root == extent_root) {
+		struct pending_extent_op *extent_op = NULL;
+
+		mutex_lock(&root->fs_info->extent_ins_mutex);
+		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+			u64 priv;
+			ret = get_state_private(&root->fs_info->extent_ins,
+						bytenr, &priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+						(unsigned long)priv;
+
+			extent_op->del = 1;
+			if (extent_op->type == PENDING_EXTENT_INSERT) {
+				mutex_unlock(&root->fs_info->extent_ins_mutex);
+				return 0;
+			}
+		}
+
+		if (extent_op) {
+			ref_generation = extent_op->orig_generation;
+			parent = extent_op->orig_parent;
+		}
+
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+
+		extent_op->type = PENDING_EXTENT_DELETE;
+		extent_op->bytenr = bytenr;
+		extent_op->num_bytes = num_bytes;
+		extent_op->parent = parent;
+		extent_op->orig_parent = parent;
+		extent_op->generation = ref_generation;
+		extent_op->orig_generation = ref_generation;
+		extent_op->level = (int)owner_objectid;
+		INIT_LIST_HEAD(&extent_op->list);
+		extent_op->del = 0;
+
+		set_extent_bits(&root->fs_info->pending_del,
+				bytenr, bytenr + num_bytes - 1,
+				EXTENT_WRITEBACK, GFP_NOFS);
+		set_state_private(&root->fs_info->pending_del,
+				  bytenr, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
+		return 0;
+	}
+	/* if metadata always pin */
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+			struct btrfs_block_group_cache *cache;
+
+			/* btrfs_free_reserved_extent */
+			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+			BUG_ON(!cache);
+			btrfs_add_free_space(cache, bytenr, num_bytes);
+			put_block_group(cache);
+			update_reserved_extents(root, bytenr, num_bytes, 0);
+			return 0;
+		}
+		pin = 1;
+	}
+
+	/* if data pin when any transaction has committed this */
+	if (ref_generation != trans->transid)
+		pin = 1;
+
+	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+			    root_objectid, ref_generation,
+			    owner_objectid, pin, pin == 0);
+
+	finish_current_insert(trans, root->fs_info->extent_root, 0);
+	pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
+	return ret ? ret : pending_ret;
+}
+
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent,
+		      u64 root_objectid, u64 ref_generation,
+		      u64 owner_objectid, int pin)
+{
+	int ret;
+
+	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
+				  root_objectid, ref_generation,
+				  owner_objectid, pin);
+	return ret;
+}
+
+static u64 stripe_align(struct btrfs_root *root, u64 val)
+{
+	u64 mask = ((u64)root->stripesize - 1);
+	u64 ret = (val + mask) & ~mask;
+	return ret;
+}
+
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == block start
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == number of blocks
+ * Any available blocks before search_start are skipped.
+ */
+static noinline int find_free_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *orig_root,
+				     u64 num_bytes, u64 empty_size,
+				     u64 search_start, u64 search_end,
+				     u64 hint_byte, struct btrfs_key *ins,
+				     u64 exclude_start, u64 exclude_nr,
+				     int data)
+{
+	int ret = 0;
+	struct btrfs_root *root = orig_root->fs_info->extent_root;
+	u64 total_needed = num_bytes;
+	u64 *last_ptr = NULL;
+	u64 last_wanted = 0;
+	struct btrfs_block_group_cache *block_group = NULL;
+	int chunk_alloc_done = 0;
+	int empty_cluster = 2 * 1024 * 1024;
+	int allowed_chunk_alloc = 0;
+	struct list_head *head = NULL, *cur = NULL;
+	int loop = 0;
+	int extra_loop = 0;
+	struct btrfs_space_info *space_info;
+
+	WARN_ON(num_bytes < root->sectorsize);
+	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+	ins->objectid = 0;
+	ins->offset = 0;
+
+	if (orig_root->ref_cows || empty_size)
+		allowed_chunk_alloc = 1;
+
+	if (data & BTRFS_BLOCK_GROUP_METADATA) {
+		last_ptr = &root->fs_info->last_alloc;
+		empty_cluster = 64 * 1024;
+	}
+
+	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
+		last_ptr = &root->fs_info->last_data_alloc;
+
+	if (last_ptr) {
+		if (*last_ptr) {
+			hint_byte = *last_ptr;
+			last_wanted = *last_ptr;
+		} else
+			empty_size += empty_cluster;
+	} else {
+		empty_cluster = 0;
+	}
+	search_start = max(search_start, first_logical_byte(root, 0));
+	search_start = max(search_start, hint_byte);
+
+	if (last_wanted && search_start != last_wanted) {
+		last_wanted = 0;
+		empty_size += empty_cluster;
+	}
+
+	total_needed += empty_size;
+	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
+	if (!block_group)
+		block_group = btrfs_lookup_first_block_group(root->fs_info,
+							     search_start);
+	space_info = __find_space_info(root->fs_info, data);
+
+	down_read(&space_info->groups_sem);
+	while (1) {
+		struct btrfs_free_space *free_space;
+		/*
+		 * the only way this happens if our hint points to a block
+		 * group thats not of the proper type, while looping this
+		 * should never happen
+		 */
+		if (empty_size)
+			extra_loop = 1;
+
+		if (!block_group)
+			goto new_group_no_lock;
+
+		if (unlikely(!block_group->cached)) {
+			mutex_lock(&block_group->cache_mutex);
+			ret = cache_block_group(root, block_group);
+			mutex_unlock(&block_group->cache_mutex);
+			if (ret)
+				break;
+		}
+
+		mutex_lock(&block_group->alloc_mutex);
+		if (unlikely(!block_group_bits(block_group, data)))
+			goto new_group;
+
+		if (unlikely(block_group->ro))
+			goto new_group;
+
+		free_space = btrfs_find_free_space(block_group, search_start,
+						   total_needed);
+		if (free_space) {
+			u64 start = block_group->key.objectid;
+			u64 end = block_group->key.objectid +
+				block_group->key.offset;
+
+			search_start = stripe_align(root, free_space->offset);
+
+			/* move on to the next group */
+			if (search_start + num_bytes >= search_end)
+				goto new_group;
+
+			/* move on to the next group */
+			if (search_start + num_bytes > end)
+				goto new_group;
+
+			if (last_wanted && search_start != last_wanted) {
+				total_needed += empty_cluster;
+				empty_size += empty_cluster;
+				last_wanted = 0;
+				/*
+				 * if search_start is still in this block group
+				 * then we just re-search this block group
+				 */
+				if (search_start >= start &&
+				    search_start < end) {
+					mutex_unlock(&block_group->alloc_mutex);
+					continue;
+				}
+
+				/* else we go to the next block group */
+				goto new_group;
+			}
+
+			if (exclude_nr > 0 &&
+			    (search_start + num_bytes > exclude_start &&
+			     search_start < exclude_start + exclude_nr)) {
+				search_start = exclude_start + exclude_nr;
+				/*
+				 * if search_start is still in this block group
+				 * then we just re-search this block group
+				 */
+				if (search_start >= start &&
+				    search_start < end) {
+					mutex_unlock(&block_group->alloc_mutex);
+					last_wanted = 0;
+					continue;
+				}
+
+				/* else we go to the next block group */
+				goto new_group;
+			}
+
+			ins->objectid = search_start;
+			ins->offset = num_bytes;
+
+			btrfs_remove_free_space_lock(block_group, search_start,
+						     num_bytes);
+			/* we are all good, lets return */
+			mutex_unlock(&block_group->alloc_mutex);
+			break;
+		}
+new_group:
+		mutex_unlock(&block_group->alloc_mutex);
+		put_block_group(block_group);
+		block_group = NULL;
+new_group_no_lock:
+		/* don't try to compare new allocations against the
+		 * last allocation any more
+		 */
+		last_wanted = 0;
+
+		/*
+		 * Here's how this works.
+		 * loop == 0: we were searching a block group via a hint
+		 *		and didn't find anything, so we start at
+		 *		the head of the block groups and keep searching
+		 * loop == 1: we're searching through all of the block groups
+		 *		if we hit the head again we have searched
+		 *		all of the block groups for this space and we
+		 *		need to try and allocate, if we cant error out.
+		 * loop == 2: we allocated more space and are looping through
+		 *		all of the block groups again.
+		 */
+		if (loop == 0) {
+			head = &space_info->block_groups;
+			cur = head->next;
+			loop++;
+		} else if (loop == 1 && cur == head) {
+			int keep_going;
+
+			/* at this point we give up on the empty_size
+			 * allocations and just try to allocate the min
+			 * space.
+			 *
+			 * The extra_loop field was set if an empty_size
+			 * allocation was attempted above, and if this
+			 * is try we need to try the loop again without
+			 * the additional empty_size.
+			 */
+			total_needed -= empty_size;
+			empty_size = 0;
+			keep_going = extra_loop;
+			loop++;
+
+			if (allowed_chunk_alloc && !chunk_alloc_done) {
+				up_read(&space_info->groups_sem);
+				ret = do_chunk_alloc(trans, root, num_bytes +
+						     2 * 1024 * 1024, data, 1);
+				down_read(&space_info->groups_sem);
+				if (ret < 0)
+					goto loop_check;
+				head = &space_info->block_groups;
+				/*
+				 * we've allocated a new chunk, keep
+				 * trying
+				 */
+				keep_going = 1;
+				chunk_alloc_done = 1;
+			} else if (!allowed_chunk_alloc) {
+				space_info->force_alloc = 1;
+			}
+loop_check:
+			if (keep_going) {
+				cur = head->next;
+				extra_loop = 0;
+			} else {
+				break;
+			}
+		} else if (cur == head) {
+			break;
+		}
+
+		block_group = list_entry(cur, struct btrfs_block_group_cache,
+					 list);
+		atomic_inc(&block_group->count);
+
+		search_start = block_group->key.objectid;
+		cur = cur->next;
+	}
+
+	/* we found what we needed */
+	if (ins->objectid) {
+		if (!(data & BTRFS_BLOCK_GROUP_DATA))
+			trans->block_group = block_group->key.objectid;
+
+		if (last_ptr)
+			*last_ptr = ins->objectid + ins->offset;
+		ret = 0;
+	} else if (!ret) {
+		printk(KERN_ERR "btrfs searching for %llu bytes, "
+		       "num_bytes %llu, loop %d, allowed_alloc %d\n",
+		       (unsigned long long)total_needed,
+		       (unsigned long long)num_bytes,
+		       loop, allowed_chunk_alloc);
+		ret = -ENOSPC;
+	}
+	if (block_group)
+		put_block_group(block_group);
+
+	up_read(&space_info->groups_sem);
+	return ret;
+}
+
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+{
+	struct btrfs_block_group_cache *cache;
+	struct list_head *l;
+
+	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+	       (unsigned long long)(info->total_bytes - info->bytes_used -
+				    info->bytes_pinned - info->bytes_reserved),
+	       (info->full) ? "" : "not ");
+
+	down_read(&info->groups_sem);
+	list_for_each(l, &info->block_groups) {
+		cache = list_entry(l, struct btrfs_block_group_cache, list);
+		spin_lock(&cache->lock);
+		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
+		       "%llu pinned %llu reserved\n",
+		       (unsigned long long)cache->key.objectid,
+		       (unsigned long long)cache->key.offset,
+		       (unsigned long long)btrfs_block_group_used(&cache->item),
+		       (unsigned long long)cache->pinned,
+		       (unsigned long long)cache->reserved);
+		btrfs_dump_free_space(cache, bytes);
+		spin_unlock(&cache->lock);
+	}
+	up_read(&info->groups_sem);
+}
+
+static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data)
+{
+	int ret;
+	u64 search_start = 0;
+	u64 alloc_profile;
+	struct btrfs_fs_info *info = root->fs_info;
+
+	if (data) {
+		alloc_profile = info->avail_data_alloc_bits &
+			info->data_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+	} else if (root == root->fs_info->chunk_root) {
+		alloc_profile = info->avail_system_alloc_bits &
+			info->system_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+	} else {
+		alloc_profile = info->avail_metadata_alloc_bits &
+			info->metadata_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+	}
+again:
+	data = btrfs_reduce_alloc_profile(root, data);
+	/*
+	 * the only place that sets empty_size is btrfs_realloc_node, which
+	 * is not called recursively on allocations
+	 */
+	if (empty_size || root->ref_cows) {
+		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     2 * 1024 * 1024,
+				     BTRFS_BLOCK_GROUP_METADATA |
+				     (info->metadata_alloc_profile &
+				      info->avail_metadata_alloc_bits), 0);
+		}
+		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     num_bytes + 2 * 1024 * 1024, data, 0);
+	}
+
+	WARN_ON(num_bytes < root->sectorsize);
+	ret = find_free_extent(trans, root, num_bytes, empty_size,
+			       search_start, search_end, hint_byte, ins,
+			       trans->alloc_exclude_start,
+			       trans->alloc_exclude_nr, data);
+
+	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
+		num_bytes = num_bytes >> 1;
+		num_bytes = num_bytes & ~(root->sectorsize - 1);
+		num_bytes = max(num_bytes, min_alloc_size);
+		do_chunk_alloc(trans, root->fs_info->extent_root,
+			       num_bytes, data, 1);
+		goto again;
+	}
+	if (ret) {
+		struct btrfs_space_info *sinfo;
+
+		sinfo = __find_space_info(root->fs_info, data);
+		printk(KERN_ERR "btrfs allocation failed flags %llu, "
+		       "wanted %llu\n", (unsigned long long)data,
+		       (unsigned long long)num_bytes);
+		dump_space_info(sinfo, num_bytes);
+		BUG();
+	}
+
+	return ret;
+}
+
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+	struct btrfs_block_group_cache *cache;
+	int ret = 0;
+
+	cache = btrfs_lookup_block_group(root->fs_info, start);
+	if (!cache) {
+		printk(KERN_ERR "Unable to find block group for %llu\n",
+		       (unsigned long long)start);
+		return -ENOSPC;
+	}
+
+	ret = btrfs_discard_extent(root, start, len);
+
+	btrfs_add_free_space(cache, start, len);
+	put_block_group(cache);
+	update_reserved_extents(root, start, len, 0);
+
+	return ret;
+}
+
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data)
+{
+	int ret;
+	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+				     empty_size, hint_byte, search_end, ins,
+				     data);
+	update_reserved_extents(root, ins->objectid, ins->offset, 1);
+	return ret;
+}
+
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root, u64 parent,
+					 u64 root_objectid, u64 ref_generation,
+					 u64 owner, struct btrfs_key *ins)
+{
+	int ret;
+	int pending_ret;
+	u64 super_used;
+	u64 root_used;
+	u64 num_bytes = ins->offset;
+	u32 sizes[2];
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_path *path;
+	struct btrfs_key keys[2];
+
+	if (parent == 0)
+		parent = ins->objectid;
+
+	/* block accounting for super block */
+	spin_lock(&info->delalloc_lock);
+	super_used = btrfs_super_bytes_used(&info->super_copy);
+	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
+
+	/* block accounting for root item */
+	root_used = btrfs_root_used(&root->root_item);
+	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
+	spin_unlock(&info->delalloc_lock);
+
+	if (root == extent_root) {
+		struct pending_extent_op *extent_op;
+
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+
+		extent_op->type = PENDING_EXTENT_INSERT;
+		extent_op->bytenr = ins->objectid;
+		extent_op->num_bytes = ins->offset;
+		extent_op->parent = parent;
+		extent_op->orig_parent = 0;
+		extent_op->generation = ref_generation;
+		extent_op->orig_generation = 0;
+		extent_op->level = (int)owner;
+		INIT_LIST_HEAD(&extent_op->list);
+		extent_op->del = 0;
+
+		mutex_lock(&root->fs_info->extent_ins_mutex);
+		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
+				ins->objectid + ins->offset - 1,
+				EXTENT_WRITEBACK, GFP_NOFS);
+		set_state_private(&root->fs_info->extent_ins,
+				  ins->objectid, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
+		goto update_block;
+	}
+
+	memcpy(&keys[0], ins, sizeof(*ins));
+	keys[1].objectid = ins->objectid;
+	keys[1].type = BTRFS_EXTENT_REF_KEY;
+	keys[1].offset = parent;
+	sizes[0] = sizeof(*extent_item);
+	sizes[1] = sizeof(*ref);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
+				       sizes, 2);
+	BUG_ON(ret);
+
+	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_extent_item);
+	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+			     struct btrfs_extent_ref);
+
+	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
+	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
+	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
+	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	trans->alloc_exclude_start = 0;
+	trans->alloc_exclude_nr = 0;
+	btrfs_free_path(path);
+	finish_current_insert(trans, extent_root, 0);
+	pending_ret = del_pending_extents(trans, extent_root, 0);
+
+	if (ret)
+		goto out;
+	if (pending_ret) {
+		ret = pending_ret;
+		goto out;
+	}
+
+update_block:
+	ret = update_block_group(trans, root, ins->objectid,
+				 ins->offset, 1, 0);
+	if (ret) {
+		printk(KERN_ERR "btrfs update block group failed for %llu "
+		       "%llu\n", (unsigned long long)ins->objectid,
+		       (unsigned long long)ins->offset);
+		BUG();
+	}
+out:
+	return ret;
+}
+
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, u64 parent,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, struct btrfs_key *ins)
+{
+	int ret;
+
+	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
+		return 0;
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+					    ref_generation, owner, ins);
+	update_reserved_extents(root, ins->objectid, ins->offset, 0);
+	return ret;
+}
+
+/*
+ * this is used by the tree logging recovery code.  It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, u64 parent,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, struct btrfs_key *ins)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	mutex_lock(&block_group->cache_mutex);
+	cache_block_group(root, block_group);
+	mutex_unlock(&block_group->cache_mutex);
+
+	ret = btrfs_remove_free_space(block_group, ins->objectid,
+				      ins->offset);
+	BUG_ON(ret);
+	put_block_group(block_group);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+					    ref_generation, owner, ins);
+	return ret;
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       u64 num_bytes, u64 parent, u64 min_alloc_size,
+		       u64 root_objectid, u64 ref_generation,
+		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
+		       u64 search_end, struct btrfs_key *ins, u64 data)
+{
+	int ret;
+
+	ret = __btrfs_reserve_extent(trans, root, num_bytes,
+				     min_alloc_size, empty_size, hint_byte,
+				     search_end, ins, data);
+	BUG_ON(ret);
+	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+					root_objectid, ref_generation,
+					owner_objectid, ins);
+		BUG_ON(ret);
+
+	} else {
+		update_reserved_extents(root, ins->objectid, ins->offset, 1);
+	}
+	return ret;
+}
+
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize)
+{
+	struct extent_buffer *buf;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_tree_lock(buf);
+	clean_tree_block(trans, root, buf);
+	btrfs_set_buffer_uptodate(buf);
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+		set_extent_dirty(&root->dirty_log_pages, buf->start,
+			 buf->start + buf->len - 1, GFP_NOFS);
+	} else {
+		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+			 buf->start + buf->len - 1, GFP_NOFS);
+	}
+	trans->blocks_used++;
+	return buf;
+}
+
+/*
+ * helper function to allocate a block for a given tree
+ * returns the tree buffer or NULL.
+ */
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     u32 blocksize, u64 parent,
+					     u64 root_objectid,
+					     u64 ref_generation,
+					     int level,
+					     u64 hint,
+					     u64 empty_size)
+{
+	struct btrfs_key ins;
+	int ret;
+	struct extent_buffer *buf;
+
+	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+				 root_objectid, ref_generation, level,
+				 empty_size, hint, (u64)-1, &ins, 0);
+	if (ret) {
+		BUG_ON(ret > 0);
+		return ERR_PTR(ret);
+	}
+
+	buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+	return buf;
+}
+
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct extent_buffer *leaf)
+{
+	u64 leaf_owner;
+	u64 leaf_generation;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int nritems;
+	int ret;
+
+	BUG_ON(!btrfs_is_leaf(leaf));
+	nritems = btrfs_header_nritems(leaf);
+	leaf_owner = btrfs_header_owner(leaf);
+	leaf_generation = btrfs_header_generation(leaf);
+
+	for (i = 0; i < nritems; i++) {
+		u64 disk_bytenr;
+		cond_resched();
+
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		/*
+		 * FIXME make sure to insert a trans record that
+		 * repeats the snapshot del on crash
+		 */
+		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		if (disk_bytenr == 0)
+			continue;
+
+		ret = __btrfs_free_extent(trans, root, disk_bytenr,
+				btrfs_file_extent_disk_num_bytes(leaf, fi),
+				leaf->start, leaf_owner, leaf_generation,
+				key.objectid, 0);
+		BUG_ON(ret);
+
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+	}
+	return 0;
+}
+
+static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_leaf_ref *ref)
+{
+	int i;
+	int ret;
+	struct btrfs_extent_info *info = ref->extents;
+
+	for (i = 0; i < ref->nritems; i++) {
+		ret = __btrfs_free_extent(trans, root, info->bytenr,
+					  info->num_bytes, ref->bytenr,
+					  ref->owner, ref->generation,
+					  info->objectid, 0);
+
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+
+		BUG_ON(ret);
+		info++;
+	}
+
+	return 0;
+}
+
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+				     u64 len, u32 *refs)
+{
+	int ret;
+
+	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+	BUG_ON(ret);
+
+#if 0 /* some debugging code in case we see problems here */
+	/* if the refs count is one, it won't get increased again.  But
+	 * if the ref count is > 1, someone may be decreasing it at
+	 * the same time we are.
+	 */
+	if (*refs != 1) {
+		struct extent_buffer *eb = NULL;
+		eb = btrfs_find_create_tree_block(root, start, len);
+		if (eb)
+			btrfs_tree_lock(eb);
+
+		mutex_lock(&root->fs_info->alloc_mutex);
+		ret = lookup_extent_ref(NULL, root, start, len, refs);
+		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->alloc_mutex);
+
+		if (eb) {
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		}
+		if (*refs == 1) {
+			printk(KERN_ERR "btrfs block %llu went down to one "
+			       "during drop_snap\n", (unsigned long long)start);
+		}
+
+	}
+#endif
+
+	cond_resched();
+	return ret;
+}
+
+/*
+ * helper function for drop_snapshot, this walks down the tree dropping ref
+ * counts as it goes.
+ */
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level)
+{
+	u64 root_owner;
+	u64 root_gen;
+	u64 bytenr;
+	u64 ptr_gen;
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
+	struct extent_buffer *parent;
+	struct btrfs_leaf_ref *ref;
+	u32 blocksize;
+	int ret;
+	u32 refs;
+
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+				path->nodes[*level]->len, &refs);
+	BUG_ON(ret);
+	if (refs > 1)
+		goto out;
+
+	/*
+	 * walk down to the last node level and free all the leaves
+	 */
+	while (*level >= 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+		cur = path->nodes[*level];
+
+		if (btrfs_header_level(cur) != *level)
+			WARN_ON(1);
+
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(cur))
+			break;
+		if (*level == 0) {
+			ret = btrfs_drop_leaf_ref(trans, root, cur);
+			BUG_ON(ret);
+			break;
+		}
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+		blocksize = btrfs_level_size(root, *level - 1);
+
+		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+		BUG_ON(ret);
+		if (refs != 1) {
+			parent = path->nodes[*level];
+			root_owner = btrfs_header_owner(parent);
+			root_gen = btrfs_header_generation(parent);
+			path->slots[*level]++;
+
+			ret = __btrfs_free_extent(trans, root, bytenr,
+						blocksize, parent->start,
+						root_owner, root_gen,
+						*level - 1, 1);
+			BUG_ON(ret);
+
+			atomic_inc(&root->fs_info->throttle_gen);
+			wake_up(&root->fs_info->transaction_throttle);
+			cond_resched();
+
+			continue;
+		}
+		/*
+		 * at this point, we have a single ref, and since the
+		 * only place referencing this extent is a dead root
+		 * the reference count should never go higher.
+		 * So, we don't need to check it again
+		 */
+		if (*level == 1) {
+			ref = btrfs_lookup_leaf_ref(root, bytenr);
+			if (ref && ref->generation != ptr_gen) {
+				btrfs_free_leaf_ref(root, ref);
+				ref = NULL;
+			}
+			if (ref) {
+				ret = cache_drop_leaf_ref(trans, root, ref);
+				BUG_ON(ret);
+				btrfs_remove_leaf_ref(root, ref);
+				btrfs_free_leaf_ref(root, ref);
+				*level = 0;
+				break;
+			}
+		}
+		next = btrfs_find_tree_block(root, bytenr, blocksize);
+		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
+			free_extent_buffer(next);
+
+			next = read_tree_block(root, bytenr, blocksize,
+					       ptr_gen);
+			cond_resched();
+#if 0
+			/*
+			 * this is a debugging check and can go away
+			 * the ref should never go all the way down to 1
+			 * at this point
+			 */
+			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
+						&refs);
+			BUG_ON(ret);
+			WARN_ON(refs != 1);
+#endif
+		}
+		WARN_ON(*level <= 0);
+		if (path->nodes[*level-1])
+			free_extent_buffer(path->nodes[*level-1]);
+		path->nodes[*level-1] = next;
+		*level = btrfs_header_level(next);
+		path->slots[*level] = 0;
+		cond_resched();
+	}
+out:
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	if (path->nodes[*level] == root->node) {
+		parent = path->nodes[*level];
+		bytenr = path->nodes[*level]->start;
+	} else {
+		parent = path->nodes[*level + 1];
+		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+	}
+
+	blocksize = btrfs_level_size(root, *level);
+	root_owner = btrfs_header_owner(parent);
+	root_gen = btrfs_header_generation(parent);
+
+	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+				  parent->start, root_owner, root_gen,
+				  *level, 1);
+	free_extent_buffer(path->nodes[*level]);
+	path->nodes[*level] = NULL;
+	*level += 1;
+	BUG_ON(ret);
+
+	cond_resched();
+	return 0;
+}
+
+/*
+ * helper function for drop_subtree, this function is similar to
+ * walk_down_tree. The main difference is that it checks reference
+ * counts while tree blocks are locked.
+ */
+static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path, int *level)
+{
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
+	struct extent_buffer *parent;
+	u64 bytenr;
+	u64 ptr_gen;
+	u32 blocksize;
+	u32 refs;
+	int ret;
+
+	cur = path->nodes[*level];
+	ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+				      &refs);
+	BUG_ON(ret);
+	if (refs > 1)
+		goto out;
+
+	while (*level >= 0) {
+		cur = path->nodes[*level];
+		if (*level == 0) {
+			ret = btrfs_drop_leaf_ref(trans, root, cur);
+			BUG_ON(ret);
+			clean_tree_block(trans, root, cur);
+			break;
+		}
+		if (path->slots[*level] >= btrfs_header_nritems(cur)) {
+			clean_tree_block(trans, root, cur);
+			break;
+		}
+
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		blocksize = btrfs_level_size(root, *level - 1);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+
+		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+		btrfs_tree_lock(next);
+
+		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+					      &refs);
+		BUG_ON(ret);
+		if (refs > 1) {
+			parent = path->nodes[*level];
+			ret = btrfs_free_extent(trans, root, bytenr,
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					*level - 1, 1);
+			BUG_ON(ret);
+			path->slots[*level]++;
+			btrfs_tree_unlock(next);
+			free_extent_buffer(next);
+			continue;
+		}
+
+		*level = btrfs_header_level(next);
+		path->nodes[*level] = next;
+		path->slots[*level] = 0;
+		path->locks[*level] = 1;
+		cond_resched();
+	}
+out:
+	parent = path->nodes[*level + 1];
+	bytenr = path->nodes[*level]->start;
+	blocksize = path->nodes[*level]->len;
+
+	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+			parent->start, btrfs_header_owner(parent),
+			btrfs_header_generation(parent), *level, 1);
+	BUG_ON(ret);
+
+	if (path->locks[*level]) {
+		btrfs_tree_unlock(path->nodes[*level]);
+		path->locks[*level] = 0;
+	}
+	free_extent_buffer(path->nodes[*level]);
+	path->nodes[*level] = NULL;
+	*level += 1;
+	cond_resched();
+	return 0;
+}
+
+/*
+ * helper for dropping snapshots.  This walks back up the tree in the path
+ * to find the first node higher up where we haven't yet gone through
+ * all the slots
+ */
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 int *level, int max_level)
+{
+	u64 root_owner;
+	u64 root_gen;
+	struct btrfs_root_item *root_item = &root->root_item;
+	int i;
+	int slot;
+	int ret;
+
+	for (i = *level; i < max_level && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+			struct extent_buffer *node;
+			struct btrfs_disk_key disk_key;
+			node = path->nodes[i];
+			path->slots[i]++;
+			*level = i;
+			WARN_ON(*level == 0);
+			btrfs_node_key(node, &disk_key, path->slots[i]);
+			memcpy(&root_item->drop_progress,
+			       &disk_key, sizeof(disk_key));
+			root_item->drop_level = i;
+			return 0;
+		} else {
+			struct extent_buffer *parent;
+			if (path->nodes[*level] == root->node)
+				parent = path->nodes[*level];
+			else
+				parent = path->nodes[*level + 1];
+
+			root_owner = btrfs_header_owner(parent);
+			root_gen = btrfs_header_generation(parent);
+
+			clean_tree_block(trans, root, path->nodes[*level]);
+			ret = btrfs_free_extent(trans, root,
+						path->nodes[*level]->start,
+						path->nodes[*level]->len,
+						parent->start, root_owner,
+						root_gen, *level, 1);
+			BUG_ON(ret);
+			if (path->locks[*level]) {
+				btrfs_tree_unlock(path->nodes[*level]);
+				path->locks[*level] = 0;
+			}
+			free_extent_buffer(path->nodes[*level]);
+			path->nodes[*level] = NULL;
+			*level = i + 1;
+		}
+	}
+	return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+			*root)
+{
+	int ret = 0;
+	int wret;
+	int level;
+	struct btrfs_path *path;
+	int i;
+	int orig_level;
+	struct btrfs_root_item *root_item = &root->root_item;
+
+	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	level = btrfs_header_level(root->node);
+	orig_level = level;
+	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+		path->nodes[level] = root->node;
+		extent_buffer_get(root->node);
+		path->slots[level] = 0;
+	} else {
+		struct btrfs_key key;
+		struct btrfs_disk_key found_key;
+		struct extent_buffer *node;
+
+		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+		level = root_item->drop_level;
+		path->lowest_level = level;
+		wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (wret < 0) {
+			ret = wret;
+			goto out;
+		}
+		node = path->nodes[level];
+		btrfs_node_key(node, &found_key, path->slots[level]);
+		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
+			       sizeof(found_key)));
+		/*
+		 * unlock our path, this is safe because only this
+		 * function is allowed to delete this snapshot
+		 */
+		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+			if (path->nodes[i] && path->locks[i]) {
+				path->locks[i] = 0;
+				btrfs_tree_unlock(path->nodes[i]);
+			}
+		}
+	}
+	while (1) {
+		wret = walk_down_tree(trans, root, path, &level);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+
+		wret = walk_up_tree(trans, root, path, &level,
+				    BTRFS_MAX_LEVEL);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+		if (trans->transaction->in_commit) {
+			ret = -EAGAIN;
+			break;
+		}
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+	}
+	for (i = 0; i <= orig_level; i++) {
+		if (path->nodes[i]) {
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
+		}
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct extent_buffer *node,
+			struct extent_buffer *parent)
+{
+	struct btrfs_path *path;
+	int level;
+	int parent_level;
+	int ret = 0;
+	int wret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	BUG_ON(!btrfs_tree_locked(parent));
+	parent_level = btrfs_header_level(parent);
+	extent_buffer_get(parent);
+	path->nodes[parent_level] = parent;
+	path->slots[parent_level] = btrfs_header_nritems(parent);
+
+	BUG_ON(!btrfs_tree_locked(node));
+	level = btrfs_header_level(node);
+	extent_buffer_get(node);
+	path->nodes[level] = node;
+	path->slots[level] = 0;
+
+	while (1) {
+		wret = walk_down_subtree(trans, root, path, &level);
+		if (wret < 0)
+			ret = wret;
+		if (wret != 0)
+			break;
+
+		wret = walk_up_tree(trans, root, path, &level, parent_level);
+		if (wret < 0)
+			ret = wret;
+		if (wret != 0)
+			break;
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+static unsigned long calc_ra(unsigned long start, unsigned long last,
+			     unsigned long nr)
+{
+	return min(last, start + nr - 1);
+}
+
+static noinline int relocate_inode_pages(struct inode *inode, u64 start,
+					 u64 len)
+{
+	u64 page_start;
+	u64 page_end;
+	unsigned long first_index;
+	unsigned long last_index;
+	unsigned long i;
+	struct page *page;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct file_ra_state *ra;
+	struct btrfs_ordered_extent *ordered;
+	unsigned int total_read = 0;
+	unsigned int total_dirty = 0;
+	int ret = 0;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+
+	mutex_lock(&inode->i_mutex);
+	first_index = start >> PAGE_CACHE_SHIFT;
+	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+
+	/* make sure the dirty trick played by the caller work */
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    first_index, last_index);
+	if (ret)
+		goto out_unlock;
+
+	file_ra_state_init(ra, inode->i_mapping);
+
+	for (i = first_index ; i <= last_index; i++) {
+		if (total_read % ra->ra_pages == 0) {
+			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+				       calc_ra(i, last_index, ra->ra_pages));
+		}
+		total_read++;
+again:
+		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+			BUG_ON(1);
+		page = grab_cache_page(inode->i_mapping, i);
+		if (!page) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				goto out_unlock;
+			}
+		}
+		wait_on_page_writeback(page);
+
+		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+		ordered = btrfs_lookup_ordered_extent(inode, page_start);
+		if (ordered) {
+			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			goto again;
+		}
+		set_page_extent_mapped(page);
+
+		if (i == first_index)
+			set_extent_bits(io_tree, page_start, page_end,
+					EXTENT_BOUNDARY, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, page_start, page_end);
+
+		set_page_dirty(page);
+		total_dirty++;
+
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+out_unlock:
+	kfree(ra);
+	mutex_unlock(&inode->i_mutex);
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+	return ret;
+}
+
+static noinline int relocate_data_extent(struct inode *reloc_inode,
+					 struct btrfs_key *extent_key,
+					 u64 offset)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
+	struct extent_map *em;
+	u64 start = extent_key->objectid - offset;
+	u64 end = start + extent_key->offset - 1;
+
+	em = alloc_extent_map(GFP_NOFS);
+	BUG_ON(!em || IS_ERR(em));
+
+	em->start = start;
+	em->len = extent_key->offset;
+	em->block_len = extent_key->offset;
+	em->block_start = extent_key->objectid;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	/* setup extent map to cheat btrfs_readpage */
+	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+	while (1) {
+		int ret;
+		spin_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		spin_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
+			break;
+		}
+		btrfs_drop_extent_cache(reloc_inode, start, end, 0);
+	}
+	unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+
+	return relocate_inode_pages(reloc_inode, start, extent_key->offset);
+}
+
+struct btrfs_ref_path {
+	u64 extent_start;
+	u64 nodes[BTRFS_MAX_LEVEL];
+	u64 root_objectid;
+	u64 root_generation;
+	u64 owner_objectid;
+	u32 num_refs;
+	int lowest_level;
+	int current_level;
+	int shared_level;
+
+	struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
+	u64 new_nodes[BTRFS_MAX_LEVEL];
+};
+
+struct disk_extent {
+	u64 ram_bytes;
+	u64 disk_bytenr;
+	u64 disk_num_bytes;
+	u64 offset;
+	u64 num_bytes;
+	u8 compression;
+	u8 encryption;
+	u16 other_encoding;
+};
+
+static int is_cowonly_root(u64 root_objectid)
+{
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
+static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *extent_root,
+				    struct btrfs_ref_path *ref_path,
+				    int first_time)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 bytenr;
+	u32 nritems;
+	int level;
+	int ret = 1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	if (first_time) {
+		ref_path->lowest_level = -1;
+		ref_path->current_level = -1;
+		ref_path->shared_level = -1;
+		goto walk_up;
+	}
+walk_down:
+	level = ref_path->current_level - 1;
+	while (level >= -1) {
+		u64 parent;
+		if (level < ref_path->lowest_level)
+			break;
+
+		if (level >= 0)
+			bytenr = ref_path->nodes[level];
+		else
+			bytenr = ref_path->extent_start;
+		BUG_ON(bytenr == 0);
+
+		parent = ref_path->nodes[level + 1];
+		ref_path->nodes[level + 1] = 0;
+		ref_path->current_level = level;
+		BUG_ON(parent == 0);
+
+		key.objectid = bytenr;
+		key.offset = parent + 1;
+		key.type = BTRFS_EXTENT_REF_KEY;
+
+		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+		BUG_ON(ret == 0);
+
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				goto next;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid == bytenr &&
+		    found_key.type == BTRFS_EXTENT_REF_KEY) {
+			if (level < ref_path->shared_level)
+				ref_path->shared_level = level;
+			goto found;
+		}
+next:
+		level--;
+		btrfs_release_path(extent_root, path);
+		cond_resched();
+	}
+	/* reached lowest level */
+	ret = 1;
+	goto out;
+walk_up:
+	level = ref_path->current_level;
+	while (level < BTRFS_MAX_LEVEL - 1) {
+		u64 ref_objectid;
+
+		if (level >= 0)
+			bytenr = ref_path->nodes[level];
+		else
+			bytenr = ref_path->extent_start;
+
+		BUG_ON(bytenr == 0);
+
+		key.objectid = bytenr;
+		key.offset = 0;
+		key.type = BTRFS_EXTENT_REF_KEY;
+
+		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				/* the extent was freed by someone */
+				if (ref_path->lowest_level == level)
+					goto out;
+				btrfs_release_path(extent_root, path);
+				goto walk_down;
+			}
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != bytenr ||
+				found_key.type != BTRFS_EXTENT_REF_KEY) {
+			/* the extent was freed by someone */
+			if (ref_path->lowest_level == level) {
+				ret = 1;
+				goto out;
+			}
+			btrfs_release_path(extent_root, path);
+			goto walk_down;
+		}
+found:
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_extent_ref);
+		ref_objectid = btrfs_ref_objectid(leaf, ref);
+		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+			if (first_time) {
+				level = (int)ref_objectid;
+				BUG_ON(level >= BTRFS_MAX_LEVEL);
+				ref_path->lowest_level = level;
+				ref_path->current_level = level;
+				ref_path->nodes[level] = bytenr;
+			} else {
+				WARN_ON(ref_objectid != level);
+			}
+		} else {
+			WARN_ON(level != -1);
+		}
+		first_time = 0;
+
+		if (ref_path->lowest_level == level) {
+			ref_path->owner_objectid = ref_objectid;
+			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
+		}
+
+		/*
+		 * the block is tree root or the block isn't in reference
+		 * counted tree.
+		 */
+		if (found_key.objectid == found_key.offset ||
+		    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
+			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+			ref_path->root_generation =
+				btrfs_ref_generation(leaf, ref);
+			if (level < 0) {
+				/* special reference from the tree log */
+				ref_path->nodes[0] = found_key.offset;
+				ref_path->current_level = 0;
+			}
+			ret = 0;
+			goto out;
+		}
+
+		level++;
+		BUG_ON(ref_path->nodes[level] != 0);
+		ref_path->nodes[level] = found_key.offset;
+		ref_path->current_level = level;
+
+		/*
+		 * the reference was created in the running transaction,
+		 * no need to continue walking up.
+		 */
+		if (btrfs_ref_generation(leaf, ref) == trans->transid) {
+			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+			ref_path->root_generation =
+				btrfs_ref_generation(leaf, ref);
+			ret = 0;
+			goto out;
+		}
+
+		btrfs_release_path(extent_root, path);
+		cond_resched();
+	}
+	/* reached max tree level, but no tree root found. */
+	BUG();
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				struct btrfs_ref_path *ref_path,
+				u64 extent_start)
+{
+	memset(ref_path, 0, sizeof(*ref_path));
+	ref_path->extent_start = extent_start;
+
+	return __next_ref_path(trans, extent_root, ref_path, 1);
+}
+
+static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct btrfs_ref_path *ref_path)
+{
+	return __next_ref_path(trans, extent_root, ref_path, 0);
+}
+
+static noinline int get_new_locations(struct inode *reloc_inode,
+				      struct btrfs_key *extent_key,
+				      u64 offset, int no_fragment,
+				      struct disk_extent **extents,
+				      int *nr_extents)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	struct disk_extent *exts = *extents;
+	struct btrfs_key found_key;
+	u64 cur_pos;
+	u64 last_byte;
+	u32 nritems;
+	int nr = 0;
+	int max = *nr_extents;
+	int ret;
+
+	WARN_ON(!no_fragment && *extents);
+	if (!exts) {
+		max = 1;
+		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
+		if (!exts)
+			return -ENOMEM;
+	}
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	cur_pos = extent_key->objectid - offset;
+	last_byte = extent_key->objectid + extent_key->offset;
+	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+				       cur_pos, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	while (1) {
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.offset != cur_pos ||
+		    found_key.type != BTRFS_EXTENT_DATA_KEY ||
+		    found_key.objectid != reloc_inode->i_ino)
+			break;
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) !=
+		    BTRFS_FILE_EXTENT_REG ||
+		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+			break;
+
+		if (nr == max) {
+			struct disk_extent *old = exts;
+			max *= 2;
+			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
+			memcpy(exts, old, sizeof(*exts) * nr);
+			if (old != *extents)
+				kfree(old);
+		}
+
+		exts[nr].disk_bytenr =
+			btrfs_file_extent_disk_bytenr(leaf, fi);
+		exts[nr].disk_num_bytes =
+			btrfs_file_extent_disk_num_bytes(leaf, fi);
+		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
+		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+									   fi);
+		BUG_ON(exts[nr].offset > 0);
+		BUG_ON(exts[nr].compression || exts[nr].encryption);
+		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
+
+		cur_pos += exts[nr].num_bytes;
+		nr++;
+
+		if (cur_pos + offset >= last_byte)
+			break;
+
+		if (no_fragment) {
+			ret = 1;
+			goto out;
+		}
+		path->slots[0]++;
+	}
+
+	BUG_ON(cur_pos + offset > last_byte);
+	if (cur_pos + offset < last_byte) {
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	if (ret) {
+		if (exts != *extents)
+			kfree(exts);
+	} else {
+		*extents = exts;
+		*nr_extents = nr;
+	}
+	return ret;
+}
+
+static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_key *extent_key,
+					struct btrfs_key *leaf_key,
+					struct btrfs_ref_path *ref_path,
+					struct disk_extent *new_extents,
+					int nr_extents)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct inode *inode = NULL;
+	struct btrfs_key key;
+	u64 lock_start = 0;
+	u64 lock_end = 0;
+	u64 num_bytes;
+	u64 ext_offset;
+	u64 first_pos;
+	u32 nritems;
+	int nr_scaned = 0;
+	int extent_locked = 0;
+	int extent_type;
+	int ret;
+
+	memcpy(&key, leaf_key, sizeof(key));
+	first_pos = INT_LIMIT(loff_t) - extent_key->offset;
+	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+		if (key.objectid < ref_path->owner_objectid ||
+		    (key.objectid == ref_path->owner_objectid &&
+		     key.type < BTRFS_EXTENT_DATA_KEY)) {
+			key.objectid = ref_path->owner_objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = 0;
+		}
+	}
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto out;
+
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+next:
+		if (extent_locked && ret > 0) {
+			/*
+			 * the file extent item was modified by someone
+			 * before the extent got locked.
+			 */
+			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				      lock_end, GFP_NOFS);
+			extent_locked = 0;
+		}
+
+		if (path->slots[0] >= nritems) {
+			if (++nr_scaned > 2)
+				break;
+
+			BUG_ON(extent_locked);
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+			if ((key.objectid > ref_path->owner_objectid) ||
+			    (key.objectid == ref_path->owner_objectid &&
+			     key.type > BTRFS_EXTENT_DATA_KEY) ||
+			    (key.offset >= first_pos + extent_key->offset))
+				break;
+		}
+
+		if (inode && key.objectid != inode->i_ino) {
+			BUG_ON(extent_locked);
+			btrfs_release_path(root, path);
+			mutex_unlock(&inode->i_mutex);
+			iput(inode);
+			inode = NULL;
+			continue;
+		}
+
+		if (key.type != BTRFS_EXTENT_DATA_KEY) {
+			path->slots[0]++;
+			ret = 1;
+			goto next;
+		}
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, fi);
+		if ((extent_type != BTRFS_FILE_EXTENT_REG &&
+		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
+		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+		     extent_key->objectid)) {
+			path->slots[0]++;
+			ret = 1;
+			goto next;
+		}
+
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+		ext_offset = btrfs_file_extent_offset(leaf, fi);
+
+		if (first_pos > key.offset - ext_offset)
+			first_pos = key.offset - ext_offset;
+
+		if (!extent_locked) {
+			lock_start = key.offset;
+			lock_end = lock_start + num_bytes - 1;
+		} else {
+			if (lock_start > key.offset ||
+			    lock_end + 1 < key.offset + num_bytes) {
+				unlock_extent(&BTRFS_I(inode)->io_tree,
+					      lock_start, lock_end, GFP_NOFS);
+				extent_locked = 0;
+			}
+		}
+
+		if (!inode) {
+			btrfs_release_path(root, path);
+
+			inode = btrfs_iget_locked(root->fs_info->sb,
+						  key.objectid, root);
+			if (inode->i_state & I_NEW) {
+				BTRFS_I(inode)->root = root;
+				BTRFS_I(inode)->location.objectid =
+					key.objectid;
+				BTRFS_I(inode)->location.type =
+					BTRFS_INODE_ITEM_KEY;
+				BTRFS_I(inode)->location.offset = 0;
+				btrfs_read_locked_inode(inode);
+				unlock_new_inode(inode);
+			}
+			/*
+			 * some code call btrfs_commit_transaction while
+			 * holding the i_mutex, so we can't use mutex_lock
+			 * here.
+			 */
+			if (is_bad_inode(inode) ||
+			    !mutex_trylock(&inode->i_mutex)) {
+				iput(inode);
+				inode = NULL;
+				key.offset = (u64)-1;
+				goto skip;
+			}
+		}
+
+		if (!extent_locked) {
+			struct btrfs_ordered_extent *ordered;
+
+			btrfs_release_path(root, path);
+
+			lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				    lock_end, GFP_NOFS);
+			ordered = btrfs_lookup_first_ordered_extent(inode,
+								    lock_end);
+			if (ordered &&
+			    ordered->file_offset <= lock_end &&
+			    ordered->file_offset + ordered->len > lock_start) {
+				unlock_extent(&BTRFS_I(inode)->io_tree,
+					      lock_start, lock_end, GFP_NOFS);
+				btrfs_start_ordered_extent(inode, ordered, 1);
+				btrfs_put_ordered_extent(ordered);
+				key.offset += num_bytes;
+				goto skip;
+			}
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+
+			extent_locked = 1;
+			continue;
+		}
+
+		if (nr_extents == 1) {
+			/* update extent pointer in place */
+			btrfs_set_file_extent_disk_bytenr(leaf, fi,
+						new_extents[0].disk_bytenr);
+			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+						new_extents[0].disk_num_bytes);
+			btrfs_mark_buffer_dirty(leaf);
+
+			btrfs_drop_extent_cache(inode, key.offset,
+						key.offset + num_bytes - 1, 0);
+
+			ret = btrfs_inc_extent_ref(trans, root,
+						new_extents[0].disk_bytenr,
+						new_extents[0].disk_num_bytes,
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid,
+						key.objectid);
+			BUG_ON(ret);
+
+			ret = btrfs_free_extent(trans, root,
+						extent_key->objectid,
+						extent_key->offset,
+						leaf->start,
+						btrfs_header_owner(leaf),
+						btrfs_header_generation(leaf),
+						key.objectid, 0);
+			BUG_ON(ret);
+
+			btrfs_release_path(root, path);
+			key.offset += num_bytes;
+		} else {
+			BUG_ON(1);
+#if 0
+			u64 alloc_hint;
+			u64 extent_len;
+			int i;
+			/*
+			 * drop old extent pointer at first, then insert the
+			 * new pointers one bye one
+			 */
+			btrfs_release_path(root, path);
+			ret = btrfs_drop_extents(trans, root, inode, key.offset,
+						 key.offset + num_bytes,
+						 key.offset, &alloc_hint);
+			BUG_ON(ret);
+
+			for (i = 0; i < nr_extents; i++) {
+				if (ext_offset >= new_extents[i].num_bytes) {
+					ext_offset -= new_extents[i].num_bytes;
+					continue;
+				}
+				extent_len = min(new_extents[i].num_bytes -
+						 ext_offset, num_bytes);
+
+				ret = btrfs_insert_empty_item(trans, root,
+							      path, &key,
+							      sizeof(*fi));
+				BUG_ON(ret);
+
+				leaf = path->nodes[0];
+				fi = btrfs_item_ptr(leaf, path->slots[0],
+						struct btrfs_file_extent_item);
+				btrfs_set_file_extent_generation(leaf, fi,
+							trans->transid);
+				btrfs_set_file_extent_type(leaf, fi,
+							BTRFS_FILE_EXTENT_REG);
+				btrfs_set_file_extent_disk_bytenr(leaf, fi,
+						new_extents[i].disk_bytenr);
+				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+						new_extents[i].disk_num_bytes);
+				btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extents[i].ram_bytes);
+
+				btrfs_set_file_extent_compression(leaf, fi,
+						new_extents[i].compression);
+				btrfs_set_file_extent_encryption(leaf, fi,
+						new_extents[i].encryption);
+				btrfs_set_file_extent_other_encoding(leaf, fi,
+						new_extents[i].other_encoding);
+
+				btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_len);
+				ext_offset += new_extents[i].offset;
+				btrfs_set_file_extent_offset(leaf, fi,
+							ext_offset);
+				btrfs_mark_buffer_dirty(leaf);
+
+				btrfs_drop_extent_cache(inode, key.offset,
+						key.offset + extent_len - 1, 0);
+
+				ret = btrfs_inc_extent_ref(trans, root,
+						new_extents[i].disk_bytenr,
+						new_extents[i].disk_num_bytes,
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid, key.objectid);
+				BUG_ON(ret);
+				btrfs_release_path(root, path);
+
+				inode_add_bytes(inode, extent_len);
+
+				ext_offset = 0;
+				num_bytes -= extent_len;
+				key.offset += extent_len;
+
+				if (num_bytes == 0)
+					break;
+			}
+			BUG_ON(i >= nr_extents);
+#endif
+		}
+
+		if (extent_locked) {
+			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				      lock_end, GFP_NOFS);
+			extent_locked = 0;
+		}
+skip:
+		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
+		    key.offset >= first_pos + extent_key->offset)
+			break;
+
+		cond_resched();
+	}
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	if (inode) {
+		mutex_unlock(&inode->i_mutex);
+		if (extent_locked) {
+			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				      lock_end, GFP_NOFS);
+		}
+		iput(inode);
+	}
+	return ret;
+}
+
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_buffer *buf, u64 orig_start)
+{
+	int level;
+	int ret;
+
+	BUG_ON(btrfs_header_generation(buf) != trans->transid);
+	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+	level = btrfs_header_level(buf);
+	if (level == 0) {
+		struct btrfs_leaf_ref *ref;
+		struct btrfs_leaf_ref *orig_ref;
+
+		orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
+		if (!orig_ref)
+			return -ENOENT;
+
+		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
+		if (!ref) {
+			btrfs_free_leaf_ref(root, orig_ref);
+			return -ENOMEM;
+		}
+
+		ref->nritems = orig_ref->nritems;
+		memcpy(ref->extents, orig_ref->extents,
+			sizeof(ref->extents[0]) * ref->nritems);
+
+		btrfs_free_leaf_ref(root, orig_ref);
+
+		ref->root_gen = trans->transid;
+		ref->bytenr = buf->start;
+		ref->owner = btrfs_header_owner(buf);
+		ref->generation = btrfs_header_generation(buf);
+		ret = btrfs_add_leaf_ref(root, ref, 0);
+		WARN_ON(ret);
+		btrfs_free_leaf_ref(root, ref);
+	}
+	return 0;
+}
+
+static noinline int invalidate_extent_cache(struct btrfs_root *root,
+					struct extent_buffer *leaf,
+					struct btrfs_block_group_cache *group,
+					struct btrfs_root *target_root)
+{
+	struct btrfs_key key;
+	struct inode *inode = NULL;
+	struct btrfs_file_extent_item *fi;
+	u64 num_bytes;
+	u64 skip_objectid = 0;
+	u32 nritems;
+	u32 i;
+
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (key.objectid == skip_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+			continue;
+		if (!inode || inode->i_ino != key.objectid) {
+			iput(inode);
+			inode = btrfs_ilookup(target_root->fs_info->sb,
+					      key.objectid, target_root, 1);
+		}
+		if (!inode) {
+			skip_objectid = key.objectid;
+			continue;
+		}
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+
+		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+			    key.offset + num_bytes - 1, GFP_NOFS);
+		btrfs_drop_extent_cache(inode, key.offset,
+					key.offset + num_bytes - 1, 1);
+		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+			      key.offset + num_bytes - 1, GFP_NOFS);
+		cond_resched();
+	}
+	iput(inode);
+	return 0;
+}
+
+static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct extent_buffer *leaf,
+					struct btrfs_block_group_cache *group,
+					struct inode *reloc_inode)
+{
+	struct btrfs_key key;
+	struct btrfs_key extent_key;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_leaf_ref *ref;
+	struct disk_extent *new_extent;
+	u64 bytenr;
+	u64 num_bytes;
+	u32 nritems;
+	u32 i;
+	int ext_index;
+	int nr_extent;
+	int ret;
+
+	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
+	BUG_ON(!new_extent);
+
+	ref = btrfs_lookup_leaf_ref(root, leaf->start);
+	BUG_ON(!ref);
+
+	ext_index = -1;
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		if (bytenr == 0)
+			continue;
+
+		ext_index++;
+		if (bytenr >= group->key.objectid + group->key.offset ||
+		    bytenr + num_bytes <= group->key.objectid)
+			continue;
+
+		extent_key.objectid = bytenr;
+		extent_key.offset = num_bytes;
+		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
+		nr_extent = 1;
+		ret = get_new_locations(reloc_inode, &extent_key,
+					group->key.objectid, 1,
+					&new_extent, &nr_extent);
+		if (ret > 0)
+			continue;
+		BUG_ON(ret < 0);
+
+		BUG_ON(ref->extents[ext_index].bytenr != bytenr);
+		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
+		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
+		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
+
+		btrfs_set_file_extent_disk_bytenr(leaf, fi,
+						new_extent->disk_bytenr);
+		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+						new_extent->disk_num_bytes);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_inc_extent_ref(trans, root,
+					new_extent->disk_bytenr,
+					new_extent->disk_num_bytes,
+					leaf->start,
+					root->root_key.objectid,
+					trans->transid, key.objectid);
+		BUG_ON(ret);
+		ret = btrfs_free_extent(trans, root,
+					bytenr, num_bytes, leaf->start,
+					btrfs_header_owner(leaf),
+					btrfs_header_generation(leaf),
+					key.objectid, 0);
+		BUG_ON(ret);
+		cond_resched();
+	}
+	kfree(new_extent);
+	BUG_ON(ext_index + 1 != ref->nritems);
+	btrfs_free_leaf_ref(root, ref);
+	return 0;
+}
+
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	int ret;
+
+	if (root->reloc_root) {
+		reloc_root = root->reloc_root;
+		root->reloc_root = NULL;
+		list_add(&reloc_root->dead_list,
+			 &root->fs_info->dead_reloc_roots);
+
+		btrfs_set_root_bytenr(&reloc_root->root_item,
+				      reloc_root->node->start);
+		btrfs_set_root_level(&root->root_item,
+				     btrfs_header_level(reloc_root->node));
+		memset(&reloc_root->root_item.drop_progress, 0,
+			sizeof(struct btrfs_disk_key));
+		reloc_root->root_item.drop_level = 0;
+
+		ret = btrfs_update_root(trans, root->fs_info->tree_root,
+					&reloc_root->root_key,
+					&reloc_root->root_item);
+		BUG_ON(ret);
+	}
+	return 0;
+}
+
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root *prev_root = NULL;
+	struct list_head dead_roots;
+	int ret;
+	unsigned long nr;
+
+	INIT_LIST_HEAD(&dead_roots);
+	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
+
+	while (!list_empty(&dead_roots)) {
+		reloc_root = list_entry(dead_roots.prev,
+					struct btrfs_root, dead_list);
+		list_del_init(&reloc_root->dead_list);
+
+		BUG_ON(reloc_root->commit_root != NULL);
+		while (1) {
+			trans = btrfs_join_transaction(root, 1);
+			BUG_ON(!trans);
+
+			mutex_lock(&root->fs_info->drop_mutex);
+			ret = btrfs_drop_snapshot(trans, reloc_root);
+			if (ret != -EAGAIN)
+				break;
+			mutex_unlock(&root->fs_info->drop_mutex);
+
+			nr = trans->blocks_used;
+			ret = btrfs_end_transaction(trans, root);
+			BUG_ON(ret);
+			btrfs_btree_balance_dirty(root, nr);
+		}
+
+		free_extent_buffer(reloc_root->node);
+
+		ret = btrfs_del_root(trans, root->fs_info->tree_root,
+				     &reloc_root->root_key);
+		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->drop_mutex);
+
+		nr = trans->blocks_used;
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
+		btrfs_btree_balance_dirty(root, nr);
+
+		kfree(prev_root);
+		prev_root = reloc_root;
+	}
+	if (prev_root) {
+		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
+		kfree(prev_root);
+	}
+	return 0;
+}
+
+int btrfs_add_dead_reloc_root(struct btrfs_root *root)
+{
+	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
+	return 0;
+}
+
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key location;
+	int found;
+	int ret;
+
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
+	BUG_ON(ret);
+	found = !list_empty(&root->fs_info->dead_reloc_roots);
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+	if (found) {
+		trans = btrfs_start_transaction(root, 1);
+		BUG_ON(!trans);
+		ret = btrfs_commit_transaction(trans, root);
+		BUG_ON(ret);
+	}
+
+	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+	location.offset = (u64)-1;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+
+	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	BUG_ON(!reloc_root);
+	btrfs_orphan_cleanup(reloc_root);
+	return 0;
+}
+
+static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb;
+	struct btrfs_root_item *root_item;
+	struct btrfs_key root_key;
+	int ret;
+
+	BUG_ON(!root->ref_cows);
+	if (root->reloc_root)
+		return 0;
+
+	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+	BUG_ON(!root_item);
+
+	ret = btrfs_copy_root(trans, root, root->commit_root,
+			      &eb, BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(ret);
+
+	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	root_key.offset = root->root_key.objectid;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+
+	memcpy(root_item, &root->root_item, sizeof(root_item));
+	btrfs_set_root_refs(root_item, 0);
+	btrfs_set_root_bytenr(root_item, eb->start);
+	btrfs_set_root_level(root_item, btrfs_header_level(eb));
+	btrfs_set_root_generation(root_item, trans->transid);
+
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+				&root_key, root_item);
+	BUG_ON(ret);
+	kfree(root_item);
+
+	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						 &root_key);
+	BUG_ON(!reloc_root);
+	reloc_root->last_trans = trans->transid;
+	reloc_root->commit_root = NULL;
+	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
+
+	root->reloc_root = reloc_root;
+	return 0;
+}
+
+/*
+ * Core function of space balance.
+ *
+ * The idea is using reloc trees to relocate tree blocks in reference
+ * counted roots. There is one reloc tree for each subvol, and all
+ * reloc trees share same root key objectid. Reloc trees are snapshots
+ * of the latest committed roots of subvols (root->commit_root).
+ *
+ * To relocate a tree block referenced by a subvol, there are two steps.
+ * COW the block through subvol's reloc tree, then update block pointer
+ * in the subvol to point to the new block. Since all reloc trees share
+ * same root key objectid, doing special handing for tree blocks owned
+ * by them is easy. Once a tree block has been COWed in one reloc tree,
+ * we can use the resulting new block directly when the same block is
+ * required to COW again through other reloc trees. By this way, relocated
+ * tree blocks are shared between reloc trees, so they are also shared
+ * between subvols.
+ */
+static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct btrfs_key *first_key,
+				      struct btrfs_ref_path *ref_path,
+				      struct btrfs_block_group_cache *group,
+				      struct inode *reloc_inode)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb = NULL;
+	struct btrfs_key *keys;
+	u64 *nodes;
+	int level;
+	int shared_level;
+	int lowest_level = 0;
+	int ret;
+
+	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		lowest_level = ref_path->owner_objectid;
+
+	if (!root->ref_cows) {
+		path->lowest_level = lowest_level;
+		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
+		BUG_ON(ret < 0);
+		path->lowest_level = 0;
+		btrfs_release_path(root, path);
+		return 0;
+	}
+
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+	ret = init_reloc_tree(trans, root);
+	BUG_ON(ret);
+	reloc_root = root->reloc_root;
+
+	shared_level = ref_path->shared_level;
+	ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
+
+	keys = ref_path->node_keys;
+	nodes = ref_path->new_nodes;
+	memset(&keys[shared_level + 1], 0,
+	       sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
+	memset(&nodes[shared_level + 1], 0,
+	       sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
+
+	if (nodes[lowest_level] == 0) {
+		path->lowest_level = lowest_level;
+		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+					0, 1);
+		BUG_ON(ret);
+		for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
+			eb = path->nodes[level];
+			if (!eb || eb == reloc_root->node)
+				break;
+			nodes[level] = eb->start;
+			if (level == 0)
+				btrfs_item_key_to_cpu(eb, &keys[level], 0);
+			else
+				btrfs_node_key_to_cpu(eb, &keys[level], 0);
+		}
+		if (nodes[0] &&
+		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			eb = path->nodes[0];
+			ret = replace_extents_in_leaf(trans, reloc_root, eb,
+						      group, reloc_inode);
+			BUG_ON(ret);
+		}
+		btrfs_release_path(reloc_root, path);
+	} else {
+		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
+				       lowest_level);
+		BUG_ON(ret);
+	}
+
+	/*
+	 * replace tree blocks in the fs tree with tree blocks in
+	 * the reloc tree.
+	 */
+	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
+	BUG_ON(ret < 0);
+
+	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+					0, 0);
+		BUG_ON(ret);
+		extent_buffer_get(path->nodes[0]);
+		eb = path->nodes[0];
+		btrfs_release_path(reloc_root, path);
+		ret = invalidate_extent_cache(reloc_root, eb, group, root);
+		BUG_ON(ret);
+		free_extent_buffer(eb);
+	}
+
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+	path->lowest_level = 0;
+	return 0;
+}
+
+static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_key *first_key,
+					struct btrfs_ref_path *ref_path)
+{
+	int ret;
+
+	ret = relocate_one_path(trans, root, path, first_key,
+				ref_path, NULL, NULL);
+	BUG_ON(ret);
+
+	if (root == root->fs_info->extent_root)
+		btrfs_extent_post_op(trans, root);
+
+	return 0;
+}
+
+static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *extent_root,
+				    struct btrfs_path *path,
+				    struct btrfs_key *extent_key)
+{
+	int ret;
+
+	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+	if (ret)
+		goto out;
+	ret = btrfs_del_item(trans, extent_root, path);
+out:
+	btrfs_release_path(extent_root, path);
+	return ret;
+}
+
+static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
+						struct btrfs_ref_path *ref_path)
+{
+	struct btrfs_key root_key;
+
+	root_key.objectid = ref_path->root_objectid;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	if (is_cowonly_root(ref_path->root_objectid))
+		root_key.offset = 0;
+	else
+		root_key.offset = (u64)-1;
+
+	return btrfs_read_fs_root_no_name(fs_info, &root_key);
+}
+
+static noinline int relocate_one_extent(struct btrfs_root *extent_root,
+					struct btrfs_path *path,
+					struct btrfs_key *extent_key,
+					struct btrfs_block_group_cache *group,
+					struct inode *reloc_inode, int pass)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *found_root;
+	struct btrfs_ref_path *ref_path = NULL;
+	struct disk_extent *new_extents = NULL;
+	int nr_extents = 0;
+	int loops;
+	int ret;
+	int level;
+	struct btrfs_key first_key;
+	u64 prev_block = 0;
+
+
+	trans = btrfs_start_transaction(extent_root, 1);
+	BUG_ON(!trans);
+
+	if (extent_key->objectid == 0) {
+		ret = del_extent_zero(trans, extent_root, path, extent_key);
+		goto out;
+	}
+
+	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
+	if (!ref_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (loops = 0; ; loops++) {
+		if (loops == 0) {
+			ret = btrfs_first_ref_path(trans, extent_root, ref_path,
+						   extent_key->objectid);
+		} else {
+			ret = btrfs_next_ref_path(trans, extent_root, ref_path);
+		}
+		if (ret < 0)
+			goto out;
+		if (ret > 0)
+			break;
+
+		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+			continue;
+
+		found_root = read_ref_root(extent_root->fs_info, ref_path);
+		BUG_ON(!found_root);
+		/*
+		 * for reference counted tree, only process reference paths
+		 * rooted at the latest committed root.
+		 */
+		if (found_root->ref_cows &&
+		    ref_path->root_generation != found_root->root_key.offset)
+			continue;
+
+		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			if (pass == 0) {
+				/*
+				 * copy data extents to new locations
+				 */
+				u64 group_start = group->key.objectid;
+				ret = relocate_data_extent(reloc_inode,
+							   extent_key,
+							   group_start);
+				if (ret < 0)
+					goto out;
+				break;
+			}
+			level = 0;
+		} else {
+			level = ref_path->owner_objectid;
+		}
+
+		if (prev_block != ref_path->nodes[level]) {
+			struct extent_buffer *eb;
+			u64 block_start = ref_path->nodes[level];
+			u64 block_size = btrfs_level_size(found_root, level);
+
+			eb = read_tree_block(found_root, block_start,
+					     block_size, 0);
+			btrfs_tree_lock(eb);
+			BUG_ON(level != btrfs_header_level(eb));
+
+			if (level == 0)
+				btrfs_item_key_to_cpu(eb, &first_key, 0);
+			else
+				btrfs_node_key_to_cpu(eb, &first_key, 0);
+
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+			prev_block = block_start;
+		}
+
+		btrfs_record_root_in_trans(found_root);
+		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			/*
+			 * try to update data extent references while
+			 * keeping metadata shared between snapshots.
+			 */
+			if (pass == 1) {
+				ret = relocate_one_path(trans, found_root,
+						path, &first_key, ref_path,
+						group, reloc_inode);
+				if (ret < 0)
+					goto out;
+				continue;
+			}
+			/*
+			 * use fallback method to process the remaining
+			 * references.
+			 */
+			if (!new_extents) {
+				u64 group_start = group->key.objectid;
+				new_extents = kmalloc(sizeof(*new_extents),
+						      GFP_NOFS);
+				nr_extents = 1;
+				ret = get_new_locations(reloc_inode,
+							extent_key,
+							group_start, 1,
+							&new_extents,
+							&nr_extents);
+				if (ret)
+					goto out;
+			}
+			ret = replace_one_extent(trans, found_root,
+						path, extent_key,
+						&first_key, ref_path,
+						new_extents, nr_extents);
+		} else {
+			ret = relocate_tree_block(trans, found_root, path,
+						  &first_key, ref_path);
+		}
+		if (ret < 0)
+			goto out;
+	}
+	ret = 0;
+out:
+	btrfs_end_transaction(trans, extent_root);
+	kfree(new_extents);
+	kfree(ref_path);
+	return ret;
+}
+
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+	u64 num_devices;
+	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
+	num_devices = root->fs_info->fs_devices->rw_devices;
+	if (num_devices == 1) {
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* turn raid0 into single device chunks */
+		if (flags & BTRFS_BLOCK_GROUP_RAID0)
+			return stripped;
+
+		/* turn mirroring into duplication */
+		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+			     BTRFS_BLOCK_GROUP_RAID10))
+			return stripped | BTRFS_BLOCK_GROUP_DUP;
+		return flags;
+	} else {
+		/* they already had raid on here, just return */
+		if (flags & stripped)
+			return flags;
+
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* switch duplicated blocks with raid1 */
+		if (flags & BTRFS_BLOCK_GROUP_DUP)
+			return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+		/* turn single device chunks into raid0 */
+		return stripped | BTRFS_BLOCK_GROUP_RAID0;
+	}
+	return flags;
+}
+
+static int __alloc_chunk_for_shrink(struct btrfs_root *root,
+		     struct btrfs_block_group_cache *shrink_block_group,
+		     int force)
+{
+	struct btrfs_trans_handle *trans;
+	u64 new_alloc_flags;
+	u64 calc;
+
+	spin_lock(&shrink_block_group->lock);
+	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+		spin_unlock(&shrink_block_group->lock);
+
+		trans = btrfs_start_transaction(root, 1);
+		spin_lock(&shrink_block_group->lock);
+
+		new_alloc_flags = update_block_group_flags(root,
+						   shrink_block_group->flags);
+		if (new_alloc_flags != shrink_block_group->flags) {
+			calc =
+			     btrfs_block_group_used(&shrink_block_group->item);
+		} else {
+			calc = shrink_block_group->key.offset;
+		}
+		spin_unlock(&shrink_block_group->lock);
+
+		do_chunk_alloc(trans, root->fs_info->extent_root,
+			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
+
+		btrfs_end_transaction(trans, root);
+	} else
+		spin_unlock(&shrink_block_group->lock);
+	return 0;
+}
+
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 u64 objectid, u64 size)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_item *item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+	btrfs_set_inode_generation(leaf, item, 1);
+	btrfs_set_inode_size(leaf, item, size);
+	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+					struct btrfs_block_group_cache *group)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	struct btrfs_key root_key;
+	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+	int err = 0;
+
+	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(root))
+		return ERR_CAST(root);
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+	if (err)
+		goto out;
+
+	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+	BUG_ON(err);
+
+	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+				       group->key.offset, 0, group->key.offset,
+				       0, 0, 0);
+	BUG_ON(err);
+
+	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		BTRFS_I(inode)->location.objectid = objectid;
+		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+		BTRFS_I(inode)->location.offset = 0;
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+		BUG_ON(is_bad_inode(inode));
+	} else {
+		BUG_ON(1);
+	}
+	BTRFS_I(inode)->index_cnt = group->key.objectid;
+
+	err = btrfs_orphan_add(trans, inode);
+out:
+	btrfs_end_transaction(trans, root);
+	if (err) {
+		if (inode)
+			iput(inode);
+		inode = ERR_PTR(err);
+	}
+	return inode;
+}
+
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct list_head list;
+	size_t offset;
+	int ret;
+	u64 disk_bytenr;
+
+	INIT_LIST_HEAD(&list);
+
+	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+
+	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+				       disk_bytenr + len - 1, &list);
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del_init(&sums->list);
+
+		sector_sum = sums->sums;
+		sums->bytenr = ordered->start;
+
+		offset = 0;
+		while (offset < sums->len) {
+			sector_sum->bytenr += ordered->start - disk_bytenr;
+			sector_sum++;
+			offset += root->sectorsize;
+		}
+
+		btrfs_add_ordered_sum(inode, ordered, sums);
+	}
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
+
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct extent_buffer *leaf;
+	struct inode *reloc_inode;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_key key;
+	u64 skipped;
+	u64 cur_byte;
+	u64 total_found;
+	u32 nritems;
+	int ret;
+	int progress;
+	int pass = 0;
+
+	root = root->fs_info->extent_root;
+
+	block_group = btrfs_lookup_block_group(info, group_start);
+	BUG_ON(!block_group);
+
+	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
+	       (unsigned long long)block_group->key.objectid,
+	       (unsigned long long)block_group->flags);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	reloc_inode = create_reloc_inode(info, block_group);
+	BUG_ON(IS_ERR(reloc_inode));
+
+	__alloc_chunk_for_shrink(root, block_group, 1);
+	set_block_group_readonly(block_group);
+
+	btrfs_start_delalloc_inodes(info->tree_root);
+	btrfs_wait_ordered_extents(info->tree_root, 0);
+again:
+	skipped = 0;
+	total_found = 0;
+	progress = 0;
+	key.objectid = block_group->key.objectid;
+	key.offset = 0;
+	key.type = 0;
+	cur_byte = key.objectid;
+
+	trans = btrfs_start_transaction(info->tree_root, 1);
+	btrfs_commit_transaction(trans, info->tree_root);
+
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_clean_old_snapshots(info->tree_root);
+	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+next:
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret == 1) {
+				ret = 0;
+				break;
+			}
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (key.objectid >= block_group->key.objectid +
+		    block_group->key.offset)
+			break;
+
+		if (progress && need_resched()) {
+			btrfs_release_path(root, path);
+			cond_resched();
+			progress = 0;
+			continue;
+		}
+		progress = 1;
+
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
+		    key.objectid + key.offset <= cur_byte) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		total_found++;
+		cur_byte = key.objectid + key.offset;
+		btrfs_release_path(root, path);
+
+		__alloc_chunk_for_shrink(root, block_group, 0);
+		ret = relocate_one_extent(root, path, &key, block_group,
+					  reloc_inode, pass);
+		BUG_ON(ret < 0);
+		if (ret > 0)
+			skipped++;
+
+		key.objectid = cur_byte;
+		key.type = 0;
+		key.offset = 0;
+	}
+
+	btrfs_release_path(root, path);
+
+	if (pass == 0) {
+		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
+		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
+	}
+
+	if (total_found > 0) {
+		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
+		       (unsigned long long)total_found, pass);
+		pass++;
+		if (total_found == skipped && pass > 2) {
+			iput(reloc_inode);
+			reloc_inode = create_reloc_inode(info, block_group);
+			pass = 0;
+		}
+		goto again;
+	}
+
+	/* delete reloc_inode */
+	iput(reloc_inode);
+
+	/* unpin extents in this range */
+	trans = btrfs_start_transaction(info->tree_root, 1);
+	btrfs_commit_transaction(trans, info->tree_root);
+
+	spin_lock(&block_group->lock);
+	WARN_ON(block_group->pinned > 0);
+	WARN_ON(block_group->reserved > 0);
+	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
+	spin_unlock(&block_group->lock);
+	put_block_group(block_group);
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int find_first_block_group(struct btrfs_root *root,
+		struct btrfs_path *path, struct btrfs_key *key)
+{
+	int ret = 0;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int slot;
+
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	while (1) {
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto out;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid >= key->objectid &&
+		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			ret = 0;
+			goto out;
+		}
+		path->slots[0]++;
+	}
+	ret = -ENOENT;
+out:
+	return ret;
+}
+
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct rb_node *n;
+
+	spin_lock(&info->block_group_cache_lock);
+	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+		block_group = rb_entry(n, struct btrfs_block_group_cache,
+				       cache_node);
+		rb_erase(&block_group->cache_node,
+			 &info->block_group_cache_tree);
+		spin_unlock(&info->block_group_cache_lock);
+
+		btrfs_remove_free_space_cache(block_group);
+		down_write(&block_group->space_info->groups_sem);
+		list_del(&block_group->list);
+		up_write(&block_group->space_info->groups_sem);
+
+		WARN_ON(atomic_read(&block_group->count) != 1);
+		kfree(block_group);
+
+		spin_lock(&info->block_group_cache_lock);
+	}
+	spin_unlock(&info->block_group_cache_lock);
+	return 0;
+}
+
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *space_info;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+
+	root = info->extent_root;
+	key.objectid = 0;
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while (1) {
+		ret = find_first_block_group(root, path, &key);
+		if (ret > 0) {
+			ret = 0;
+			goto error;
+		}
+		if (ret != 0)
+			goto error;
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		cache = kzalloc(sizeof(*cache), GFP_NOFS);
+		if (!cache) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		atomic_set(&cache->count, 1);
+		spin_lock_init(&cache->lock);
+		mutex_init(&cache->alloc_mutex);
+		mutex_init(&cache->cache_mutex);
+		INIT_LIST_HEAD(&cache->list);
+		read_extent_buffer(leaf, &cache->item,
+				   btrfs_item_ptr_offset(leaf, path->slots[0]),
+				   sizeof(cache->item));
+		memcpy(&cache->key, &found_key, sizeof(found_key));
+
+		key.objectid = found_key.objectid + found_key.offset;
+		btrfs_release_path(root, path);
+		cache->flags = btrfs_block_group_flags(&cache->item);
+
+		ret = update_space_info(info, cache->flags, found_key.offset,
+					btrfs_block_group_used(&cache->item),
+					&space_info);
+		BUG_ON(ret);
+		cache->space_info = space_info;
+		down_write(&space_info->groups_sem);
+		list_add_tail(&cache->list, &space_info->block_groups);
+		up_write(&space_info->groups_sem);
+
+		ret = btrfs_add_block_group_cache(root->fs_info, cache);
+		BUG_ON(ret);
+
+		set_avail_alloc_bits(root->fs_info, cache->flags);
+		if (btrfs_chunk_readonly(root, cache->key.objectid))
+			set_block_group_readonly(cache);
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
+			   u64 size)
+{
+	int ret;
+	struct btrfs_root *extent_root;
+	struct btrfs_block_group_cache *cache;
+
+	extent_root = root->fs_info->extent_root;
+
+	root->fs_info->last_trans_new_blockgroup = trans->transid;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return -ENOMEM;
+
+	cache->key.objectid = chunk_offset;
+	cache->key.offset = size;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	atomic_set(&cache->count, 1);
+	spin_lock_init(&cache->lock);
+	mutex_init(&cache->alloc_mutex);
+	mutex_init(&cache->cache_mutex);
+	INIT_LIST_HEAD(&cache->list);
+
+	btrfs_set_block_group_used(&cache->item, bytes_used);
+	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+	cache->flags = type;
+	btrfs_set_block_group_flags(&cache->item, type);
+
+	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+				&cache->space_info);
+	BUG_ON(ret);
+	down_write(&cache->space_info->groups_sem);
+	list_add_tail(&cache->list, &cache->space_info->block_groups);
+	up_write(&cache->space_info->groups_sem);
+
+	ret = btrfs_add_block_group_cache(root->fs_info, cache);
+	BUG_ON(ret);
+
+	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
+				sizeof(cache->item));
+	BUG_ON(ret);
+
+	finish_current_insert(trans, extent_root, 0);
+	ret = del_pending_extents(trans, extent_root, 0);
+	BUG_ON(ret);
+	set_avail_alloc_bits(extent_root->fs_info, type);
+
+	return 0;
+}
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start)
+{
+	struct btrfs_path *path;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_key key;
+	int ret;
+
+	root = root->fs_info->extent_root;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+	BUG_ON(!block_group);
+	BUG_ON(!block_group->ro);
+
+	memcpy(&key, &block_group->key, sizeof(key));
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	btrfs_remove_free_space_cache(block_group);
+	rb_erase(&block_group->cache_node,
+		 &root->fs_info->block_group_cache_tree);
+	down_write(&block_group->space_info->groups_sem);
+	list_del(&block_group->list);
+	up_write(&block_group->space_info->groups_sem);
+
+	spin_lock(&block_group->space_info->lock);
+	block_group->space_info->total_bytes -= block_group->key.offset;
+	block_group->space_info->bytes_readonly -= block_group->key.offset;
+	spin_unlock(&block_group->space_info->lock);
+	block_group->space_info->full = 0;
+
+	put_block_group(block_group);
+	put_block_group(block_group);
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 00000000000..e086d407f1f
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3717 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/version.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "compat.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+				       unsigned long extra_flags,
+				       void (*ctor)(void *, struct kmem_cache *,
+						    unsigned long));
+
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
+#define LEAK_DEBUG 0
+#ifdef LEAK_DEBUG
+static DEFINE_SPINLOCK(leak_lock);
+#endif
+
+#define BUFFER_LRU_MAX 64
+
+struct tree_entry {
+	u64 start;
+	u64 end;
+	struct rb_node rb_node;
+};
+
+struct extent_page_data {
+	struct bio *bio;
+	struct extent_io_tree *tree;
+	get_extent_t *get_extent;
+
+	/* tells writepage not to lock the state bits for this range
+	 * it still does the unlocking
+	 */
+	int extent_locked;
+};
+
+int __init extent_io_init(void)
+{
+	extent_state_cache = btrfs_cache_create("extent_state",
+					    sizeof(struct extent_state), 0,
+					    NULL);
+	if (!extent_state_cache)
+		return -ENOMEM;
+
+	extent_buffer_cache = btrfs_cache_create("extent_buffers",
+					    sizeof(struct extent_buffer), 0,
+					    NULL);
+	if (!extent_buffer_cache)
+		goto free_state_cache;
+	return 0;
+
+free_state_cache:
+	kmem_cache_destroy(extent_state_cache);
+	return -ENOMEM;
+}
+
+void extent_io_exit(void)
+{
+	struct extent_state *state;
+	struct extent_buffer *eb;
+
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, leak_list);
+		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+		       "state %lu in tree %p refs %d\n",
+		       (unsigned long long)state->start,
+		       (unsigned long long)state->end,
+		       state->state, state->tree, atomic_read(&state->refs));
+		list_del(&state->leak_list);
+		kmem_cache_free(extent_state_cache, state);
+
+	}
+
+	while (!list_empty(&buffers)) {
+		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+		       "refs %d\n", (unsigned long long)eb->start,
+		       eb->len, atomic_read(&eb->refs));
+		list_del(&eb->leak_list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
+	if (extent_state_cache)
+		kmem_cache_destroy(extent_state_cache);
+	if (extent_buffer_cache)
+		kmem_cache_destroy(extent_buffer_cache);
+}
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			  struct address_space *mapping, gfp_t mask)
+{
+	tree->state.rb_node = NULL;
+	tree->buffer.rb_node = NULL;
+	tree->ops = NULL;
+	tree->dirty_bytes = 0;
+	spin_lock_init(&tree->lock);
+	spin_lock_init(&tree->buffer_lock);
+	tree->mapping = mapping;
+}
+
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+	struct extent_state *state;
+#ifdef LEAK_DEBUG
+	unsigned long flags;
+#endif
+
+	state = kmem_cache_alloc(extent_state_cache, mask);
+	if (!state)
+		return state;
+	state->state = 0;
+	state->private = 0;
+	state->tree = NULL;
+#ifdef LEAK_DEBUG
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(&state->leak_list, &states);
+	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+	atomic_set(&state->refs, 1);
+	init_waitqueue_head(&state->wq);
+	return state;
+}
+
+static void free_extent_state(struct extent_state *state)
+{
+	if (!state)
+		return;
+	if (atomic_dec_and_test(&state->refs)) {
+#ifdef LEAK_DEBUG
+		unsigned long flags;
+#endif
+		WARN_ON(state->tree);
+#ifdef LEAK_DEBUG
+		spin_lock_irqsave(&leak_lock, flags);
+		list_del(&state->leak_list);
+		spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+		kmem_cache_free(extent_state_cache, state);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct tree_entry *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (offset < entry->start)
+			p = &(*p)->rb_left;
+		else if (offset > entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
+				     struct rb_node **prev_ret,
+				     struct rb_node **next_ret)
+{
+	struct rb_root *root = &tree->state;
+	struct rb_node *n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset > entry->end)
+			n = n->rb_right;
+		else
+			return n;
+	}
+
+	if (prev_ret) {
+		orig_prev = prev;
+		while (prev && offset > prev_entry->end) {
+			prev = rb_next(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*prev_ret = prev;
+		prev = orig_prev;
+	}
+
+	if (next_ret) {
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		while (prev && offset < prev_entry->start) {
+			prev = rb_prev(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*next_ret = prev;
+	}
+	return NULL;
+}
+
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+					  u64 offset)
+{
+	struct rb_node *prev = NULL;
+	struct rb_node *ret;
+
+	ret = __etree_search(tree, offset, &prev, NULL);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
+					  u64 offset, struct rb_node *node)
+{
+	struct rb_root *root = &tree->buffer;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_buffer *eb;
+
+	while (*p) {
+		parent = *p;
+		eb = rb_entry(parent, struct extent_buffer, rb_node);
+
+		if (offset < eb->start)
+			p = &(*p)->rb_left;
+		else if (offset > eb->start)
+			p = &(*p)->rb_right;
+		else
+			return eb;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+					   u64 offset)
+{
+	struct rb_root *root = &tree->buffer;
+	struct rb_node *n = root->rb_node;
+	struct extent_buffer *eb;
+
+	while (n) {
+		eb = rb_entry(n, struct extent_buffer, rb_node);
+		if (offset < eb->start)
+			n = n->rb_left;
+		else if (offset > eb->start)
+			n = n->rb_right;
+		else
+			return eb;
+	}
+	return NULL;
+}
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree.  Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_io_tree *tree,
+		       struct extent_state *state)
+{
+	struct extent_state *other;
+	struct rb_node *other_node;
+
+	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+		return 0;
+
+	other_node = rb_prev(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->end == state->start - 1 &&
+		    other->state == state->state) {
+			state->start = other->start;
+			other->tree = NULL;
+			rb_erase(&other->rb_node, &tree->state);
+			free_extent_state(other);
+		}
+	}
+	other_node = rb_next(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->start == state->end + 1 &&
+		    other->state == state->state) {
+			other->start = state->start;
+			state->tree = NULL;
+			rb_erase(&state->rb_node, &tree->state);
+			free_extent_state(state);
+		}
+	}
+	return 0;
+}
+
+static void set_state_cb(struct extent_io_tree *tree,
+			 struct extent_state *state,
+			 unsigned long bits)
+{
+	if (tree->ops && tree->ops->set_bit_hook) {
+		tree->ops->set_bit_hook(tree->mapping->host, state->start,
+					state->end, state->state, bits);
+	}
+}
+
+static void clear_state_cb(struct extent_io_tree *tree,
+			   struct extent_state *state,
+			   unsigned long bits)
+{
+	if (tree->ops && tree->ops->clear_bit_hook) {
+		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
+					  state->end, state->state, bits);
+	}
+}
+
+/*
+ * insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+			struct extent_state *state, u64 start, u64 end,
+			int bits)
+{
+	struct rb_node *node;
+
+	if (end < start) {
+		printk(KERN_ERR "btrfs end < start %llu %llu\n",
+		       (unsigned long long)end,
+		       (unsigned long long)start);
+		WARN_ON(1);
+	}
+	if (bits & EXTENT_DIRTY)
+		tree->dirty_bytes += end - start + 1;
+	set_state_cb(tree, state, bits);
+	state->state |= bits;
+	state->start = start;
+	state->end = end;
+	node = tree_insert(&tree->state, end, &state->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+		       "%llu %llu\n", (unsigned long long)found->start,
+		       (unsigned long long)found->end,
+		       (unsigned long long)start, (unsigned long long)end);
+		free_extent_state(state);
+		return -EEXIST;
+	}
+	state->tree = tree;
+	merge_state(tree, state);
+	return 0;
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+		       struct extent_state *prealloc, u64 split)
+{
+	struct rb_node *node;
+	prealloc->start = orig->start;
+	prealloc->end = split - 1;
+	prealloc->state = orig->state;
+	orig->start = split;
+
+	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+		free_extent_state(prealloc);
+		return -EEXIST;
+	}
+	prealloc->tree = tree;
+	return 0;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_io_tree *tree,
+			    struct extent_state *state, int bits, int wake,
+			    int delete)
+{
+	int ret = state->state & bits;
+
+	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		WARN_ON(range > tree->dirty_bytes);
+		tree->dirty_bytes -= range;
+	}
+	clear_state_cb(tree, state, bits);
+	state->state &= ~bits;
+	if (wake)
+		wake_up(&state->wq);
+	if (delete || state->state == 0) {
+		if (state->tree) {
+			clear_state_cb(tree, state, state->state);
+			rb_erase(&state->rb_node, &tree->state);
+			state->tree = NULL;
+			free_extent_state(state);
+		} else {
+			WARN_ON(1);
+		}
+	} else {
+		merge_state(tree, state);
+	}
+	return ret;
+}
+
+/*
+ * clear some bits on a range in the tree.  This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	int err;
+	int set = 0;
+
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find the extents that end after
+	 * our range starts
+	 */
+	node = tree_search(tree, start);
+	if (!node)
+		goto out;
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start > end)
+		goto out;
+	WARN_ON(state->end < start);
+
+	/*
+	 *     | ---- desired range ---- |
+	 *  | state | or
+	 *  | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip
+	 * bits on second half.
+	 *
+	 * If the extent we found extends past our range, we
+	 * just split and search again.  It'll get split again
+	 * the next time though.
+	 *
+	 * If the extent we found is inside our range, we clear
+	 * the desired bit on it.
+	 */
+
+	if (state->start < start) {
+		if (!prealloc)
+			prealloc = alloc_extent_state(GFP_ATOMIC);
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			start = state->end + 1;
+			set |= clear_state_bit(tree, state, bits,
+					wake, delete);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and clear the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		if (!prealloc)
+			prealloc = alloc_extent_state(GFP_ATOMIC);
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		if (wake)
+			wake_up(&state->wq);
+		set |= clear_state_bit(tree, prealloc, bits,
+				       wake, delete);
+		prealloc = NULL;
+		goto out;
+	}
+
+	start = state->end + 1;
+	set |= clear_state_bit(tree, state, bits, wake, delete);
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return set;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+
+static int wait_on_state(struct extent_io_tree *tree,
+			 struct extent_state *state)
+		__releases(tree->lock)
+		__acquires(tree->lock)
+{
+	DEFINE_WAIT(wait);
+	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&tree->lock);
+	schedule();
+	spin_lock(&tree->lock);
+	finish_wait(&state->wq, &wait);
+	return 0;
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+{
+	struct extent_state *state;
+	struct rb_node *node;
+
+	spin_lock(&tree->lock);
+again:
+	while (1) {
+		/*
+		 * this search will find all the extents that end after
+		 * our range starts
+		 */
+		node = tree_search(tree, start);
+		if (!node)
+			break;
+
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (state->start > end)
+			goto out;
+
+		if (state->state & bits) {
+			start = state->start;
+			atomic_inc(&state->refs);
+			wait_on_state(tree, state);
+			free_extent_state(state);
+			goto again;
+		}
+		start = state->end + 1;
+
+		if (start > end)
+			break;
+
+		if (need_resched()) {
+			spin_unlock(&tree->lock);
+			cond_resched();
+			spin_lock(&tree->lock);
+		}
+	}
+out:
+	spin_unlock(&tree->lock);
+	return 0;
+}
+
+static void set_state_bits(struct extent_io_tree *tree,
+			   struct extent_state *state,
+			   int bits)
+{
+	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		tree->dirty_bytes += range;
+	}
+	set_state_cb(tree, state, bits);
+	state->state |= bits;
+}
+
+/*
+ * set some bits on a range in the tree.  This may require allocations
+ * or sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+ * range already has the desired bits set.  The start of the existing
+ * range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive
+ * This takes the tree lock.
+ */
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			  int bits, int exclusive, u64 *failed_start,
+			  gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	int err = 0;
+	int set;
+	u64 last_start;
+	u64 last_end;
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		err = insert_state(tree, prealloc, start, end, bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		goto out;
+	}
+
+	state = rb_entry(node, struct extent_state, rb_node);
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		set = state->state & bits;
+		if (set && exclusive) {
+			*failed_start = state->start;
+			err = -EEXIST;
+			goto out;
+		}
+		set_state_bits(tree, state, bits);
+		start = state->end + 1;
+		merge_state(tree, state);
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			set_state_bits(tree, state, bits);
+			start = state->end + 1;
+			merge_state(tree, state);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start - 1;
+		err = insert_state(tree, prealloc, start, this_end,
+				   bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		if (err)
+			goto out;
+		start = this_end + 1;
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		set_state_bits(tree, prealloc, bits);
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+			      mask);
+}
+
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+}
+
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, bits, 0, NULL,
+			      mask);
+}
+
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+}
+
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_DIRTY,
+			      0, NULL, mask);
+}
+
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end,
+				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
+}
+
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+			 gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+}
+
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+			      mask);
+}
+
+static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+}
+
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+			      mask);
+}
+
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+				 u64 end, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+}
+
+static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+			 gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+			      0, NULL, mask);
+}
+
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+				  u64 end, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+}
+
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+	while (1) {
+		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+				     &failed_start, mask);
+		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+			start = failed_start;
+		} else {
+			break;
+		}
+		WARN_ON(start > end);
+	}
+	return err;
+}
+
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+
+	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+			     &failed_start, mask);
+	if (err == -EEXIST) {
+		if (failed_start > start)
+			clear_extent_bit(tree, start, failed_start - 1,
+					 EXTENT_LOCKED, 1, 0, mask);
+		return 0;
+	}
+	return 1;
+}
+
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		  gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+}
+
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		__set_page_dirty_nobuffers(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_dirty(tree, start, end, GFP_NOFS);
+	return 0;
+}
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		set_page_writeback(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_writeback(tree, start, end, GFP_NOFS);
+	return 0;
+}
+
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 1;
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node)
+		goto out;
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->end >= start && (state->state & bits)) {
+			*start_ret = state->start;
+			*end_ret = state->end;
+			ret = 0;
+			break;
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+/* find the first state struct with 'bits' set after 'start', and
+ * return it.  tree->lock must be held.  NULL will returned if
+ * nothing was found after 'start'
+ */
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+						 u64 start, int bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node)
+		goto out;
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->end >= start && (state->state & bits))
+			return state;
+
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	return NULL;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+					u64 *start, u64 *end, u64 max_bytes)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 found = 0;
+	u64 total_bytes = 0;
+
+	spin_lock(&tree->lock);
+
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, cur_start);
+	if (!node) {
+		if (!found)
+			*end = (u64)-1;
+		goto out;
+	}
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (found && (state->start != cur_start ||
+			      (state->state & EXTENT_BOUNDARY))) {
+			goto out;
+		}
+		if (!(state->state & EXTENT_DELALLOC)) {
+			if (!found)
+				*end = state->end;
+			goto out;
+		}
+		if (!found)
+			*start = state->start;
+		found++;
+		*end = state->end;
+		cur_start = state->end + 1;
+		node = rb_next(node);
+		if (!node)
+			break;
+		total_bytes += state->end - state->start + 1;
+		if (total_bytes >= max_bytes)
+			break;
+	}
+out:
+	spin_unlock(&tree->lock);
+	return found;
+}
+
+static noinline int __unlock_for_delalloc(struct inode *inode,
+					  struct page *locked_page,
+					  u64 start, u64 end)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+
+	if (index == locked_page->index && end_index == index)
+		return 0;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] != locked_page)
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+					struct page *locked_page,
+					u64 delalloc_start,
+					u64 delalloc_end)
+{
+	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+	unsigned long start_index = index;
+	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+	unsigned long pages_locked = 0;
+	struct page *pages[16];
+	unsigned long nrpages;
+	int ret;
+	int i;
+
+	/* the caller is responsible for locking the start index */
+	if (index == locked_page->index && index == end_index)
+		return 0;
+
+	/* skip the page at the start index */
+	nrpages = end_index - index + 1;
+	while (nrpages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long,
+				     nrpages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			ret = -EAGAIN;
+			goto done;
+		}
+		/* now we have an array of pages, lock them all */
+		for (i = 0; i < ret; i++) {
+			/*
+			 * the caller is taking responsibility for
+			 * locked_page
+			 */
+			if (pages[i] != locked_page) {
+				lock_page(pages[i]);
+				if (!PageDirty(pages[i]) ||
+				    pages[i]->mapping != inode->i_mapping) {
+					ret = -EAGAIN;
+					unlock_page(pages[i]);
+					page_cache_release(pages[i]);
+					goto done;
+				}
+			}
+			page_cache_release(pages[i]);
+			pages_locked++;
+		}
+		nrpages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	ret = 0;
+done:
+	if (ret && pages_locked) {
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start,
+			      ((u64)(start_index + pages_locked - 1)) <<
+			      PAGE_CACHE_SHIFT);
+	}
+	return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+					     struct extent_io_tree *tree,
+					     struct page *locked_page,
+					     u64 *start, u64 *end,
+					     u64 max_bytes)
+{
+	u64 delalloc_start;
+	u64 delalloc_end;
+	u64 found;
+	int ret;
+	int loops = 0;
+
+again:
+	/* step one, find a bunch of delalloc bytes starting at start */
+	delalloc_start = *start;
+	delalloc_end = 0;
+	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+				    max_bytes);
+	if (!found || delalloc_end <= *start) {
+		*start = delalloc_start;
+		*end = delalloc_end;
+		return found;
+	}
+
+	/*
+	 * start comes from the offset of locked_page.  We have to lock
+	 * pages in order, so we can't process delalloc bytes before
+	 * locked_page
+	 */
+	if (delalloc_start < *start)
+		delalloc_start = *start;
+
+	/*
+	 * make sure to limit the number of pages we try to lock down
+	 * if we're looping.
+	 */
+	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
+		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+
+	/* step two, lock all the pages after the page that has start */
+	ret = lock_delalloc_pages(inode, locked_page,
+				  delalloc_start, delalloc_end);
+	if (ret == -EAGAIN) {
+		/* some of the pages are gone, lets avoid looping by
+		 * shortening the size of the delalloc range we're searching
+		 */
+		if (!loops) {
+			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+			max_bytes = PAGE_CACHE_SIZE - offset;
+			loops = 1;
+			goto again;
+		} else {
+			found = 0;
+			goto out_failed;
+		}
+	}
+	BUG_ON(ret);
+
+	/* step three, lock the state bits for the whole range */
+	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+
+	/* then test to make sure it is all still delalloc */
+	ret = test_range_bit(tree, delalloc_start, delalloc_end,
+			     EXTENT_DELALLOC, 1);
+	if (!ret) {
+		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start, delalloc_end);
+		cond_resched();
+		goto again;
+	}
+	*start = delalloc_start;
+	*end = delalloc_end;
+out_failed:
+	return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				int unlock_pages,
+				int clear_unlock,
+				int clear_delalloc, int clear_dirty,
+				int set_writeback,
+				int end_writeback)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int clear_bits = 0;
+
+	if (clear_unlock)
+		clear_bits |= EXTENT_LOCKED;
+	if (clear_dirty)
+		clear_bits |= EXTENT_DIRTY;
+
+	if (clear_delalloc)
+		clear_bits |= EXTENT_DELALLOC;
+
+	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+		return 0;
+
+	while (nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] == locked_page) {
+				page_cache_release(pages[i]);
+				continue;
+			}
+			if (clear_dirty)
+				clear_page_dirty_for_io(pages[i]);
+			if (set_writeback)
+				set_page_writeback(pages[i]);
+			if (end_writeback)
+				end_page_writeback(pages[i]);
+			if (unlock_pages)
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set.  This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached.  The total number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end, u64 max_bytes,
+		     unsigned long bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 total_bytes = 0;
+	int found = 0;
+
+	if (search_end <= cur_start) {
+		WARN_ON(1);
+		return 0;
+	}
+
+	spin_lock(&tree->lock);
+	if (cur_start == 0 && bits == EXTENT_DIRTY) {
+		total_bytes = tree->dirty_bytes;
+		goto out;
+	}
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, cur_start);
+	if (!node)
+		goto out;
+
+	while (1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->start > search_end)
+			break;
+		if (state->end >= cur_start && (state->state & bits)) {
+			total_bytes += min(search_end, state->end) + 1 -
+				       max(cur_start, state->start);
+			if (total_bytes >= max_bytes)
+				break;
+			if (!found) {
+				*start = state->start;
+				found = 1;
+			}
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	spin_unlock(&tree->lock);
+	return total_bytes;
+}
+
+#if 0
+/*
+ * helper function to lock both pages and extents in the tree.
+ * pages must be locked first.
+ */
+static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int err;
+
+	while (index <= end_index) {
+		page = grab_cache_page(tree->mapping, index);
+		if (!page) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto failed;
+		}
+		index++;
+	}
+	lock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+
+failed:
+	/*
+	 * we failed above in getting the page at 'index', so we undo here
+	 * up to but not including the page at 'index'
+	 */
+	end_index = index;
+	index = start >> PAGE_CACHE_SHIFT;
+	while (index < end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	return err;
+}
+
+/*
+ * helper function to unlock both pages and extents in the tree.
+ */
+static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	unlock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+}
+#endif
+
+/*
+ * set the private field for a given byte offset in the tree.  If there isn't
+ * an extent_state there already, this does nothing.
+ */
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state->private = private;
+out:
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*private = state->private;
+out:
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if every extent in the tree
+ * has the bits set.  Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int filled)
+{
+	struct extent_state *state = NULL;
+	struct rb_node *node;
+	int bitset = 0;
+
+	spin_lock(&tree->lock);
+	node = tree_search(tree, start);
+	while (node && start <= end) {
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (filled && state->start > start) {
+			bitset = 0;
+			break;
+		}
+
+		if (state->start > end)
+			break;
+
+		if (state->state & bits) {
+			bitset = 1;
+			if (!filled)
+				break;
+		} else if (filled) {
+			bitset = 0;
+			break;
+		}
+		start = state->end + 1;
+		if (start > end)
+			break;
+		node = rb_next(node);
+		if (!node) {
+			if (filled)
+				bitset = 0;
+			break;
+		}
+	}
+	spin_unlock(&tree->lock);
+	return bitset;
+}
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_io_tree *tree,
+			       struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_io_tree *tree,
+			     struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+		unlock_page(page);
+	return 0;
+}
+
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_io_tree *tree,
+			     struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+		end_page_writeback(page);
+	return 0;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_writepage(struct bio *bio, int err)
+{
+	int uptodate = err == 0;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree;
+	u64 start;
+	u64 end;
+	int whole_page;
+	int ret;
+
+	do {
+		struct page *page = bvec->bv_page;
+		tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			 bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+		if (tree->ops && tree->ops->writepage_end_io_hook) {
+			ret = tree->ops->writepage_end_io_hook(page, start,
+						       end, NULL, uptodate);
+			if (ret)
+				uptodate = 0;
+		}
+
+		if (!uptodate && tree->ops &&
+		    tree->ops->writepage_io_failed_hook) {
+			ret = tree->ops->writepage_io_failed_hook(bio, page,
+							 start, end, NULL);
+			if (ret == 0) {
+				uptodate = (err == 0);
+				continue;
+			}
+		}
+
+		if (!uptodate) {
+			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page)
+			end_page_writeback(page);
+		else
+			check_page_writeback(tree, page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_readpage(struct bio *bio, int err)
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree;
+	u64 start;
+	u64 end;
+	int whole_page;
+	int ret;
+
+	if (err)
+		uptodate = 0;
+
+	do {
+		struct page *page = bvec->bv_page;
+		tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+			ret = tree->ops->readpage_end_io_hook(page, start, end,
+							      NULL);
+			if (ret)
+				uptodate = 0;
+		}
+		if (!uptodate && tree->ops &&
+		    tree->ops->readpage_io_failed_hook) {
+			ret = tree->ops->readpage_io_failed_hook(bio, page,
+							 start, end, NULL);
+			if (ret == 0) {
+				uptodate =
+					test_bit(BIO_UPTODATE, &bio->bi_flags);
+				if (err)
+					uptodate = 0;
+				continue;
+			}
+		}
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end,
+					    GFP_ATOMIC);
+		}
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page) {
+			if (uptodate) {
+				SetPageUptodate(page);
+			} else {
+				ClearPageUptodate(page);
+				SetPageError(page);
+			}
+			unlock_page(page);
+		} else {
+			if (uptodate) {
+				check_page_uptodate(tree, page);
+			} else {
+				ClearPageUptodate(page);
+				SetPageError(page);
+			}
+			check_page_locked(tree, page);
+		}
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+}
+
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+static void end_bio_extent_preparewrite(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree;
+	u64 start;
+	u64 end;
+
+	do {
+		struct page *page = bvec->bv_page;
+		tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+}
+
+static struct bio *
+extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+		 gfp_t gfp_flags)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_size = 0;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_sector;
+	}
+	return bio;
+}
+
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+			  unsigned long bio_flags)
+{
+	int ret = 0;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct page *page = bvec->bv_page;
+	struct extent_io_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+
+	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+	end = start + bvec->bv_len - 1;
+
+	bio->bi_private = NULL;
+
+	bio_get(bio);
+
+	if (tree->ops && tree->ops->submit_bio_hook)
+		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+					   mirror_num, bio_flags);
+	else
+		submit_bio(rw, bio);
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+	bio_put(bio);
+	return ret;
+}
+
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+			      struct page *page, sector_t sector,
+			      size_t size, unsigned long offset,
+			      struct block_device *bdev,
+			      struct bio **bio_ret,
+			      unsigned long max_pages,
+			      bio_end_io_t end_io_func,
+			      int mirror_num,
+			      unsigned long prev_bio_flags,
+			      unsigned long bio_flags)
+{
+	int ret = 0;
+	struct bio *bio;
+	int nr;
+	int contig = 0;
+	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+
+	if (bio_ret && *bio_ret) {
+		bio = *bio_ret;
+		if (old_compressed)
+			contig = bio->bi_sector == sector;
+		else
+			contig = bio->bi_sector + (bio->bi_size >> 9) ==
+				sector;
+
+		if (prev_bio_flags != bio_flags || !contig ||
+		    (tree->ops && tree->ops->merge_bio_hook &&
+		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
+					       bio_flags)) ||
+		    bio_add_page(bio, page, page_size, offset) < page_size) {
+			ret = submit_one_bio(rw, bio, mirror_num,
+					     prev_bio_flags);
+			bio = NULL;
+		} else {
+			return 0;
+		}
+	}
+	if (this_compressed)
+		nr = BIO_MAX_PAGES;
+	else
+		nr = bio_get_nr_vecs(bdev);
+
+	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+
+	bio_add_page(bio, page, page_size, offset);
+	bio->bi_end_io = end_io_func;
+	bio->bi_private = tree;
+
+	if (bio_ret)
+		*bio_ret = bio;
+	else
+		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
+
+	return ret;
+}
+
+void set_page_extent_mapped(struct page *page)
+{
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, EXTENT_PAGE_PRIVATE);
+	}
+}
+
+static void set_page_extent_head(struct page *page, unsigned long len)
+{
+	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+}
+
+/*
+ * basic readpage implementation.  Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+static int __extent_read_full_page(struct extent_io_tree *tree,
+				   struct page *page,
+				   get_extent_t *get_extent,
+				   struct bio **bio, int mirror_num,
+				   unsigned long *bio_flags)
+{
+	struct inode *inode = page->mapping->host;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 cur_end;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t page_offset = 0;
+	size_t iosize;
+	size_t disk_io_size;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	unsigned long this_bio_flag = 0;
+
+	set_page_extent_mapped(page);
+
+	end = page_end;
+	lock_extent(tree, start, end, GFP_NOFS);
+
+	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+		char *userpage;
+		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+		if (zero_offset) {
+			iosize = PAGE_CACHE_SIZE - zero_offset;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + zero_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+		}
+	}
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			char *userpage;
+			iosize = PAGE_CACHE_SIZE - page_offset;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + page_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			break;
+		}
+		em = get_extent(inode, page, page_offset, cur,
+				end - cur + 1, 0);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			unlock_extent(tree, cur, end, GFP_NOFS);
+			break;
+		}
+		extent_offset = cur - em->start;
+		BUG_ON(extent_map_end(em) <= cur);
+		BUG_ON(end < cur);
+
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			this_bio_flag = EXTENT_BIO_COMPRESSED;
+
+		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		cur_end = min(extent_map_end(em) - 1, end);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+			disk_io_size = em->block_len;
+			sector = em->block_start >> 9;
+		} else {
+			sector = (em->block_start + extent_offset) >> 9;
+			disk_io_size = iosize;
+		}
+		bdev = em->bdev;
+		block_start = em->block_start;
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			block_start = EXTENT_MAP_HOLE;
+		free_extent_map(em);
+		em = NULL;
+
+		/* we've found a hole, just zero and go on */
+		if (block_start == EXTENT_MAP_HOLE) {
+			char *userpage;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + page_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		/* the get_extent function already copied into the page */
+		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+			check_page_uptodate(tree, page);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		/* we have an inline extent but it didn't get marked up
+		 * to date.  Error out
+		 */
+		if (block_start == EXTENT_MAP_INLINE) {
+			SetPageError(page);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+
+		ret = 0;
+		if (tree->ops && tree->ops->readpage_io_hook) {
+			ret = tree->ops->readpage_io_hook(page, cur,
+							  cur + iosize - 1);
+		}
+		if (!ret) {
+			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+			pnr -= page->index;
+			ret = submit_extent_page(READ, tree, page,
+					 sector, disk_io_size, page_offset,
+					 bdev, bio, pnr,
+					 end_bio_extent_readpage, mirror_num,
+					 *bio_flags,
+					 this_bio_flag);
+			nr++;
+			*bio_flags = this_bio_flag;
+		}
+		if (ret)
+			SetPageError(page);
+		cur = cur + iosize;
+		page_offset += iosize;
+	}
+	if (!nr) {
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	return 0;
+}
+
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			    get_extent_t *get_extent)
+{
+	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
+	int ret;
+
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+				      &bio_flags);
+	if (bio)
+		submit_one_bio(READ, bio, 0, bio_flags);
+	return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+			      void *data)
+{
+	struct inode *inode = page->mapping->host;
+	struct extent_page_data *epd = data;
+	struct extent_io_tree *tree = epd->tree;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 delalloc_start;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 iosize;
+	u64 unlock_start;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t pg_offset = 0;
+	size_t blocksize;
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	u64 nr_delalloc;
+	u64 delalloc_end;
+	int page_started;
+	int compressed;
+	unsigned long nr_written = 0;
+
+	WARN_ON(!PageLocked(page));
+	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+	if (page->index > end_index ||
+	   (page->index == end_index && !pg_offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (page->index == end_index) {
+		char *userpage;
+
+		userpage = kmap_atomic(page, KM_USER0);
+		memset(userpage + pg_offset, 0,
+		       PAGE_CACHE_SIZE - pg_offset);
+		kunmap_atomic(userpage, KM_USER0);
+		flush_dcache_page(page);
+	}
+	pg_offset = 0;
+
+	set_page_extent_mapped(page);
+
+	delalloc_start = start;
+	delalloc_end = 0;
+	page_started = 0;
+	if (!epd->extent_locked) {
+		while (delalloc_end < page_end) {
+			nr_delalloc = find_lock_delalloc_range(inode, tree,
+						       page,
+						       &delalloc_start,
+						       &delalloc_end,
+						       128 * 1024 * 1024);
+			if (nr_delalloc == 0) {
+				delalloc_start = delalloc_end + 1;
+				continue;
+			}
+			tree->ops->fill_delalloc(inode, page, delalloc_start,
+						 delalloc_end, &page_started,
+						 &nr_written);
+			delalloc_start = delalloc_end + 1;
+		}
+
+		/* did the fill delalloc function already unlock and start
+		 * the IO?
+		 */
+		if (page_started) {
+			ret = 0;
+			goto update_nr_written;
+		}
+	}
+	lock_extent(tree, start, page_end, GFP_NOFS);
+
+	unlock_start = start;
+
+	if (tree->ops && tree->ops->writepage_start_hook) {
+		ret = tree->ops->writepage_start_hook(page, start,
+						      page_end);
+		if (ret == -EAGAIN) {
+			unlock_extent(tree, start, page_end, GFP_NOFS);
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			ret = 0;
+			goto update_nr_written;
+		}
+	}
+
+	nr_written++;
+
+	end = page_end;
+	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
+
+	if (last_byte <= start) {
+		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		unlock_extent(tree, start, page_end, GFP_NOFS);
+		if (tree->ops && tree->ops->writepage_end_io_hook)
+			tree->ops->writepage_end_io_hook(page, start,
+							 page_end, NULL, 1);
+		unlock_start = page_end + 1;
+		goto done;
+	}
+
+	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+	blocksize = inode->i_sb->s_blocksize;
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 page_end, NULL, 1);
+			unlock_start = page_end + 1;
+			break;
+		}
+		em = epd->get_extent(inode, page, pg_offset, cur,
+				     end - cur + 1, 1);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(extent_map_end(em) <= cur);
+		BUG_ON(end < cur);
+		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		free_extent_map(em);
+		em = NULL;
+
+		/*
+		 * compressed and inline extents are written through other
+		 * paths in the FS
+		 */
+		if (compressed || block_start == EXTENT_MAP_HOLE ||
+		    block_start == EXTENT_MAP_INLINE) {
+			clear_extent_dirty(tree, cur,
+					   cur + iosize - 1, GFP_NOFS);
+
+			unlock_extent(tree, unlock_start, cur + iosize - 1,
+				      GFP_NOFS);
+
+			/*
+			 * end_io notification does not happen here for
+			 * compressed extents
+			 */
+			if (!compressed && tree->ops &&
+			    tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 cur + iosize - 1,
+							 NULL, 1);
+			else if (compressed) {
+				/* we don't want to end_page_writeback on
+				 * a compressed extent.  this happens
+				 * elsewhere
+				 */
+				nr++;
+			}
+
+			cur += iosize;
+			pg_offset += iosize;
+			unlock_start = cur;
+			continue;
+		}
+		/* leave this out until we have a page_mkwrite call */
+		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+				   EXTENT_DIRTY, 0)) {
+			cur = cur + iosize;
+			pg_offset += iosize;
+			continue;
+		}
+
+		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+		if (tree->ops && tree->ops->writepage_io_hook) {
+			ret = tree->ops->writepage_io_hook(page, cur,
+						cur + iosize - 1);
+		} else {
+			ret = 0;
+		}
+		if (ret) {
+			SetPageError(page);
+		} else {
+			unsigned long max_nr = end_index + 1;
+
+			set_range_writeback(tree, cur, cur + iosize - 1);
+			if (!PageWriteback(page)) {
+				printk(KERN_ERR "btrfs warning page %lu not "
+				       "writeback, cur %llu end %llu\n",
+				       page->index, (unsigned long long)cur,
+				       (unsigned long long)end);
+			}
+
+			ret = submit_extent_page(WRITE, tree, page, sector,
+						 iosize, pg_offset, bdev,
+						 &epd->bio, max_nr,
+						 end_bio_extent_writepage,
+						 0, 0, 0);
+			if (ret)
+				SetPageError(page);
+		}
+		cur = cur + iosize;
+		pg_offset += iosize;
+		nr++;
+	}
+done:
+	if (nr == 0) {
+		/* make sure the mapping tag for page dirty gets cleared */
+		set_page_writeback(page);
+		end_page_writeback(page);
+	}
+	if (unlock_start <= page_end)
+		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+	unlock_page(page);
+
+update_nr_written:
+	wbc->nr_to_write -= nr_written;
+	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+		page->mapping->writeback_index = page->index + nr_written;
+	return 0;
+}
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int extent_write_cache_pages(struct extent_io_tree *tree,
+			     struct address_space *mapping,
+			     struct writeback_control *wbc,
+			     writepage_t writepage, void *data,
+			     void (*flush_fn)(void *))
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+retry:
+	while (!done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			      PAGECACHE_TAG_DIRTY, min(end - index,
+				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+				tree->ops->write_cache_pages_lock_hook(page);
+			else
+				lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE) {
+				if (PageWriteback(page))
+					flush_fn(data);
+				wait_on_page_writeback(page);
+			}
+
+			if (PageWriteback(page) ||
+			    !clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			ret = (*writepage)(page, wbc, data);
+
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+				unlock_page(page);
+				ret = 0;
+			}
+			if (ret || wbc->nr_to_write <= 0)
+				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	return ret;
+}
+
+static noinline void flush_write_bio(void *data)
+{
+	struct extent_page_data *epd = data;
+	if (epd->bio) {
+		submit_one_bio(WRITE, epd->bio, 0, 0);
+		epd->bio = NULL;
+	}
+}
+
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc)
+{
+	int ret;
+	struct address_space *mapping = page->mapping;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 0,
+	};
+	struct writeback_control wbc_writepages = {
+		.bdi		= wbc->bdi,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.nr_to_write	= 64,
+		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
+		.range_end	= (loff_t)-1,
+	};
+
+
+	ret = __extent_writepage(page, wbc, &epd);
+
+	extent_write_cache_pages(tree, mapping, &wbc_writepages,
+				 __extent_writepage, &epd, flush_write_bio);
+	if (epd.bio)
+		submit_one_bio(WRITE, epd.bio, 0, 0);
+	return ret;
+}
+
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode)
+{
+	int ret = 0;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 1,
+	};
+	struct writeback_control wbc_writepages = {
+		.bdi		= inode->i_mapping->backing_dev_info,
+		.sync_mode	= mode,
+		.older_than_this = NULL,
+		.nr_to_write	= nr_pages * 2,
+		.range_start	= start,
+		.range_end	= end + 1,
+	};
+
+	while (start <= end) {
+		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		if (clear_page_dirty_for_io(page))
+			ret = __extent_writepage(page, &wbc_writepages, &epd);
+		else {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, start,
+						 start + PAGE_CACHE_SIZE - 1,
+						 NULL, 1);
+			unlock_page(page);
+		}
+		page_cache_release(page);
+		start += PAGE_CACHE_SIZE;
+	}
+
+	if (epd.bio)
+		submit_one_bio(WRITE, epd.bio, 0, 0);
+	return ret;
+}
+
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc)
+{
+	int ret = 0;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 0,
+	};
+
+	ret = extent_write_cache_pages(tree, mapping, wbc,
+				       __extent_writepage, &epd,
+				       flush_write_bio);
+	if (epd.bio)
+		submit_one_bio(WRITE, epd.bio, 0, 0);
+	return ret;
+}
+
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	struct pagevec pvec;
+	unsigned long bio_flags = 0;
+
+	pagevec_init(&pvec, 0);
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		/*
+		 * what we want to do here is call add_to_page_cache_lru,
+		 * but that isn't exported, so we reproduce it here
+		 */
+		if (!add_to_page_cache(page, mapping,
+					page->index, GFP_KERNEL)) {
+
+			/* open coding of lru_cache_add, also not exported */
+			page_cache_get(page);
+			if (!pagevec_add(&pvec, page))
+				__pagevec_lru_add_file(&pvec);
+			__extent_read_full_page(tree, page, get_extent,
+						&bio, 0, &bio_flags);
+		}
+		page_cache_release(page);
+	}
+	if (pagevec_count(&pvec))
+		__pagevec_lru_add_file(&pvec);
+	BUG_ON(!list_empty(pages));
+	if (bio)
+		submit_one_bio(READ, bio, 0, bio_flags);
+	return 0;
+}
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset)
+{
+	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+	start += (offset + blocksize - 1) & ~(blocksize - 1);
+	if (start > end)
+		return 0;
+
+	lock_extent(tree, start, end, GFP_NOFS);
+	wait_on_extent_writeback(tree, start, end);
+	clear_extent_bit(tree, start, end,
+			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+			 1, 1, GFP_NOFS);
+	return 0;
+}
+
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_io_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to)
+{
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	set_page_extent_mapped(page);
+	set_page_dirty(page);
+
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+
+int extent_prepare_write(struct extent_io_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent)
+{
+	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	u64 block_start;
+	u64 orig_block_start;
+	u64 block_end;
+	u64 cur_end;
+	struct extent_map *em;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	size_t page_offset = 0;
+	size_t block_off_start;
+	size_t block_off_end;
+	int err = 0;
+	int iocount = 0;
+	int ret = 0;
+	int isnew;
+
+	set_page_extent_mapped(page);
+
+	block_start = (page_start + from) & ~((u64)blocksize - 1);
+	block_end = (page_start + to - 1) | (blocksize - 1);
+	orig_block_start = block_start;
+
+	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	while (block_start <= block_end) {
+		em = get_extent(inode, page, page_offset, block_start,
+				block_end - block_start + 1, 1);
+		if (IS_ERR(em) || !em)
+			goto err;
+
+		cur_end = min(block_end, extent_map_end(em) - 1);
+		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+		block_off_end = block_off_start + blocksize;
+		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+
+		if (!PageUptodate(page) && isnew &&
+		    (block_off_end > to || block_off_start < from)) {
+			void *kaddr;
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			if (block_off_end > to)
+				memset(kaddr + to, 0, block_off_end - to);
+			if (block_off_start < from)
+				memset(kaddr + block_off_start, 0,
+				       from - block_off_start);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		if ((em->block_start != EXTENT_MAP_HOLE &&
+		     em->block_start != EXTENT_MAP_INLINE) &&
+		    !isnew && !PageUptodate(page) &&
+		    (block_off_end > to || block_off_start < from) &&
+		    !test_range_bit(tree, block_start, cur_end,
+				    EXTENT_UPTODATE, 1)) {
+			u64 sector;
+			u64 extent_offset = block_start - em->start;
+			size_t iosize;
+			sector = (em->block_start + extent_offset) >> 9;
+			iosize = (cur_end - block_start + blocksize) &
+				~((u64)blocksize - 1);
+			/*
+			 * we've already got the extent locked, but we
+			 * need to split the state such that our end_bio
+			 * handler can clear the lock.
+			 */
+			set_extent_bit(tree, block_start,
+				       block_start + iosize - 1,
+				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+			ret = submit_extent_page(READ, tree, page,
+					 sector, iosize, page_offset, em->bdev,
+					 NULL, 1,
+					 end_bio_extent_preparewrite, 0,
+					 0, 0);
+			iocount++;
+			block_start = block_start + iosize;
+		} else {
+			set_extent_uptodate(tree, block_start, cur_end,
+					    GFP_NOFS);
+			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+			block_start = cur_end + 1;
+		}
+		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+		free_extent_map(em);
+	}
+	if (iocount) {
+		wait_extent_bit(tree, orig_block_start,
+				block_end, EXTENT_LOCKED);
+	}
+	check_page_uptodate(tree, page);
+err:
+	/* FIXME, zero out newly allocated blocks on error */
+	return err;
+}
+
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+int try_release_extent_state(struct extent_map_tree *map,
+			     struct extent_io_tree *tree, struct page *page,
+			     gfp_t mask)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	int ret = 1;
+
+	if (test_range_bit(tree, start, end,
+			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
+		ret = 0;
+	else {
+		if ((mask & GFP_NOFS) == GFP_NOFS)
+			mask = GFP_NOFS;
+		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+				 1, 1, mask);
+	}
+	return ret;
+}
+
+/*
+ * a helper for releasepage.  As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page,
+			       gfp_t mask)
+{
+	struct extent_map *em;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+
+	if ((mask & __GFP_WAIT) &&
+	    page->mapping->host->i_size > 16 * 1024 * 1024) {
+		u64 len;
+		while (start <= end) {
+			len = end - start + 1;
+			spin_lock(&map->lock);
+			em = lookup_extent_mapping(map, start, len);
+			if (!em || IS_ERR(em)) {
+				spin_unlock(&map->lock);
+				break;
+			}
+			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+			    em->start != start) {
+				spin_unlock(&map->lock);
+				free_extent_map(em);
+				break;
+			}
+			if (!test_range_bit(tree, em->start,
+					    extent_map_end(em) - 1,
+					    EXTENT_LOCKED | EXTENT_WRITEBACK |
+					    EXTENT_ORDERED,
+					    0)) {
+				remove_extent_mapping(map, em);
+				/* once for the rb tree */
+				free_extent_map(em);
+			}
+			start = extent_map_end(em);
+			spin_unlock(&map->lock);
+
+			/* once for us */
+			free_extent_map(em);
+		}
+	}
+	return try_release_extent_state(map, tree, page, mask);
+}
+
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+		get_extent_t *get_extent)
+{
+	struct inode *inode = mapping->host;
+	u64 start = iblock << inode->i_blkbits;
+	sector_t sector = 0;
+	size_t blksize = (1 << inode->i_blkbits);
+	struct extent_map *em;
+
+	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+		    GFP_NOFS);
+	em = get_extent(inode, NULL, 0, start, blksize, 0);
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+		      GFP_NOFS);
+	if (!em || IS_ERR(em))
+		return 0;
+
+	if (em->block_start > EXTENT_MAP_LAST_BYTE)
+		goto out;
+
+	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+out:
+	free_extent_map(em);
+	return sector;
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+					      unsigned long i)
+{
+	struct page *p;
+	struct address_space *mapping;
+
+	if (i == 0)
+		return eb->first_page;
+	i += eb->start >> PAGE_CACHE_SHIFT;
+	mapping = eb->first_page->mapping;
+	if (!mapping)
+		return NULL;
+
+	/*
+	 * extent_buffer_page is only called after pinning the page
+	 * by increasing the reference count.  So we know the page must
+	 * be in the radix tree.
+	 */
+	rcu_read_lock();
+	p = radix_tree_lookup(&mapping->page_tree, i);
+	rcu_read_unlock();
+
+	return p;
+}
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
+
+static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
+						   u64 start,
+						   unsigned long len,
+						   gfp_t mask)
+{
+	struct extent_buffer *eb = NULL;
+#ifdef LEAK_DEBUG
+	unsigned long flags;
+#endif
+
+	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+	eb->start = start;
+	eb->len = len;
+	mutex_init(&eb->mutex);
+#ifdef LEAK_DEBUG
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(&eb->leak_list, &buffers);
+	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+	atomic_set(&eb->refs, 1);
+
+	return eb;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#ifdef LEAK_DEBUG
+	unsigned long flags;
+	spin_lock_irqsave(&leak_lock, flags);
+	list_del(&eb->leak_list);
+	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+	kmem_cache_free(extent_buffer_cache, eb);
+}
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+					  u64 start, unsigned long len,
+					  struct page *page0,
+					  gfp_t mask)
+{
+	unsigned long num_pages = num_extent_pages(start, len);
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct extent_buffer *eb;
+	struct extent_buffer *exists = NULL;
+	struct page *p;
+	struct address_space *mapping = tree->mapping;
+	int uptodate = 1;
+
+	spin_lock(&tree->buffer_lock);
+	eb = buffer_search(tree, start);
+	if (eb) {
+		atomic_inc(&eb->refs);
+		spin_unlock(&tree->buffer_lock);
+		mark_page_accessed(eb->first_page);
+		return eb;
+	}
+	spin_unlock(&tree->buffer_lock);
+
+	eb = __alloc_extent_buffer(tree, start, len, mask);
+	if (!eb)
+		return NULL;
+
+	if (page0) {
+		eb->first_page = page0;
+		i = 1;
+		index++;
+		page_cache_get(page0);
+		mark_page_accessed(page0);
+		set_page_extent_mapped(page0);
+		set_page_extent_head(page0, len);
+		uptodate = PageUptodate(page0);
+	} else {
+		i = 0;
+	}
+	for (; i < num_pages; i++, index++) {
+		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+		if (!p) {
+			WARN_ON(1);
+			goto free_eb;
+		}
+		set_page_extent_mapped(p);
+		mark_page_accessed(p);
+		if (i == 0) {
+			eb->first_page = p;
+			set_page_extent_head(p, len);
+		} else {
+			set_page_private(p, EXTENT_PAGE_PRIVATE);
+		}
+		if (!PageUptodate(p))
+			uptodate = 0;
+		unlock_page(p);
+	}
+	if (uptodate)
+		eb->flags |= EXTENT_UPTODATE;
+	eb->flags |= EXTENT_BUFFER_FILLED;
+
+	spin_lock(&tree->buffer_lock);
+	exists = buffer_tree_insert(tree, start, &eb->rb_node);
+	if (exists) {
+		/* add one reference for the caller */
+		atomic_inc(&exists->refs);
+		spin_unlock(&tree->buffer_lock);
+		goto free_eb;
+	}
+	spin_unlock(&tree->buffer_lock);
+
+	/* add one reference for the tree */
+	atomic_inc(&eb->refs);
+	return eb;
+
+free_eb:
+	if (!atomic_dec_and_test(&eb->refs))
+		return exists;
+	for (index = 1; index < i; index++)
+		page_cache_release(extent_buffer_page(eb, index));
+	page_cache_release(extent_buffer_page(eb, 0));
+	__free_extent_buffer(eb);
+	return exists;
+}
+
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 u64 start, unsigned long len,
+					  gfp_t mask)
+{
+	struct extent_buffer *eb;
+
+	spin_lock(&tree->buffer_lock);
+	eb = buffer_search(tree, start);
+	if (eb)
+		atomic_inc(&eb->refs);
+	spin_unlock(&tree->buffer_lock);
+
+	if (eb)
+		mark_page_accessed(eb->first_page);
+
+	return eb;
+}
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+	if (!eb)
+		return;
+
+	if (!atomic_dec_and_test(&eb->refs))
+		return;
+
+	WARN_ON(1);
+}
+
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+			      struct extent_buffer *eb)
+{
+	int set;
+	unsigned long i;
+	unsigned long num_pages;
+	struct page *page;
+
+	u64 start = eb->start;
+	u64 end = start + eb->len - 1;
+
+	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (!set && !PageDirty(page))
+			continue;
+
+		lock_page(page);
+		if (i == 0)
+			set_page_extent_head(page, eb->len);
+		else
+			set_page_private(page, EXTENT_PAGE_PRIVATE);
+
+		/*
+		 * if we're on the last page or the first page and the
+		 * block isn't aligned on a page boundary, do extra checks
+		 * to make sure we don't clean page that is partially dirty
+		 */
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+			start = (u64)page->index << PAGE_CACHE_SHIFT;
+			end  = start + PAGE_CACHE_SIZE - 1;
+			if (test_range_bit(tree, start, end,
+					   EXTENT_DIRTY, 0)) {
+				unlock_page(page);
+				continue;
+			}
+		}
+		clear_page_dirty_for_io(page);
+		spin_lock_irq(&page->mapping->tree_lock);
+		if (!PageDirty(page)) {
+			radix_tree_tag_clear(&page->mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+		}
+		spin_unlock_irq(&page->mapping->tree_lock);
+		unlock_page(page);
+	}
+	return 0;
+}
+
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+				    struct extent_buffer *eb)
+{
+	return wait_on_extent_writeback(tree, eb->start,
+					eb->start + eb->len - 1);
+}
+
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = extent_buffer_page(eb, i);
+		/* writepage may need to do something special for the
+		 * first page, we have to make sure page->private is
+		 * properly set.  releasepage may drop page->private
+		 * on us if the page isn't already dirty.
+		 */
+		lock_page(page);
+		if (i == 0) {
+			set_page_extent_head(page, eb->len);
+		} else if (PagePrivate(page) &&
+			   page->private != EXTENT_PAGE_PRIVATE) {
+			set_page_extent_mapped(page);
+		}
+		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+		set_extent_dirty(tree, page_offset(page),
+				 page_offset(page) + PAGE_CACHE_SIZE - 1,
+				 GFP_NOFS);
+		unlock_page(page);
+	}
+	return 0;
+}
+
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	eb->flags &= ~EXTENT_UPTODATE;
+
+	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+			      GFP_NOFS);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (page)
+			ClearPageUptodate(page);
+	}
+	return 0;
+}
+
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+			    GFP_NOFS);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+			check_page_uptodate(tree, page);
+			continue;
+		}
+		SetPageUptodate(page);
+	}
+	return 0;
+}
+
+int extent_range_uptodate(struct extent_io_tree *tree,
+			  u64 start, u64 end)
+{
+	struct page *page;
+	int ret;
+	int pg_uptodate = 1;
+	int uptodate;
+	unsigned long index;
+
+	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+	if (ret)
+		return 1;
+	while (start <= end) {
+		index = start >> PAGE_CACHE_SHIFT;
+		page = find_get_page(tree->mapping, index);
+		uptodate = PageUptodate(page);
+		page_cache_release(page);
+		if (!uptodate) {
+			pg_uptodate = 0;
+			break;
+		}
+		start += PAGE_CACHE_SIZE;
+	}
+	return pg_uptodate;
+}
+
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+			   struct extent_buffer *eb)
+{
+	int ret = 0;
+	unsigned long num_pages;
+	unsigned long i;
+	struct page *page;
+	int pg_uptodate = 1;
+
+	if (eb->flags & EXTENT_UPTODATE)
+		return 1;
+
+	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1);
+	if (ret)
+		return ret;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (!PageUptodate(page)) {
+			pg_uptodate = 0;
+			break;
+		}
+	}
+	return pg_uptodate;
+}
+
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb,
+			     u64 start, int wait,
+			     get_extent_t *get_extent, int mirror_num)
+{
+	unsigned long i;
+	unsigned long start_i;
+	struct page *page;
+	int err;
+	int ret = 0;
+	int locked_pages = 0;
+	int all_uptodate = 1;
+	int inc_all_pages = 0;
+	unsigned long num_pages;
+	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
+
+	if (eb->flags & EXTENT_UPTODATE)
+		return 0;
+
+	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1)) {
+		return 0;
+	}
+
+	if (start) {
+		WARN_ON(start < eb->start);
+		start_i = (start >> PAGE_CACHE_SHIFT) -
+			(eb->start >> PAGE_CACHE_SHIFT);
+	} else {
+		start_i = 0;
+	}
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (!wait) {
+			if (!trylock_page(page))
+				goto unlock_exit;
+		} else {
+			lock_page(page);
+		}
+		locked_pages++;
+		if (!PageUptodate(page))
+			all_uptodate = 0;
+	}
+	if (all_uptodate) {
+		if (start_i == 0)
+			eb->flags |= EXTENT_UPTODATE;
+		goto unlock_exit;
+	}
+
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (inc_all_pages)
+			page_cache_get(page);
+		if (!PageUptodate(page)) {
+			if (start_i == 0)
+				inc_all_pages = 1;
+			ClearPageError(page);
+			err = __extent_read_full_page(tree, page,
+						      get_extent, &bio,
+						      mirror_num, &bio_flags);
+			if (err)
+				ret = err;
+		} else {
+			unlock_page(page);
+		}
+	}
+
+	if (bio)
+		submit_one_bio(READ, bio, mirror_num, bio_flags);
+
+	if (ret || !wait)
+		return ret;
+
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		wait_on_page_locked(page);
+		if (!PageUptodate(page))
+			ret = -EIO;
+	}
+
+	if (!ret)
+		eb->flags |= EXTENT_UPTODATE;
+	return ret;
+
+unlock_exit:
+	i = start_i;
+	while (locked_pages > 0) {
+		page = extent_buffer_page(eb, i);
+		i++;
+		unlock_page(page);
+		locked_pages--;
+	}
+	return ret;
+}
+
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+			unsigned long start,
+			unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *dst = (char *)dstv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		kaddr = kmap_atomic(page, KM_USER1);
+		memcpy(dst, kaddr + offset, cur);
+		kunmap_atomic(kaddr, KM_USER1);
+
+		dst += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+			       unsigned long min_len, char **token, char **map,
+			       unsigned long *map_start,
+			       unsigned long *map_len, int km)
+{
+	size_t offset = start & (PAGE_CACHE_SIZE - 1);
+	char *kaddr;
+	struct page *p;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	unsigned long end_i = (start_offset + start + min_len - 1) >>
+		PAGE_CACHE_SHIFT;
+
+	if (i != end_i)
+		return -EINVAL;
+
+	if (i == 0) {
+		offset = start_offset;
+		*map_start = 0;
+	} else {
+		offset = 0;
+		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+	}
+
+	if (start + min_len > eb->len) {
+		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+		       "wanted %lu %lu\n", (unsigned long long)eb->start,
+		       eb->len, start, min_len);
+		WARN_ON(1);
+	}
+
+	p = extent_buffer_page(eb, i);
+	kaddr = kmap_atomic(p, km);
+	*token = kaddr;
+	*map = kaddr + offset;
+	*map_len = PAGE_CACHE_SIZE - offset;
+	return 0;
+}
+
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+		      unsigned long min_len,
+		      char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km)
+{
+	int err;
+	int save = 0;
+	if (eb->map_token) {
+		unmap_extent_buffer(eb, eb->map_token, km);
+		eb->map_token = NULL;
+		save = 1;
+		WARN_ON(!mutex_is_locked(&eb->mutex));
+	}
+	err = map_private_extent_buffer(eb, start, min_len, token, map,
+				       map_start, map_len, km);
+	if (!err && save) {
+		eb->map_token = *token;
+		eb->kaddr = *map;
+		eb->map_start = *map_start;
+		eb->map_len = *map_len;
+	}
+	return err;
+}
+
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
+{
+	kunmap_atomic(token, km);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *ptr = (char *)ptrv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		ret = memcmp(ptr, kaddr + offset, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (ret)
+			break;
+
+		ptr += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+	return ret;
+}
+
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+			 unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *src = (char *)srcv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = kmap_atomic(page, KM_USER1);
+		memcpy(kaddr + offset, src, cur);
+		kunmap_atomic(kaddr, KM_USER1);
+
+		src += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + offset, c, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len)
+{
+	u64 dst_len = dst->len;
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(src->len != dst_len);
+
+	offset = (start_offset + dst_offset) &
+		((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(dst, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		read_extent_buffer(src, kaddr + offset, src_offset, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		src_offset += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+
+static void move_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	if (dst_page == src_page) {
+		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
+	} else {
+		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+		char *p = dst_kaddr + dst_off + len;
+		char *s = src_kaddr + src_off + len;
+
+		while (len--)
+			*--p = *--s;
+
+		kunmap_atomic(src_kaddr, KM_USER1);
+	}
+	kunmap_atomic(dst_kaddr, KM_USER0);
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	char *src_kaddr;
+
+	if (dst_page != src_page)
+		src_kaddr = kmap_atomic(src_page, KM_USER1);
+	else
+		src_kaddr = dst_kaddr;
+
+	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+	kunmap_atomic(dst_kaddr, KM_USER0);
+	if (dst_page != src_page)
+		kunmap_atomic(src_kaddr, KM_USER1);
+}
+
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu dst len %lu\n", src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+
+	while (len > 0) {
+		dst_off_in_page = (start_offset + dst_offset) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_offset) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+					       src_off_in_page));
+		cur = min_t(unsigned long, cur,
+			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+
+		copy_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
+			   dst_off_in_page, src_off_in_page, cur);
+
+		src_offset += cur;
+		dst_offset += cur;
+		len -= cur;
+	}
+}
+
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	unsigned long dst_end = dst_offset + len - 1;
+	unsigned long src_end = src_offset + len - 1;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu len %lu\n", src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu len %lu\n", dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset < src_offset) {
+		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+		return;
+	}
+	while (len > 0) {
+		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+
+		dst_off_in_page = (start_offset + dst_end) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_end) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		cur = min_t(unsigned long, len, src_off_in_page + 1);
+		cur = min(cur, dst_off_in_page + 1);
+		move_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
+			   dst_off_in_page - cur + 1,
+			   src_off_in_page - cur + 1, cur);
+
+		dst_end -= cur;
+		src_end -= cur;
+		len -= cur;
+	}
+}
+
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+{
+	u64 start = page_offset(page);
+	struct extent_buffer *eb;
+	int ret = 1;
+	unsigned long i;
+	unsigned long num_pages;
+
+	spin_lock(&tree->buffer_lock);
+	eb = buffer_search(tree, start);
+	if (!eb)
+		goto out;
+
+	if (atomic_read(&eb->refs) > 1) {
+		ret = 0;
+		goto out;
+	}
+	/* at this point we can safely release the extent buffer */
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++)
+		page_cache_release(extent_buffer_page(eb, i));
+	rb_erase(&eb->rb_node, &tree->buffer);
+	__free_extent_buffer(eb);
+out:
+	spin_unlock(&tree->buffer_lock);
+	return ret;
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 00000000000..c5b483a7913
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,269 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+
+#include <linux/rbtree.h>
+
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG_DONE (1 << 7)
+#define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_ORDERED (1 << 9)
+#define EXTENT_ORDERED_METADATA (1 << 10)
+#define EXTENT_BOUNDARY (1 << 11)
+#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+
+/*
+ * page->private values.  Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
+
+struct extent_state;
+
+typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
+				       struct bio *bio, int mirror_num,
+				       unsigned long bio_flags);
+struct extent_io_ops {
+	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+			     u64 start, u64 end, int *page_started,
+			     unsigned long *nr_written);
+	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
+	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+	extent_submit_bio_hook_t *submit_bio_hook;
+	int (*merge_bio_hook)(struct page *page, unsigned long offset,
+			      size_t size, struct bio *bio,
+			      unsigned long bio_flags);
+	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
+				       u64 start, u64 end,
+				       struct extent_state *state);
+	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
+					u64 start, u64 end,
+				       struct extent_state *state);
+	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
+				    struct extent_state *state);
+	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+				      struct extent_state *state, int uptodate);
+	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+			    unsigned long old, unsigned long bits);
+	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
+			    unsigned long old, unsigned long bits);
+	int (*write_cache_pages_lock_hook)(struct page *page);
+};
+
+struct extent_io_tree {
+	struct rb_root state;
+	struct rb_root buffer;
+	struct address_space *mapping;
+	u64 dirty_bytes;
+	spinlock_t lock;
+	spinlock_t buffer_lock;
+	struct extent_io_ops *ops;
+};
+
+struct extent_state {
+	u64 start;
+	u64 end; /* inclusive */
+	struct rb_node rb_node;
+	struct extent_io_tree *tree;
+	wait_queue_head_t wq;
+	atomic_t refs;
+	unsigned long state;
+
+	/* for use by the FS */
+	u64 private;
+
+	struct list_head leak_list;
+};
+
+struct extent_buffer {
+	u64 start;
+	unsigned long len;
+	char *map_token;
+	char *kaddr;
+	unsigned long map_start;
+	unsigned long map_len;
+	struct page *first_page;
+	atomic_t refs;
+	int flags;
+	struct list_head leak_list;
+	struct rb_node rb_node;
+	struct mutex mutex;
+};
+
+struct extent_map_tree;
+
+static inline struct extent_state *extent_state_next(struct extent_state *state)
+{
+	struct rb_node *node;
+	node = rb_next(&state->rb_node);
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct extent_state, rb_node);
+}
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+					  struct page *page,
+					  size_t page_offset,
+					  u64 start, u64 len,
+					  int create);
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			  struct address_space *mapping, gfp_t mask);
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page,
+			       gfp_t mask);
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_state(struct extent_map_tree *map,
+			     struct extent_io_tree *tree, struct page *page,
+			     gfp_t mask);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end,
+		     u64 max_bytes, unsigned long bits);
+
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int filled);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		   gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
+int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
+				  u64 end, gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits);
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+						 u64 start, int bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode);
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent);
+int extent_prepare_write(struct extent_io_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent);
+int extent_commit_write(struct extent_io_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to);
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+		get_extent_t *get_extent);
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+					  u64 start, unsigned long len,
+					  struct page *page0,
+					  gfp_t mask);
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 u64 start, unsigned long len,
+					  gfp_t mask);
+void free_extent_buffer(struct extent_buffer *eb);
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb, u64 start, int wait,
+			     get_extent_t *get_extent, int mirror_num);
+
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+	atomic_inc(&eb->refs);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+			unsigned long start,
+			unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+			 unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len);
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+				    struct extent_buffer *eb);
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+			      struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+			       struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+			   struct extent_buffer *eb);
+int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+int release_extent_buffer_tail_pages(struct extent_buffer *eb);
+int extent_range_uptodate(struct extent_io_tree *tree,
+			  u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				int unlock_page,
+				int clear_unlock,
+				int clear_delalloc, int clear_dirty,
+				int set_writeback,
+				int end_writeback);
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 00000000000..4a83e33ada3
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/version.h>
+#include <linux/hardirq.h>
+#include "extent_map.h"
+
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+				       unsigned long extra_flags,
+				       void (*ctor)(void *, struct kmem_cache *,
+						    unsigned long));
+
+static struct kmem_cache *extent_map_cache;
+
+int __init extent_map_init(void)
+{
+	extent_map_cache = btrfs_cache_create("extent_map",
+					    sizeof(struct extent_map), 0,
+					    NULL);
+	if (!extent_map_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void extent_map_exit(void)
+{
+	if (extent_map_cache)
+		kmem_cache_destroy(extent_map_cache);
+}
+
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree:		tree to initialize
+ * @mask:		flags for memory allocations during tree operations
+ *
+ * Initialize the extent tree @tree.  Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+{
+	tree->map.rb_node = NULL;
+	spin_lock_init(&tree->lock);
+}
+EXPORT_SYMBOL(extent_map_tree_init);
+
+/**
+ * alloc_extent_map - allocate new extent map structure
+ * @mask:	memory allocation flags
+ *
+ * Allocate a new extent_map structure.  The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
+struct extent_map *alloc_extent_map(gfp_t mask)
+{
+	struct extent_map *em;
+	em = kmem_cache_alloc(extent_map_cache, mask);
+	if (!em || IS_ERR(em))
+		return em;
+	em->in_tree = 0;
+	em->flags = 0;
+	atomic_set(&em->refs, 1);
+	return em;
+}
+EXPORT_SYMBOL(alloc_extent_map);
+
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em:		extent map beeing releasead
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
+void free_extent_map(struct extent_map *em)
+{
+	if (!em)
+		return;
+	WARN_ON(atomic_read(&em->refs) == 0);
+	if (atomic_dec_and_test(&em->refs)) {
+		WARN_ON(em->in_tree);
+		kmem_cache_free(extent_map_cache, em);
+	}
+}
+EXPORT_SYMBOL(free_extent_map);
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct extent_map *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct extent_map, rb_node);
+
+		WARN_ON(!entry->in_tree);
+
+		if (offset < entry->start)
+			p = &(*p)->rb_left;
+		else if (offset >= extent_map_end(entry))
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct extent_map, rb_node);
+	entry->in_tree = 1;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/*
+ * search through the tree for an extent_map with a given offset.  If
+ * it can't be found, try to find some neighboring extents
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+				     struct rb_node **prev_ret,
+				     struct rb_node **next_ret)
+{
+	struct rb_node *n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev = NULL;
+	struct extent_map *entry;
+	struct extent_map *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct extent_map, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		WARN_ON(!entry->in_tree);
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset >= extent_map_end(entry))
+			n = n->rb_right;
+		else
+			return n;
+	}
+
+	if (prev_ret) {
+		orig_prev = prev;
+		while (prev && offset >= extent_map_end(prev_entry)) {
+			prev = rb_next(prev);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
+		}
+		*prev_ret = prev;
+		prev = orig_prev;
+	}
+
+	if (next_ret) {
+		prev_entry = rb_entry(prev, struct extent_map, rb_node);
+		while (prev && offset < prev_entry->start) {
+			prev = rb_prev(prev);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
+		}
+		*next_ret = prev;
+	}
+	return NULL;
+}
+
+/*
+ * look for an offset in the tree, and if it can't be found, return
+ * the first offset we can find smaller than 'offset'.
+ */
+static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, offset, &prev, NULL);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+/* check to see if two extent_map structs are adjacent and safe to merge */
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+		return 0;
+
+	/*
+	 * don't merge compressed extents, we need to know their
+	 * actual size
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+		return 0;
+
+	if (extent_map_end(prev) == next->start &&
+	    prev->flags == next->flags &&
+	    prev->bdev == next->bdev &&
+	    ((next->block_start == EXTENT_MAP_HOLE &&
+	      prev->block_start == EXTENT_MAP_HOLE) ||
+	     (next->block_start == EXTENT_MAP_INLINE &&
+	      prev->block_start == EXTENT_MAP_INLINE) ||
+	     (next->block_start == EXTENT_MAP_DELALLOC &&
+	      prev->block_start == EXTENT_MAP_DELALLOC) ||
+	     (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+	      next->block_start == extent_map_block_end(prev)))) {
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree:	tree to insert new map in
+ * @em:		map to insert
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings.  The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was sucessfull.
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em)
+{
+	int ret = 0;
+	struct extent_map *merge = NULL;
+	struct rb_node *rb;
+	struct extent_map *exist;
+
+	exist = lookup_extent_mapping(tree, em->start, em->len);
+	if (exist) {
+		free_extent_map(exist);
+		ret = -EEXIST;
+		goto out;
+	}
+	assert_spin_locked(&tree->lock);
+	rb = tree_insert(&tree->map, em->start, &em->rb_node);
+	if (rb) {
+		ret = -EEXIST;
+		free_extent_map(merge);
+		goto out;
+	}
+	atomic_inc(&em->refs);
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb)
+			merge = rb_entry(rb, struct extent_map, rb_node);
+		if (rb && mergable_maps(merge, em)) {
+			em->start = merge->start;
+			em->len += merge->len;
+			em->block_len += merge->block_len;
+			em->block_start = merge->block_start;
+			merge->in_tree = 0;
+			rb_erase(&merge->rb_node, &tree->map);
+			free_extent_map(merge);
+		}
+	 }
+	rb = rb_next(&em->rb_node);
+	if (rb)
+		merge = rb_entry(rb, struct extent_map, rb_node);
+	if (rb && mergable_maps(em, merge)) {
+		em->len += merge->len;
+		em->block_len += merge->len;
+		rb_erase(&merge->rb_node, &tree->map);
+		merge->in_tree = 0;
+		free_extent_map(merge);
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(add_extent_mapping);
+
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+	if (start + len < start)
+		return (u64)-1;
+	return start + len;
+}
+
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree:	tree to lookup in
+ * @start:	byte offset to start the search
+ * @len:	length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.  There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *next = NULL;
+	u64 end = range_end(start, len);
+
+	assert_spin_locked(&tree->lock);
+	rb_node = __tree_search(&tree->map, start, &prev, &next);
+	if (!rb_node && prev) {
+		em = rb_entry(prev, struct extent_map, rb_node);
+		if (end > em->start && start < extent_map_end(em))
+			goto found;
+	}
+	if (!rb_node && next) {
+		em = rb_entry(next, struct extent_map, rb_node);
+		if (end > em->start && start < extent_map_end(em))
+			goto found;
+	}
+	if (!rb_node) {
+		em = NULL;
+		goto out;
+	}
+	if (IS_ERR(rb_node)) {
+		em = ERR_PTR(PTR_ERR(rb_node));
+		goto out;
+	}
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+	if (end > em->start && start < extent_map_end(em))
+		goto found;
+
+	em = NULL;
+	goto out;
+
+found:
+	atomic_inc(&em->refs);
+out:
+	return em;
+}
+EXPORT_SYMBOL(lookup_extent_mapping);
+
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree:	extent tree to remove from
+ * @em:		extent map beeing removed
+ *
+ * Removes @em from @tree.  No reference counts are dropped, and no checks
+ * are done to see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+	int ret = 0;
+
+	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+	assert_spin_locked(&tree->lock);
+	rb_erase(&em->rb_node, &tree->map);
+	em->in_tree = 0;
+	return ret;
+}
+EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 00000000000..fb6eeef06bb
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+
+#include <linux/rbtree.h>
+
+#define EXTENT_MAP_LAST_BYTE (u64)-4
+#define EXTENT_MAP_HOLE (u64)-3
+#define EXTENT_MAP_INLINE (u64)-2
+#define EXTENT_MAP_DELALLOC (u64)-1
+
+/* bits for the flags field */
+#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
+#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+
+struct extent_map {
+	struct rb_node rb_node;
+
+	/* all of these are in bytes */
+	u64 start;
+	u64 len;
+	u64 orig_start;
+	u64 block_start;
+	u64 block_len;
+	unsigned long flags;
+	struct block_device *bdev;
+	atomic_t refs;
+	int in_tree;
+};
+
+struct extent_map_tree {
+	struct rb_root map;
+	spinlock_t lock;
+};
+
+static inline u64 extent_map_end(struct extent_map *em)
+{
+	if (em->start + em->len < em->start)
+		return (u64)-1;
+	return em->start + em->len;
+}
+
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+	if (em->block_start + em->block_len < em->block_start)
+		return (u64)-1;
+	return em->block_start + em->block_len;
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 len);
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+
+struct extent_map *alloc_extent_map(gfp_t mask);
+void free_extent_map(struct extent_map *em);
+int __init extent_map_init(void);
+void extent_map_exit(void);
+#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 00000000000..964652435fd
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+
+#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+				   sizeof(struct btrfs_item) * 2) / \
+				  size) - 1))
+
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+				   sizeof(struct btrfs_ordered_sum)) / \
+				   sizeof(struct btrfs_sector_sum) * \
+				   (r)->sectorsize - (r)->sectorsize)
+
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding)
+{
+	int ret = 0;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_key file_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	file_key.objectid = objectid;
+	file_key.offset = pos;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+
+	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+				      sizeof(*item));
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret);
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_file_extent_item);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
+	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
+	btrfs_set_file_extent_offset(leaf, item, offset);
+	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+	btrfs_set_file_extent_generation(leaf, item, trans->transid);
+	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, compression);
+	btrfs_set_file_extent_encryption(leaf, item, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, int cow)
+{
+	int ret;
+	struct btrfs_key file_key;
+	struct btrfs_key found_key;
+	struct btrfs_csum_item *item;
+	struct extent_buffer *leaf;
+	u64 csum_offset = 0;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	int csums_in_item;
+
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+	ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
+	if (ret < 0)
+		goto fail;
+	leaf = path->nodes[0];
+	if (ret > 0) {
+		ret = 1;
+		if (path->slots[0] == 0)
+			goto fail;
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+			goto fail;
+
+		csum_offset = (bytenr - found_key.offset) >>
+				root->fs_info->sb->s_blocksize_bits;
+		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+		csums_in_item /= csum_size;
+
+		if (csum_offset >= csums_in_item) {
+			ret = -EFBIG;
+			goto fail;
+		}
+	}
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	item = (struct btrfs_csum_item *)((unsigned char *)item +
+					  csum_offset * csum_size);
+	return item;
+fail:
+	if (ret > 0)
+		ret = -ENOENT;
+	return ERR_PTR(ret);
+}
+
+
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid,
+			     u64 offset, int mod)
+{
+	int ret;
+	struct btrfs_key file_key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	file_key.objectid = objectid;
+	file_key.offset = offset;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+	return ret;
+}
+
+
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio, u32 *dst)
+{
+	u32 sum;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	u64 offset;
+	u64 item_start_offset = 0;
+	u64 item_last_offset = 0;
+	u64 disk_bytenr;
+	u32 diff;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+	path = btrfs_alloc_path();
+	if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+		path->reada = 2;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+
+	disk_bytenr = (u64)bio->bi_sector << 9;
+	while (bio_index < bio->bi_vcnt) {
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
+		if (ret == 0)
+			goto found;
+
+		if (!item || disk_bytenr < item_start_offset ||
+		    disk_bytenr >= item_last_offset) {
+			struct btrfs_key found_key;
+			u32 item_size;
+
+			if (item)
+				btrfs_release_path(root, path);
+			item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+						 path, disk_bytenr, 0);
+			if (IS_ERR(item)) {
+				ret = PTR_ERR(item);
+				if (ret == -ENOENT || ret == -EFBIG)
+					ret = 0;
+				sum = 0;
+				if (BTRFS_I(inode)->root->root_key.objectid ==
+				    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+					set_extent_bits(io_tree, offset,
+						offset + bvec->bv_len - 1,
+						EXTENT_NODATASUM, GFP_NOFS);
+				} else {
+					printk(KERN_INFO "btrfs no csum found "
+					       "for inode %lu start %llu\n",
+					       inode->i_ino,
+					       (unsigned long long)offset);
+				}
+				item = NULL;
+				btrfs_release_path(root, path);
+				goto found;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+
+			item_start_offset = found_key.offset;
+			item_size = btrfs_item_size_nr(path->nodes[0],
+						       path->slots[0]);
+			item_last_offset = item_start_offset +
+				(item_size / csum_size) *
+				root->sectorsize;
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_csum_item);
+		}
+		/*
+		 * this byte range must be able to fit inside
+		 * a single leaf so it will also fit inside a u32
+		 */
+		diff = disk_bytenr - item_start_offset;
+		diff = diff / root->sectorsize;
+		diff = diff * csum_size;
+
+		read_extent_buffer(path->nodes[0], &sum,
+				   ((unsigned long)item) + diff,
+				   csum_size);
+found:
+		if (dst)
+			*dst++ = sum;
+		else
+			set_state_private(io_tree, offset, sum);
+		disk_bytenr += bvec->bv_len;
+		bio_index++;
+		bvec++;
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+			     struct list_head *list)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_csum_item *item;
+	unsigned long offset;
+	int ret;
+	size_t size;
+	u64 csum_end;
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key.offset = start;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto fail;
+	if (ret > 0 && path->slots[0] > 0) {
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+		    key.type == BTRFS_EXTENT_CSUM_KEY) {
+			offset = (start - key.offset) >>
+				 root->fs_info->sb->s_blocksize_bits;
+			if (offset * csum_size <
+			    btrfs_item_size_nr(leaf, path->slots[0] - 1))
+				path->slots[0]--;
+		}
+	}
+
+	while (start <= end) {
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto fail;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    key.type != BTRFS_EXTENT_CSUM_KEY)
+			break;
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.offset > end)
+			break;
+
+		if (key.offset > start)
+			start = key.offset;
+
+		size = btrfs_item_size_nr(leaf, path->slots[0]);
+		csum_end = key.offset + (size / csum_size) * root->sectorsize;
+		if (csum_end <= start) {
+			path->slots[0]++;
+			continue;
+		}
+
+		csum_end = min(csum_end, end + 1);
+		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_csum_item);
+		while (start < csum_end) {
+			size = min_t(size_t, csum_end - start,
+					MAX_ORDERED_SUM_BYTES(root));
+			sums = kzalloc(btrfs_ordered_sum_size(root, size),
+					GFP_NOFS);
+			BUG_ON(!sums);
+
+			sector_sum = sums->sums;
+			sums->bytenr = start;
+			sums->len = size;
+
+			offset = (start - key.offset) >>
+				root->fs_info->sb->s_blocksize_bits;
+			offset *= csum_size;
+
+			while (size > 0) {
+				read_extent_buffer(path->nodes[0],
+						&sector_sum->sum,
+						((unsigned long)item) +
+						offset, csum_size);
+				sector_sum->bytenr = start;
+
+				size -= root->sectorsize;
+				start += root->sectorsize;
+				offset += csum_size;
+				sector_sum++;
+			}
+			list_add_tail(&sums->list, list);
+		}
+		path->slots[0]++;
+	}
+	ret = 0;
+fail:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+		       struct bio *bio, u64 file_start, int contig)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	char *data;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	unsigned long total_bytes = 0;
+	unsigned long this_sum_bytes = 0;
+	u64 offset;
+	u64 disk_bytenr;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+	if (!sums)
+		return -ENOMEM;
+
+	sector_sum = sums->sums;
+	disk_bytenr = (u64)bio->bi_sector << 9;
+	sums->len = bio->bi_size;
+	INIT_LIST_HEAD(&sums->list);
+
+	if (contig)
+		offset = file_start;
+	else
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
+	BUG_ON(!ordered);
+	sums->bytenr = ordered->start;
+
+	while (bio_index < bio->bi_vcnt) {
+		if (!contig)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+		if (!contig && (offset >= ordered->file_offset + ordered->len ||
+		    offset < ordered->file_offset)) {
+			unsigned long bytes_left;
+			sums->len = this_sum_bytes;
+			this_sum_bytes = 0;
+			btrfs_add_ordered_sum(inode, ordered, sums);
+			btrfs_put_ordered_extent(ordered);
+
+			bytes_left = bio->bi_size - total_bytes;
+
+			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+				       GFP_NOFS);
+			BUG_ON(!sums);
+			sector_sum = sums->sums;
+			sums->len = bytes_left;
+			ordered = btrfs_lookup_ordered_extent(inode, offset);
+			BUG_ON(!ordered);
+			sums->bytenr = ordered->start;
+		}
+
+		data = kmap_atomic(bvec->bv_page, KM_USER0);
+		sector_sum->sum = ~(u32)0;
+		sector_sum->sum = btrfs_csum_data(root,
+						  data + bvec->bv_offset,
+						  sector_sum->sum,
+						  bvec->bv_len);
+		kunmap_atomic(data, KM_USER0);
+		btrfs_csum_final(sector_sum->sum,
+				 (char *)&sector_sum->sum);
+		sector_sum->bytenr = disk_bytenr;
+
+		sector_sum++;
+		bio_index++;
+		total_bytes += bvec->bv_len;
+		this_sum_bytes += bvec->bv_len;
+		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
+		bvec++;
+	}
+	this_sum_bytes = 0;
+	btrfs_add_ordered_sum(inode, ordered, sums);
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
+
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct btrfs_key *key,
+				      u64 bytenr, u64 len)
+{
+	struct extent_buffer *leaf;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u64 csum_end;
+	u64 end_byte = bytenr + len;
+	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+	int ret;
+
+	leaf = path->nodes[0];
+	csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+	csum_end <<= root->fs_info->sb->s_blocksize_bits;
+	csum_end += key->offset;
+
+	if (key->offset < bytenr && csum_end <= end_byte) {
+		/*
+		 *         [ bytenr - len ]
+		 *         [   ]
+		 *   [csum     ]
+		 *   A simple truncate off the end of the item
+		 */
+		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+		new_size *= csum_size;
+		ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+		BUG_ON(ret);
+	} else if (key->offset >= bytenr && csum_end > end_byte &&
+		   end_byte > key->offset) {
+		/*
+		 *         [ bytenr - len ]
+		 *                 [ ]
+		 *                 [csum     ]
+		 * we need to truncate from the beginning of the csum
+		 */
+		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+		new_size *= csum_size;
+
+		ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+		BUG_ON(ret);
+
+		key->offset = end_byte;
+		ret = btrfs_set_item_key_safe(trans, root, path, key);
+		BUG_ON(ret);
+	} else {
+		BUG();
+	}
+	return 0;
+}
+
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, u64 bytenr, u64 len)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	u64 end_byte = bytenr + len;
+	u64 csum_end;
+	struct extent_buffer *leaf;
+	int ret;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+
+	root = root->fs_info->csum_root;
+
+	path = btrfs_alloc_path();
+
+	while (1) {
+		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+		key.offset = end_byte - 1;
+		key.type = BTRFS_EXTENT_CSUM_KEY;
+
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				goto out;
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    key.type != BTRFS_EXTENT_CSUM_KEY) {
+			break;
+		}
+
+		if (key.offset >= end_byte)
+			break;
+
+		csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+		csum_end <<= blocksize_bits;
+		csum_end += key.offset;
+
+		/* this csum ends before we start, we're done */
+		if (csum_end <= bytenr)
+			break;
+
+		/* delete the entire item, it is inside our range */
+		if (key.offset >= bytenr && csum_end <= end_byte) {
+			ret = btrfs_del_item(trans, root, path);
+			BUG_ON(ret);
+			if (key.offset == bytenr)
+				break;
+		} else if (key.offset < bytenr && csum_end > end_byte) {
+			unsigned long offset;
+			unsigned long shift_len;
+			unsigned long item_offset;
+			/*
+			 *        [ bytenr - len ]
+			 *     [csum                ]
+			 *
+			 * Our bytes are in the middle of the csum,
+			 * we need to split this item and insert a new one.
+			 *
+			 * But we can't drop the path because the
+			 * csum could change, get removed, extended etc.
+			 *
+			 * The trick here is the max size of a csum item leaves
+			 * enough room in the tree block for a single
+			 * item header.  So, we split the item in place,
+			 * adding a new header pointing to the existing
+			 * bytes.  Then we loop around again and we have
+			 * a nicely formed csum item that we can neatly
+			 * truncate.
+			 */
+			offset = (bytenr - key.offset) >> blocksize_bits;
+			offset *= csum_size;
+
+			shift_len = (len >> blocksize_bits) * csum_size;
+
+			item_offset = btrfs_item_ptr_offset(leaf,
+							    path->slots[0]);
+
+			memset_extent_buffer(leaf, 0, item_offset + offset,
+					     shift_len);
+			key.offset = bytenr;
+
+			/*
+			 * btrfs_split_item returns -EAGAIN when the
+			 * item changed size or key
+			 */
+			ret = btrfs_split_item(trans, root, path, &key, offset);
+			BUG_ON(ret && ret != -EAGAIN);
+
+			key.offset = end_byte - 1;
+		} else {
+			ret = truncate_one_csum(trans, root, path,
+						&key, bytenr, len);
+			BUG_ON(ret);
+			if (key.offset < bytenr)
+				break;
+		}
+		btrfs_release_path(root, path);
+	}
+out:
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_ordered_sum *sums)
+{
+	u64 bytenr;
+	int ret;
+	struct btrfs_key file_key;
+	struct btrfs_key found_key;
+	u64 next_offset;
+	u64 total_bytes = 0;
+	int found_next;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item;
+	struct btrfs_csum_item *item_end;
+	struct extent_buffer *leaf = NULL;
+	u64 csum_offset;
+	struct btrfs_sector_sum *sector_sum;
+	u32 nritems;
+	u32 ins_size;
+	char *eb_map;
+	char *eb_token;
+	unsigned long map_len;
+	unsigned long map_start;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	sector_sum = sums->sums;
+again:
+	next_offset = (u64)-1;
+	found_next = 0;
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = sector_sum->bytenr;
+	bytenr = sector_sum->bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+
+	item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+	if (!IS_ERR(item)) {
+		leaf = path->nodes[0];
+		ret = 0;
+		goto found;
+	}
+	ret = PTR_ERR(item);
+	if (ret == -EFBIG) {
+		u32 item_size;
+		/* we found one, but it isn't big enough yet */
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		if ((item_size / csum_size) >=
+		    MAX_CSUM_ITEMS(root, csum_size)) {
+			/* already at max size, make a new one */
+			goto insert;
+		}
+	} else {
+		int slot = path->slots[0] + 1;
+		/* we didn't find a csum item, insert one */
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (path->slots[0] >= nritems - 1) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 1)
+				found_next = 1;
+			if (ret != 0)
+				goto insert;
+			slot = 0;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+		if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    found_key.type != BTRFS_EXTENT_CSUM_KEY) {
+			found_next = 1;
+			goto insert;
+		}
+		next_offset = found_key.offset;
+		found_next = 1;
+		goto insert;
+	}
+
+	/*
+	 * at this point, we know the tree has an item, but it isn't big
+	 * enough yet to put our csum in.  Grow it
+	 */
+	btrfs_release_path(root, path);
+	ret = btrfs_search_slot(trans, root, &file_key, path,
+				csum_size, 1);
+	if (ret < 0)
+		goto fail_unlock;
+
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto insert;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	csum_offset = (bytenr - found_key.offset) >>
+			root->fs_info->sb->s_blocksize_bits;
+
+	if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+	    found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+	    csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
+		goto insert;
+	}
+
+	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+	    csum_size) {
+		u32 diff = (csum_offset + 1) * csum_size;
+
+		/*
+		 * is the item big enough already?  we dropped our lock
+		 * before and need to recheck
+		 */
+		if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
+			goto csum;
+
+		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+		if (diff != csum_size)
+			goto insert;
+
+		ret = btrfs_extend_item(trans, root, path, diff);
+		BUG_ON(ret);
+		goto csum;
+	}
+
+insert:
+	btrfs_release_path(root, path);
+	csum_offset = 0;
+	if (found_next) {
+		u64 tmp = total_bytes + root->sectorsize;
+		u64 next_sector = sector_sum->bytenr;
+		struct btrfs_sector_sum *next = sector_sum + 1;
+
+		while (tmp < sums->len) {
+			if (next_sector + root->sectorsize != next->bytenr)
+				break;
+			tmp += root->sectorsize;
+			next_sector = next->bytenr;
+			next++;
+		}
+		tmp = min(tmp, next_offset - file_key.offset);
+		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		tmp = max((u64)1, tmp);
+		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+		ins_size = csum_size * tmp;
+	} else {
+		ins_size = csum_size;
+	}
+	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+				      ins_size);
+	if (ret < 0)
+		goto fail_unlock;
+	if (ret != 0) {
+		WARN_ON(1);
+		goto fail_unlock;
+	}
+csum:
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	ret = 0;
+	item = (struct btrfs_csum_item *)((unsigned char *)item +
+					  csum_offset * csum_size);
+found:
+	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
+				      btrfs_item_size_nr(leaf, path->slots[0]));
+	eb_token = NULL;
+	cond_resched();
+next_sector:
+
+	if (!eb_token ||
+	   (unsigned long)item + csum_size >= map_start + map_len) {
+		int err;
+
+		if (eb_token)
+			unmap_extent_buffer(leaf, eb_token, KM_USER1);
+		eb_token = NULL;
+		err = map_private_extent_buffer(leaf, (unsigned long)item,
+						csum_size,
+						&eb_token, &eb_map,
+						&map_start, &map_len, KM_USER1);
+		if (err)
+			eb_token = NULL;
+	}
+	if (eb_token) {
+		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
+		       &sector_sum->sum, csum_size);
+	} else {
+		write_extent_buffer(leaf, &sector_sum->sum,
+				    (unsigned long)item, csum_size);
+	}
+
+	total_bytes += root->sectorsize;
+	sector_sum++;
+	if (total_bytes < sums->len) {
+		item = (struct btrfs_csum_item *)((char *)item +
+						  csum_size);
+		if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
+		    sector_sum->bytenr) {
+			bytenr = sector_sum->bytenr;
+			goto next_sector;
+		}
+	}
+	if (eb_token) {
+		unmap_extent_buffer(leaf, eb_token, KM_USER1);
+		eb_token = NULL;
+	}
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	cond_resched();
+	if (total_bytes < sums->len) {
+		btrfs_release_path(root, path);
+		goto again;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+
+fail_unlock:
+	goto out;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 00000000000..90268334145
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1288 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "compat.h"
+
+
+/* simple helper to fault in pages and copy.  This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+					 int write_bytes,
+					 struct page **prepared_pages,
+					 const char __user *buf)
+{
+	long page_fault = 0;
+	int i;
+	int offset = pos & (PAGE_CACHE_SIZE - 1);
+
+	for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+		size_t count = min_t(size_t,
+				     PAGE_CACHE_SIZE - offset, write_bytes);
+		struct page *page = prepared_pages[i];
+		fault_in_pages_readable(buf, count);
+
+		/* Copy data from userspace to the current page */
+		kmap(page);
+		page_fault = __copy_from_user(page_address(page) + offset,
+					      buf, count);
+		/* Flush processor's dcache for this page */
+		flush_dcache_page(page);
+		kunmap(page);
+		buf += count;
+		write_bytes -= count;
+
+		if (page_fault)
+			break;
+	}
+	return page_fault ? -EFAULT : 0;
+}
+
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+	size_t i;
+	for (i = 0; i < num_pages; i++) {
+		if (!pages[i])
+			break;
+		/* page checked is some magic around finding pages that
+		 * have been modified without going through btrfs_set_page_dirty
+		 * clear it here
+		 */
+		ClearPageChecked(pages[i]);
+		unlock_page(pages[i]);
+		mark_page_accessed(pages[i]);
+		page_cache_release(pages[i]);
+	}
+}
+
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
+static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct file *file,
+				   struct page **pages,
+				   size_t num_pages,
+				   loff_t pos,
+				   size_t write_bytes)
+{
+	int err = 0;
+	int i;
+	struct inode *inode = fdentry(file)->d_inode;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	u64 hint_byte;
+	u64 num_bytes;
+	u64 start_pos;
+	u64 end_of_last_block;
+	u64 end_pos = pos + write_bytes;
+	loff_t isize = i_size_read(inode);
+
+	start_pos = pos & ~((u64)root->sectorsize - 1);
+	num_bytes = (write_bytes + pos - start_pos +
+		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	end_of_last_block = start_pos + num_bytes - 1;
+
+	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+	trans = btrfs_join_transaction(root, 1);
+	if (!trans) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+	btrfs_set_trans_block_group(trans, inode);
+	hint_byte = 0;
+
+	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+
+	/* check for reserved extents on each page, we don't want
+	 * to reset the delalloc bit on things that already have
+	 * extents reserved.
+	 */
+	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = pages[i];
+		SetPageUptodate(p);
+		ClearPageChecked(p);
+		set_page_dirty(p);
+	}
+	if (end_pos > isize) {
+		i_size_write(inode, end_pos);
+		btrfs_update_inode(trans, root, inode);
+	}
+	err = btrfs_end_transaction(trans, root);
+out_unlock:
+	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+	return err;
+}
+
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end].  Existing extents are split as required.
+ */
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			    int skip_pinned)
+{
+	struct extent_map *em;
+	struct extent_map *split = NULL;
+	struct extent_map *split2 = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 len = end - start + 1;
+	int ret;
+	int testend = 1;
+	unsigned long flags;
+	int compressed = 0;
+
+	WARN_ON(end < start);
+	if (end == (u64)-1) {
+		len = (u64)-1;
+		testend = 0;
+	}
+	while (1) {
+		if (!split)
+			split = alloc_extent_map(GFP_NOFS);
+		if (!split2)
+			split2 = alloc_extent_map(GFP_NOFS);
+
+		spin_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, len);
+		if (!em) {
+			spin_unlock(&em_tree->lock);
+			break;
+		}
+		flags = em->flags;
+		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+			spin_unlock(&em_tree->lock);
+			if (em->start <= start &&
+			    (!testend || em->start + em->len >= start + len)) {
+				free_extent_map(em);
+				break;
+			}
+			if (start < em->start) {
+				len = em->start - start;
+			} else {
+				len = start + len - (em->start + em->len);
+				start = em->start + em->len;
+			}
+			free_extent_map(em);
+			continue;
+		}
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		remove_extent_mapping(em_tree, em);
+
+		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+		    em->start < start) {
+			split->start = em->start;
+			split->len = start - em->start;
+			split->orig_start = em->orig_start;
+			split->block_start = em->block_start;
+
+			if (compressed)
+				split->block_len = em->block_len;
+			else
+				split->block_len = split->len;
+
+			split->bdev = em->bdev;
+			split->flags = flags;
+			ret = add_extent_mapping(em_tree, split);
+			BUG_ON(ret);
+			free_extent_map(split);
+			split = split2;
+			split2 = NULL;
+		}
+		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+		    testend && em->start + em->len > start + len) {
+			u64 diff = start + len - em->start;
+
+			split->start = start + len;
+			split->len = em->start + em->len - (start + len);
+			split->bdev = em->bdev;
+			split->flags = flags;
+
+			if (compressed) {
+				split->block_len = em->block_len;
+				split->block_start = em->block_start;
+				split->orig_start = em->orig_start;
+			} else {
+				split->block_len = split->len;
+				split->block_start = em->block_start + diff;
+				split->orig_start = split->start;
+			}
+
+			ret = add_extent_mapping(em_tree, split);
+			BUG_ON(ret);
+			free_extent_map(split);
+			split = NULL;
+		}
+		spin_unlock(&em_tree->lock);
+
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree*/
+		free_extent_map(em);
+	}
+	if (split)
+		free_extent_map(split);
+	if (split2)
+		free_extent_map(split2);
+	return 0;
+}
+
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
+{
+	return 0;
+#if 0
+	struct btrfs_path *path;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *extent;
+	u64 last_offset = 0;
+	int nritems;
+	int slot;
+	int found_type;
+	int ret;
+	int err = 0;
+	u64 extent_end = 0;
+
+	path = btrfs_alloc_path();
+	ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
+				       last_offset, 0);
+	while (1) {
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret)
+				goto out;
+			nritems = btrfs_header_nritems(path->nodes[0]);
+		}
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (found_key.objectid != inode->i_ino)
+			break;
+		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto out;
+
+		if (found_key.offset < last_offset) {
+			WARN_ON(1);
+			btrfs_print_leaf(root, leaf);
+			printk(KERN_ERR "inode %lu found offset %llu "
+			       "expected %llu\n", inode->i_ino,
+			       (unsigned long long)found_key.offset,
+			       (unsigned long long)last_offset);
+			err = 1;
+			goto out;
+		}
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+		found_type = btrfs_file_extent_type(leaf, extent);
+		if (found_type == BTRFS_FILE_EXTENT_REG) {
+			extent_end = found_key.offset +
+			     btrfs_file_extent_num_bytes(leaf, extent);
+		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+			struct btrfs_item *item;
+			item = btrfs_item_nr(leaf, slot);
+			extent_end = found_key.offset +
+			     btrfs_file_extent_inline_len(leaf, extent);
+			extent_end = (extent_end + root->sectorsize - 1) &
+				~((u64)root->sectorsize - 1);
+		}
+		last_offset = extent_end;
+		path->slots[0]++;
+	}
+	if (0 && last_offset < inode->i_size) {
+		WARN_ON(1);
+		btrfs_print_leaf(root, leaf);
+		printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
+		       inode->i_ino, (unsigned long long)last_offset,
+		       (unsigned long long)inode->i_size);
+		err = 1;
+
+	}
+out:
+	btrfs_free_path(path);
+	return err;
+#endif
+}
+
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end.  hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split.  Anything entirely inside the range
+ * is deleted from the tree.
+ *
+ * inline_limit is used to tell this code which offsets in the file to keep
+ * if they contain inline extents.
+ */
+noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode,
+		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
+{
+	u64 extent_end = 0;
+	u64 locked_end = end;
+	u64 search_start = start;
+	u64 leaf_start;
+	u64 ram_bytes = 0;
+	u64 orig_parent = 0;
+	u64 disk_bytenr = 0;
+	u8 compression;
+	u8 encryption;
+	u16 other_encoding = 0;
+	u64 root_gen;
+	u64 root_owner;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item old;
+	int keep;
+	int slot;
+	int bookend;
+	int found_type = 0;
+	int found_extent;
+	int found_inline;
+	int recow;
+	int ret;
+
+	inline_limit = 0;
+	btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	while (1) {
+		recow = 0;
+		btrfs_release_path(root, path);
+		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+					       search_start, -1);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (path->slots[0] == 0) {
+				ret = 0;
+				goto out;
+			}
+			path->slots[0]--;
+		}
+next_slot:
+		keep = 0;
+		bookend = 0;
+		found_extent = 0;
+		found_inline = 0;
+		leaf_start = 0;
+		root_gen = 0;
+		root_owner = 0;
+		compression = 0;
+		encryption = 0;
+		extent = NULL;
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		ret = 0;
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
+		    key.offset >= end) {
+			goto out;
+		}
+		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+		    key.objectid != inode->i_ino) {
+			goto out;
+		}
+		if (recow) {
+			search_start = max(key.offset, start);
+			continue;
+		}
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			found_type = btrfs_file_extent_type(leaf, extent);
+			compression = btrfs_file_extent_compression(leaf,
+								    extent);
+			encryption = btrfs_file_extent_encryption(leaf,
+								  extent);
+			other_encoding = btrfs_file_extent_other_encoding(leaf,
+								  extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG ||
+			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+				extent_end =
+				     btrfs_file_extent_disk_bytenr(leaf,
+								   extent);
+				if (extent_end)
+					*hint_byte = extent_end;
+
+				extent_end = key.offset +
+				     btrfs_file_extent_num_bytes(leaf, extent);
+				ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+								extent);
+				found_extent = 1;
+			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+				found_inline = 1;
+				extent_end = key.offset +
+				     btrfs_file_extent_inline_len(leaf, extent);
+			}
+		} else {
+			extent_end = search_start;
+		}
+
+		/* we found nothing we can drop */
+		if ((!found_extent && !found_inline) ||
+		    search_start >= extent_end) {
+			int nextret;
+			u32 nritems;
+			nritems = btrfs_header_nritems(leaf);
+			if (slot >= nritems - 1) {
+				nextret = btrfs_next_leaf(root, path);
+				if (nextret)
+					goto out;
+				recow = 1;
+			} else {
+				path->slots[0]++;
+			}
+			goto next_slot;
+		}
+
+		if (end <= extent_end && start >= key.offset && found_inline)
+			*hint_byte = EXTENT_MAP_INLINE;
+
+		if (found_extent) {
+			read_extent_buffer(leaf, &old, (unsigned long)extent,
+					   sizeof(old));
+			root_gen = btrfs_header_generation(leaf);
+			root_owner = btrfs_header_owner(leaf);
+			leaf_start = leaf->start;
+		}
+
+		if (end < extent_end && end >= key.offset) {
+			bookend = 1;
+			if (found_inline && start <= key.offset)
+				keep = 1;
+		}
+
+		if (bookend && found_extent) {
+			if (locked_end < extent_end) {
+				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+						locked_end, extent_end - 1,
+						GFP_NOFS);
+				if (!ret) {
+					btrfs_release_path(root, path);
+					lock_extent(&BTRFS_I(inode)->io_tree,
+						locked_end, extent_end - 1,
+						GFP_NOFS);
+					locked_end = extent_end;
+					continue;
+				}
+				locked_end = extent_end;
+			}
+			orig_parent = path->nodes[0]->start;
+			disk_bytenr = le64_to_cpu(old.disk_bytenr);
+			if (disk_bytenr != 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+					   disk_bytenr,
+					   le64_to_cpu(old.disk_num_bytes),
+					   orig_parent, root->root_key.objectid,
+					   trans->transid, inode->i_ino);
+				BUG_ON(ret);
+			}
+		}
+
+		if (found_inline) {
+			u64 mask = root->sectorsize - 1;
+			search_start = (extent_end + mask) & ~mask;
+		} else
+			search_start = extent_end;
+
+		/* truncate existing extent */
+		if (start > key.offset) {
+			u64 new_num;
+			u64 old_num;
+			keep = 1;
+			WARN_ON(start & (root->sectorsize - 1));
+			if (found_extent) {
+				new_num = start - key.offset;
+				old_num = btrfs_file_extent_num_bytes(leaf,
+								      extent);
+				*hint_byte =
+					btrfs_file_extent_disk_bytenr(leaf,
+								      extent);
+				if (btrfs_file_extent_disk_bytenr(leaf,
+								  extent)) {
+					inode_sub_bytes(inode, old_num -
+							new_num);
+				}
+				btrfs_set_file_extent_num_bytes(leaf,
+							extent, new_num);
+				btrfs_mark_buffer_dirty(leaf);
+			} else if (key.offset < inline_limit &&
+				   (end > extent_end) &&
+				   (inline_limit < extent_end)) {
+				u32 new_size;
+				new_size = btrfs_file_extent_calc_inline_size(
+						   inline_limit - key.offset);
+				inode_sub_bytes(inode, extent_end -
+						inline_limit);
+				btrfs_set_file_extent_ram_bytes(leaf, extent,
+							new_size);
+				if (!compression && !encryption) {
+					btrfs_truncate_item(trans, root, path,
+							    new_size, 1);
+				}
+			}
+		}
+		/* delete the entire extent */
+		if (!keep) {
+			if (found_inline)
+				inode_sub_bytes(inode, extent_end -
+						key.offset);
+			ret = btrfs_del_item(trans, root, path);
+			/* TODO update progress marker and return */
+			BUG_ON(ret);
+			extent = NULL;
+			btrfs_release_path(root, path);
+			/* the extent will be freed later */
+		}
+		if (bookend && found_inline && start <= key.offset) {
+			u32 new_size;
+			new_size = btrfs_file_extent_calc_inline_size(
+						   extent_end - end);
+			inode_sub_bytes(inode, end - key.offset);
+			btrfs_set_file_extent_ram_bytes(leaf, extent,
+							new_size);
+			if (!compression && !encryption)
+				ret = btrfs_truncate_item(trans, root, path,
+							  new_size, 0);
+			BUG_ON(ret);
+		}
+		/* create bookend, splitting the extent in two */
+		if (bookend && found_extent) {
+			struct btrfs_key ins;
+			ins.objectid = inode->i_ino;
+			ins.offset = end;
+			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+
+			btrfs_release_path(root, path);
+			ret = btrfs_insert_empty_item(trans, root, path, &ins,
+						      sizeof(*extent));
+			BUG_ON(ret);
+
+			leaf = path->nodes[0];
+			extent = btrfs_item_ptr(leaf, path->slots[0],
+						struct btrfs_file_extent_item);
+			write_extent_buffer(leaf, &old,
+					    (unsigned long)extent, sizeof(old));
+
+			btrfs_set_file_extent_compression(leaf, extent,
+							  compression);
+			btrfs_set_file_extent_encryption(leaf, extent,
+							 encryption);
+			btrfs_set_file_extent_other_encoding(leaf, extent,
+							     other_encoding);
+			btrfs_set_file_extent_offset(leaf, extent,
+				    le64_to_cpu(old.offset) + end - key.offset);
+			WARN_ON(le64_to_cpu(old.num_bytes) <
+				(extent_end - end));
+			btrfs_set_file_extent_num_bytes(leaf, extent,
+							extent_end - end);
+
+			/*
+			 * set the ram bytes to the size of the full extent
+			 * before splitting.  This is a worst case flag,
+			 * but its the best we can do because we don't know
+			 * how splitting affects compression
+			 */
+			btrfs_set_file_extent_ram_bytes(leaf, extent,
+							ram_bytes);
+			btrfs_set_file_extent_type(leaf, extent, found_type);
+
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+
+			if (disk_bytenr != 0) {
+				ret = btrfs_update_extent_ref(trans, root,
+						disk_bytenr, orig_parent,
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid, ins.objectid);
+
+				BUG_ON(ret);
+			}
+			btrfs_release_path(root, path);
+			if (disk_bytenr != 0)
+				inode_add_bytes(inode, extent_end - end);
+		}
+
+		if (found_extent && !keep) {
+			u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
+
+			if (old_disk_bytenr != 0) {
+				inode_sub_bytes(inode,
+						le64_to_cpu(old.num_bytes));
+				ret = btrfs_free_extent(trans, root,
+						old_disk_bytenr,
+						le64_to_cpu(old.disk_num_bytes),
+						leaf_start, root_owner,
+						root_gen, key.objectid, 0);
+				BUG_ON(ret);
+				*hint_byte = old_disk_bytenr;
+			}
+		}
+
+		if (search_start >= end) {
+			ret = 0;
+			goto out;
+		}
+	}
+out:
+	btrfs_free_path(path);
+	if (locked_end > end) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+			      GFP_NOFS);
+	}
+	btrfs_check_file(root, inode);
+	return ret;
+}
+
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+			    u64 objectid, u64 bytenr, u64 *start, u64 *end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 extent_end;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+	    btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		return 0;
+
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if ((*start && *start != key.offset) || (*end && *end != extent_end))
+		return 0;
+
+	*start = key.offset;
+	*end = extent_end;
+	return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode, u64 start, u64 end)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 extent_end;
+	u64 extent_offset;
+	u64 other_start;
+	u64 other_end;
+	u64 split = start;
+	u64 locked_end = end;
+	u64 orig_parent;
+	int extent_type;
+	int split_end = 1;
+	int ret;
+
+	btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+again:
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (split == start)
+		key.offset = split;
+	else
+		key.offset = split - 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0 && path->slots[0] > 0)
+		path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	BUG_ON(key.objectid != inode->i_ino ||
+	       key.type != BTRFS_EXTENT_DATA_KEY);
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	extent_type = btrfs_file_extent_type(leaf, fi);
+	BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	BUG_ON(key.offset > start || extent_end < end);
+
+	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+	extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+	if (key.offset == start)
+		split = end;
+
+	if (key.offset == start && extent_end == end) {
+		int del_nr = 0;
+		int del_slot = 0;
+		u64 leaf_owner = btrfs_header_owner(leaf);
+		u64 leaf_gen = btrfs_header_generation(leaf);
+		other_start = end;
+		other_end = 0;
+		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			extent_end = other_end;
+			del_slot = path->slots[0] + 1;
+			del_nr++;
+			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+						leaf->start, leaf_owner,
+						leaf_gen, inode->i_ino, 0);
+			BUG_ON(ret);
+		}
+		other_start = 0;
+		other_end = start;
+		if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			key.offset = other_start;
+			del_slot = path->slots[0];
+			del_nr++;
+			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+						leaf->start, leaf_owner,
+						leaf_gen, inode->i_ino, 0);
+			BUG_ON(ret);
+		}
+		split_end = 0;
+		if (del_nr == 0) {
+			btrfs_set_file_extent_type(leaf, fi,
+						   BTRFS_FILE_EXTENT_REG);
+			goto done;
+		}
+
+		fi = btrfs_item_ptr(leaf, del_slot - 1,
+				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						extent_end - key.offset);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+		BUG_ON(ret);
+		goto done;
+	} else if (split == start) {
+		if (locked_end < extent_end) {
+			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+					locked_end, extent_end - 1, GFP_NOFS);
+			if (!ret) {
+				btrfs_release_path(root, path);
+				lock_extent(&BTRFS_I(inode)->io_tree,
+					locked_end, extent_end - 1, GFP_NOFS);
+				locked_end = extent_end;
+				goto again;
+			}
+			locked_end = extent_end;
+		}
+		btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
+		extent_offset += split - key.offset;
+	} else  {
+		BUG_ON(key.offset != start);
+		btrfs_set_file_extent_offset(leaf, fi, extent_offset +
+					     split - key.offset);
+		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
+		key.offset = split;
+		btrfs_set_item_key_safe(trans, root, path, &key);
+		extent_end = split;
+	}
+
+	if (extent_end == end) {
+		split_end = 0;
+		extent_type = BTRFS_FILE_EXTENT_REG;
+	}
+	if (extent_end == end && split == start) {
+		other_start = end;
+		other_end = 0;
+		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			path->slots[0]++;
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			key.offset = split;
+			btrfs_set_item_key_safe(trans, root, path, &key);
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							other_end - split);
+			goto done;
+		}
+	}
+	if (extent_end == end && split == end) {
+		other_start = 0;
+		other_end = start;
+		if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			path->slots[0]--;
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
+							other_start);
+			goto done;
+		}
+	}
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	orig_parent = leaf->start;
+	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+				   orig_parent, root->root_key.objectid,
+				   trans->transid, inode->i_ino);
+	BUG_ON(ret);
+	btrfs_release_path(root, path);
+
+	key.offset = start;
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+	btrfs_set_file_extent_type(leaf, fi, extent_type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+	btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, 0);
+	btrfs_set_file_extent_encryption(leaf, fi, 0);
+	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+
+	if (orig_parent != leaf->start) {
+		ret = btrfs_update_extent_ref(trans, root, bytenr,
+					      orig_parent, leaf->start,
+					      root->root_key.objectid,
+					      trans->transid, inode->i_ino);
+		BUG_ON(ret);
+	}
+done:
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+	if (split_end && split == start) {
+		split = end;
+		goto again;
+	}
+	if (locked_end > end) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+			      GFP_NOFS);
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+/*
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
+ */
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
+			 struct page **pages, size_t num_pages,
+			 loff_t pos, unsigned long first_index,
+			 unsigned long last_index, size_t write_bytes)
+{
+	int i;
+	unsigned long index = pos >> PAGE_CACHE_SHIFT;
+	struct inode *inode = fdentry(file)->d_inode;
+	int err = 0;
+	u64 start_pos;
+	u64 last_pos;
+
+	start_pos = pos & ~((u64)root->sectorsize - 1);
+	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+
+	if (start_pos > inode->i_size) {
+		err = btrfs_cont_expand(inode, start_pos);
+		if (err)
+			return err;
+	}
+
+	memset(pages, 0, num_pages * sizeof(struct page *));
+again:
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = grab_cache_page(inode->i_mapping, index + i);
+		if (!pages[i]) {
+			err = -ENOMEM;
+			BUG_ON(1);
+		}
+		wait_on_page_writeback(pages[i]);
+	}
+	if (start_pos < inode->i_size) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(inode)->io_tree,
+			    start_pos, last_pos - 1, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    last_pos - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > start_pos &&
+		    ordered->file_offset < last_pos) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent(&BTRFS_I(inode)->io_tree,
+				      start_pos, last_pos - 1, GFP_NOFS);
+			for (i = 0; i < num_pages; i++) {
+				unlock_page(pages[i]);
+				page_cache_release(pages[i]);
+			}
+			btrfs_wait_ordered_range(inode, start_pos,
+						 last_pos - start_pos);
+			goto again;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+
+		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+				  GFP_NOFS);
+		unlock_extent(&BTRFS_I(inode)->io_tree,
+			      start_pos, last_pos - 1, GFP_NOFS);
+	}
+	for (i = 0; i < num_pages; i++) {
+		clear_page_dirty_for_io(pages[i]);
+		set_page_extent_mapped(pages[i]);
+		WARN_ON(!PageLocked(pages[i]));
+	}
+	return 0;
+}
+
+static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	loff_t pos;
+	loff_t start_pos;
+	ssize_t num_written = 0;
+	ssize_t err = 0;
+	int ret = 0;
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page **pages = NULL;
+	int nrptrs;
+	struct page *pinned[2];
+	unsigned long first_index;
+	unsigned long last_index;
+	int will_write;
+
+	will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+		      (file->f_flags & O_DIRECT));
+
+	nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
+		     PAGE_CACHE_SIZE / (sizeof(struct page *)));
+	pinned[0] = NULL;
+	pinned[1] = NULL;
+
+	pos = *ppos;
+	start_pos = pos;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out_nolock;
+	if (count == 0)
+		goto out_nolock;
+
+	err = file_remove_suid(file);
+	if (err)
+		goto out_nolock;
+	file_update_time(file);
+
+	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+
+	mutex_lock(&inode->i_mutex);
+	BTRFS_I(inode)->sequence++;
+	first_index = pos >> PAGE_CACHE_SHIFT;
+	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+
+	/*
+	 * there are lots of better ways to do this, but this code
+	 * makes sure the first and last page in the file range are
+	 * up to date and ready for cow
+	 */
+	if ((pos & (PAGE_CACHE_SIZE - 1))) {
+		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
+		if (!PageUptodate(pinned[0])) {
+			ret = btrfs_readpage(NULL, pinned[0]);
+			BUG_ON(ret);
+			wait_on_page_locked(pinned[0]);
+		} else {
+			unlock_page(pinned[0]);
+		}
+	}
+	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
+		if (!PageUptodate(pinned[1])) {
+			ret = btrfs_readpage(NULL, pinned[1]);
+			BUG_ON(ret);
+			wait_on_page_locked(pinned[1]);
+		} else {
+			unlock_page(pinned[1]);
+		}
+	}
+
+	while (count > 0) {
+		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+		size_t write_bytes = min(count, nrptrs *
+					(size_t)PAGE_CACHE_SIZE -
+					 offset);
+		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT;
+
+		WARN_ON(num_pages > nrptrs);
+		memset(pages, 0, sizeof(struct page *) * nrptrs);
+
+		ret = btrfs_check_free_space(root, write_bytes, 0);
+		if (ret)
+			goto out;
+
+		ret = prepare_pages(root, file, pages, num_pages,
+				    pos, first_index, last_index,
+				    write_bytes);
+		if (ret)
+			goto out;
+
+		ret = btrfs_copy_from_user(pos, num_pages,
+					   write_bytes, pages, buf);
+		if (ret) {
+			btrfs_drop_pages(pages, num_pages);
+			goto out;
+		}
+
+		ret = dirty_and_release_pages(NULL, root, file, pages,
+					      num_pages, pos, write_bytes);
+		btrfs_drop_pages(pages, num_pages);
+		if (ret)
+			goto out;
+
+		if (will_write) {
+			btrfs_fdatawrite_range(inode->i_mapping, pos,
+					       pos + write_bytes - 1,
+					       WB_SYNC_NONE);
+		} else {
+			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+							   num_pages);
+			if (num_pages <
+			    (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+				btrfs_btree_balance_dirty(root, 1);
+			btrfs_throttle(root);
+		}
+
+		buf += write_bytes;
+		count -= write_bytes;
+		pos += write_bytes;
+		num_written += write_bytes;
+
+		cond_resched();
+	}
+out:
+	mutex_unlock(&inode->i_mutex);
+
+out_nolock:
+	kfree(pages);
+	if (pinned[0])
+		page_cache_release(pinned[0]);
+	if (pinned[1])
+		page_cache_release(pinned[1]);
+	*ppos = pos;
+
+	if (num_written > 0 && will_write) {
+		struct btrfs_trans_handle *trans;
+
+		err = btrfs_wait_ordered_range(inode, start_pos, num_written);
+		if (err)
+			num_written = err;
+
+		if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+			trans = btrfs_start_transaction(root, 1);
+			ret = btrfs_log_dentry_safe(trans, root,
+						    file->f_dentry);
+			if (ret == 0) {
+				btrfs_sync_log(trans, root);
+				btrfs_end_transaction(trans, root);
+			} else {
+				btrfs_commit_transaction(trans, root);
+			}
+		}
+		if (file->f_flags & O_DIRECT) {
+			invalidate_mapping_pages(inode->i_mapping,
+			      start_pos >> PAGE_CACHE_SHIFT,
+			     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
+		}
+	}
+	current->backing_dev_info = NULL;
+	return num_written ? num_written : err;
+}
+
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+	if (filp->private_data)
+		btrfs_ioctl_trans_end(filp);
+	return 0;
+}
+
+/*
+ * fsync call for both files and directories.  This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit.  This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+	struct btrfs_trans_handle *trans;
+
+	/*
+	 * check the transaction that last modified this inode
+	 * and see if its already been committed
+	 */
+	if (!BTRFS_I(inode)->last_trans)
+		goto out;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (BTRFS_I(inode)->last_trans <=
+	    root->fs_info->last_trans_committed) {
+		BTRFS_I(inode)->last_trans = 0;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		goto out;
+	}
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	root->fs_info->tree_log_batch++;
+	filemap_fdatawrite(inode->i_mapping);
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	root->fs_info->tree_log_batch++;
+
+	/*
+	 * ok we haven't committed the transaction yet, lets do a commit
+	 */
+	if (file->private_data)
+		btrfs_ioctl_trans_end(file);
+
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+	if (ret < 0)
+		goto out;
+
+	/* we've logged all the items and now have a consistent
+	 * version of the file in the log.  It is possible that
+	 * someone will come in and modify the file, but that's
+	 * fine because the log is consistent on disk, and we
+	 * have references to all of the file's extents
+	 *
+	 * It is possible that someone will come in and log the
+	 * file again, but that will end up using the synchronization
+	 * inside btrfs_sync_log to keep things safe.
+	 */
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+
+	if (ret > 0) {
+		ret = btrfs_commit_transaction(trans, root);
+	} else {
+		btrfs_sync_log(trans, root);
+		ret = btrfs_end_transaction(trans, root);
+	}
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+out:
+	return ret > 0 ? EIO : ret;
+}
+
+static struct vm_operations_struct btrfs_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= btrfs_page_mkwrite,
+};
+
+static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &btrfs_file_vm_ops;
+	file_accessed(filp);
+	return 0;
+}
+
+struct file_operations btrfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read       = generic_file_aio_read,
+	.splice_read	= generic_file_splice_read,
+	.write		= btrfs_file_write,
+	.mmap		= btrfs_file_mmap,
+	.open		= generic_file_open,
+	.release	= btrfs_release_file,
+	.fsync		= btrfs_sync_file,
+	.unlocked_ioctl	= btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_ioctl,
+#endif
+};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 00000000000..d1e5f0e84c5
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,495 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+			      struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_free_space *info;
+
+	while (*p) {
+		parent = *p;
+		info = rb_entry(parent, struct btrfs_free_space, offset_index);
+
+		if (offset < info->offset)
+			p = &(*p)->rb_left;
+		else if (offset > info->offset)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+
+	return 0;
+}
+
+static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+			     struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_free_space *info;
+
+	while (*p) {
+		parent = *p;
+		info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+
+		if (bytes < info->bytes)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+
+	return 0;
+}
+
+/*
+ * searches the tree for the given offset.  If contains is set we will return
+ * the free space that contains the given offset.  If contains is not set we
+ * will return the free space that starts at or after the given offset and is
+ * at least bytes long.
+ */
+static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+						   u64 offset, u64 bytes,
+						   int contains)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_free_space *entry, *ret = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+
+		if (offset < entry->offset) {
+			if (!contains &&
+			    (!ret || entry->offset < ret->offset) &&
+			    (bytes <= entry->bytes))
+				ret = entry;
+			n = n->rb_left;
+		} else if (offset > entry->offset) {
+			if ((entry->offset + entry->bytes - 1) >= offset &&
+			    bytes <= entry->bytes) {
+				ret = entry;
+				break;
+			}
+			n = n->rb_right;
+		} else {
+			if (bytes > entry->bytes) {
+				n = n->rb_right;
+				continue;
+			}
+			ret = entry;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * return a chunk at least bytes size, as close to offset that we can get.
+ */
+static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
+						  u64 offset, u64 bytes)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_free_space *entry, *ret = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+
+		if (bytes < entry->bytes) {
+			/*
+			 * We prefer to get a hole size as close to the size we
+			 * are asking for so we don't take small slivers out of
+			 * huge holes, but we also want to get as close to the
+			 * offset as possible so we don't have a whole lot of
+			 * fragmentation.
+			 */
+			if (offset <= entry->offset) {
+				if (!ret)
+					ret = entry;
+				else if (entry->bytes < ret->bytes)
+					ret = entry;
+				else if (entry->offset < ret->offset)
+					ret = entry;
+			}
+			n = n->rb_left;
+		} else if (bytes > entry->bytes) {
+			n = n->rb_right;
+		} else {
+			/*
+			 * Ok we may have multiple chunks of the wanted size,
+			 * so we don't want to take the first one we find, we
+			 * want to take the one closest to our given offset, so
+			 * keep searching just in case theres a better match.
+			 */
+			n = n->rb_right;
+			if (offset > entry->offset)
+				continue;
+			else if (!ret || entry->offset < ret->offset)
+				ret = entry;
+		}
+	}
+
+	return ret;
+}
+
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_free_space *info)
+{
+	rb_erase(&info->offset_index, &block_group->free_space_offset);
+	rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+}
+
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+			   struct btrfs_free_space *info)
+{
+	int ret = 0;
+
+
+	ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+				 &info->offset_index);
+	if (ret)
+		return ret;
+
+	ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+				&info->bytes_index);
+	if (ret)
+		return ret;
+
+	return ret;
+}
+
+static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+				  u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *right_info;
+	struct btrfs_free_space *left_info;
+	struct btrfs_free_space *info = NULL;
+	struct btrfs_free_space *alloc_info;
+	int ret = 0;
+
+	alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+	if (!alloc_info)
+		return -ENOMEM;
+
+	/*
+	 * first we want to see if there is free space adjacent to the range we
+	 * are adding, if there is remove that struct and add a new one to
+	 * cover the entire range
+	 */
+	right_info = tree_search_offset(&block_group->free_space_offset,
+					offset+bytes, 0, 1);
+	left_info = tree_search_offset(&block_group->free_space_offset,
+				       offset-1, 0, 1);
+
+	if (right_info && right_info->offset == offset+bytes) {
+		unlink_free_space(block_group, right_info);
+		info = right_info;
+		info->offset = offset;
+		info->bytes += bytes;
+	} else if (right_info && right_info->offset != offset+bytes) {
+		printk(KERN_ERR "btrfs adding space in the middle of an "
+		       "existing free space area. existing: "
+		       "offset=%llu, bytes=%llu. new: offset=%llu, "
+		       "bytes=%llu\n", (unsigned long long)right_info->offset,
+		       (unsigned long long)right_info->bytes,
+		       (unsigned long long)offset,
+		       (unsigned long long)bytes);
+		BUG();
+	}
+
+	if (left_info) {
+		unlink_free_space(block_group, left_info);
+
+		if (unlikely((left_info->offset + left_info->bytes) !=
+			     offset)) {
+			printk(KERN_ERR "btrfs free space to the left "
+			       "of new free space isn't "
+			       "quite right. existing: offset=%llu, "
+			       "bytes=%llu. new: offset=%llu, bytes=%llu\n",
+			       (unsigned long long)left_info->offset,
+			       (unsigned long long)left_info->bytes,
+			       (unsigned long long)offset,
+			       (unsigned long long)bytes);
+			BUG();
+		}
+
+		if (info) {
+			info->offset = left_info->offset;
+			info->bytes += left_info->bytes;
+			kfree(left_info);
+		} else {
+			info = left_info;
+			info->bytes += bytes;
+		}
+	}
+
+	if (info) {
+		ret = link_free_space(block_group, info);
+		if (!ret)
+			info = NULL;
+		goto out;
+	}
+
+	info = alloc_info;
+	alloc_info = NULL;
+	info->offset = offset;
+	info->bytes = bytes;
+
+	ret = link_free_space(block_group, info);
+	if (ret)
+		kfree(info);
+out:
+	if (ret) {
+		printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+		if (ret == -EEXIST)
+			BUG();
+	}
+
+	kfree(alloc_info);
+
+	return ret;
+}
+
+static int
+__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			  u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+				  1);
+
+	if (info && info->offset == offset) {
+		if (info->bytes < bytes) {
+			printk(KERN_ERR "Found free space at %llu, size %llu,"
+			       "trying to use %llu\n",
+			       (unsigned long long)info->offset,
+			       (unsigned long long)info->bytes,
+			       (unsigned long long)bytes);
+			WARN_ON(1);
+			ret = -EINVAL;
+			goto out;
+		}
+		unlink_free_space(block_group, info);
+
+		if (info->bytes == bytes) {
+			kfree(info);
+			goto out;
+		}
+
+		info->offset += bytes;
+		info->bytes -= bytes;
+
+		ret = link_free_space(block_group, info);
+		BUG_ON(ret);
+	} else if (info && info->offset < offset &&
+		   info->offset + info->bytes >= offset + bytes) {
+		u64 old_start = info->offset;
+		/*
+		 * we're freeing space in the middle of the info,
+		 * this can happen during tree log replay
+		 *
+		 * first unlink the old info and then
+		 * insert it again after the hole we're creating
+		 */
+		unlink_free_space(block_group, info);
+		if (offset + bytes < info->offset + info->bytes) {
+			u64 old_end = info->offset + info->bytes;
+
+			info->offset = offset + bytes;
+			info->bytes = old_end - info->offset;
+			ret = link_free_space(block_group, info);
+			BUG_ON(ret);
+		} else {
+			/* the hole we're creating ends at the end
+			 * of the info struct, just free the info
+			 */
+			kfree(info);
+		}
+
+		/* step two, insert a new info struct to cover anything
+		 * before the hole
+		 */
+		ret = __btrfs_add_free_space(block_group, old_start,
+					     offset - old_start);
+		BUG_ON(ret);
+	} else {
+		WARN_ON(1);
+	}
+out:
+	return ret;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 offset, u64 bytes)
+{
+	int ret;
+	struct btrfs_free_space *sp;
+
+	mutex_lock(&block_group->alloc_mutex);
+	ret = __btrfs_add_free_space(block_group, offset, bytes);
+	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+	BUG_ON(!sp);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+			      u64 offset, u64 bytes)
+{
+	int ret;
+	struct btrfs_free_space *sp;
+
+	ret = __btrfs_add_free_space(block_group, offset, bytes);
+	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+	BUG_ON(!sp);
+
+	return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 offset, u64 bytes)
+{
+	int ret = 0;
+
+	mutex_lock(&block_group->alloc_mutex);
+	ret = __btrfs_remove_free_space(block_group, offset, bytes);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+				 u64 offset, u64 bytes)
+{
+	int ret;
+
+	ret = __btrfs_remove_free_space(block_group, offset, bytes);
+
+	return ret;
+}
+
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	int count = 0;
+
+	for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (info->bytes >= bytes)
+			count++;
+	}
+	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
+	       "\n", count);
+}
+
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	u64 ret = 0;
+
+	for (n = rb_first(&block_group->free_space_offset); n;
+	     n = rb_next(n)) {
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		ret += info->bytes;
+	}
+
+	return ret;
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *node;
+
+	mutex_lock(&block_group->alloc_mutex);
+	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+		info = rb_entry(node, struct btrfs_free_space, bytes_index);
+		unlink_free_space(block_group, info);
+		kfree(info);
+		if (need_resched()) {
+			mutex_unlock(&block_group->alloc_mutex);
+			cond_resched();
+			mutex_lock(&block_group->alloc_mutex);
+		}
+	}
+	mutex_unlock(&block_group->alloc_mutex);
+}
+
+#if 0
+static struct btrfs_free_space *btrfs_find_free_space_offset(struct
+						      btrfs_block_group_cache
+						      *block_group, u64 offset,
+						      u64 bytes)
+{
+	struct btrfs_free_space *ret;
+
+	mutex_lock(&block_group->alloc_mutex);
+	ret = tree_search_offset(&block_group->free_space_offset, offset,
+				 bytes, 0);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+
+static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+						     btrfs_block_group_cache
+						     *block_group, u64 offset,
+						     u64 bytes)
+{
+	struct btrfs_free_space *ret;
+
+	mutex_lock(&block_group->alloc_mutex);
+
+	ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+#endif
+
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+					       *block_group, u64 offset,
+					       u64 bytes)
+{
+	struct btrfs_free_space *ret = NULL;
+
+	ret = tree_search_offset(&block_group->free_space_offset, offset,
+				 bytes, 0);
+	if (!ret)
+		ret = tree_search_bytes(&block_group->free_space_bytes,
+					offset, bytes);
+
+	return ret;
+}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 00000000000..2a020b27676
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __HASH__
+#define __HASH__
+
+#include "crc32c.h"
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+	return btrfs_crc32c((u32)~1, name, len);
+}
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 00000000000..3d46fa1f29a
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
+			 int name_len, struct btrfs_inode_ref **ref_ret)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long name_ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int len;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	while (cur_offset < item_size) {
+		ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+		len = btrfs_inode_ref_name_len(leaf, ref);
+		name_ptr = (unsigned long)(ref + 1);
+		cur_offset += len + sizeof(*ref);
+		if (len != name_len)
+			continue;
+		if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
+			*ref_ret = ref;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+	unsigned long item_start;
+	u32 item_size;
+	u32 sub_item_len;
+	int ret;
+	int del_len = name_len + sizeof(*ref);
+
+	key.objectid = inode_objectid;
+	key.offset = ref_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	} else if (ret < 0) {
+		goto out;
+	}
+	if (!find_name_in_backref(path, name, name_len, &ref)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+
+	if (index)
+		*index = btrfs_inode_ref_index(leaf, ref);
+
+	if (del_len == item_size) {
+		ret = btrfs_del_item(trans, root, path);
+		goto out;
+	}
+	ptr = (unsigned long)ref;
+	sub_item_len = name_len + sizeof(*ref);
+	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+			      item_size - (ptr + sub_item_len - item_start));
+	ret = btrfs_truncate_item(trans, root, path,
+				  item_size - sub_item_len, 1);
+	BUG_ON(ret);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	int ret;
+	int ins_len = name_len + sizeof(*ref);
+
+	key.objectid = inode_objectid;
+	key.offset = ref_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      ins_len);
+	if (ret == -EEXIST) {
+		u32 old_size;
+
+		if (find_name_in_backref(path, name, name_len, &ref))
+			goto out;
+
+		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+		ret = btrfs_extend_item(trans, root, path, ins_len);
+		BUG_ON(ret);
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_inode_ref);
+		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+		ptr = (unsigned long)(ref + 1);
+		ret = 0;
+	} else if (ret < 0) {
+		goto out;
+	} else {
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_inode_ref);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+		ptr = (unsigned long)(ref + 1);
+	}
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid)
+{
+	struct btrfs_key key;
+	int ret;
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(struct btrfs_inode_item));
+	if (ret == 0 && objectid > root->highest_inode)
+		root->highest_inode = objectid;
+	return ret;
+}
+
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod)
+{
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key found_key;
+
+	ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+	if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+	    location->offset == (u64)-1 && path->slots[0] != 0) {
+		slot = path->slots[0] - 1;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (found_key.objectid == location->objectid &&
+		    btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+			path->slots[0]--;
+			return 0;
+		}
+	}
+	return ret;
+}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 00000000000..2aa79873eb4
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct extent_buffer *l;
+	struct btrfs_key search_key;
+	struct btrfs_key found_key;
+	int slot;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+	search_key.type = -1;
+	search_key.offset = (u64)-1;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	BUG_ON(ret == 0);
+	if (path->slots[0] > 0) {
+		slot = path->slots[0] - 1;
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+		*objectid = found_key.objectid;
+	} else {
+		*objectid = BTRFS_FIRST_FREE_OBJECTID;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * walks the btree of allocated inodes and find a hole.
+ */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     u64 dirid, u64 *objectid)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+	int slot = 0;
+	u64 last_ino = 0;
+	int start_found;
+	struct extent_buffer *l;
+	struct btrfs_key search_key;
+	u64 search_start = dirid;
+
+	mutex_lock(&root->objectid_mutex);
+	if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
+	    root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
+		*objectid = ++root->last_inode_alloc;
+		mutex_unlock(&root->objectid_mutex);
+		return 0;
+	}
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
+	search_key.objectid = search_start;
+	search_key.type = 0;
+	search_key.offset = 0;
+
+	btrfs_init_path(path);
+	start_found = 0;
+	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			if (!start_found) {
+				*objectid = search_start;
+				start_found = 1;
+				goto found;
+			}
+			*objectid = last_ino > search_start ?
+				last_ino : search_start;
+			goto found;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+		if (key.objectid >= search_start) {
+			if (start_found) {
+				if (last_ino < search_start)
+					last_ino = search_start;
+				if (key.objectid > last_ino) {
+					*objectid = last_ino;
+					goto found;
+				}
+			} else if (key.objectid > search_start) {
+				*objectid = search_start;
+				goto found;
+			}
+		}
+		if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
+			break;
+
+		start_found = 1;
+		last_ino = key.objectid + 1;
+		path->slots[0]++;
+	}
+	BUG_ON(1);
+found:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	BUG_ON(*objectid < search_start);
+	mutex_unlock(&root->objectid_mutex);
+	return 0;
+error:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	mutex_unlock(&root->objectid_mutex);
+	return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 00000000000..8adfe059ab4
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5035 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/falloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "xattr.h"
+#include "tree-log.h"
+#include "ref-cache.h"
+#include "compression.h"
+
+struct btrfs_iget_args {
+	u64 ino;
+	struct btrfs_root *root;
+};
+
+static struct inode_operations btrfs_dir_inode_operations;
+static struct inode_operations btrfs_symlink_inode_operations;
+static struct inode_operations btrfs_dir_ro_inode_operations;
+static struct inode_operations btrfs_special_inode_operations;
+static struct inode_operations btrfs_file_inode_operations;
+static struct address_space_operations btrfs_aops;
+static struct address_space_operations btrfs_symlink_aops;
+static struct file_operations btrfs_dir_file_operations;
+static struct extent_io_ops btrfs_extent_io_ops;
+
+static struct kmem_cache *btrfs_inode_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_bit_radix_cachep;
+struct kmem_cache *btrfs_path_cachep;
+
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
+};
+
+static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written, int unlock);
+
+/*
+ * a very lame attempt at stopping writes when the FS is 85% full.  There
+ * are countless ways this is incorrect, but it is better than nothing.
+ */
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+			   int for_del)
+{
+	u64 total;
+	u64 used;
+	u64 thresh;
+	int ret = 0;
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	total = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	used = btrfs_super_bytes_used(&root->fs_info->super_copy);
+	if (for_del)
+		thresh = total * 90;
+	else
+		thresh = total * 85;
+
+	do_div(thresh, 100);
+
+	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
+		ret = -ENOSPC;
+	spin_unlock(&root->fs_info->delalloc_lock);
+	return ret;
+}
+
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode,
+				u64 start, size_t size, size_t compressed_size,
+				struct page **compressed_pages)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct page *page = NULL;
+	char *kaddr;
+	unsigned long ptr;
+	struct btrfs_file_extent_item *ei;
+	int err = 0;
+	int ret;
+	size_t cur_size = size;
+	size_t datasize;
+	unsigned long offset;
+	int use_compress = 0;
+
+	if (compressed_size && compressed_pages) {
+		use_compress = 1;
+		cur_size = compressed_size;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	btrfs_set_trans_block_group(trans, inode);
+
+	key.objectid = inode->i_ino;
+	key.offset = start;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+	inode_add_bytes(inode, size);
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	BUG_ON(ret);
+	if (ret) {
+		err = ret;
+		goto fail;
+	}
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+	ptr = btrfs_file_extent_inline_start(ei);
+
+	if (use_compress) {
+		struct page *cpage;
+		int i = 0;
+		while (compressed_size > 0) {
+			cpage = compressed_pages[i];
+			cur_size = min_t(unsigned long, compressed_size,
+				       PAGE_CACHE_SIZE);
+
+			kaddr = kmap(cpage);
+			write_extent_buffer(leaf, kaddr, ptr, cur_size);
+			kunmap(cpage);
+
+			i++;
+			ptr += cur_size;
+			compressed_size -= cur_size;
+		}
+		btrfs_set_file_extent_compression(leaf, ei,
+						  BTRFS_COMPRESS_ZLIB);
+	} else {
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+		btrfs_set_file_extent_compression(leaf, ei, 0);
+		kaddr = kmap_atomic(page, KM_USER0);
+		offset = start & (PAGE_CACHE_SIZE - 1);
+		write_extent_buffer(leaf, kaddr + offset, ptr, size);
+		kunmap_atomic(kaddr, KM_USER0);
+		page_cache_release(page);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	BTRFS_I(inode)->disk_i_size = inode->i_size;
+	btrfs_update_inode(trans, root, inode);
+	return 0;
+fail:
+	btrfs_free_path(path);
+	return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct inode *inode, u64 start, u64 end,
+				 size_t compressed_size,
+				 struct page **compressed_pages)
+{
+	u64 isize = i_size_read(inode);
+	u64 actual_end = min(end + 1, isize);
+	u64 inline_len = actual_end - start;
+	u64 aligned_end = (end + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+	u64 hint_byte;
+	u64 data_len = inline_len;
+	int ret;
+
+	if (compressed_size)
+		data_len = compressed_size;
+
+	if (start > 0 ||
+	    actual_end >= PAGE_CACHE_SIZE ||
+	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+	    (!compressed_size &&
+	    (actual_end & (root->sectorsize - 1)) == 0) ||
+	    end + 1 < isize ||
+	    data_len > root->fs_info->max_inline) {
+		return 1;
+	}
+
+	ret = btrfs_drop_extents(trans, root, inode, start,
+				 aligned_end, start, &hint_byte);
+	BUG_ON(ret);
+
+	if (isize > actual_end)
+		inline_len = min_t(u64, isize, actual_end);
+	ret = insert_inline_extent(trans, root, inode, start,
+				   inline_len, compressed_size,
+				   compressed_pages);
+	BUG_ON(ret);
+	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+	return 0;
+}
+
+struct async_extent {
+	u64 start;
+	u64 ram_size;
+	u64 compressed_size;
+	struct page **pages;
+	unsigned long nr_pages;
+	struct list_head list;
+};
+
+struct async_cow {
+	struct inode *inode;
+	struct btrfs_root *root;
+	struct page *locked_page;
+	u64 start;
+	u64 end;
+	struct list_head extents;
+	struct btrfs_work work;
+};
+
+static noinline int add_async_extent(struct async_cow *cow,
+				     u64 start, u64 ram_size,
+				     u64 compressed_size,
+				     struct page **pages,
+				     unsigned long nr_pages)
+{
+	struct async_extent *async_extent;
+
+	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+	async_extent->start = start;
+	async_extent->ram_size = ram_size;
+	async_extent->compressed_size = compressed_size;
+	async_extent->pages = pages;
+	async_extent->nr_pages = nr_pages;
+	list_add_tail(&async_extent->list, &cow->extents);
+	return 0;
+}
+
+/*
+ * we create compressed extents in two phases.  The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
+ *
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus.  The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
+ *
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes.  This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that pdflush sent them down.
+ */
+static noinline int compress_file_range(struct inode *inode,
+					struct page *locked_page,
+					u64 start, u64 end,
+					struct async_cow *async_cow,
+					int *num_added)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 num_bytes;
+	u64 orig_start;
+	u64 disk_num_bytes;
+	u64 blocksize = root->sectorsize;
+	u64 actual_end;
+	u64 isize = i_size_read(inode);
+	int ret = 0;
+	struct page **pages = NULL;
+	unsigned long nr_pages;
+	unsigned long nr_pages_ret = 0;
+	unsigned long total_compressed = 0;
+	unsigned long total_in = 0;
+	unsigned long max_compressed = 128 * 1024;
+	unsigned long max_uncompressed = 128 * 1024;
+	int i;
+	int will_compress;
+
+	orig_start = start;
+
+	actual_end = min_t(u64, isize, end + 1);
+again:
+	will_compress = 0;
+	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+
+	total_compressed = actual_end - start;
+
+	/* we want to make sure that amount of ram required to uncompress
+	 * an extent is reasonable, so we limit the total size in ram
+	 * of a compressed extent to 128k.  This is a crucial number
+	 * because it also controls how easily we can spread reads across
+	 * cpus for decompression.
+	 *
+	 * We also want to make sure the amount of IO required to do
+	 * a random read is reasonably small, so we limit the size of
+	 * a compressed extent to 128k.
+	 */
+	total_compressed = min(total_compressed, max_uncompressed);
+	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = max(blocksize,  num_bytes);
+	disk_num_bytes = num_bytes;
+	total_in = 0;
+	ret = 0;
+
+	/*
+	 * we do compression for mount -o compress and when the
+	 * inode has not been flagged as nocompress.  This flag can
+	 * change at any time if we discover bad compression ratios.
+	 */
+	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+	    btrfs_test_opt(root, COMPRESS)) {
+		WARN_ON(pages);
+		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+
+		ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+						total_compressed, pages,
+						nr_pages, &nr_pages_ret,
+						&total_in,
+						&total_compressed,
+						max_compressed);
+
+		if (!ret) {
+			unsigned long offset = total_compressed &
+				(PAGE_CACHE_SIZE - 1);
+			struct page *page = pages[nr_pages_ret - 1];
+			char *kaddr;
+
+			/* zero the tail end of the last page, we might be
+			 * sending it down to disk
+			 */
+			if (offset) {
+				kaddr = kmap_atomic(page, KM_USER0);
+				memset(kaddr + offset, 0,
+				       PAGE_CACHE_SIZE - offset);
+				kunmap_atomic(kaddr, KM_USER0);
+			}
+			will_compress = 1;
+		}
+	}
+	if (start == 0) {
+		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(!trans);
+		btrfs_set_trans_block_group(trans, inode);
+
+		/* lets try to make an inline extent */
+		if (ret || total_in < (actual_end - start)) {
+			/* we didn't compress the entire range, try
+			 * to make an uncompressed inline extent.
+			 */
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end, 0, NULL);
+		} else {
+			/* try making a compressed inline extent */
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end,
+						    total_compressed, pages);
+		}
+		btrfs_end_transaction(trans, root);
+		if (ret == 0) {
+			/*
+			 * inline extent creation worked, we don't need
+			 * to create any more async work items.  Unlock
+			 * and free up our temp pages.
+			 */
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start, end, NULL, 1, 0,
+						     0, 1, 1, 1);
+			ret = 0;
+			goto free_pages_out;
+		}
+	}
+
+	if (will_compress) {
+		/*
+		 * we aren't doing an inline extent round the compressed size
+		 * up to a block size boundary so the allocator does sane
+		 * things
+		 */
+		total_compressed = (total_compressed + blocksize - 1) &
+			~(blocksize - 1);
+
+		/*
+		 * one last check to make sure the compression is really a
+		 * win, compare the page count read with the blocks on disk
+		 */
+		total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+			~(PAGE_CACHE_SIZE - 1);
+		if (total_compressed >= total_in) {
+			will_compress = 0;
+		} else {
+			disk_num_bytes = total_compressed;
+			num_bytes = total_in;
+		}
+	}
+	if (!will_compress && pages) {
+		/*
+		 * the compression code ran but failed to make things smaller,
+		 * free any pages it allocated and our page pointer array
+		 */
+		for (i = 0; i < nr_pages_ret; i++) {
+			WARN_ON(pages[i]->mapping);
+			page_cache_release(pages[i]);
+		}
+		kfree(pages);
+		pages = NULL;
+		total_compressed = 0;
+		nr_pages_ret = 0;
+
+		/* flag the file so we don't compress in the future */
+		btrfs_set_flag(inode, NOCOMPRESS);
+	}
+	if (will_compress) {
+		*num_added += 1;
+
+		/* the async work queues will take care of doing actual
+		 * allocation on disk for these compressed pages,
+		 * and will submit them to the elevator.
+		 */
+		add_async_extent(async_cow, start, num_bytes,
+				 total_compressed, pages, nr_pages_ret);
+
+		if (start + num_bytes < end && start + num_bytes < actual_end) {
+			start += num_bytes;
+			pages = NULL;
+			cond_resched();
+			goto again;
+		}
+	} else {
+		/*
+		 * No compression, but we still need to write the pages in
+		 * the file we've been given so far.  redirty the locked
+		 * page if it corresponds to our extent and set things up
+		 * for the async work queue to run cow_file_range to do
+		 * the normal delalloc dance
+		 */
+		if (page_offset(locked_page) >= start &&
+		    page_offset(locked_page) <= end) {
+			__set_page_dirty_nobuffers(locked_page);
+			/* unlocked later on in the async handlers */
+		}
+		add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+		*num_added += 1;
+	}
+
+out:
+	return 0;
+
+free_pages_out:
+	for (i = 0; i < nr_pages_ret; i++) {
+		WARN_ON(pages[i]->mapping);
+		page_cache_release(pages[i]);
+	}
+	kfree(pages);
+
+	goto out;
+}
+
+/*
+ * phase two of compressed writeback.  This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued.  We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline int submit_compressed_extents(struct inode *inode,
+					      struct async_cow *async_cow)
+{
+	struct async_extent *async_extent;
+	u64 alloc_hint = 0;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree;
+	int ret;
+
+	if (list_empty(&async_cow->extents))
+		return 0;
+
+	trans = btrfs_join_transaction(root, 1);
+
+	while (!list_empty(&async_cow->extents)) {
+		async_extent = list_entry(async_cow->extents.next,
+					  struct async_extent, list);
+		list_del(&async_extent->list);
+
+		io_tree = &BTRFS_I(inode)->io_tree;
+
+		/* did the compression code fall back to uncompressed IO? */
+		if (!async_extent->pages) {
+			int page_started = 0;
+			unsigned long nr_written = 0;
+
+			lock_extent(io_tree, async_extent->start,
+				    async_extent->start +
+				    async_extent->ram_size - 1, GFP_NOFS);
+
+			/* allocate blocks */
+			cow_file_range(inode, async_cow->locked_page,
+				       async_extent->start,
+				       async_extent->start +
+				       async_extent->ram_size - 1,
+				       &page_started, &nr_written, 0);
+
+			/*
+			 * if page_started, cow_file_range inserted an
+			 * inline extent and took care of all the unlocking
+			 * and IO for us.  Otherwise, we need to submit
+			 * all those pages down to the drive.
+			 */
+			if (!page_started)
+				extent_write_locked_range(io_tree,
+						  inode, async_extent->start,
+						  async_extent->start +
+						  async_extent->ram_size - 1,
+						  btrfs_get_extent,
+						  WB_SYNC_ALL);
+			kfree(async_extent);
+			cond_resched();
+			continue;
+		}
+
+		lock_extent(io_tree, async_extent->start,
+			    async_extent->start + async_extent->ram_size - 1,
+			    GFP_NOFS);
+		/*
+		 * here we're doing allocation and writeback of the
+		 * compressed pages
+		 */
+		btrfs_drop_extent_cache(inode, async_extent->start,
+					async_extent->start +
+					async_extent->ram_size - 1, 0);
+
+		ret = btrfs_reserve_extent(trans, root,
+					   async_extent->compressed_size,
+					   async_extent->compressed_size,
+					   0, alloc_hint,
+					   (u64)-1, &ins, 1);
+		BUG_ON(ret);
+		em = alloc_extent_map(GFP_NOFS);
+		em->start = async_extent->start;
+		em->len = async_extent->ram_size;
+		em->orig_start = em->start;
+
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+		while (1) {
+			spin_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			spin_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, async_extent->start,
+						async_extent->start +
+						async_extent->ram_size - 1, 0);
+		}
+
+		ret = btrfs_add_ordered_extent(inode, async_extent->start,
+					       ins.objectid,
+					       async_extent->ram_size,
+					       ins.offset,
+					       BTRFS_ORDERED_COMPRESSED);
+		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, root);
+
+		/*
+		 * clear dirty, set writeback and unlock the pages.
+		 */
+		extent_clear_unlock_delalloc(inode,
+					     &BTRFS_I(inode)->io_tree,
+					     async_extent->start,
+					     async_extent->start +
+					     async_extent->ram_size - 1,
+					     NULL, 1, 1, 0, 1, 1, 0);
+
+		ret = btrfs_submit_compressed_write(inode,
+				    async_extent->start,
+				    async_extent->ram_size,
+				    ins.objectid,
+				    ins.offset, async_extent->pages,
+				    async_extent->nr_pages);
+
+		BUG_ON(ret);
+		trans = btrfs_join_transaction(root, 1);
+		alloc_hint = ins.objectid + ins.offset;
+		kfree(async_extent);
+		cond_resched();
+	}
+
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written,
+				   int unlock)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 alloc_hint = 0;
+	u64 num_bytes;
+	unsigned long ram_size;
+	u64 disk_num_bytes;
+	u64 cur_alloc_size;
+	u64 blocksize = root->sectorsize;
+	u64 actual_end;
+	u64 isize = i_size_read(inode);
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 0;
+
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
+	btrfs_set_trans_block_group(trans, inode);
+
+	actual_end = min_t(u64, isize, end + 1);
+
+	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = max(blocksize,  num_bytes);
+	disk_num_bytes = num_bytes;
+	ret = 0;
+
+	if (start == 0) {
+		/* lets try to make an inline extent */
+		ret = cow_file_range_inline(trans, root, inode,
+					    start, end, 0, NULL);
+		if (ret == 0) {
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start, end, NULL, 1, 1,
+						     1, 1, 1, 1);
+			*nr_written = *nr_written +
+			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+			*page_started = 1;
+			ret = 0;
+			goto out;
+		}
+	}
+
+	BUG_ON(disk_num_bytes >
+	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+	while (disk_num_bytes > 0) {
+		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+					   root->sectorsize, 0, alloc_hint,
+					   (u64)-1, &ins, 1);
+		BUG_ON(ret);
+
+		em = alloc_extent_map(GFP_NOFS);
+		em->start = start;
+		em->orig_start = em->start;
+
+		ram_size = ins.offset;
+		em->len = ins.offset;
+
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+		while (1) {
+			spin_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			spin_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, start,
+						start + ram_size - 1, 0);
+		}
+
+		cur_alloc_size = ins.offset;
+		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+					       ram_size, cur_alloc_size, 0);
+		BUG_ON(ret);
+
+		if (root->root_key.objectid ==
+		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+			ret = btrfs_reloc_clone_csums(inode, start,
+						      cur_alloc_size);
+			BUG_ON(ret);
+		}
+
+		if (disk_num_bytes < cur_alloc_size)
+			break;
+
+		/* we're not doing compressed IO, don't unlock the first
+		 * page (which the caller expects to stay locked), don't
+		 * clear any dirty bits and don't set any writeback bits
+		 */
+		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+					     start, start + ram_size - 1,
+					     locked_page, unlock, 1,
+					     1, 0, 0, 0);
+		disk_num_bytes -= cur_alloc_size;
+		num_bytes -= cur_alloc_size;
+		alloc_hint = ins.objectid + ins.offset;
+		start += cur_alloc_size;
+	}
+out:
+	ret = 0;
+	btrfs_end_transaction(trans, root);
+
+	return ret;
+}
+
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	int num_added = 0;
+	async_cow = container_of(work, struct async_cow, work);
+
+	compress_file_range(async_cow->inode, async_cow->locked_page,
+			    async_cow->start, async_cow->end, async_cow,
+			    &num_added);
+	if (num_added == 0)
+		async_cow->inode = NULL;
+}
+
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root;
+	unsigned long nr_pages;
+
+	async_cow = container_of(work, struct async_cow, work);
+
+	root = async_cow->root;
+	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
+
+	if (atomic_read(&root->fs_info->async_delalloc_pages) <
+	    5 * 1042 * 1024 &&
+	    waitqueue_active(&root->fs_info->async_submit_wait))
+		wake_up(&root->fs_info->async_submit_wait);
+
+	if (async_cow->inode)
+		submit_compressed_extents(async_cow->inode, async_cow);
+}
+
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	async_cow = container_of(work, struct async_cow, work);
+	kfree(async_cow);
+}
+
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+				u64 start, u64 end, int *page_started,
+				unsigned long *nr_written)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long nr_pages;
+	u64 cur_end;
+	int limit = 10 * 1024 * 1042;
+
+	if (!btrfs_test_opt(root, COMPRESS)) {
+		return cow_file_range(inode, locked_page, start, end,
+				      page_started, nr_written, 1);
+	}
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
+			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+	while (start < end) {
+		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+		async_cow->inode = inode;
+		async_cow->root = root;
+		async_cow->locked_page = locked_page;
+		async_cow->start = start;
+
+		if (btrfs_test_flag(inode, NOCOMPRESS))
+			cur_end = end;
+		else
+			cur_end = min(end, start + 512 * 1024 - 1);
+
+		async_cow->end = cur_end;
+		INIT_LIST_HEAD(&async_cow->extents);
+
+		async_cow->work.func = async_cow_start;
+		async_cow->work.ordered_func = async_cow_submit;
+		async_cow->work.ordered_free = async_cow_free;
+		async_cow->work.flags = 0;
+
+		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+			PAGE_CACHE_SHIFT;
+		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+
+		btrfs_queue_worker(&root->fs_info->delalloc_workers,
+				   &async_cow->work);
+
+		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+			wait_event(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->async_delalloc_pages) <
+			    limit));
+		}
+
+		while (atomic_read(&root->fs_info->async_submit_draining) &&
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
+			   0));
+		}
+
+		*nr_written += nr_pages;
+		start = cur_end + 1;
+	}
+	*page_started = 1;
+	return 0;
+}
+
+static noinline int csum_exist_in_range(struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_ordered_sum *sums;
+	LIST_HEAD(list);
+
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+				       bytenr + num_bytes - 1, &list);
+	if (ret == 0 && list_empty(&list))
+		return 0;
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	return 1;
+}
+
+/*
+ * when nowcow writeback call back.  This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started, int force,
+			      unsigned long *nr_written)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key found_key;
+	u64 cow_start;
+	u64 cur_offset;
+	u64 extent_end;
+	u64 disk_bytenr;
+	u64 num_bytes;
+	int extent_type;
+	int ret;
+	int type;
+	int nocow;
+	int check_prev = 1;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
+
+	cow_start = (u64)-1;
+	cur_offset = start;
+	while (1) {
+		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+					       cur_offset, 0);
+		BUG_ON(ret < 0);
+		if (ret > 0 && path->slots[0] > 0 && check_prev) {
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &found_key,
+					      path->slots[0] - 1);
+			if (found_key.objectid == inode->i_ino &&
+			    found_key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
+		}
+		check_prev = 0;
+next_slot:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				BUG_ON(1);
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		nocow = 0;
+		disk_bytenr = 0;
+		num_bytes = 0;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		if (found_key.objectid > inode->i_ino ||
+		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
+		    found_key.offset > end)
+			break;
+
+		if (found_key.offset > cur_offset) {
+			extent_end = found_key.offset;
+			goto out_check;
+		}
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, fi);
+
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+			extent_end = found_key.offset +
+				btrfs_file_extent_num_bytes(leaf, fi);
+			if (extent_end <= start) {
+				path->slots[0]++;
+				goto next_slot;
+			}
+			if (disk_bytenr == 0)
+				goto out_check;
+			if (btrfs_file_extent_compression(leaf, fi) ||
+			    btrfs_file_extent_encryption(leaf, fi) ||
+			    btrfs_file_extent_other_encoding(leaf, fi))
+				goto out_check;
+			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+				goto out_check;
+			if (btrfs_extent_readonly(root, disk_bytenr))
+				goto out_check;
+			if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+						  disk_bytenr))
+				goto out_check;
+			disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+			disk_bytenr += cur_offset - found_key.offset;
+			num_bytes = min(end + 1, extent_end) - cur_offset;
+			/*
+			 * force cow if csum exists in the range.
+			 * this ensure that csum for a given extent are
+			 * either valid or do not exist.
+			 */
+			if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+				goto out_check;
+			nocow = 1;
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			extent_end = found_key.offset +
+				btrfs_file_extent_inline_len(leaf, fi);
+			extent_end = ALIGN(extent_end, root->sectorsize);
+		} else {
+			BUG_ON(1);
+		}
+out_check:
+		if (extent_end <= start) {
+			path->slots[0]++;
+			goto next_slot;
+		}
+		if (!nocow) {
+			if (cow_start == (u64)-1)
+				cow_start = cur_offset;
+			cur_offset = extent_end;
+			if (cur_offset > end)
+				break;
+			path->slots[0]++;
+			goto next_slot;
+		}
+
+		btrfs_release_path(root, path);
+		if (cow_start != (u64)-1) {
+			ret = cow_file_range(inode, locked_page, cow_start,
+					found_key.offset - 1, page_started,
+					nr_written, 1);
+			BUG_ON(ret);
+			cow_start = (u64)-1;
+		}
+
+		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			struct extent_map *em;
+			struct extent_map_tree *em_tree;
+			em_tree = &BTRFS_I(inode)->extent_tree;
+			em = alloc_extent_map(GFP_NOFS);
+			em->start = cur_offset;
+			em->orig_start = em->start;
+			em->len = num_bytes;
+			em->block_len = num_bytes;
+			em->block_start = disk_bytenr;
+			em->bdev = root->fs_info->fs_devices->latest_bdev;
+			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			while (1) {
+				spin_lock(&em_tree->lock);
+				ret = add_extent_mapping(em_tree, em);
+				spin_unlock(&em_tree->lock);
+				if (ret != -EEXIST) {
+					free_extent_map(em);
+					break;
+				}
+				btrfs_drop_extent_cache(inode, em->start,
+						em->start + em->len - 1, 0);
+			}
+			type = BTRFS_ORDERED_PREALLOC;
+		} else {
+			type = BTRFS_ORDERED_NOCOW;
+		}
+
+		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
+					       num_bytes, num_bytes, type);
+		BUG_ON(ret);
+
+		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+					cur_offset, cur_offset + num_bytes - 1,
+					locked_page, 1, 1, 1, 0, 0, 0);
+		cur_offset = extent_end;
+		if (cur_offset > end)
+			break;
+	}
+	btrfs_release_path(root, path);
+
+	if (cur_offset <= end && cow_start == (u64)-1)
+		cow_start = cur_offset;
+	if (cow_start != (u64)-1) {
+		ret = cow_file_range(inode, locked_page, cow_start, end,
+				     page_started, nr_written, 1);
+		BUG_ON(ret);
+	}
+
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
+	btrfs_free_path(path);
+	return 0;
+}
+
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started,
+			      unsigned long *nr_written)
+{
+	int ret;
+
+	if (btrfs_test_flag(inode, NODATACOW))
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started, 1, nr_written);
+	else if (btrfs_test_flag(inode, PREALLOC))
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started, 0, nr_written);
+	else
+		ret = cow_file_range_async(inode, locked_page, start, end,
+					   page_started, nr_written);
+
+	return ret;
+}
+
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
+static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+		       unsigned long old, unsigned long bits)
+{
+	/*
+	 * set_bit and clear bit hooks normally require _irqsave/restore
+	 * but in this case, we are only testeing for the DELALLOC
+	 * bit, which is only set or cleared with irqs on
+	 */
+	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		spin_lock(&root->fs_info->delalloc_lock);
+		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+		root->fs_info->delalloc_bytes += end - start + 1;
+		if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+				      &root->fs_info->delalloc_inodes);
+		}
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+	return 0;
+}
+
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
+static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+			 unsigned long old, unsigned long bits)
+{
+	/*
+	 * set_bit and clear bit hooks normally require _irqsave/restore
+	 * but in this case, we are only testeing for the DELALLOC
+	 * bit, which is only set or cleared with irqs on
+	 */
+	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+
+		spin_lock(&root->fs_info->delalloc_lock);
+		if (end - start + 1 > root->fs_info->delalloc_bytes) {
+			printk(KERN_INFO "btrfs warning: delalloc account "
+			       "%llu %llu\n",
+			       (unsigned long long)end - start + 1,
+			       (unsigned long long)
+			       root->fs_info->delalloc_bytes);
+			root->fs_info->delalloc_bytes = 0;
+			BTRFS_I(inode)->delalloc_bytes = 0;
+		} else {
+			root->fs_info->delalloc_bytes -= end - start + 1;
+			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+		}
+		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+		}
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+	return 0;
+}
+
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags)
+{
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	struct btrfs_mapping_tree *map_tree;
+	u64 logical = (u64)bio->bi_sector << 9;
+	u64 length = 0;
+	u64 map_length;
+	int ret;
+
+	if (bio_flags & EXTENT_BIO_COMPRESSED)
+		return 0;
+
+	length = bio->bi_size;
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+	ret = btrfs_map_block(map_tree, READ, logical,
+			      &map_length, NULL, 0);
+
+	if (map_length < length + size)
+		return 1;
+	return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num, unsigned long bio_flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+}
+
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation
+ * on write, or reading the csums from the tree before a read
+ */
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num, unsigned long bio_flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+	int skip_sum;
+
+	skip_sum = btrfs_test_flag(inode, NODATASUM);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
+
+	if (!(rw & (1 << BIO_RW))) {
+		if (bio_flags & EXTENT_BIO_COMPRESSED) {
+			return btrfs_submit_compressed_read(inode, bio,
+						    mirror_num, bio_flags);
+		} else if (!skip_sum)
+			btrfs_lookup_bio_sums(root, inode, bio, NULL);
+		goto mapit;
+	} else if (!skip_sum) {
+		/* csum items have already been cloned */
+		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+			goto mapit;
+		/* we're doing a write, do the async checksumming */
+		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+				   inode, rw, bio, mirror_num,
+				   bio_flags, __btrfs_submit_bio_start,
+				   __btrfs_submit_bio_done);
+	}
+
+mapit:
+	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+}
+
+/*
+ * given a list of ordered sums record them in the inode.  This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
+static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
+			     struct inode *inode, u64 file_offset,
+			     struct list_head *list)
+{
+	struct list_head *cur;
+	struct btrfs_ordered_sum *sum;
+
+	btrfs_set_trans_block_group(trans, inode);
+	list_for_each(cur, list) {
+		sum = list_entry(cur, struct btrfs_ordered_sum, list);
+		btrfs_csum_file_blocks(trans,
+		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
+	}
+	return 0;
+}
+
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+{
+	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+		WARN_ON(1);
+	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+				   GFP_NOFS);
+}
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+struct btrfs_writepage_fixup {
+	struct page *page;
+	struct btrfs_work work;
+};
+
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+	struct btrfs_writepage_fixup *fixup;
+	struct btrfs_ordered_extent *ordered;
+	struct page *page;
+	struct inode *inode;
+	u64 page_start;
+	u64 page_end;
+
+	fixup = container_of(work, struct btrfs_writepage_fixup, work);
+	page = fixup->page;
+again:
+	lock_page(page);
+	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+		ClearPageChecked(page);
+		goto out_page;
+	}
+
+	inode = page->mapping->host;
+	page_start = page_offset(page);
+	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+
+	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+
+	/* already ordered? We're done */
+	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			     EXTENT_ORDERED, 0)) {
+		goto out;
+	}
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
+			      page_end, GFP_NOFS);
+		unlock_page(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		goto again;
+	}
+
+	btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ClearPageChecked(page);
+out:
+	unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+out_page:
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea.  This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the ORDERED bit set
+ * hasn't been properly setup for IO.  We kick off an async process
+ * to fix it up.  The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_writepage_fixup *fixup;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+			     EXTENT_ORDERED, 0);
+	if (ret)
+		return 0;
+
+	if (PageChecked(page))
+		return -EAGAIN;
+
+	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+	if (!fixup)
+		return -EAGAIN;
+
+	SetPageChecked(page);
+	page_cache_get(page);
+	fixup->work.func = btrfs_writepage_fixup_worker;
+	fixup->page = page;
+	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+	return -EAGAIN;
+}
+
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+				       struct inode *inode, u64 file_pos,
+				       u64 disk_bytenr, u64 disk_num_bytes,
+				       u64 num_bytes, u64 ram_bytes,
+				       u8 compression, u8 encryption,
+				       u16 other_encoding, int extent_type)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key ins;
+	u64 hint;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	ret = btrfs_drop_extents(trans, root, inode, file_pos,
+				 file_pos + num_bytes, file_pos, &hint);
+	BUG_ON(ret);
+
+	ins.objectid = inode->i_ino;
+	ins.offset = file_pos;
+	ins.type = BTRFS_EXTENT_DATA_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
+	BUG_ON(ret);
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+	btrfs_set_file_extent_type(leaf, fi, extent_type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+	btrfs_set_file_extent_offset(leaf, fi, 0);
+	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, compression);
+	btrfs_set_file_extent_encryption(leaf, fi, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_add_bytes(inode, num_bytes);
+	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
+
+	ins.objectid = disk_bytenr;
+	ins.offset = disk_num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+					  root->root_key.objectid,
+					  trans->transid, inode->i_ino, &ins);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_ordered_extent *ordered_extent;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int compressed = 0;
+	int ret;
+
+	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+	if (!ret)
+		return 0;
+
+	trans = btrfs_join_transaction(root, 1);
+
+	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+	BUG_ON(!ordered_extent);
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
+		goto nocow;
+
+	lock_extent(io_tree, ordered_extent->file_offset,
+		    ordered_extent->file_offset + ordered_extent->len - 1,
+		    GFP_NOFS);
+
+	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+		compressed = 1;
+	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+		BUG_ON(compressed);
+		ret = btrfs_mark_extent_written(trans, root, inode,
+						ordered_extent->file_offset,
+						ordered_extent->file_offset +
+						ordered_extent->len);
+		BUG_ON(ret);
+	} else {
+		ret = insert_reserved_file_extent(trans, inode,
+						ordered_extent->file_offset,
+						ordered_extent->start,
+						ordered_extent->disk_len,
+						ordered_extent->len,
+						ordered_extent->len,
+						compressed, 0, 0,
+						BTRFS_FILE_EXTENT_REG);
+		BUG_ON(ret);
+	}
+	unlock_extent(io_tree, ordered_extent->file_offset,
+		    ordered_extent->file_offset + ordered_extent->len - 1,
+		    GFP_NOFS);
+nocow:
+	add_pending_csums(trans, inode, ordered_extent->file_offset,
+			  &ordered_extent->list);
+
+	mutex_lock(&BTRFS_I(inode)->extent_mutex);
+	btrfs_ordered_update_i_size(inode, ordered_extent);
+	btrfs_update_inode(trans, root, inode);
+	btrfs_remove_ordered_extent(inode, ordered_extent);
+	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+
+	/* once for us */
+	btrfs_put_ordered_extent(ordered_extent);
+	/* once for the tree */
+	btrfs_put_ordered_extent(ordered_extent);
+
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+				struct extent_state *state, int uptodate)
+{
+	return btrfs_finish_ordered_io(page->mapping->host, start, end);
+}
+
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+	struct page *page;
+	u64 start;
+	u64 len;
+	u64 logical;
+	unsigned long bio_flags;
+	int last_mirror;
+};
+
+static int btrfs_io_failed_hook(struct bio *failed_bio,
+			 struct page *page, u64 start, u64 end,
+			 struct extent_state *state)
+{
+	struct io_failure_record *failrec = NULL;
+	u64 private;
+	struct extent_map *em;
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct bio *bio;
+	int num_copies;
+	int ret;
+	int rw;
+	u64 logical;
+
+	ret = get_state_private(failure_tree, start, &private);
+	if (ret) {
+		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
+		if (!failrec)
+			return -ENOMEM;
+		failrec->start = start;
+		failrec->len = end - start + 1;
+		failrec->last_mirror = 0;
+		failrec->bio_flags = 0;
+
+		spin_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, failrec->len);
+		if (em->start > start || em->start + em->len < start) {
+			free_extent_map(em);
+			em = NULL;
+		}
+		spin_unlock(&em_tree->lock);
+
+		if (!em || IS_ERR(em)) {
+			kfree(failrec);
+			return -EIO;
+		}
+		logical = start - em->start;
+		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+		}
+		failrec->logical = logical;
+		free_extent_map(em);
+		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
+				EXTENT_DIRTY, GFP_NOFS);
+		set_state_private(failure_tree, start,
+				 (u64)(unsigned long)failrec);
+	} else {
+		failrec = (struct io_failure_record *)(unsigned long)private;
+	}
+	num_copies = btrfs_num_copies(
+			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
+			      failrec->logical, failrec->len);
+	failrec->last_mirror++;
+	if (!state) {
+		spin_lock(&BTRFS_I(inode)->io_tree.lock);
+		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+						    failrec->start,
+						    EXTENT_LOCKED);
+		if (state && state->start != failrec->start)
+			state = NULL;
+		spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+	}
+	if (!state || failrec->last_mirror > num_copies) {
+		set_state_private(failure_tree, failrec->start, 0);
+		clear_extent_bits(failure_tree, failrec->start,
+				  failrec->start + failrec->len - 1,
+				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+		kfree(failrec);
+		return -EIO;
+	}
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_private = state;
+	bio->bi_end_io = failed_bio->bi_end_io;
+	bio->bi_sector = failrec->logical >> 9;
+	bio->bi_bdev = failed_bio->bi_bdev;
+	bio->bi_size = 0;
+
+	bio_add_page(bio, page, failrec->len, start - page_offset(page));
+	if (failed_bio->bi_rw & (1 << BIO_RW))
+		rw = WRITE;
+	else
+		rw = READ;
+
+	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+						      failrec->last_mirror,
+						      failrec->bio_flags);
+	return 0;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int btrfs_clean_io_failures(struct inode *inode, u64 start)
+{
+	u64 private;
+	u64 private_failure;
+	struct io_failure_record *failure;
+	int ret;
+
+	private = 0;
+	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+			     (u64)-1, 1, EXTENT_DIRTY)) {
+		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+					start, &private_failure);
+		if (ret == 0) {
+			failure = (struct io_failure_record *)(unsigned long)
+				   private_failure;
+			set_state_private(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start, 0);
+			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start,
+					  failure->start + failure->len - 1,
+					  EXTENT_DIRTY | EXTENT_LOCKED,
+					  GFP_NOFS);
+			kfree(failure);
+		}
+	}
+	return 0;
+}
+
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish.  If not, we go through
+ * the io_failure_record routines to find good copies
+ */
+static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+			       struct extent_state *state)
+{
+	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	char *kaddr;
+	u64 private = ~(u32)0;
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u32 csum = ~(u32)0;
+
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		goto good;
+	}
+	if (btrfs_test_flag(inode, NODATASUM))
+		return 0;
+
+	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+				  GFP_NOFS);
+		return 0;
+	}
+
+	if (state && state->start == start) {
+		private = state->private;
+		ret = 0;
+	} else {
+		ret = get_state_private(io_tree, start, &private);
+	}
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (ret)
+		goto zeroit;
+
+	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
+	btrfs_csum_final(csum, (char *)&csum);
+	if (csum != private)
+		goto zeroit;
+
+	kunmap_atomic(kaddr, KM_USER0);
+good:
+	/* if the io failure tree for this inode is non-empty,
+	 * check to see if we've recovered from a failed IO
+	 */
+	btrfs_clean_io_failures(inode, start);
+	return 0;
+
+zeroit:
+	printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+	       "private %llu\n", page->mapping->host->i_ino,
+	       (unsigned long long)start, csum,
+	       (unsigned long long)private);
+	memset(kaddr + offset, 1, end - start + 1);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+	if (private == 0)
+		return 0;
+	return -EIO;
+}
+
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	spin_lock(&root->list_lock);
+
+	/* already on the orphan list, we're good */
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		spin_unlock(&root->list_lock);
+		return 0;
+	}
+
+	list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+
+	spin_unlock(&root->list_lock);
+
+	/*
+	 * insert an orphan item to track this unlinked/truncated file
+	 */
+	ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+
+	return ret;
+}
+
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	spin_lock(&root->list_lock);
+
+	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+		spin_unlock(&root->list_lock);
+		return 0;
+	}
+
+	list_del_init(&BTRFS_I(inode)->i_orphan);
+	if (!trans) {
+		spin_unlock(&root->list_lock);
+		return 0;
+	}
+
+	spin_unlock(&root->list_lock);
+
+	ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+
+	return ret;
+}
+
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+void btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	struct btrfs_key key, found_key;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode;
+	int ret = 0, nr_unlink = 0, nr_truncate = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return;
+	path->reada = -1;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = (u64)-1;
+
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			printk(KERN_ERR "Error searching slot for orphan: %d"
+			       "\n", ret);
+			break;
+		}
+
+		/*
+		 * if ret == 0 means we found what we were searching for, which
+		 * is weird, but possible, so only screw with path if we didnt
+		 * find the key and see if we have stuff that matches
+		 */
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		/* pull out the item */
+		leaf = path->nodes[0];
+		item = btrfs_item_nr(leaf, path->slots[0]);
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		/* make sure the item matches what we want */
+		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		/* release the path since we're done with it */
+		btrfs_release_path(root, path);
+
+		/*
+		 * this is where we are basically btrfs_lookup, without the
+		 * crossing root thing.  we store the inode number in the
+		 * offset of the orphan item.
+		 */
+		inode = btrfs_iget_locked(root->fs_info->sb,
+					  found_key.offset, root);
+		if (!inode)
+			break;
+
+		if (inode->i_state & I_NEW) {
+			BTRFS_I(inode)->root = root;
+
+			/* have to set the location manually */
+			BTRFS_I(inode)->location.objectid = inode->i_ino;
+			BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+			BTRFS_I(inode)->location.offset = 0;
+
+			btrfs_read_locked_inode(inode);
+			unlock_new_inode(inode);
+		}
+
+		/*
+		 * add this inode to the orphan list so btrfs_orphan_del does
+		 * the proper thing when we hit it
+		 */
+		spin_lock(&root->list_lock);
+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+		spin_unlock(&root->list_lock);
+
+		/*
+		 * if this is a bad inode, means we actually succeeded in
+		 * removing the inode, but not the orphan record, which means
+		 * we need to manually delete the orphan since iput will just
+		 * do a destroy_inode
+		 */
+		if (is_bad_inode(inode)) {
+			trans = btrfs_start_transaction(root, 1);
+			btrfs_orphan_del(trans, inode);
+			btrfs_end_transaction(trans, root);
+			iput(inode);
+			continue;
+		}
+
+		/* if we have links, this was a truncate, lets do that */
+		if (inode->i_nlink) {
+			nr_truncate++;
+			btrfs_truncate(inode);
+		} else {
+			nr_unlink++;
+		}
+
+		/* this will do delete_inode and everything for us */
+		iput(inode);
+	}
+
+	if (nr_unlink)
+		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+	if (nr_truncate)
+		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+
+	btrfs_free_path(path);
+}
+
+/*
+ * read an inode from the btree into the in-memory inode
+ */
+void btrfs_read_locked_inode(struct inode *inode)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_timespec *tspec;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key location;
+	u64 alloc_group_block;
+	u32 rdev;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
+	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+	if (ret)
+		goto make_bad;
+
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+
+	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+	inode->i_uid = btrfs_inode_uid(leaf, inode_item);
+	inode->i_gid = btrfs_inode_gid(leaf, inode_item);
+	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+
+	tspec = btrfs_inode_atime(inode_item);
+	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+	tspec = btrfs_inode_mtime(inode_item);
+	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+	tspec = btrfs_inode_ctime(inode_item);
+	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
+	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+	inode->i_generation = BTRFS_I(inode)->generation;
+	inode->i_rdev = 0;
+	rdev = btrfs_inode_rdev(leaf, inode_item);
+
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+						alloc_group_block, 0);
+	btrfs_free_path(path);
+	inode_item = NULL;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+		break;
+	case S_IFDIR:
+		inode->i_fop = &btrfs_dir_file_operations;
+		if (root == root->fs_info->tree_root)
+			inode->i_op = &btrfs_dir_ro_inode_operations;
+		else
+			inode->i_op = &btrfs_dir_inode_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &btrfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &btrfs_symlink_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+		break;
+	default:
+		init_special_inode(inode, inode->i_mode, rdev);
+		break;
+	}
+	return;
+
+make_bad:
+	btrfs_free_path(path);
+	make_bad_inode(inode);
+}
+
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
+			    struct btrfs_inode_item *item,
+			    struct inode *inode)
+{
+	btrfs_set_inode_uid(leaf, item, inode->i_uid);
+	btrfs_set_inode_gid(leaf, item, inode->i_gid);
+	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+	btrfs_set_inode_mode(leaf, item, inode->i_mode);
+	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+			       inode->i_atime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+				inode->i_atime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+			       inode->i_mtime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				inode->i_mtime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+			       inode->i_ctime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				inode->i_ctime.tv_nsec);
+
+	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+	btrfs_set_inode_transid(leaf, item, trans->transid);
+	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+	btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+}
+
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_lookup_inode(trans, root, path,
+				 &BTRFS_I(inode)->location, 1);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto failed;
+	}
+
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				  struct btrfs_inode_item);
+
+	fill_inode_item(trans, leaf, inode_item, inode);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_set_inode_last_trans(trans, inode);
+	ret = 0;
+failed:
+	btrfs_free_path(path);
+	return ret;
+}
+
+
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code.  It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len)
+{
+	struct btrfs_path *path;
+	int ret = 0;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 index;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				    name, name_len, -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto err;
+	}
+	if (!di) {
+		ret = -ENOENT;
+		goto err;
+	}
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	if (ret)
+		goto err;
+	btrfs_release_path(root, path);
+
+	ret = btrfs_del_inode_ref(trans, root, name, name_len,
+				  inode->i_ino,
+				  dir->i_ino, &index);
+	if (ret) {
+		printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
+		       "inode %lu parent %lu\n", name_len, name,
+		       inode->i_ino, dir->i_ino);
+		goto err;
+	}
+
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					 index, name, name_len, -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto err;
+	}
+	if (!di) {
+		ret = -ENOENT;
+		goto err;
+	}
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	btrfs_release_path(root, path);
+
+	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+					 inode, dir->i_ino);
+	BUG_ON(ret != 0 && ret != -ENOENT);
+	if (ret != -ENOENT)
+		BTRFS_I(dir)->log_dirty_trans = trans->transid;
+
+	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+					   dir, index);
+	BUG_ON(ret);
+err:
+	btrfs_free_path(path);
+	if (ret)
+		goto out;
+
+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	btrfs_update_inode(trans, root, dir);
+	btrfs_drop_nlink(inode);
+	ret = btrfs_update_inode(trans, root, inode);
+	dir->i_sb->s_dirt = 1;
+out:
+	return ret;
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct btrfs_root *root;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode = dentry->d_inode;
+	int ret;
+	unsigned long nr = 0;
+
+	root = BTRFS_I(dir)->root;
+
+	ret = btrfs_check_free_space(root, 1, 1);
+	if (ret)
+		goto fail;
+
+	trans = btrfs_start_transaction(root, 1);
+
+	btrfs_set_trans_block_group(trans, dir);
+	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+				 dentry->d_name.name, dentry->d_name.len);
+
+	if (inode->i_nlink == 0)
+		ret = btrfs_orphan_add(trans, inode);
+
+	nr = trans->blocks_used;
+
+	btrfs_end_transaction_throttle(trans, root);
+fail:
+	btrfs_btree_balance_dirty(root, nr);
+	return ret;
+}
+
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = 0;
+	int ret;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr = 0;
+
+	/*
+	 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
+	 * the root of a subvolume or snapshot
+	 */
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+		return -ENOTEMPTY;
+	}
+
+	ret = btrfs_check_free_space(root, 1, 1);
+	if (ret)
+		goto fail;
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+
+	err = btrfs_orphan_add(trans, inode);
+	if (err)
+		goto fail_trans;
+
+	/* now the directory is empty */
+	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+				 dentry->d_name.name, dentry->d_name.len);
+	if (!err)
+		btrfs_i_size_write(inode, 0);
+
+fail_trans:
+	nr = trans->blocks_used;
+	ret = btrfs_end_transaction_throttle(trans, root);
+fail:
+	btrfs_btree_balance_dirty(root, nr);
+
+	if (ret && !err)
+		err = ret;
+	return err;
+}
+
+#if 0
+/*
+ * when truncating bytes in a file, it is possible to avoid reading
+ * the leaves that contain only checksum items.  This can be the
+ * majority of the IO required to delete a large file, but it must
+ * be done carefully.
+ *
+ * The keys in the level just above the leaves are checked to make sure
+ * the lowest key in a given leaf is a csum key, and starts at an offset
+ * after the new  size.
+ *
+ * Then the key for the next leaf is checked to make sure it also has
+ * a checksum item for the same file.  If it does, we know our target leaf
+ * contains only checksum items, and it can be safely freed without reading
+ * it.
+ *
+ * This is just an optimization targeted at large files.  It may do
+ * nothing.  It will return 0 unless things went badly.
+ */
+static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path,
+				     struct inode *inode, u64 new_size)
+{
+	struct btrfs_key key;
+	int ret;
+	int nritems;
+	struct btrfs_key found_key;
+	struct btrfs_key other_key;
+	struct btrfs_leaf_ref *ref;
+	u64 leaf_gen;
+	u64 leaf_start;
+
+	path->lowest_level = 1;
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_CSUM_ITEM_KEY;
+	key.offset = new_size;
+again:
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (path->nodes[1] == NULL) {
+		ret = 0;
+		goto out;
+	}
+	ret = 0;
+	btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
+	nritems = btrfs_header_nritems(path->nodes[1]);
+
+	if (!nritems)
+		goto out;
+
+	if (path->slots[1] >= nritems)
+		goto next_node;
+
+	/* did we find a key greater than anything we want to delete? */
+	if (found_key.objectid > inode->i_ino ||
+	   (found_key.objectid == inode->i_ino && found_key.type > key.type))
+		goto out;
+
+	/* we check the next key in the node to make sure the leave contains
+	 * only checksum items.  This comparison doesn't work if our
+	 * leaf is the last one in the node
+	 */
+	if (path->slots[1] + 1 >= nritems) {
+next_node:
+		/* search forward from the last key in the node, this
+		 * will bring us into the next node in the tree
+		 */
+		btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
+
+		/* unlikely, but we inc below, so check to be safe */
+		if (found_key.offset == (u64)-1)
+			goto out;
+
+		/* search_forward needs a path with locks held, do the
+		 * search again for the original key.  It is possible
+		 * this will race with a balance and return a path that
+		 * we could modify, but this drop is just an optimization
+		 * and is allowed to miss some leaves.
+		 */
+		btrfs_release_path(root, path);
+		found_key.offset++;
+
+		/* setup a max key for search_forward */
+		other_key.offset = (u64)-1;
+		other_key.type = key.type;
+		other_key.objectid = key.objectid;
+
+		path->keep_locks = 1;
+		ret = btrfs_search_forward(root, &found_key, &other_key,
+					   path, 0, 0);
+		path->keep_locks = 0;
+		if (ret || found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		key.offset = found_key.offset;
+		btrfs_release_path(root, path);
+		cond_resched();
+		goto again;
+	}
+
+	/* we know there's one more slot after us in the tree,
+	 * read that key so we can verify it is also a checksum item
+	 */
+	btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
+
+	if (found_key.objectid < inode->i_ino)
+		goto next_key;
+
+	if (found_key.type != key.type || found_key.offset < new_size)
+		goto next_key;
+
+	/*
+	 * if the key for the next leaf isn't a csum key from this objectid,
+	 * we can't be sure there aren't good items inside this leaf.
+	 * Bail out
+	 */
+	if (other_key.objectid != inode->i_ino || other_key.type != key.type)
+		goto out;
+
+	leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
+	leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
+	/*
+	 * it is safe to delete this leaf, it contains only
+	 * csum items from this inode at an offset >= new_size
+	 */
+	ret = btrfs_del_leaf(trans, root, path, leaf_start);
+	BUG_ON(ret);
+
+	if (root->ref_cows && leaf_gen < trans->transid) {
+		ref = btrfs_alloc_leaf_ref(root, 0);
+		if (ref) {
+			ref->root_gen = root->root_key.offset;
+			ref->bytenr = leaf_start;
+			ref->owner = 0;
+			ref->generation = leaf_gen;
+			ref->nritems = 0;
+
+			ret = btrfs_add_leaf_ref(root, ref, 0);
+			WARN_ON(ret);
+			btrfs_free_leaf_ref(root, ref);
+		} else {
+			WARN_ON(1);
+		}
+	}
+next_key:
+	btrfs_release_path(root, path);
+
+	if (other_key.objectid == inode->i_ino &&
+	    other_key.type == key.type && other_key.offset > key.offset) {
+		key.offset = other_key.offset;
+		cond_resched();
+		goto again;
+	}
+	ret = 0;
+out:
+	/* fixup any changes we've made to the path */
+	path->lowest_level = 0;
+	path->keep_locks = 0;
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+#endif
+
+/*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+ * any higher than new_size
+ *
+ * csum items that cross the new i_size are truncated to the new size
+ * as well.
+ *
+ * min_type is the minimum key type to truncate down to.  If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ */
+noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct inode *inode,
+					u64 new_size, u32 min_type)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u32 found_type;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	u64 extent_start = 0;
+	u64 extent_num_bytes = 0;
+	u64 item_end = 0;
+	u64 root_gen = 0;
+	u64 root_owner = 0;
+	int found_extent;
+	int del_item;
+	int pending_del_nr = 0;
+	int pending_del_slot = 0;
+	int extent_type = -1;
+	int encoding;
+	u64 mask = root->sectorsize - 1;
+
+	if (root->ref_cows)
+		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+	path = btrfs_alloc_path();
+	path->reada = -1;
+	BUG_ON(!path);
+
+	/* FIXME, add redo link to tree so we don't leak on crash */
+	key.objectid = inode->i_ino;
+	key.offset = (u64)-1;
+	key.type = (u8)-1;
+
+	btrfs_init_path(path);
+
+search_again:
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto error;
+
+	if (ret > 0) {
+		/* there are no items in the tree for us to truncate, we're
+		 * done
+		 */
+		if (path->slots[0] == 0) {
+			ret = 0;
+			goto error;
+		}
+		path->slots[0]--;
+	}
+
+	while (1) {
+		fi = NULL;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		found_type = btrfs_key_type(&found_key);
+		encoding = 0;
+
+		if (found_key.objectid != inode->i_ino)
+			break;
+
+		if (found_type < min_type)
+			break;
+
+		item_end = found_key.offset;
+		if (found_type == BTRFS_EXTENT_DATA_KEY) {
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			extent_type = btrfs_file_extent_type(leaf, fi);
+			encoding = btrfs_file_extent_compression(leaf, fi);
+			encoding |= btrfs_file_extent_encryption(leaf, fi);
+			encoding |= btrfs_file_extent_other_encoding(leaf, fi);
+
+			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+				item_end +=
+				    btrfs_file_extent_num_bytes(leaf, fi);
+			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				item_end += btrfs_file_extent_inline_len(leaf,
+									 fi);
+			}
+			item_end--;
+		}
+		if (item_end < new_size) {
+			if (found_type == BTRFS_DIR_ITEM_KEY)
+				found_type = BTRFS_INODE_ITEM_KEY;
+			else if (found_type == BTRFS_EXTENT_ITEM_KEY)
+				found_type = BTRFS_EXTENT_DATA_KEY;
+			else if (found_type == BTRFS_EXTENT_DATA_KEY)
+				found_type = BTRFS_XATTR_ITEM_KEY;
+			else if (found_type == BTRFS_XATTR_ITEM_KEY)
+				found_type = BTRFS_INODE_REF_KEY;
+			else if (found_type)
+				found_type--;
+			else
+				break;
+			btrfs_set_key_type(&key, found_type);
+			goto next;
+		}
+		if (found_key.offset >= new_size)
+			del_item = 1;
+		else
+			del_item = 0;
+		found_extent = 0;
+
+		/* FIXME, shrink the extent if the ref count is only 1 */
+		if (found_type != BTRFS_EXTENT_DATA_KEY)
+			goto delete;
+
+		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+			u64 num_dec;
+			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+			if (!del_item && !encoding) {
+				u64 orig_num_bytes =
+					btrfs_file_extent_num_bytes(leaf, fi);
+				extent_num_bytes = new_size -
+					found_key.offset + root->sectorsize - 1;
+				extent_num_bytes = extent_num_bytes &
+					~((u64)root->sectorsize - 1);
+				btrfs_set_file_extent_num_bytes(leaf, fi,
+							 extent_num_bytes);
+				num_dec = (orig_num_bytes -
+					   extent_num_bytes);
+				if (root->ref_cows && extent_start != 0)
+					inode_sub_bytes(inode, num_dec);
+				btrfs_mark_buffer_dirty(leaf);
+			} else {
+				extent_num_bytes =
+					btrfs_file_extent_disk_num_bytes(leaf,
+									 fi);
+				/* FIXME blocksize != 4096 */
+				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+				if (extent_start != 0) {
+					found_extent = 1;
+					if (root->ref_cows)
+						inode_sub_bytes(inode, num_dec);
+				}
+				root_gen = btrfs_header_generation(leaf);
+				root_owner = btrfs_header_owner(leaf);
+			}
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			/*
+			 * we can't truncate inline items that have had
+			 * special encodings
+			 */
+			if (!del_item &&
+			    btrfs_file_extent_compression(leaf, fi) == 0 &&
+			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
+			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+				u32 size = new_size - found_key.offset;
+
+				if (root->ref_cows) {
+					inode_sub_bytes(inode, item_end + 1 -
+							new_size);
+				}
+				size =
+				    btrfs_file_extent_calc_inline_size(size);
+				ret = btrfs_truncate_item(trans, root, path,
+							  size, 1);
+				BUG_ON(ret);
+			} else if (root->ref_cows) {
+				inode_sub_bytes(inode, item_end + 1 -
+						found_key.offset);
+			}
+		}
+delete:
+		if (del_item) {
+			if (!pending_del_nr) {
+				/* no pending yet, add ourselves */
+				pending_del_slot = path->slots[0];
+				pending_del_nr = 1;
+			} else if (pending_del_nr &&
+				   path->slots[0] + 1 == pending_del_slot) {
+				/* hop on the pending chunk */
+				pending_del_nr++;
+				pending_del_slot = path->slots[0];
+			} else {
+				BUG();
+			}
+		} else {
+			break;
+		}
+		if (found_extent) {
+			ret = btrfs_free_extent(trans, root, extent_start,
+						extent_num_bytes,
+						leaf->start, root_owner,
+						root_gen, inode->i_ino, 0);
+			BUG_ON(ret);
+		}
+next:
+		if (path->slots[0] == 0) {
+			if (pending_del_nr)
+				goto del_pending;
+			btrfs_release_path(root, path);
+			goto search_again;
+		}
+
+		path->slots[0]--;
+		if (pending_del_nr &&
+		    path->slots[0] + 1 != pending_del_slot) {
+			struct btrfs_key debug;
+del_pending:
+			btrfs_item_key_to_cpu(path->nodes[0], &debug,
+					      pending_del_slot);
+			ret = btrfs_del_items(trans, root, path,
+					      pending_del_slot,
+					      pending_del_nr);
+			BUG_ON(ret);
+			pending_del_nr = 0;
+			btrfs_release_path(root, path);
+			goto search_again;
+		}
+	}
+	ret = 0;
+error:
+	if (pending_del_nr) {
+		ret = btrfs_del_items(trans, root, path, pending_del_slot,
+				      pending_del_nr);
+	}
+	btrfs_free_path(path);
+	inode->i_sb->s_dirt = 1;
+	return ret;
+}
+
+/*
+ * taken from block_truncate_page, but does cow as it zeros out
+ * any bytes left in the last page in the file.
+ */
+static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+{
+	struct inode *inode = mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	char *kaddr;
+	u32 blocksize = root->sectorsize;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	struct page *page;
+	int ret = 0;
+	u64 page_start;
+	u64 page_end;
+
+	if ((offset & (blocksize - 1)) == 0)
+		goto out;
+
+	ret = -ENOMEM;
+again:
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		goto out;
+
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	if (!PageUptodate(page)) {
+		ret = btrfs_readpage(NULL, page);
+		lock_page(page);
+		if (page->mapping != mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
+		if (!PageUptodate(page)) {
+			ret = -EIO;
+			goto out_unlock;
+		}
+	}
+	wait_on_page_writeback(page);
+
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = 0;
+	if (offset != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	ClearPageChecked(page);
+	set_page_dirty(page);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+int btrfs_cont_expand(struct inode *inode, loff_t size)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em;
+	u64 mask = root->sectorsize - 1;
+	u64 hole_start = (inode->i_size + mask) & ~mask;
+	u64 block_end = (size + mask) & ~mask;
+	u64 last_byte;
+	u64 cur_offset;
+	u64 hole_size;
+	int err;
+
+	if (size <= hole_start)
+		return 0;
+
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		return err;
+
+	btrfs_truncate_page(inode->i_mapping, inode->i_size);
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		btrfs_wait_ordered_range(inode, hole_start,
+					 block_end - hole_start);
+		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+		if (!ordered)
+			break;
+		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+
+	cur_offset = hole_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				block_end - cur_offset, 0);
+		BUG_ON(IS_ERR(em) || !em);
+		last_byte = min(extent_map_end(em), block_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+			u64 hint_byte = 0;
+			hole_size = last_byte - cur_offset;
+			err = btrfs_drop_extents(trans, root, inode,
+						 cur_offset,
+						 cur_offset + hole_size,
+						 cur_offset, &hint_byte);
+			if (err)
+				break;
+			err = btrfs_insert_file_extent(trans, root,
+					inode->i_ino, cur_offset, 0,
+					0, hole_size, 0, hole_size,
+					0, 0, 0);
+			btrfs_drop_extent_cache(inode, hole_start,
+					last_byte - 1, 0);
+		}
+		free_extent_map(em);
+		cur_offset = last_byte;
+		if (err || cur_offset >= block_end)
+			break;
+	}
+
+	btrfs_end_transaction(trans, root);
+	unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+	return err;
+}
+
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (S_ISREG(inode->i_mode) &&
+	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+		err = btrfs_cont_expand(inode, attr->ia_size);
+		if (err)
+			return err;
+	}
+
+	err = inode_setattr(inode, attr);
+
+	if (!err && ((attr->ia_valid & ATTR_MODE)))
+		err = btrfs_acl_chmod(inode);
+	return err;
+}
+
+void btrfs_delete_inode(struct inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long nr;
+	int ret;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	if (is_bad_inode(inode)) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+
+	btrfs_i_size_write(inode, 0);
+	trans = btrfs_join_transaction(root, 1);
+
+	btrfs_set_trans_block_group(trans, inode);
+	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
+	if (ret) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete_lock;
+	}
+
+	btrfs_orphan_del(trans, inode);
+
+	nr = trans->blocks_used;
+	clear_inode(inode);
+
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+	return;
+
+no_delete_lock:
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root, nr);
+no_delete:
+	clear_inode(inode);
+}
+
+/*
+ * this returns the key found in the dir entry in the location pointer.
+ * If no dir entries were found, location->objectid is 0.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+			       struct btrfs_key *location)
+{
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	int ret = 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+				    namelen, 0);
+	if (IS_ERR(di))
+		ret = PTR_ERR(di);
+
+	if (!di || IS_ERR(di))
+		goto out_err;
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+out:
+	btrfs_free_path(path);
+	return ret;
+out_err:
+	location->objectid = 0;
+	goto out;
+}
+
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root.  This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_root *root,
+			     struct btrfs_key *location,
+			     struct btrfs_root **sub_root,
+			     struct dentry *dentry)
+{
+	struct btrfs_root_item *ri;
+
+	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+		return 0;
+	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return 0;
+
+	*sub_root = btrfs_read_fs_root(root->fs_info, location,
+					dentry->d_name.name,
+					dentry->d_name.len);
+	if (IS_ERR(*sub_root))
+		return PTR_ERR(*sub_root);
+
+	ri = &(*sub_root)->root_item;
+	location->objectid = btrfs_root_dirid(ri);
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+	location->offset = 0;
+
+	return 0;
+}
+
+static noinline void init_btrfs_i(struct inode *inode)
+{
+	struct btrfs_inode *bi = BTRFS_I(inode);
+
+	bi->i_acl = NULL;
+	bi->i_default_acl = NULL;
+
+	bi->generation = 0;
+	bi->sequence = 0;
+	bi->last_trans = 0;
+	bi->logged_trans = 0;
+	bi->delalloc_bytes = 0;
+	bi->disk_i_size = 0;
+	bi->flags = 0;
+	bi->index_cnt = (u64)-1;
+	bi->log_dirty_trans = 0;
+	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
+			     inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+			     inode->i_mapping, GFP_NOFS);
+	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+	mutex_init(&BTRFS_I(inode)->extent_mutex);
+	mutex_init(&BTRFS_I(inode)->log_mutex);
+}
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+	struct btrfs_iget_args *args = p;
+	inode->i_ino = args->ino;
+	init_btrfs_i(inode);
+	BTRFS_I(inode)->root = args->root;
+	return 0;
+}
+
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+	struct btrfs_iget_args *args = opaque;
+	return args->ino == inode->i_ino &&
+		args->root == BTRFS_I(inode)->root;
+}
+
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    struct btrfs_root *root, int wait)
+{
+	struct inode *inode;
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = root;
+
+	if (wait) {
+		inode = ilookup5(s, objectid, btrfs_find_actor,
+				 (void *)&args);
+	} else {
+		inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
+					(void *)&args);
+	}
+	return inode;
+}
+
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+				struct btrfs_root *root)
+{
+	struct inode *inode;
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = root;
+
+	inode = iget5_locked(s, objectid, btrfs_find_actor,
+			     btrfs_init_locked_inode,
+			     (void *)&args);
+	return inode;
+}
+
+/* Get an inode object given its location and corresponding root.
+ * Returns in *is_new if the inode was read from disk
+ */
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+			 struct btrfs_root *root, int *is_new)
+{
+	struct inode *inode;
+
+	inode = btrfs_iget_locked(s, location->objectid, root);
+	if (!inode)
+		return ERR_PTR(-EACCES);
+
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+		if (is_new)
+			*is_new = 1;
+	} else {
+		if (is_new)
+			*is_new = 0;
+	}
+
+	return inode;
+}
+
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode;
+	struct btrfs_inode *bi = BTRFS_I(dir);
+	struct btrfs_root *root = bi->root;
+	struct btrfs_root *sub_root = root;
+	struct btrfs_key location;
+	int ret, new;
+
+	if (dentry->d_name.len > BTRFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ret = btrfs_inode_by_name(dir, dentry, &location);
+
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	inode = NULL;
+	if (location.objectid) {
+		ret = fixup_tree_root_location(root, &location, &sub_root,
+						dentry);
+		if (ret < 0)
+			return ERR_PTR(ret);
+		if (ret > 0)
+			return ERR_PTR(-ENOENT);
+		inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	return inode;
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode;
+
+	if (dentry->d_name.len > BTRFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	inode = btrfs_lookup_dentry(dir, dentry);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	return d_splice_alias(inode, dentry);
+}
+
+static unsigned char btrfs_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int btrfs_real_readdir(struct file *filp, void *dirent,
+			      filldir_t filldir)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_item *item;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	int ret;
+	u32 nritems;
+	struct extent_buffer *leaf;
+	int slot;
+	int advance;
+	unsigned char d_type;
+	int over = 0;
+	u32 di_cur;
+	u32 di_total;
+	u32 di_len;
+	int key_type = BTRFS_DIR_INDEX_KEY;
+	char tmp_name[32];
+	char *name_ptr;
+	int name_len;
+
+	/* FIXME, use a real flag for deciding about the key type */
+	if (root->fs_info->tree_root == root)
+		key_type = BTRFS_DIR_ITEM_KEY;
+
+	/* special case for "." */
+	if (filp->f_pos == 0) {
+		over = filldir(dirent, ".", 1,
+			       1, inode->i_ino,
+			       DT_DIR);
+		if (over)
+			return 0;
+		filp->f_pos = 1;
+	}
+	/* special case for .., just use the back ref */
+	if (filp->f_pos == 1) {
+		u64 pino = parent_ino(filp->f_path.dentry);
+		over = filldir(dirent, "..", 2,
+			       2, pino, DT_DIR);
+		if (over)
+			return 0;
+		filp->f_pos = 2;
+	}
+	path = btrfs_alloc_path();
+	path->reada = 2;
+
+	btrfs_set_key_type(&key, key_type);
+	key.offset = filp->f_pos;
+	key.objectid = inode->i_ino;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+	advance = 0;
+
+	while (1) {
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		slot = path->slots[0];
+		if (advance || slot >= nritems) {
+			if (slot >= nritems - 1) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+				leaf = path->nodes[0];
+				nritems = btrfs_header_nritems(leaf);
+				slot = path->slots[0];
+			} else {
+				slot++;
+				path->slots[0]++;
+			}
+		}
+
+		advance = 1;
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid != key.objectid)
+			break;
+		if (btrfs_key_type(&found_key) != key_type)
+			break;
+		if (found_key.offset < filp->f_pos)
+			continue;
+
+		filp->f_pos = found_key.offset;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		di_cur = 0;
+		di_total = btrfs_item_size(leaf, item);
+
+		while (di_cur < di_total) {
+			struct btrfs_key location;
+
+			name_len = btrfs_dir_name_len(leaf, di);
+			if (name_len <= sizeof(tmp_name)) {
+				name_ptr = tmp_name;
+			} else {
+				name_ptr = kmalloc(name_len, GFP_NOFS);
+				if (!name_ptr) {
+					ret = -ENOMEM;
+					goto err;
+				}
+			}
+			read_extent_buffer(leaf, name_ptr,
+					   (unsigned long)(di + 1), name_len);
+
+			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+			btrfs_dir_item_key_to_cpu(leaf, di, &location);
+
+			/* is this a reference to our own snapshot? If so
+			 * skip it
+			 */
+			if (location.type == BTRFS_ROOT_ITEM_KEY &&
+			    location.objectid == root->root_key.objectid) {
+				over = 0;
+				goto skip;
+			}
+			over = filldir(dirent, name_ptr, name_len,
+				       found_key.offset, location.objectid,
+				       d_type);
+
+skip:
+			if (name_ptr != tmp_name)
+				kfree(name_ptr);
+
+			if (over)
+				goto nopos;
+			di_len = btrfs_dir_name_len(leaf, di) +
+				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
+			di_cur += di_len;
+			di = (struct btrfs_dir_item *)((char *)di + di_len);
+		}
+	}
+
+	/* Reached end of directory/root. Bump pos past the last item. */
+	if (key_type == BTRFS_DIR_INDEX_KEY)
+		filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
+	else
+		filp->f_pos++;
+nopos:
+	ret = 0;
+err:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_write_inode(struct inode *inode, int wait)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	if (root->fs_info->btree_inode == inode)
+		return 0;
+
+	if (wait) {
+		trans = btrfs_join_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
+		ret = btrfs_commit_transaction(trans, root);
+	}
+	return ret;
+}
+
+/*
+ * This is somewhat expensive, updating the tree every time the
+ * inode changes.  But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+void btrfs_dirty_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+
+	trans = btrfs_join_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+}
+
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key key, found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret;
+
+	key.objectid = inode->i_ino;
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	/* FIXME: we should be able to handle this */
+	if (ret == 0)
+		goto out;
+	ret = 0;
+
+	/*
+	 * MAGIC NUMBER EXPLANATION:
+	 * since we search a directory based on f_pos we have to start at 2
+	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+	 * else has to start at 2
+	 */
+	if (path->slots[0] == 0) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != inode->i_ino ||
+	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to find a free sequence number in a given directory.  This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
+int btrfs_set_inode_index(struct inode *dir, u64 *index)
+{
+	int ret = 0;
+
+	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+		ret = btrfs_set_inode_index_count(dir);
+		if (ret)
+			return ret;
+	}
+
+	*index = BTRFS_I(dir)->index_cnt;
+	BTRFS_I(dir)->index_cnt++;
+
+	return ret;
+}
+
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct inode *dir,
+				     const char *name, int name_len,
+				     u64 ref_objectid, u64 objectid,
+				     u64 alloc_hint, int mode, u64 *index)
+{
+	struct inode *inode;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_key *location;
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_key key[2];
+	u32 sizes[2];
+	unsigned long ptr;
+	int ret;
+	int owner;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	inode = new_inode(root->fs_info->sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (dir) {
+		ret = btrfs_set_inode_index(dir, index);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+	/*
+	 * index_cnt is ignored for everything but a dir,
+	 * btrfs_get_inode_index_count has an explanation for the magic
+	 * number
+	 */
+	init_btrfs_i(inode);
+	BTRFS_I(inode)->index_cnt = 2;
+	BTRFS_I(inode)->root = root;
+	BTRFS_I(inode)->generation = trans->transid;
+
+	if (mode & S_IFDIR)
+		owner = 0;
+	else
+		owner = 1;
+	BTRFS_I(inode)->block_group =
+			btrfs_find_block_group(root, 0, alloc_hint, owner);
+	if ((mode & S_IFREG)) {
+		if (btrfs_test_opt(root, NODATASUM))
+			btrfs_set_flag(inode, NODATASUM);
+		if (btrfs_test_opt(root, NODATACOW))
+			btrfs_set_flag(inode, NODATACOW);
+	}
+
+	key[0].objectid = objectid;
+	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+	key[0].offset = 0;
+
+	key[1].objectid = objectid;
+	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+	key[1].offset = ref_objectid;
+
+	sizes[0] = sizeof(struct btrfs_inode_item);
+	sizes[1] = name_len + sizeof(*ref);
+
+	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+	if (ret != 0)
+		goto fail;
+
+	if (objectid > root->highest_inode)
+		root->highest_inode = objectid;
+
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
+	inode->i_mode = mode;
+	inode->i_ino = objectid;
+	inode_set_bytes(inode, 0);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				  struct btrfs_inode_item);
+	fill_inode_item(trans, path->nodes[0], inode_item, inode);
+
+	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+			     struct btrfs_inode_ref);
+	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+	btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+	ptr = (unsigned long)(ref + 1);
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
+	location = &BTRFS_I(inode)->location;
+	location->objectid = objectid;
+	location->offset = 0;
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+
+	insert_inode_hash(inode);
+	return inode;
+fail:
+	if (dir)
+		BTRFS_I(dir)->index_cnt--;
+	btrfs_free_path(path);
+	return ERR_PTR(ret);
+}
+
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+
+	key.objectid = inode->i_ino;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	ret = btrfs_insert_dir_item(trans, root, name, name_len,
+				    parent_inode->i_ino,
+				    &key, btrfs_inode_type(inode),
+				    index);
+	if (ret == 0) {
+		if (add_backref) {
+			ret = btrfs_insert_inode_ref(trans, root,
+						     name, name_len,
+						     inode->i_ino,
+						     parent_inode->i_ino,
+						     index);
+		}
+		btrfs_i_size_write(parent_inode, parent_inode->i_size +
+				   name_len * 2);
+		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+		ret = btrfs_update_inode(trans, root, parent_inode);
+	}
+	return ret;
+}
+
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+			    struct dentry *dentry, struct inode *inode,
+			    int backref, u64 index)
+{
+	int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+				 inode, dentry->d_name.name,
+				 dentry->d_name.len, backref, index);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	if (err > 0)
+		err = -EEXIST;
+	return err;
+}
+
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+			int mode, dev_t rdev)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = NULL;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+	unsigned long nr = 0;
+	u64 index = 0;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto fail;
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len,
+				dentry->d_parent->d_inode->i_ino, objectid,
+				BTRFS_I(dir)->block_group, mode, &index);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_unlock;
+
+	err = btrfs_init_acl(inode, dir);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
+	btrfs_set_trans_block_group(trans, inode);
+	err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_op = &btrfs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+		btrfs_update_inode(trans, root, inode);
+	}
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+fail:
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root, nr);
+	return err;
+}
+
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+			int mode, struct nameidata *nd)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = NULL;
+	int err;
+	int drop_inode = 0;
+	unsigned long nr = 0;
+	u64 objectid;
+	u64 index = 0;
+
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto fail;
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len,
+				dentry->d_parent->d_inode->i_ino,
+				objectid, BTRFS_I(dir)->block_group, mode,
+				&index);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_unlock;
+
+	err = btrfs_init_acl(inode, dir);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
+	btrfs_set_trans_block_group(trans, inode);
+	err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+	}
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+fail:
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root, nr);
+	return err;
+}
+
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = old_dentry->d_inode;
+	u64 index;
+	unsigned long nr = 0;
+	int err;
+	int drop_inode = 0;
+
+	if (inode->i_nlink == 0)
+		return -ENOENT;
+
+	btrfs_inc_nlink(inode);
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto fail;
+	err = btrfs_set_inode_index(dir, &index);
+	if (err)
+		goto fail;
+
+	trans = btrfs_start_transaction(root, 1);
+
+	btrfs_set_trans_block_group(trans, dir);
+	atomic_inc(&inode->i_count);
+
+	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+
+	if (err)
+		drop_inode = 1;
+
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, dir);
+	err = btrfs_update_inode(trans, root, inode);
+
+	if (err)
+		drop_inode = 1;
+
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+fail:
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root, nr);
+	return err;
+}
+
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	int err = 0;
+	int drop_on_err = 0;
+	u64 objectid = 0;
+	u64 index = 0;
+	unsigned long nr = 1;
+
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto out_unlock;
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len,
+				dentry->d_parent->d_inode->i_ino, objectid,
+				BTRFS_I(dir)->block_group, S_IFDIR | mode,
+				&index);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_fail;
+	}
+
+	drop_on_err = 1;
+
+	err = btrfs_init_acl(inode, dir);
+	if (err)
+		goto out_fail;
+
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+	btrfs_set_trans_block_group(trans, inode);
+
+	btrfs_i_size_write(inode, 0);
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		goto out_fail;
+
+	err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+				 inode, dentry->d_name.name,
+				 dentry->d_name.len, 0, index);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+	drop_on_err = 0;
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
+
+out_fail:
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+
+out_unlock:
+	if (drop_on_err)
+		iput(inode);
+	btrfs_btree_balance_dirty(root, nr);
+	return err;
+}
+
+/* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the new extent into the tree.
+ */
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+				struct extent_map *existing,
+				struct extent_map *em,
+				u64 map_start, u64 map_len)
+{
+	u64 start_diff;
+
+	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+	start_diff = map_start - em->start;
+	em->start = map_start;
+	em->len = map_len;
+	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+		em->block_start += start_diff;
+		em->block_len -= start_diff;
+	}
+	return add_extent_mapping(em_tree, em);
+}
+
+static noinline int uncompress_inline(struct btrfs_path *path,
+				      struct inode *inode, struct page *page,
+				      size_t pg_offset, u64 extent_offset,
+				      struct btrfs_file_extent_item *item)
+{
+	int ret;
+	struct extent_buffer *leaf = path->nodes[0];
+	char *tmp;
+	size_t max_size;
+	unsigned long inline_size;
+	unsigned long ptr;
+
+	WARN_ON(pg_offset != 0);
+	max_size = btrfs_file_extent_ram_bytes(leaf, item);
+	inline_size = btrfs_file_extent_inline_item_len(leaf,
+					btrfs_item_nr(leaf, path->slots[0]));
+	tmp = kmalloc(inline_size, GFP_NOFS);
+	ptr = btrfs_file_extent_inline_start(item);
+
+	read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+	ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+				    inline_size, max_size);
+	if (ret) {
+		char *kaddr = kmap_atomic(page, KM_USER0);
+		unsigned long copy_size = min_t(u64,
+				  PAGE_CACHE_SIZE - pg_offset,
+				  max_size - extent_offset);
+		memset(kaddr + pg_offset, 0, copy_size);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	kfree(tmp);
+	return 0;
+}
+
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation.  This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
+
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t pg_offset, u64 start, u64 len,
+				    int create)
+{
+	int ret;
+	int err = 0;
+	u64 bytenr;
+	u64 extent_start = 0;
+	u64 extent_end = 0;
+	u64 objectid = inode->i_ino;
+	u32 found_type;
+	struct btrfs_path *path = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *item;
+	struct extent_buffer *leaf;
+	struct btrfs_key found_key;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_trans_handle *trans = NULL;
+	int compressed;
+
+again:
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	if (em)
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+	spin_unlock(&em_tree->lock);
+
+	if (em) {
+		if (em->start > start || em->start + em->len <= start)
+			free_extent_map(em);
+		else if (em->block_start == EXTENT_MAP_INLINE && page)
+			free_extent_map(em);
+		else
+			goto out;
+	}
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em) {
+		err = -ENOMEM;
+		goto out;
+	}
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	em->start = EXTENT_MAP_HOLE;
+	em->orig_start = EXTENT_MAP_HOLE;
+	em->len = (u64)-1;
+	em->block_len = (u64)-1;
+
+	if (!path) {
+		path = btrfs_alloc_path();
+		BUG_ON(!path);
+	}
+
+	ret = btrfs_lookup_file_extent(trans, root, path,
+				       objectid, start, trans != NULL);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	if (ret != 0) {
+		if (path->slots[0] == 0)
+			goto not_found;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_file_extent_item);
+	/* are we inside the extent that was found? */
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	found_type = btrfs_key_type(&found_key);
+	if (found_key.objectid != objectid ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		goto not_found;
+	}
+
+	found_type = btrfs_file_extent_type(leaf, item);
+	extent_start = found_key.offset;
+	compressed = btrfs_file_extent_compression(leaf, item);
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		extent_end = extent_start +
+		       btrfs_file_extent_num_bytes(leaf, item);
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size_t size;
+		size = btrfs_file_extent_inline_len(leaf, item);
+		extent_end = (extent_start + size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+	}
+
+	if (start >= extent_end) {
+		path->slots[0]++;
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
+			}
+			if (ret > 0)
+				goto not_found;
+			leaf = path->nodes[0];
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != objectid ||
+		    found_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto not_found;
+		if (start + len <= found_key.offset)
+			goto not_found;
+		em->start = start;
+		em->len = found_key.offset - start;
+		goto not_found_em;
+	}
+
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		em->start = extent_start;
+		em->len = extent_end - extent_start;
+		em->orig_start = extent_start -
+				 btrfs_file_extent_offset(leaf, item);
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+		if (bytenr == 0) {
+			em->block_start = EXTENT_MAP_HOLE;
+			goto insert;
+		}
+		if (compressed) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->block_start = bytenr;
+			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+									 item);
+		} else {
+			bytenr += btrfs_file_extent_offset(leaf, item);
+			em->block_start = bytenr;
+			em->block_len = em->len;
+			if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
+				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		}
+		goto insert;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		unsigned long ptr;
+		char *map;
+		size_t size;
+		size_t extent_offset;
+		size_t copy_size;
+
+		em->block_start = EXTENT_MAP_INLINE;
+		if (!page || create) {
+			em->start = extent_start;
+			em->len = extent_end - extent_start;
+			goto out;
+		}
+
+		size = btrfs_file_extent_inline_len(leaf, item);
+		extent_offset = page_offset(page) + pg_offset - extent_start;
+		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
+				size - extent_offset);
+		em->start = extent_start + extent_offset;
+		em->len = (copy_size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+		em->orig_start = EXTENT_MAP_INLINE;
+		if (compressed)
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+		if (create == 0 && !PageUptodate(page)) {
+			if (btrfs_file_extent_compression(leaf, item) ==
+			    BTRFS_COMPRESS_ZLIB) {
+				ret = uncompress_inline(path, inode, page,
+							pg_offset,
+							extent_offset, item);
+				BUG_ON(ret);
+			} else {
+				map = kmap(page);
+				read_extent_buffer(leaf, map + pg_offset, ptr,
+						   copy_size);
+				kunmap(page);
+			}
+			flush_dcache_page(page);
+		} else if (create && PageUptodate(page)) {
+			if (!trans) {
+				kunmap(page);
+				free_extent_map(em);
+				em = NULL;
+				btrfs_release_path(root, path);
+				trans = btrfs_join_transaction(root, 1);
+				goto again;
+			}
+			map = kmap(page);
+			write_extent_buffer(leaf, map + pg_offset, ptr,
+					    copy_size);
+			kunmap(page);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+		set_extent_uptodate(io_tree, em->start,
+				    extent_map_end(em) - 1, GFP_NOFS);
+		goto insert;
+	} else {
+		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
+		WARN_ON(1);
+	}
+not_found:
+	em->start = start;
+	em->len = len;
+not_found_em:
+	em->block_start = EXTENT_MAP_HOLE;
+	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
+insert:
+	btrfs_release_path(root, path);
+	if (em->start > start || extent_map_end(em) <= start) {
+		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
+		       "[%llu %llu]\n", (unsigned long long)em->start,
+		       (unsigned long long)em->len,
+		       (unsigned long long)start,
+		       (unsigned long long)len);
+		err = -EIO;
+		goto out;
+	}
+
+	err = 0;
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	/* it is possible that someone inserted the extent into the tree
+	 * while we had the lock dropped.  It is also possible that
+	 * an overlapping map exists in the tree
+	 */
+	if (ret == -EEXIST) {
+		struct extent_map *existing;
+
+		ret = 0;
+
+		existing = lookup_extent_mapping(em_tree, start, len);
+		if (existing && (existing->start > start ||
+		    existing->start + existing->len <= start)) {
+			free_extent_map(existing);
+			existing = NULL;
+		}
+		if (!existing) {
+			existing = lookup_extent_mapping(em_tree, em->start,
+							 em->len);
+			if (existing) {
+				err = merge_extent_mapping(em_tree, existing,
+							   em, start,
+							   root->sectorsize);
+				free_extent_map(existing);
+				if (err) {
+					free_extent_map(em);
+					em = NULL;
+				}
+			} else {
+				err = -EIO;
+				free_extent_map(em);
+				em = NULL;
+			}
+		} else {
+			free_extent_map(em);
+			em = existing;
+			err = 0;
+		}
+	}
+	spin_unlock(&em_tree->lock);
+out:
+	if (path)
+		btrfs_free_path(path);
+	if (trans) {
+		ret = btrfs_end_transaction(trans, root);
+		if (!err)
+			err = ret;
+	}
+	if (err) {
+		free_extent_map(em);
+		WARN_ON(1);
+		return ERR_PTR(err);
+	}
+	return em;
+}
+
+static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+{
+	return -EINVAL;
+}
+
+static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
+{
+	return extent_bmap(mapping, iblock, btrfs_get_extent);
+}
+
+int btrfs_readpage(struct file *file, struct page *page)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	return extent_read_full_page(tree, page, btrfs_get_extent);
+}
+
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+
+
+	if (current->flags & PF_MEMALLOC) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+}
+
+int btrfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree;
+
+	tree = &BTRFS_I(mapping->host)->io_tree;
+	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+}
+
+static int
+btrfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
+	return extent_readpages(tree, mapping, pages, nr_pages,
+				btrfs_get_extent);
+}
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *map;
+	int ret;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	map = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+	if (ret == 1) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+	return ret;
+}
+
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	if (PageWriteback(page) || PageDirty(page))
+		return 0;
+	return __btrfs_releasepage(page, gfp_flags);
+}
+
+static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct extent_io_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	u64 page_start = page_offset(page);
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	wait_on_page_writeback(page);
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (offset) {
+		btrfs_releasepage(page, GFP_NOFS);
+		return;
+	}
+
+	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+					   page_offset(page));
+	if (ordered) {
+		/*
+		 * IO on this page will never be started, so we need
+		 * to account for any ordered extents now
+		 */
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
+		btrfs_finish_ordered_io(page->mapping->host,
+					page_start, page_end);
+		btrfs_put_ordered_extent(ordered);
+		lock_extent(tree, page_start, page_end, GFP_NOFS);
+	}
+	clear_extent_bit(tree, page_start, page_end,
+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+		 EXTENT_ORDERED,
+		 1, 1, GFP_NOFS);
+	__btrfs_releasepage(page, GFP_NOFS);
+
+	ClearPageChecked(page);
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
+}
+
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct inode *inode = fdentry(vma->vm_file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	char *kaddr;
+	unsigned long zero_start;
+	loff_t size;
+	int ret;
+	u64 page_start;
+	u64 page_end;
+
+	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+again:
+	lock_page(page);
+	size = i_size_read(inode);
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	if ((page->mapping != inode->i_mapping) ||
+	    (page_start >= size)) {
+		/* page got truncated out from underneath us */
+		goto out_unlock;
+	}
+	wait_on_page_writeback(page);
+
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	/*
+	 * we can't set the delalloc bits if there are pending ordered
+	 * extents.  Drop our locks and wait for them to finish
+	 */
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = 0;
+
+	/* page is wholly or partially inside EOF */
+	if (page_start + PAGE_CACHE_SIZE > size)
+		zero_start = size & ~PAGE_CACHE_MASK;
+	else
+		zero_start = PAGE_CACHE_SIZE;
+
+	if (zero_start != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	ClearPageChecked(page);
+	set_page_dirty(page);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+out_unlock:
+	unlock_page(page);
+out:
+	return ret;
+}
+
+static void btrfs_truncate(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr;
+	u64 mask = root->sectorsize - 1;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+
+	btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	btrfs_i_size_write(inode, inode->i_size);
+
+	ret = btrfs_orphan_add(trans, inode);
+	if (ret)
+		goto out;
+	/* FIXME, add redo link to tree so we don't leak on crash */
+	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
+				      BTRFS_EXTENT_DATA_KEY);
+	btrfs_update_inode(trans, root, inode);
+
+	ret = btrfs_orphan_del(trans, inode);
+	BUG_ON(ret);
+
+out:
+	nr = trans->blocks_used;
+	ret = btrfs_end_transaction_throttle(trans, root);
+	BUG_ON(ret);
+	btrfs_btree_balance_dirty(root, nr);
+}
+
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root, struct dentry *dentry,
+			     u64 new_dirid, u64 alloc_hint)
+{
+	struct inode *inode;
+	int error;
+	u64 index = 0;
+
+	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
+				new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+
+	inode->i_nlink = 1;
+	btrfs_i_size_write(inode, 0);
+
+	error = btrfs_update_inode(trans, new_root, inode);
+	if (error)
+		return error;
+
+	d_instantiate(dentry, inode);
+	return 0;
+}
+
+/* helper function for file defrag and space balancing.  This
+ * forces readahead on a given range of bytes in an inode
+ */
+unsigned long btrfs_force_ra(struct address_space *mapping,
+			      struct file_ra_state *ra, struct file *file,
+			      pgoff_t offset, pgoff_t last_index)
+{
+	pgoff_t req_size = last_index - offset + 1;
+
+	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+	return offset + req_size;
+}
+
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+	struct btrfs_inode *ei;
+
+	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+	ei->last_trans = 0;
+	ei->logged_trans = 0;
+	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+	ei->i_acl = BTRFS_ACL_NOT_CACHED;
+	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
+	INIT_LIST_HEAD(&ei->i_orphan);
+	return &ei->vfs_inode;
+}
+
+void btrfs_destroy_inode(struct inode *inode)
+{
+	struct btrfs_ordered_extent *ordered;
+	WARN_ON(!list_empty(&inode->i_dentry));
+	WARN_ON(inode->i_data.nrpages);
+
+	if (BTRFS_I(inode)->i_acl &&
+	    BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
+		posix_acl_release(BTRFS_I(inode)->i_acl);
+	if (BTRFS_I(inode)->i_default_acl &&
+	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
+		posix_acl_release(BTRFS_I(inode)->i_default_acl);
+
+	spin_lock(&BTRFS_I(inode)->root->list_lock);
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
+		       " list\n", inode->i_ino);
+		dump_stack();
+	}
+	spin_unlock(&BTRFS_I(inode)->root->list_lock);
+
+	while (1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+		if (!ordered)
+			break;
+		else {
+			printk(KERN_ERR "btrfs found ordered "
+			       "extent %llu %llu on inode cleanup\n",
+			       (unsigned long long)ordered->file_offset,
+			       (unsigned long long)ordered->len);
+			btrfs_remove_ordered_extent(inode, ordered);
+			btrfs_put_ordered_extent(ordered);
+			btrfs_put_ordered_extent(ordered);
+		}
+	}
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
+static void init_once(void *foo)
+{
+	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+void btrfs_destroy_cachep(void)
+{
+	if (btrfs_inode_cachep)
+		kmem_cache_destroy(btrfs_inode_cachep);
+	if (btrfs_trans_handle_cachep)
+		kmem_cache_destroy(btrfs_trans_handle_cachep);
+	if (btrfs_transaction_cachep)
+		kmem_cache_destroy(btrfs_transaction_cachep);
+	if (btrfs_bit_radix_cachep)
+		kmem_cache_destroy(btrfs_bit_radix_cachep);
+	if (btrfs_path_cachep)
+		kmem_cache_destroy(btrfs_path_cachep);
+}
+
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+				       unsigned long extra_flags,
+				       void (*ctor)(void *))
+{
+	return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
+				 SLAB_MEM_SPREAD | extra_flags), ctor);
+}
+
+int btrfs_init_cachep(void)
+{
+	btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
+					  sizeof(struct btrfs_inode),
+					  0, init_once);
+	if (!btrfs_inode_cachep)
+		goto fail;
+	btrfs_trans_handle_cachep =
+			btrfs_cache_create("btrfs_trans_handle_cache",
+					   sizeof(struct btrfs_trans_handle),
+					   0, NULL);
+	if (!btrfs_trans_handle_cachep)
+		goto fail;
+	btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
+					     sizeof(struct btrfs_transaction),
+					     0, NULL);
+	if (!btrfs_transaction_cachep)
+		goto fail;
+	btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
+					 sizeof(struct btrfs_path),
+					 0, NULL);
+	if (!btrfs_path_cachep)
+		goto fail;
+	btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
+					      SLAB_DESTROY_BY_RCU, NULL);
+	if (!btrfs_bit_radix_cachep)
+		goto fail;
+	return 0;
+fail:
+	btrfs_destroy_cachep();
+	return -ENOMEM;
+}
+
+static int btrfs_getattr(struct vfsmount *mnt,
+			 struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+	stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
+	stat->blksize = PAGE_CACHE_SIZE;
+	stat->blocks = (inode_get_bytes(inode) +
+			BTRFS_I(inode)->delalloc_bytes) >> 9;
+	return 0;
+}
+
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(old_dir)->root;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct timespec ctime = CURRENT_TIME;
+	u64 index = 0;
+	int ret;
+
+	/* we're not allowed to rename between subvolumes */
+	if (BTRFS_I(old_inode)->root->root_key.objectid !=
+	    BTRFS_I(new_dir)->root->root_key.objectid)
+		return -EXDEV;
+
+	if (S_ISDIR(old_inode->i_mode) && new_inode &&
+	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+		return -ENOTEMPTY;
+	}
+
+	/* to rename a snapshot or subvolume, we need to juggle the
+	 * backrefs.  This isn't coded yet
+	 */
+	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		return -EXDEV;
+
+	ret = btrfs_check_free_space(root, 1, 0);
+	if (ret)
+		goto out_unlock;
+
+	trans = btrfs_start_transaction(root, 1);
+
+	btrfs_set_trans_block_group(trans, new_dir);
+
+	btrfs_inc_nlink(old_dentry->d_inode);
+	old_dir->i_ctime = old_dir->i_mtime = ctime;
+	new_dir->i_ctime = new_dir->i_mtime = ctime;
+	old_inode->i_ctime = ctime;
+
+	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+				 old_dentry->d_name.name,
+				 old_dentry->d_name.len);
+	if (ret)
+		goto out_fail;
+
+	if (new_inode) {
+		new_inode->i_ctime = CURRENT_TIME;
+		ret = btrfs_unlink_inode(trans, root, new_dir,
+					 new_dentry->d_inode,
+					 new_dentry->d_name.name,
+					 new_dentry->d_name.len);
+		if (ret)
+			goto out_fail;
+		if (new_inode->i_nlink == 0) {
+			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+			if (ret)
+				goto out_fail;
+		}
+
+	}
+	ret = btrfs_set_inode_index(new_dir, &index);
+	if (ret)
+		goto out_fail;
+
+	ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
+			     old_inode, new_dentry->d_name.name,
+			     new_dentry->d_name.len, 1, index);
+	if (ret)
+		goto out_fail;
+
+out_fail:
+	btrfs_end_transaction_throttle(trans, root);
+out_unlock:
+	return ret;
+}
+
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+{
+	struct list_head *head = &root->fs_info->delalloc_inodes;
+	struct btrfs_inode *binode;
+	struct inode *inode;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	while (!list_empty(head)) {
+		binode = list_entry(head->next, struct btrfs_inode,
+				    delalloc_inodes);
+		inode = igrab(&binode->vfs_inode);
+		if (!inode)
+			list_del_init(&binode->delalloc_inodes);
+		spin_unlock(&root->fs_info->delalloc_lock);
+		if (inode) {
+			filemap_flush(inode->i_mapping);
+			iput(inode);
+		}
+		cond_resched();
+		spin_lock(&root->fs_info->delalloc_lock);
+	}
+	spin_unlock(&root->fs_info->delalloc_lock);
+
+	/* the filemap_flush will queue IO into the worker threads, but
+	 * we have to make sure the IO is actually started and that
+	 * ordered extents get created before we return
+	 */
+	atomic_inc(&root->fs_info->async_submit_draining);
+	while (atomic_read(&root->fs_info->nr_async_submits) ||
+	      atomic_read(&root->fs_info->async_delalloc_pages)) {
+		wait_event(root->fs_info->async_submit_wait,
+		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+	}
+	atomic_dec(&root->fs_info->async_submit_draining);
+	return 0;
+}
+
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct inode *inode = NULL;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+	u64 index = 0 ;
+	int name_len;
+	int datasize;
+	unsigned long ptr;
+	struct btrfs_file_extent_item *ei;
+	struct extent_buffer *leaf;
+	unsigned long nr = 0;
+
+	name_len = strlen(symname) + 1;
+	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+		return -ENAMETOOLONG;
+
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto out_fail;
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+				dentry->d_name.len,
+				dentry->d_parent->d_inode->i_ino, objectid,
+				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+				&index);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_unlock;
+
+	err = btrfs_init_acl(inode, dir);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
+	btrfs_set_trans_block_group(trans, inode);
+	err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+	}
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
+	if (drop_inode)
+		goto out_unlock;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	key.objectid = inode->i_ino;
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	datasize = btrfs_file_extent_calc_inline_size(name_len);
+	err = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei,
+				   BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_compression(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
+	ptr = btrfs_file_extent_inline_start(ei);
+	write_extent_buffer(leaf, symname, ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	inode->i_op = &btrfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &btrfs_symlink_aops;
+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+	inode_set_bytes(inode, name_len);
+	btrfs_i_size_write(inode, name_len - 1);
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		drop_inode = 1;
+
+out_unlock:
+	nr = trans->blocks_used;
+	btrfs_end_transaction_throttle(trans, root);
+out_fail:
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root, nr);
+	return err;
+}
+
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+			       u64 alloc_hint, int mode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key ins;
+	u64 alloc_size;
+	u64 cur_offset = start;
+	u64 num_bytes = end - start;
+	int ret = 0;
+
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
+	btrfs_set_trans_block_group(trans, inode);
+
+	while (num_bytes > 0) {
+		alloc_size = min(num_bytes, root->fs_info->max_extent);
+		ret = btrfs_reserve_extent(trans, root, alloc_size,
+					   root->sectorsize, 0, alloc_hint,
+					   (u64)-1, &ins, 1);
+		if (ret) {
+			WARN_ON(1);
+			goto out;
+		}
+		ret = insert_reserved_file_extent(trans, inode,
+						  cur_offset, ins.objectid,
+						  ins.offset, ins.offset,
+						  ins.offset, 0, 0, 0,
+						  BTRFS_FILE_EXTENT_PREALLOC);
+		BUG_ON(ret);
+		num_bytes -= ins.offset;
+		cur_offset += ins.offset;
+		alloc_hint = ins.objectid + ins.offset;
+	}
+out:
+	if (cur_offset > start) {
+		inode->i_ctime = CURRENT_TIME;
+		btrfs_set_flag(inode, PREALLOC);
+		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+		    cur_offset > i_size_read(inode))
+			btrfs_i_size_write(inode, cur_offset);
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
+	}
+
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+static long btrfs_fallocate(struct inode *inode, int mode,
+			    loff_t offset, loff_t len)
+{
+	u64 cur_offset;
+	u64 last_byte;
+	u64 alloc_start;
+	u64 alloc_end;
+	u64 alloc_hint = 0;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	struct extent_map *em;
+	int ret;
+
+	alloc_start = offset & ~mask;
+	alloc_end =  (offset + len + mask) & ~mask;
+
+	mutex_lock(&inode->i_mutex);
+	if (alloc_start > inode->i_size) {
+		ret = btrfs_cont_expand(inode, alloc_start);
+		if (ret)
+			goto out;
+	}
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
+			    alloc_end - 1, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    alloc_end - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > alloc_start &&
+		    ordered->file_offset < alloc_end) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent(&BTRFS_I(inode)->io_tree,
+				      alloc_start, alloc_end - 1, GFP_NOFS);
+			btrfs_wait_ordered_range(inode, alloc_start,
+						 alloc_end - alloc_start);
+		} else {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+	}
+
+	cur_offset = alloc_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				      alloc_end - cur_offset, 0);
+		BUG_ON(IS_ERR(em) || !em);
+		last_byte = min(extent_map_end(em), alloc_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (em->block_start == EXTENT_MAP_HOLE) {
+			ret = prealloc_file_range(inode, cur_offset,
+					last_byte, alloc_hint, mode);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
+			}
+		}
+		if (em->block_start <= EXTENT_MAP_LAST_BYTE)
+			alloc_hint = em->block_start;
+		free_extent_map(em);
+
+		cur_offset = last_byte;
+		if (cur_offset >= alloc_end) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+		      GFP_NOFS);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
+static int btrfs_set_page_dirty(struct page *page)
+{
+	return __set_page_dirty_nobuffers(page);
+}
+
+static int btrfs_permission(struct inode *inode, int mask)
+{
+	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+		return -EACCES;
+	return generic_permission(inode, mask, btrfs_check_acl);
+}
+
+static struct inode_operations btrfs_dir_inode_operations = {
+	.getattr	= btrfs_getattr,
+	.lookup		= btrfs_lookup,
+	.create		= btrfs_create,
+	.unlink		= btrfs_unlink,
+	.link		= btrfs_link,
+	.mkdir		= btrfs_mkdir,
+	.rmdir		= btrfs_rmdir,
+	.rename		= btrfs_rename,
+	.symlink	= btrfs_symlink,
+	.setattr	= btrfs_setattr,
+	.mknod		= btrfs_mknod,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+	.permission	= btrfs_permission,
+};
+static struct inode_operations btrfs_dir_ro_inode_operations = {
+	.lookup		= btrfs_lookup,
+	.permission	= btrfs_permission,
+};
+static struct file_operations btrfs_dir_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= btrfs_real_readdir,
+	.unlocked_ioctl	= btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_ioctl,
+#endif
+	.release        = btrfs_release_file,
+	.fsync		= btrfs_sync_file,
+};
+
+static struct extent_io_ops btrfs_extent_io_ops = {
+	.fill_delalloc = run_delalloc_range,
+	.submit_bio_hook = btrfs_submit_bio_hook,
+	.merge_bio_hook = btrfs_merge_bio_hook,
+	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
+	.writepage_start_hook = btrfs_writepage_start_hook,
+	.readpage_io_failed_hook = btrfs_io_failed_hook,
+	.set_bit_hook = btrfs_set_bit_hook,
+	.clear_bit_hook = btrfs_clear_bit_hook,
+};
+
+static struct address_space_operations btrfs_aops = {
+	.readpage	= btrfs_readpage,
+	.writepage	= btrfs_writepage,
+	.writepages	= btrfs_writepages,
+	.readpages	= btrfs_readpages,
+	.sync_page	= block_sync_page,
+	.bmap		= btrfs_bmap,
+	.direct_IO	= btrfs_direct_IO,
+	.invalidatepage = btrfs_invalidatepage,
+	.releasepage	= btrfs_releasepage,
+	.set_page_dirty	= btrfs_set_page_dirty,
+};
+
+static struct address_space_operations btrfs_symlink_aops = {
+	.readpage	= btrfs_readpage,
+	.writepage	= btrfs_writepage,
+	.invalidatepage = btrfs_invalidatepage,
+	.releasepage	= btrfs_releasepage,
+};
+
+static struct inode_operations btrfs_file_inode_operations = {
+	.truncate	= btrfs_truncate,
+	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr      = btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+	.permission	= btrfs_permission,
+	.fallocate	= btrfs_fallocate,
+};
+static struct inode_operations btrfs_special_inode_operations = {
+	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
+	.permission	= btrfs_permission,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
+};
+static struct inode_operations btrfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+	.permission	= btrfs_permission,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 00000000000..c2aa33e3feb
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/security.h>
+#include <linux/version.h>
+#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "locking.h"
+
+
+
+static noinline int create_subvol(struct btrfs_root *root,
+				  struct dentry *dentry,
+				  char *name, int namelen)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	struct btrfs_root *new_root = root;
+	struct inode *dir;
+	int ret;
+	int err;
+	u64 objectid;
+	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+	u64 index = 0;
+	unsigned long nr = 1;
+
+	ret = btrfs_check_free_space(root, 1, 0);
+	if (ret)
+		goto fail_commit;
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+				       0, &objectid);
+	if (ret)
+		goto fail;
+
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+				      objectid, trans->transid, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		goto fail;
+	}
+
+	btrfs_set_header_nritems(leaf, 0);
+	btrfs_set_header_level(leaf, 0);
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_owner(leaf, objectid);
+
+	write_extent_buffer(leaf, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_item = &root_item.inode;
+	memset(inode_item, 0, sizeof(*inode_item));
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_generation(&root_item, trans->transid);
+	btrfs_set_root_level(&root_item, 0);
+	btrfs_set_root_refs(&root_item, 1);
+	btrfs_set_root_used(&root_item, 0);
+	btrfs_set_root_last_snapshot(&root_item, 0);
+
+	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+	root_item.drop_level = 0;
+
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	leaf = NULL;
+
+	btrfs_set_root_dirid(&root_item, new_dirid);
+
+	key.objectid = objectid;
+	key.offset = 1;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				&root_item);
+	if (ret)
+		goto fail;
+
+	/*
+	 * insert the directory item
+	 */
+	key.offset = (u64)-1;
+	dir = dentry->d_parent->d_inode;
+	ret = btrfs_set_inode_index(dir, &index);
+	BUG_ON(ret);
+
+	ret = btrfs_insert_dir_item(trans, root,
+				    name, namelen, dir->i_ino, &key,
+				    BTRFS_FT_DIR, index);
+	if (ret)
+		goto fail;
+
+	btrfs_i_size_write(dir, dir->i_size + namelen * 2);
+	ret = btrfs_update_inode(trans, root, dir);
+	BUG_ON(ret);
+
+	/* add the backref first */
+	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+				 objectid, BTRFS_ROOT_BACKREF_KEY,
+				 root->root_key.objectid,
+				 dir->i_ino, index, name, namelen);
+
+	BUG_ON(ret);
+
+	/* now add the forward ref */
+	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+				 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
+				 objectid,
+				 dir->i_ino, index, name, namelen);
+
+	BUG_ON(ret);
+
+	ret = btrfs_commit_transaction(trans, root);
+	if (ret)
+		goto fail_commit;
+
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+	BUG_ON(!new_root);
+
+	trans = btrfs_start_transaction(new_root, 1);
+	BUG_ON(!trans);
+
+	ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
+				       BTRFS_I(dir)->block_group);
+	if (ret)
+		goto fail;
+
+fail:
+	nr = trans->blocks_used;
+	err = btrfs_commit_transaction(trans, new_root);
+	if (err && !ret)
+		ret = err;
+fail_commit:
+	btrfs_btree_balance_dirty(root, nr);
+	return ret;
+}
+
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+			   char *name, int namelen)
+{
+	struct btrfs_pending_snapshot *pending_snapshot;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+	int err;
+	unsigned long nr = 0;
+
+	if (!root->ref_cows)
+		return -EINVAL;
+
+	ret = btrfs_check_free_space(root, 1, 0);
+	if (ret)
+		goto fail_unlock;
+
+	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+	if (!pending_snapshot) {
+		ret = -ENOMEM;
+		goto fail_unlock;
+	}
+	pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+	if (!pending_snapshot->name) {
+		ret = -ENOMEM;
+		kfree(pending_snapshot);
+		goto fail_unlock;
+	}
+	memcpy(pending_snapshot->name, name, namelen);
+	pending_snapshot->name[namelen] = '\0';
+	pending_snapshot->dentry = dentry;
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+	pending_snapshot->root = root;
+	list_add(&pending_snapshot->list,
+		 &trans->transaction->pending_snapshots);
+	err = btrfs_commit_transaction(trans, root);
+
+fail_unlock:
+	btrfs_btree_balance_dirty(root, nr);
+	return ret;
+}
+
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+{
+	if (child->d_inode)
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * Create a new subvolume below @parent.  This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(struct path *parent, char *name,
+				   int mode, int namelen,
+				   struct btrfs_root *snap_src)
+{
+	struct dentry *dentry;
+	int error;
+
+	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+
+	dentry = lookup_one_len(name, parent->dentry, namelen);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+
+	error = -EEXIST;
+	if (dentry->d_inode)
+		goto out_dput;
+
+	if (!IS_POSIXACL(parent->dentry->d_inode))
+		mode &= ~current->fs->umask;
+
+	error = mnt_want_write(parent->mnt);
+	if (error)
+		goto out_dput;
+
+	error = btrfs_may_create(parent->dentry->d_inode, dentry);
+	if (error)
+		goto out_drop_write;
+
+	/*
+	 * Actually perform the low-level subvolume creation after all
+	 * this VFS fuzz.
+	 *
+	 * Eventually we want to pass in an inode under which we create this
+	 * subvolume, but for now all are under the filesystem root.
+	 *
+	 * Also we should pass on the mode eventually to allow creating new
+	 * subvolume with specific mode bits.
+	 */
+	if (snap_src) {
+		struct dentry *dir = dentry->d_parent;
+		struct dentry *test = dir->d_parent;
+		struct btrfs_path *path = btrfs_alloc_path();
+		int ret;
+		u64 test_oid;
+		u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
+
+		test_oid = snap_src->root_key.objectid;
+
+		ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+					  path, parent_oid, test_oid);
+		if (ret == 0)
+			goto create;
+		btrfs_release_path(snap_src->fs_info->tree_root, path);
+
+		/* we need to make sure we aren't creating a directory loop
+		 * by taking a snapshot of something that has our current
+		 * subvol in its directory tree.  So, this loops through
+		 * the dentries and checks the forward refs for each subvolume
+		 * to see if is references the subvolume where we are
+		 * placing this new snapshot.
+		 */
+		while (1) {
+			if (!test ||
+			    dir == snap_src->fs_info->sb->s_root ||
+			    test == snap_src->fs_info->sb->s_root ||
+			    test->d_inode->i_sb != snap_src->fs_info->sb) {
+				break;
+			}
+			if (S_ISLNK(test->d_inode->i_mode)) {
+				printk(KERN_INFO "Btrfs symlink in snapshot "
+				       "path, failed\n");
+				error = -EMLINK;
+				btrfs_free_path(path);
+				goto out_drop_write;
+			}
+			test_oid =
+				BTRFS_I(test->d_inode)->root->root_key.objectid;
+			ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+				  path, test_oid, parent_oid);
+			if (ret == 0) {
+				printk(KERN_INFO "Btrfs snapshot creation "
+				       "failed, looping\n");
+				error = -EMLINK;
+				btrfs_free_path(path);
+				goto out_drop_write;
+			}
+			btrfs_release_path(snap_src->fs_info->tree_root, path);
+			test = test->d_parent;
+		}
+create:
+		btrfs_free_path(path);
+		error = create_snapshot(snap_src, dentry, name, namelen);
+	} else {
+		error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
+				      dentry, name, namelen);
+	}
+	if (error)
+		goto out_drop_write;
+
+	fsnotify_mkdir(parent->dentry->d_inode, dentry);
+out_drop_write:
+	mnt_drop_write(parent->mnt);
+out_dput:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&parent->dentry->d_inode->i_mutex);
+	return error;
+}
+
+
+static int btrfs_defrag_file(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	struct page *page;
+	unsigned long last_index;
+	unsigned long ra_pages = root->fs_info->bdi.ra_pages;
+	unsigned long total_read = 0;
+	u64 page_start;
+	u64 page_end;
+	unsigned long i;
+	int ret;
+
+	ret = btrfs_check_free_space(root, inode->i_size, 0);
+	if (ret)
+		return -ENOSPC;
+
+	mutex_lock(&inode->i_mutex);
+	last_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	for (i = 0; i <= last_index; i++) {
+		if (total_read % ra_pages == 0) {
+			btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
+				       min(last_index, i + ra_pages - 1));
+		}
+		total_read++;
+again:
+		page = grab_cache_page(inode->i_mapping, i);
+		if (!page)
+			goto out_unlock;
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto out_unlock;
+			}
+		}
+
+		wait_on_page_writeback(page);
+
+		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+		ordered = btrfs_lookup_ordered_extent(inode, page_start);
+		if (ordered) {
+			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			goto again;
+		}
+		set_page_extent_mapped(page);
+
+		/*
+		 * this makes sure page_mkwrite is called on the
+		 * page if it is dirtied again later
+		 */
+		clear_page_dirty_for_io(page);
+
+		btrfs_set_extent_delalloc(inode, page_start, page_end);
+
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+	}
+
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+}
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+
+static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+{
+	u64 new_size;
+	u64 old_size;
+	u64 devid = 1;
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device = NULL;
+	char *sizestr;
+	char *devstr = NULL;
+	int ret = 0;
+	int namelen;
+	int mod = 0;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	namelen = strlen(vol_args->name);
+
+	mutex_lock(&root->fs_info->volume_mutex);
+	sizestr = vol_args->name;
+	devstr = strchr(sizestr, ':');
+	if (devstr) {
+		char *end;
+		sizestr = devstr + 1;
+		*devstr = '\0';
+		devstr = vol_args->name;
+		devid = simple_strtoull(devstr, &end, 10);
+		printk(KERN_INFO "resizing devid %llu\n", devid);
+	}
+	device = btrfs_find_device(root, devid, NULL, NULL);
+	if (!device) {
+		printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (!strcmp(sizestr, "max"))
+		new_size = device->bdev->bd_inode->i_size;
+	else {
+		if (sizestr[0] == '-') {
+			mod = -1;
+			sizestr++;
+		} else if (sizestr[0] == '+') {
+			mod = 1;
+			sizestr++;
+		}
+		new_size = btrfs_parse_size(sizestr);
+		if (new_size == 0) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	old_size = device->total_bytes;
+
+	if (mod < 0) {
+		if (new_size > old_size) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		new_size = old_size - new_size;
+	} else if (mod > 0) {
+		new_size = old_size + new_size;
+	}
+
+	if (new_size < 256 * 1024 * 1024) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (new_size > device->bdev->bd_inode->i_size) {
+		ret = -EFBIG;
+		goto out_unlock;
+	}
+
+	do_div(new_size, root->sectorsize);
+	new_size *= root->sectorsize;
+
+	printk(KERN_INFO "new size for %s is %llu\n",
+		device->name, (unsigned long long)new_size);
+
+	if (new_size > old_size) {
+		trans = btrfs_start_transaction(root, 1);
+		ret = btrfs_grow_device(trans, device, new_size);
+		btrfs_commit_transaction(trans, root);
+	} else {
+		ret = btrfs_shrink_device(device, new_size);
+	}
+
+out_unlock:
+	mutex_unlock(&root->fs_info->volume_mutex);
+out:
+	kfree(vol_args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+					    void __user *arg, int subvol)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct file *src_file;
+	u64 root_dirid;
+	int namelen;
+	int ret = 0;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	namelen = strlen(vol_args->name);
+	if (strchr(vol_args->name, '/')) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+			    path, root_dirid,
+			    vol_args->name, namelen, 0);
+	btrfs_free_path(path);
+
+	if (di && !IS_ERR(di)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+
+	if (subvol) {
+		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+				     file->f_path.dentry->d_inode->i_mode,
+				     namelen, NULL);
+	} else {
+		struct inode *src_inode;
+		src_file = fget(vol_args->fd);
+		if (!src_file) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		src_inode = src_file->f_path.dentry->d_inode;
+		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
+			printk(KERN_INFO "btrfs: Snapshot src from "
+			       "another FS\n");
+			ret = -EINVAL;
+			fput(src_file);
+			goto out;
+		}
+		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+			     file->f_path.dentry->d_inode->i_mode,
+			     namelen, BTRFS_I(src_inode)->root);
+		fput(src_file);
+	}
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
+static int btrfs_ioctl_defrag(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		return ret;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		if (!capable(CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			goto out;
+		}
+		btrfs_defrag_root(root, 0);
+		btrfs_defrag_root(root->fs_info->extent_root, 0);
+		break;
+	case S_IFREG:
+		if (!(file->f_mode & FMODE_WRITE)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		btrfs_defrag_file(file);
+		break;
+	}
+out:
+	mnt_drop_write(file->f_path.mnt);
+	return ret;
+}
+
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	ret = btrfs_init_new_device(root, vol_args->name);
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
+static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+	ret = btrfs_rm_device(root, vol_args->name);
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
+static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+		u64 off, u64 olen, u64 destoff)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file *src_file;
+	struct inode *src;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	char *buf;
+	struct btrfs_key key;
+	u32 nritems;
+	int slot;
+	int ret;
+	u64 len = olen;
+	u64 bs = root->fs_info->sb->s_blocksize;
+	u64 hint_byte;
+
+	/*
+	 * TODO:
+	 * - split compressed inline extents.  annoying: we need to
+	 *   decompress into destination's address_space (the file offset
+	 *   may change, so source mapping won't do), then recompress (or
+	 *   otherwise reinsert) a subrange.
+	 * - allow ranges within the same file to be cloned (provided
+	 *   they don't overlap)?
+	 */
+
+	/* the destination must be opened for writing */
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EINVAL;
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		return ret;
+
+	src_file = fget(srcfd);
+	if (!src_file) {
+		ret = -EBADF;
+		goto out_drop_write;
+	}
+	src = src_file->f_dentry->d_inode;
+
+	ret = -EINVAL;
+	if (src == inode)
+		goto out_fput;
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+		goto out_fput;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
+		goto out_fput;
+
+	ret = -ENOMEM;
+	buf = vmalloc(btrfs_level_size(root, 0));
+	if (!buf)
+		goto out_fput;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		vfree(buf);
+		goto out_fput;
+	}
+	path->reada = 2;
+
+	if (inode < src) {
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&src->i_mutex);
+	} else {
+		mutex_lock(&src->i_mutex);
+		mutex_lock(&inode->i_mutex);
+	}
+
+	/* determine range to clone */
+	ret = -EINVAL;
+	if (off >= src->i_size || off + len > src->i_size)
+		goto out_unlock;
+	if (len == 0)
+		olen = len = src->i_size - off;
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == src->i_size)
+		len = ((src->i_size + bs-1) & ~(bs-1))
+			- off;
+
+	/* verify the end result is block aligned */
+	if ((off & (bs-1)) ||
+	    ((off + len) & (bs-1)))
+		goto out_unlock;
+
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
+		if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+			break;
+		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		btrfs_wait_ordered_range(src, off, off+len);
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	/* punch hole in destination first */
+	btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+
+	/* clone data */
+	key.objectid = src->i_ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
+
+	while (1) {
+		/*
+		 * note the key will change type as we walk through the
+		 * tree.
+		 */
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+			nritems = btrfs_header_nritems(path->nodes[0]);
+		}
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+		    key.objectid != src->i_ino)
+			break;
+
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int type;
+			u32 size;
+			struct btrfs_key new_key;
+			u64 disko = 0, diskl = 0;
+			u64 datao = 0, datal = 0;
+			u8 comp;
+
+			size = btrfs_item_size_nr(leaf, slot);
+			read_extent_buffer(leaf, buf,
+					   btrfs_item_ptr_offset(leaf, slot),
+					   size);
+
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			comp = btrfs_file_extent_compression(leaf, extent);
+			type = btrfs_file_extent_type(leaf, extent);
+			if (type == BTRFS_FILE_EXTENT_REG) {
+				disko = btrfs_file_extent_disk_bytenr(leaf,
+								      extent);
+				diskl = btrfs_file_extent_disk_num_bytes(leaf,
+								 extent);
+				datao = btrfs_file_extent_offset(leaf, extent);
+				datal = btrfs_file_extent_num_bytes(leaf,
+								    extent);
+			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+				/* take upper bound, may be compressed */
+				datal = btrfs_file_extent_ram_bytes(leaf,
+								    extent);
+			}
+			btrfs_release_path(root, path);
+
+			if (key.offset + datal < off ||
+			    key.offset >= off+len)
+				goto next;
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.objectid = inode->i_ino;
+			new_key.offset = key.offset + destoff - off;
+
+			if (type == BTRFS_FILE_EXTENT_REG) {
+				ret = btrfs_insert_empty_item(trans, root, path,
+							      &new_key, size);
+				if (ret)
+					goto out;
+
+				leaf = path->nodes[0];
+				slot = path->slots[0];
+				write_extent_buffer(leaf, buf,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    size);
+
+				extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+
+				if (off > key.offset) {
+					datao += off - key.offset;
+					datal -= off - key.offset;
+				}
+				if (key.offset + datao + datal + key.offset >
+				    off + len)
+					datal = off + len - key.offset - datao;
+				/* disko == 0 means it's a hole */
+				if (!disko)
+					datao = 0;
+
+				btrfs_set_file_extent_offset(leaf, extent,
+							     datao);
+				btrfs_set_file_extent_num_bytes(leaf, extent,
+								datal);
+				if (disko) {
+					inode_add_bytes(inode, datal);
+					ret = btrfs_inc_extent_ref(trans, root,
+						   disko, diskl, leaf->start,
+						   root->root_key.objectid,
+						   trans->transid,
+						   inode->i_ino);
+					BUG_ON(ret);
+				}
+			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+				u64 skip = 0;
+				u64 trim = 0;
+				if (off > key.offset) {
+					skip = off - key.offset;
+					new_key.offset += skip;
+				}
+
+				if (key.offset + datal > off+len)
+					trim = key.offset + datal - (off+len);
+
+				if (comp && (skip || trim)) {
+					ret = -EINVAL;
+					goto out;
+				}
+				size -= skip + trim;
+				datal -= skip + trim;
+				ret = btrfs_insert_empty_item(trans, root, path,
+							      &new_key, size);
+				if (ret)
+					goto out;
+
+				if (skip) {
+					u32 start =
+					  btrfs_file_extent_calc_inline_size(0);
+					memmove(buf+start, buf+start+skip,
+						datal);
+				}
+
+				leaf = path->nodes[0];
+				slot = path->slots[0];
+				write_extent_buffer(leaf, buf,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    size);
+				inode_add_bytes(inode, datal);
+			}
+
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
+next:
+		btrfs_release_path(root, path);
+		key.offset++;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	if (ret == 0) {
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		if (destoff + olen > inode->i_size)
+			btrfs_i_size_write(inode, destoff + olen);
+		BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+		ret = btrfs_update_inode(trans, root, inode);
+	}
+	btrfs_end_transaction(trans, root);
+	unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+	if (ret)
+		vmtruncate(inode, 0);
+out_unlock:
+	mutex_unlock(&src->i_mutex);
+	mutex_unlock(&inode->i_mutex);
+	vfree(buf);
+	btrfs_free_path(path);
+out_fput:
+	fput(src_file);
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
+	return ret;
+}
+
+static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+{
+	struct btrfs_ioctl_clone_range_args args;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+	return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
+				 args.src_length, args.dest_offset);
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+static long btrfs_ioctl_trans_start(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (file->private_data) {
+		ret = -EINPROGRESS;
+		goto out;
+	}
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		goto out;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->open_ioctl_trans++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	trans = btrfs_start_ioctl_transaction(root, 0);
+	if (trans)
+		file->private_data = trans;
+	else
+		ret = -ENOMEM;
+	/*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+out:
+	return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	trans = file->private_data;
+	if (!trans) {
+		ret = -EINVAL;
+		goto out;
+	}
+	btrfs_end_transaction(trans, root);
+	file->private_data = NULL;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->open_ioctl_trans--;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	mnt_drop_write(file->f_path.mnt);
+
+out:
+	return ret;
+}
+
+long btrfs_ioctl(struct file *file, unsigned int
+		cmd, unsigned long arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case BTRFS_IOC_SNAP_CREATE:
+		return btrfs_ioctl_snap_create(file, argp, 0);
+	case BTRFS_IOC_SUBVOL_CREATE:
+		return btrfs_ioctl_snap_create(file, argp, 1);
+	case BTRFS_IOC_DEFRAG:
+		return btrfs_ioctl_defrag(file);
+	case BTRFS_IOC_RESIZE:
+		return btrfs_ioctl_resize(root, argp);
+	case BTRFS_IOC_ADD_DEV:
+		return btrfs_ioctl_add_dev(root, argp);
+	case BTRFS_IOC_RM_DEV:
+		return btrfs_ioctl_rm_dev(root, argp);
+	case BTRFS_IOC_BALANCE:
+		return btrfs_balance(root->fs_info->dev_root);
+	case BTRFS_IOC_CLONE:
+		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
+	case BTRFS_IOC_CLONE_RANGE:
+		return btrfs_ioctl_clone_range(file, argp);
+	case BTRFS_IOC_TRANS_START:
+		return btrfs_ioctl_trans_start(file);
+	case BTRFS_IOC_TRANS_END:
+		return btrfs_ioctl_trans_end(file);
+	case BTRFS_IOC_SYNC:
+		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return 0;
+	}
+
+	return -ENOTTY;
+}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 00000000000..78049ea208d
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __IOCTL_
+#define __IOCTL_
+#include <linux/ioctl.h>
+
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+#define BTRFS_PATH_NAME_MAX 3072
+
+struct btrfs_ioctl_vol_args {
+	__s64 fd;
+	char name[BTRFS_PATH_NAME_MAX + 1];
+};
+
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+				   struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+				   struct btrfs_ioctl_vol_args)
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+				  struct btrfs_ioctl_clone_range_args)
+
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+				   struct btrfs_ioctl_vol_args)
+
+#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 00000000000..39bae7761db
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <asm/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+
+/*
+ * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
+ * and the spin is not tuned very extensively.  The spinning does make a big
+ * difference in almost every workload, but spinning for the right amount of
+ * time needs some help.
+ *
+ * In general, we want to spin as long as the lock holder is doing btree
+ * searches, and we should give up if they are in more expensive code.
+ */
+
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+	int i;
+
+	if (mutex_trylock(&eb->mutex))
+		return 0;
+	for (i = 0; i < 512; i++) {
+		cpu_relax();
+		if (mutex_trylock(&eb->mutex))
+			return 0;
+	}
+	cpu_relax();
+	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+	return 0;
+}
+
+int btrfs_try_tree_lock(struct extent_buffer *eb)
+{
+	return mutex_trylock(&eb->mutex);
+}
+
+int btrfs_tree_unlock(struct extent_buffer *eb)
+{
+	mutex_unlock(&eb->mutex);
+	return 0;
+}
+
+int btrfs_tree_locked(struct extent_buffer *eb)
+{
+	return mutex_is_locked(&eb->mutex);
+}
+
+/*
+ * btrfs_search_slot uses this to decide if it should drop its locks
+ * before doing something expensive like allocating free blocks for cow.
+ */
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
+{
+	int i;
+	struct extent_buffer *eb;
+	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
+		eb = path->nodes[i];
+		if (!eb)
+			break;
+		smp_mb();
+		if (!list_empty(&eb->mutex.wait_list))
+			return 1;
+	}
+	return 0;
+}
+
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 00000000000..bc1faef1251
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+
+int btrfs_tree_lock(struct extent_buffer *eb);
+int btrfs_tree_unlock(struct extent_buffer *eb);
+int btrfs_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 00000000000..a2094017027
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "extent_io.h"
+
+static u64 entry_end(struct btrfs_ordered_extent *entry)
+{
+	if (entry->file_offset + entry->len < entry->file_offset)
+		return (u64)-1;
+	return entry->file_offset + entry->len;
+}
+
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_ordered_extent *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
+
+		if (file_offset < entry->file_offset)
+			p = &(*p)->rb_left;
+		else if (file_offset >= entry_end(entry))
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+				     struct rb_node **prev_ret)
+{
+	struct rb_node *n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *test;
+	struct btrfs_ordered_extent *entry;
+	struct btrfs_ordered_extent *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (file_offset < entry->file_offset)
+			n = n->rb_left;
+		else if (file_offset >= entry_end(entry))
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+
+	while (prev && file_offset >= entry_end(prev_entry)) {
+		test = rb_next(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		if (file_offset < entry_end(prev_entry))
+			break;
+
+		prev = test;
+	}
+	if (prev)
+		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+				      rb_node);
+	while (prev && file_offset < entry_end(prev_entry)) {
+		test = rb_prev(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		prev = test;
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+/*
+ * helper to check if a given offset is inside a given entry
+ */
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+	if (file_offset < entry->file_offset ||
+	    entry->file_offset + entry->len <= file_offset)
+		return 0;
+	return 1;
+}
+
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+					  u64 file_offset)
+{
+	struct rb_root *root = &tree->tree;
+	struct rb_node *prev;
+	struct rb_node *ret;
+	struct btrfs_ordered_extent *entry;
+
+	if (tree->last) {
+		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+				 rb_node);
+		if (offset_in_entry(entry, file_offset))
+			return tree->last;
+	}
+	ret = __tree_search(root, file_offset, &prev);
+	if (!ret)
+		ret = prev;
+	if (ret)
+		tree->last = ret;
+	return ret;
+}
+
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * file_offset is the logical offset in the file
+ *
+ * start is the disk block number of an extent already reserved in the
+ * extent allocation tree
+ *
+ * len is the length of the extent
+ *
+ * This also sets the EXTENT_ORDERED bit on the range in the inode.
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len, u64 disk_len, int type)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	entry = kzalloc(sizeof(*entry), GFP_NOFS);
+	if (!entry)
+		return -ENOMEM;
+
+	mutex_lock(&tree->mutex);
+	entry->file_offset = file_offset;
+	entry->start = start;
+	entry->len = len;
+	entry->disk_len = disk_len;
+	entry->inode = inode;
+	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+		set_bit(type, &entry->flags);
+
+	/* one ref for the tree */
+	atomic_set(&entry->refs, 1);
+	init_waitqueue_head(&entry->wait);
+	INIT_LIST_HEAD(&entry->list);
+	INIT_LIST_HEAD(&entry->root_extent_list);
+
+	node = tree_insert(&tree->tree, file_offset,
+			   &entry->rb_node);
+	BUG_ON(node);
+
+	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+			   entry_end(entry) - 1, GFP_NOFS);
+
+	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	list_add_tail(&entry->root_extent_list,
+		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
+	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
+	mutex_unlock(&tree->mutex);
+	BUG_ON(node);
+	return 0;
+}
+
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished.  If the list covers more than one
+ * ordered extent, it is split across multiples.
+ */
+int btrfs_add_ordered_sum(struct inode *inode,
+			  struct btrfs_ordered_extent *entry,
+			  struct btrfs_ordered_sum *sum)
+{
+	struct btrfs_ordered_inode_tree *tree;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	list_add_tail(&sum->list, &entry->list);
+	mutex_unlock(&tree->mutex);
+	return 0;
+}
+
+/*
+ * this is used to account for finished IO across a given range
+ * of the file.  The IO should not span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ */
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   u64 file_offset, u64 io_size)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int ret;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+			     GFP_NOFS);
+	node = tree_search(tree, file_offset);
+	if (!node) {
+		ret = 1;
+		goto out;
+	}
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset)) {
+		ret = 1;
+		goto out;
+	}
+
+	ret = test_range_bit(io_tree, entry->file_offset,
+			     entry->file_offset + entry->len - 1,
+			     EXTENT_ORDERED, 0);
+	if (ret == 0)
+		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+out:
+	mutex_unlock(&tree->mutex);
+	return ret == 0;
+}
+
+/*
+ * used to drop a reference on an ordered extent.  This will free
+ * the extent if the last reference is dropped
+ */
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+	struct list_head *cur;
+	struct btrfs_ordered_sum *sum;
+
+	if (atomic_dec_and_test(&entry->refs)) {
+		while (!list_empty(&entry->list)) {
+			cur = entry->list.next;
+			sum = list_entry(cur, struct btrfs_ordered_sum, list);
+			list_del(&sum->list);
+			kfree(sum);
+		}
+		kfree(entry);
+	}
+	return 0;
+}
+
+/*
+ * remove an ordered extent from the tree.  No references are dropped
+ * but, anyone waiting on this extent is woken up.
+ */
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = &entry->rb_node;
+	rb_erase(node, &tree->tree);
+	tree->last = NULL;
+	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+
+	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	list_del_init(&entry->root_extent_list);
+	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
+	mutex_unlock(&tree->mutex);
+	wake_up(&entry->wait);
+	return 0;
+}
+
+/*
+ * wait for all the ordered extents in a root.  This is done when balancing
+ * space between drives.
+ */
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
+{
+	struct list_head splice;
+	struct list_head *cur;
+	struct btrfs_ordered_extent *ordered;
+	struct inode *inode;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	while (!list_empty(&splice)) {
+		cur = splice.next;
+		ordered = list_entry(cur, struct btrfs_ordered_extent,
+				     root_extent_list);
+		if (nocow_only &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+			list_move(&ordered->root_extent_list,
+				  &root->fs_info->ordered_extents);
+			cond_resched_lock(&root->fs_info->ordered_extent_lock);
+			continue;
+		}
+
+		list_del_init(&ordered->root_extent_list);
+		atomic_inc(&ordered->refs);
+
+		/*
+		 * the inode may be getting freed (in sys_unlink path).
+		 */
+		inode = igrab(ordered->inode);
+
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+
+		if (inode) {
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			iput(inode);
+		} else {
+			btrfs_put_ordered_extent(ordered);
+		}
+
+		spin_lock(&root->fs_info->ordered_extent_lock);
+	}
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+	return 0;
+}
+
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct inode *inode,
+				       struct btrfs_ordered_extent *entry,
+				       int wait)
+{
+	u64 start = entry->file_offset;
+	u64 end = start + entry->len - 1;
+
+	/*
+	 * pages in the range can be dirty, clean or writeback.  We
+	 * start IO on any dirty ones so the wait doesn't stall waiting
+	 * for pdflush to find them
+	 */
+	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
+	if (wait) {
+		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						 &entry->flags));
+	}
+}
+
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+	u64 end;
+	u64 orig_end;
+	u64 wait_end;
+	struct btrfs_ordered_extent *ordered;
+
+	if (start + len < start) {
+		orig_end = INT_LIMIT(loff_t);
+	} else {
+		orig_end = start + len - 1;
+		if (orig_end > INT_LIMIT(loff_t))
+			orig_end = INT_LIMIT(loff_t);
+	}
+	wait_end = orig_end;
+again:
+	/* start IO across the range first to instantiate any delalloc
+	 * extents
+	 */
+	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+
+	/* The compression code will leave pages locked but return from
+	 * writepage without setting the page writeback.  Starting again
+	 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+	 */
+	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+
+	btrfs_wait_on_page_writeback_range(inode->i_mapping,
+					   start >> PAGE_CACHE_SHIFT,
+					   orig_end >> PAGE_CACHE_SHIFT);
+
+	end = orig_end;
+	while (1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, end);
+		if (!ordered)
+			break;
+		if (ordered->file_offset > orig_end) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered->file_offset + ordered->len < start) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		end = ordered->file_offset;
+		btrfs_put_ordered_extent(ordered);
+		if (end == 0 || end == start)
+			break;
+		end--;
+	}
+	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+		schedule_timeout(1);
+		goto again;
+	}
+	return 0;
+}
+
+/*
+ * find an ordered extent corresponding to file_offset.  return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset))
+		entry = NULL;
+	if (entry)
+		atomic_inc(&entry->refs);
+out:
+	mutex_unlock(&tree->mutex);
+	return entry;
+}
+
+/*
+ * lookup and return any extent before 'file_offset'.  NULL is returned
+ * if none is found
+ */
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	atomic_inc(&entry->refs);
+out:
+	mutex_unlock(&tree->mutex);
+	return entry;
+}
+
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size.  i_size is updated to cover any fully written part of the file.
+ */
+int btrfs_ordered_update_i_size(struct inode *inode,
+				struct btrfs_ordered_extent *ordered)
+{
+	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	u64 disk_i_size;
+	u64 new_i_size;
+	u64 i_size_test;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *test;
+
+	mutex_lock(&tree->mutex);
+	disk_i_size = BTRFS_I(inode)->disk_i_size;
+
+	/*
+	 * if the disk i_size is already at the inode->i_size, or
+	 * this ordered extent is inside the disk i_size, we're done
+	 */
+	if (disk_i_size >= inode->i_size ||
+	    ordered->file_offset + ordered->len <= disk_i_size) {
+		goto out;
+	}
+
+	/*
+	 * we can't update the disk_isize if there are delalloc bytes
+	 * between disk_i_size and  this ordered extent
+	 */
+	if (test_range_bit(io_tree, disk_i_size,
+			   ordered->file_offset + ordered->len - 1,
+			   EXTENT_DELALLOC, 0)) {
+		goto out;
+	}
+	/*
+	 * walk backward from this ordered extent to disk_i_size.
+	 * if we find an ordered extent then we can't update disk i_size
+	 * yet
+	 */
+	node = &ordered->rb_node;
+	while (1) {
+		node = rb_prev(node);
+		if (!node)
+			break;
+		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (test->file_offset + test->len <= disk_i_size)
+			break;
+		if (test->file_offset >= inode->i_size)
+			break;
+		if (test->file_offset >= disk_i_size)
+			goto out;
+	}
+	new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
+
+	/*
+	 * at this point, we know we can safely update i_size to at least
+	 * the offset from this ordered extent.  But, we need to
+	 * walk forward and see if ios from higher up in the file have
+	 * finished.
+	 */
+	node = rb_next(&ordered->rb_node);
+	i_size_test = 0;
+	if (node) {
+		/*
+		 * do we have an area where IO might have finished
+		 * between our ordered extent and the next one.
+		 */
+		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (test->file_offset > entry_end(ordered))
+			i_size_test = test->file_offset;
+	} else {
+		i_size_test = i_size_read(inode);
+	}
+
+	/*
+	 * i_size_test is the end of a region after this ordered
+	 * extent where there are no ordered extents.  As long as there
+	 * are no delalloc bytes in this area, it is safe to update
+	 * disk_i_size to the end of the region.
+	 */
+	if (i_size_test > entry_end(ordered) &&
+	    !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
+			   EXTENT_DELALLOC, 0)) {
+		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+	}
+	BTRFS_I(inode)->disk_i_size = new_i_size;
+out:
+	mutex_unlock(&tree->mutex);
+	return 0;
+}
+
+/*
+ * search the ordered extents for one corresponding to 'offset' and
+ * try to find a checksum.  This is used because we allow pages to
+ * be reclaimed before their checksum is actually put into the btree
+ */
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum)
+{
+	struct btrfs_ordered_sum *ordered_sum;
+	struct btrfs_sector_sum *sector_sums;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+	struct list_head *cur;
+	unsigned long num_sectors;
+	unsigned long i;
+	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+	int ret = 1;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
+	if (!ordered)
+		return 1;
+
+	mutex_lock(&tree->mutex);
+	list_for_each_prev(cur, &ordered->list) {
+		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
+		if (disk_bytenr >= ordered_sum->bytenr) {
+			num_sectors = ordered_sum->len / sectorsize;
+			sector_sums = ordered_sum->sums;
+			for (i = 0; i < num_sectors; i++) {
+				if (sector_sums[i].bytenr == disk_bytenr) {
+					*sum = sector_sums[i].sum;
+					ret = 0;
+					goto out;
+				}
+			}
+		}
+	}
+out:
+	mutex_unlock(&tree->mutex);
+	btrfs_put_ordered_extent(ordered);
+	return ret;
+}
+
+
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping:	address space structure to write
+ * @start:	offset in bytes where the range starts
+ * @end:	offset in bytes where the range ends (inclusive)
+ * @sync_mode:	enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback.  The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+			   loff_t end, int sync_mode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = sync_mode,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = start,
+		.range_end = end,
+		.for_writepages = 1,
+	};
+	return btrfs_writepages(mapping, &wbc);
+}
+
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping:	target address_space
+ * @start:	beginning page index
+ * @end:	ending page index
+ *
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
+ */
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+				       pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	int nr_pages;
+	int ret = 0;
+	pgoff_t index;
+
+	if (end < start)
+		return 0;
+
+	pagevec_init(&pvec, 0);
+	index = start;
+	while ((index <= end) &&
+			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			PAGECACHE_TAG_WRITEBACK,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+		unsigned i;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/* until radix tree lookup accepts end_index */
+			if (page->index > end)
+				continue;
+
+			wait_on_page_writeback(page);
+			if (PageError(page))
+				ret = -EIO;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	/* Check for outstanding write errors */
+	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+		ret = -ENOSPC;
+	if (test_and_clear_bit(AS_EIO, &mapping->flags))
+		ret = -EIO;
+
+	return ret;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 00000000000..ab66d5e8d6d
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+/* one of these per inode */
+struct btrfs_ordered_inode_tree {
+	struct mutex mutex;
+	struct rb_root tree;
+	struct rb_node *last;
+};
+
+/*
+ * these are used to collect checksums done just before bios submission.
+ * They are attached via a list into the ordered extent, and
+ * checksum items are inserted into the tree after all the blocks in
+ * the ordered extent are on disk
+ */
+struct btrfs_sector_sum {
+	/* bytenr on disk */
+	u64 bytenr;
+	u32 sum;
+};
+
+struct btrfs_ordered_sum {
+	/* bytenr is the start of this extent on disk */
+	u64 bytenr;
+
+	/*
+	 * this is the length in bytes covered by the sums array below.
+	 */
+	unsigned long len;
+	struct list_head list;
+	/* last field is a variable length array of btrfs_sector_sums */
+	struct btrfs_sector_sum sums[];
+};
+
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters.  It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+
+struct btrfs_ordered_extent {
+	/* logical offset in the file */
+	u64 file_offset;
+
+	/* disk byte number */
+	u64 start;
+
+	/* ram length of the extent in bytes */
+	u64 len;
+
+	/* extent length on disk */
+	u64 disk_len;
+
+	/* flags (described above) */
+	unsigned long flags;
+
+	/* reference count */
+	atomic_t refs;
+
+	/* the inode we belong to */
+	struct inode *inode;
+
+	/* list of checksums for insertion when the extent io is done */
+	struct list_head list;
+
+	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
+	wait_queue_head_t wait;
+
+	/* our friendly rbtree entry */
+	struct rb_node rb_node;
+
+	/* a per root list of all the pending ordered extents */
+	struct list_head root_extent_list;
+};
+
+
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+					 unsigned long bytes)
+{
+	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+		root->sectorsize;
+	num_sectors++;
+	return sizeof(struct btrfs_ordered_sum) +
+		num_sectors * sizeof(struct btrfs_sector_sum);
+}
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+	mutex_init(&t->mutex);
+	t->tree.rb_node = NULL;
+	t->last = NULL;
+}
+
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				       u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len, u64 disk_len, int tyep);
+int btrfs_add_ordered_sum(struct inode *inode,
+			  struct btrfs_ordered_extent *entry,
+			  struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset);
+void btrfs_start_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry, int wait);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+int btrfs_ordered_update_i_size(struct inode *inode,
+				struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+				       pgoff_t start, pgoff_t end);
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+			   loff_t end, int sync_mode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 00000000000..3c0d52af4f8
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret)
+		goto out;
+
+	ret = btrfs_del_item(trans, root, path);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 00000000000..5f8f218c100
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+	int i;
+	printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+	       "num_stripes %d\n",
+	       (unsigned long long)btrfs_chunk_length(eb, chunk),
+	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
+	       (unsigned long long)btrfs_chunk_type(eb, chunk),
+	       num_stripes);
+	for (i = 0 ; i < num_stripes ; i++) {
+		printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
+		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
+		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+	}
+}
+static void print_dev_item(struct extent_buffer *eb,
+			   struct btrfs_dev_item *dev_item)
+{
+	printk(KERN_INFO "\t\tdev item devid %llu "
+	       "total_bytes %llu bytes used %llu\n",
+	       (unsigned long long)btrfs_device_id(eb, dev_item),
+	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
+	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+}
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
+{
+	int i;
+	u32 nr = btrfs_header_nritems(l);
+	struct btrfs_item *item;
+	struct btrfs_extent_item *ei;
+	struct btrfs_root_item *ri;
+	struct btrfs_dir_item *di;
+	struct btrfs_inode_item *ii;
+	struct btrfs_block_group_item *bi;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_dev_extent *dev_extent;
+	u32 type;
+
+	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
+		(unsigned long long)btrfs_header_bytenr(l), nr,
+		btrfs_leaf_free_space(root, l));
+	for (i = 0 ; i < nr ; i++) {
+		item = btrfs_item_nr(l, i);
+		btrfs_item_key_to_cpu(l, &key, i);
+		type = btrfs_key_type(&key);
+		printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+		       "itemsize %d\n",
+			i,
+			(unsigned long long)key.objectid, type,
+			(unsigned long long)key.offset,
+			btrfs_item_offset(l, item), btrfs_item_size(l, item));
+		switch (type) {
+		case BTRFS_INODE_ITEM_KEY:
+			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+			printk(KERN_INFO "\t\tinode generation %llu size %llu "
+			       "mode %o\n",
+			       (unsigned long long)
+			       btrfs_inode_generation(l, ii),
+			      (unsigned long long)btrfs_inode_size(l, ii),
+			       btrfs_inode_mode(l, ii));
+			break;
+		case BTRFS_DIR_ITEM_KEY:
+			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+			btrfs_dir_item_key_to_cpu(l, di, &found_key);
+			printk(KERN_INFO "\t\tdir oid %llu type %u\n",
+				(unsigned long long)found_key.objectid,
+				btrfs_dir_type(l, di));
+			break;
+		case BTRFS_ROOT_ITEM_KEY:
+			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+			printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+				(unsigned long long)
+				btrfs_disk_root_bytenr(l, ri),
+				btrfs_disk_root_refs(l, ri));
+			break;
+		case BTRFS_EXTENT_ITEM_KEY:
+			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
+			printk(KERN_INFO "\t\textent data refs %u\n",
+				btrfs_extent_refs(l, ei));
+			break;
+		case BTRFS_EXTENT_REF_KEY:
+			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
+			printk(KERN_INFO "\t\textent back ref root %llu "
+			       "gen %llu owner %llu num_refs %lu\n",
+			       (unsigned long long)btrfs_ref_root(l, ref),
+			       (unsigned long long)btrfs_ref_generation(l, ref),
+			       (unsigned long long)btrfs_ref_objectid(l, ref),
+			       (unsigned long)btrfs_ref_num_refs(l, ref));
+			break;
+
+		case BTRFS_EXTENT_DATA_KEY:
+			fi = btrfs_item_ptr(l, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(l, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE) {
+				printk(KERN_INFO "\t\tinline extent data "
+				       "size %u\n",
+				       btrfs_file_extent_inline_len(l, fi));
+				break;
+			}
+			printk(KERN_INFO "\t\textent data disk bytenr %llu "
+			       "nr %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_disk_bytenr(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_disk_num_bytes(l, fi));
+			printk(KERN_INFO "\t\textent data offset %llu "
+			       "nr %llu ram %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_offset(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_num_bytes(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_ram_bytes(l, fi));
+			break;
+		case BTRFS_BLOCK_GROUP_ITEM_KEY:
+			bi = btrfs_item_ptr(l, i,
+					    struct btrfs_block_group_item);
+			printk(KERN_INFO "\t\tblock group used %llu\n",
+			       (unsigned long long)
+			       btrfs_disk_block_group_used(l, bi));
+			break;
+		case BTRFS_CHUNK_ITEM_KEY:
+			print_chunk(l, btrfs_item_ptr(l, i,
+						      struct btrfs_chunk));
+			break;
+		case BTRFS_DEV_ITEM_KEY:
+			print_dev_item(l, btrfs_item_ptr(l, i,
+					struct btrfs_dev_item));
+			break;
+		case BTRFS_DEV_EXTENT_KEY:
+			dev_extent = btrfs_item_ptr(l, i,
+						    struct btrfs_dev_extent);
+			printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
+			       "\t\tchunk objectid %llu chunk offset %llu "
+			       "length %llu\n",
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_tree(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_objectid(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_offset(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_length(l, dev_extent));
+		};
+	}
+}
+
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
+{
+	int i; u32 nr;
+	struct btrfs_key key;
+	int level;
+
+	if (!c)
+		return;
+	nr = btrfs_header_nritems(c);
+	level = btrfs_header_level(c);
+	if (level == 0) {
+		btrfs_print_leaf(root, c);
+		return;
+	}
+	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
+	       (unsigned long long)btrfs_header_bytenr(c),
+	       btrfs_header_level(c), nr,
+	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+	for (i = 0; i < nr; i++) {
+		btrfs_node_key_to_cpu(c, &key, i);
+		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
+		       i,
+		       (unsigned long long)key.objectid,
+		       key.type,
+		       (unsigned long long)key.offset,
+		       (unsigned long long)btrfs_node_blockptr(c, i));
+	}
+	for (i = 0; i < nr; i++) {
+		struct extent_buffer *next = read_tree_block(root,
+					btrfs_node_blockptr(c, i),
+					btrfs_level_size(root, level - 1),
+					btrfs_node_ptr_generation(c, i));
+		if (btrfs_is_leaf(next) &&
+		    btrfs_header_level(c) != 1)
+			BUG();
+		if (btrfs_header_level(next) !=
+			btrfs_header_level(c) - 1)
+			BUG();
+		btrfs_print_tree(root, next);
+		free_extent_buffer(next);
+	}
+}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 00000000000..da75efe534d
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __PRINT_TREE_
+#define __PRINT_TREE_
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 00000000000..6f0acc4c9ea
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "ref-cache.h"
+#include "transaction.h"
+
+/*
+ * leaf refs are used to cache the information about which extents
+ * a given leaf has references on.  This allows us to process that leaf
+ * in btrfs_drop_snapshot without needing to read it back from disk.
+ */
+
+/*
+ * kmalloc a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+					    int nr_extents)
+{
+	struct btrfs_leaf_ref *ref;
+	size_t size = btrfs_leaf_ref_size(nr_extents);
+
+	ref = kmalloc(size, GFP_NOFS);
+	if (ref) {
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size += size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
+
+		memset(ref, 0, sizeof(*ref));
+		atomic_set(&ref->usage, 1);
+		INIT_LIST_HEAD(&ref->list);
+	}
+	return ref;
+}
+
+/*
+ * free a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+	if (!ref)
+		return;
+	WARN_ON(atomic_read(&ref->usage) == 0);
+	if (atomic_dec_and_test(&ref->usage)) {
+		size_t size = btrfs_leaf_ref_size(ref->nritems);
+
+		BUG_ON(ref->in_tree);
+		kfree(ref);
+
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size -= size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_leaf_ref *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
+
+		if (bytenr < entry->bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->bytenr)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_leaf_ref *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		if (bytenr < entry->bytenr)
+			n = n->rb_left;
+		else if (bytenr > entry->bytenr)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	return NULL;
+}
+
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+			   int shared)
+{
+	struct btrfs_leaf_ref *ref = NULL;
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+	if (shared)
+		tree = &root->fs_info->shared_ref_tree;
+	if (!tree)
+		return 0;
+
+	spin_lock(&tree->lock);
+	while (!list_empty(&tree->list)) {
+		ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
+		BUG_ON(ref->tree != tree);
+		if (ref->root_gen > max_root_gen)
+			break;
+		if (!xchg(&ref->in_tree, 0)) {
+			cond_resched_lock(&tree->lock);
+			continue;
+		}
+
+		rb_erase(&ref->rb_node, &tree->root);
+		list_del_init(&ref->list);
+
+		spin_unlock(&tree->lock);
+		btrfs_free_leaf_ref(root, ref);
+		cond_resched();
+		spin_lock(&tree->lock);
+	}
+	spin_unlock(&tree->lock);
+	return 0;
+}
+
+/*
+ * find the leaf ref for a given extent.  This returns the ref struct with
+ * a usage reference incremented
+ */
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+					     u64 bytenr)
+{
+	struct rb_node *rb;
+	struct btrfs_leaf_ref *ref = NULL;
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+again:
+	if (tree) {
+		spin_lock(&tree->lock);
+		rb = tree_search(&tree->root, bytenr);
+		if (rb)
+			ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+		if (ref)
+			atomic_inc(&ref->usage);
+		spin_unlock(&tree->lock);
+		if (ref)
+			return ref;
+	}
+	if (tree != &root->fs_info->shared_ref_tree) {
+		tree = &root->fs_info->shared_ref_tree;
+		goto again;
+	}
+	return NULL;
+}
+
+/*
+ * add a fully filled in leaf ref struct
+ * remove all the refs older than a given root generation
+ */
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+		       int shared)
+{
+	int ret = 0;
+	struct rb_node *rb;
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+	if (shared)
+		tree = &root->fs_info->shared_ref_tree;
+
+	spin_lock(&tree->lock);
+	rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
+	if (rb) {
+		ret = -EEXIST;
+	} else {
+		atomic_inc(&ref->usage);
+		ref->tree = tree;
+		ref->in_tree = 1;
+		list_add_tail(&ref->list, &tree->list);
+	}
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+/*
+ * remove a single leaf ref from the tree.  This drops the ref held by the tree
+ * only
+ */
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+	struct btrfs_leaf_ref_tree *tree;
+
+	if (!xchg(&ref->in_tree, 0))
+		return 0;
+
+	tree = ref->tree;
+	spin_lock(&tree->lock);
+
+	rb_erase(&ref->rb_node, &tree->root);
+	list_del_init(&ref->list);
+
+	spin_unlock(&tree->lock);
+
+	btrfs_free_leaf_ref(root, ref);
+	return 0;
+}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 00000000000..16f3183d7c5
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __REFCACHE__
+#define __REFCACHE__
+
+struct btrfs_extent_info {
+	/* bytenr and num_bytes find the extent in the extent allocation tree */
+	u64 bytenr;
+	u64 num_bytes;
+
+	/* objectid and offset find the back reference for the file */
+	u64 objectid;
+	u64 offset;
+};
+
+struct btrfs_leaf_ref {
+	struct rb_node rb_node;
+	struct btrfs_leaf_ref_tree *tree;
+	int in_tree;
+	atomic_t usage;
+
+	u64 root_gen;
+	u64 bytenr;
+	u64 owner;
+	u64 generation;
+	int nritems;
+
+	struct list_head list;
+	struct btrfs_extent_info extents[];
+};
+
+static inline size_t btrfs_leaf_ref_size(int nr_extents)
+{
+	return sizeof(struct btrfs_leaf_ref) +
+	       sizeof(struct btrfs_extent_info) * nr_extents;
+}
+
+static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
+{
+	tree->root.rb_node = NULL;
+	INIT_LIST_HEAD(&tree->list);
+	spin_lock_init(&tree->lock);
+}
+
+static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
+{
+	return RB_EMPTY_ROOT(&tree->root);
+}
+
+void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+					    int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+					     u64 bytenr);
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+		       int shared);
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+			   int shared);
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+
+#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 00000000000..b48650de447
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+/*
+ *  search forward for a root, starting with objectid 'search_start'
+ *  if a root key is found, the objectid we find is filled into 'found_objectid'
+ *  and 0 is returned.  < 0 is returned on error, 1 if there is nothing
+ *  left in the tree.
+ */
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+		      u64 *found_objectid)
+{
+	struct btrfs_path *path;
+	struct btrfs_key search_key;
+	int ret;
+
+	root = root->fs_info->tree_root;
+	search_key.objectid = search_start;
+	search_key.type = (u8)-1;
+	search_key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+again:
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret == 0) {
+		ret = 1;
+		goto out;
+	}
+	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret)
+			goto out;
+	}
+	btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
+	if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
+		search_key.offset++;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+	ret = 0;
+	*found_objectid = search_key.objectid;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * lookup the root with the highest offset for a given objectid.  The key we do
+ * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
+ * on error.
+ */
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+			struct btrfs_root_item *item, struct btrfs_key *key)
+{
+	struct btrfs_path *path;
+	struct btrfs_key search_key;
+	struct btrfs_key found_key;
+	struct extent_buffer *l;
+	int ret;
+	int slot;
+
+	search_key.objectid = objectid;
+	search_key.type = BTRFS_ROOT_ITEM_KEY;
+	search_key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	BUG_ON(ret == 0);
+	l = path->nodes[0];
+	BUG_ON(path->slots[0] == 0);
+	slot = path->slots[0] - 1;
+	btrfs_item_key_to_cpu(l, &found_key, slot);
+	if (found_key.objectid != objectid) {
+		ret = 1;
+		goto out;
+	}
+	read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+			   sizeof(*item));
+	memcpy(key, &found_key, sizeof(found_key));
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * copy the data in 'item' into the btree
+ */
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *l;
+	int ret;
+	int slot;
+	unsigned long ptr;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret != 0) {
+		btrfs_print_leaf(root, path->nodes[0]);
+		printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+		       (unsigned long long)key->objectid, key->type,
+		       (unsigned long long)key->offset);
+		BUG_ON(1);
+	}
+
+	l = path->nodes[0];
+	slot = path->slots[0];
+	ptr = btrfs_item_ptr_offset(l, slot);
+	write_extent_buffer(l, item, ptr, sizeof(*item));
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item)
+{
+	int ret;
+	ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
+	return ret;
+}
+
+/*
+ * at mount time we want to find all the old transaction snapshots that were in
+ * the process of being deleted if we crashed.  This is any root item with an
+ * offset lower than the latest root.  They need to be queued for deletion to
+ * finish what was happening when we crashed.
+ */
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+			  struct btrfs_root *latest)
+{
+	struct btrfs_root *dead_root;
+	struct btrfs_item *item;
+	struct btrfs_root_item *ri;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+	int ret;
+	u32 nritems;
+	struct extent_buffer *leaf;
+	int slot;
+
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	key.offset = 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+again:
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+	while (1) {
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		slot = path->slots[0];
+		if (slot >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			slot = path->slots[0];
+		}
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
+			goto next;
+
+		if (key.objectid < objectid)
+			goto next;
+
+		if (key.objectid > objectid)
+			break;
+
+		ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
+		if (btrfs_disk_root_refs(leaf, ri) != 0)
+			goto next;
+
+		memcpy(&found_key, &key, sizeof(key));
+		key.offset++;
+		btrfs_release_path(root, path);
+		dead_root =
+			btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						    &found_key);
+		if (IS_ERR(dead_root)) {
+			ret = PTR_ERR(dead_root);
+			goto err;
+		}
+
+		if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+			ret = btrfs_add_dead_reloc_root(dead_root);
+		else
+			ret = btrfs_add_dead_root(dead_root, latest);
+		if (ret)
+			goto err;
+		goto again;
+next:
+		slot++;
+		path->slots[0]++;
+	}
+	ret = 0;
+err:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/* drop the root item for 'key' from 'root' */
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_key *key)
+{
+	struct btrfs_path *path;
+	int ret;
+	u32 refs;
+	struct btrfs_root_item *ri;
+	struct extent_buffer *leaf;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	BUG_ON(ret != 0);
+	leaf = path->nodes[0];
+	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
+
+	refs = btrfs_disk_root_refs(leaf, ri);
+	BUG_ON(refs != 0);
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	return ret;
+}
+
+#if 0 /* this will get used when snapshot deletion is implemented */
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u8 type, u64 ref_id)
+{
+	struct btrfs_key key;
+	int ret;
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+
+	key.objectid = root_id;
+	key.type = type;
+	key.offset = ref_id;
+
+	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, tree_root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return ret;
+}
+#endif
+
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+		   struct btrfs_path *path,
+		   u64 root_id, u64 ref_id)
+{
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = ref_id;
+
+	ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+	return ret;
+}
+
+
+/*
+ * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol  or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u8 type, u64 ref_id,
+		       u64 dirid, u64 sequence,
+		       const char *name, int name_len)
+{
+	struct btrfs_key key;
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+
+	path = btrfs_alloc_path();
+
+	key.objectid = root_id;
+	key.type = type;
+	key.offset = ref_id;
+
+	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+				      sizeof(*ref) + name_len);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	btrfs_set_root_ref_dirid(leaf, ref, dirid);
+	btrfs_set_root_ref_sequence(leaf, ref, sequence);
+	btrfs_set_root_ref_name_len(leaf, ref, name_len);
+	ptr = (unsigned long)(ref + 1);
+	write_extent_buffer(leaf, name, ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 00000000000..c0f7ecaf1e7
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/highmem.h>
+
+/* this is some deeply nasty code.  ctree.h has a different
+ * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions
+ *
+ * This file declares the macros and then #includes ctree.h, which results
+ * in cpp creating the function here based on the template below.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers.  Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type.  This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do all the kmapping and page
+ * spanning work required to get extent buffers in highmem and have
+ * a metadata blocksize different from the page size.
+ *
+ * The macro starts with a simple function prototype declaration so that
+ * sparse won't complain about it being static.
+ */
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);	\
+u##bits btrfs_##name(struct extent_buffer *eb,				\
+				   type *s)				\
+{									\
+	unsigned long part_offset = (unsigned long)s;			\
+	unsigned long offset = part_offset + offsetof(type, member);	\
+	type *p;							\
+	/* ugly, but we want the fast path here */			\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\
+		return le##bits##_to_cpu(p->member);			\
+	}								\
+	{								\
+		int err;						\
+		char *map_token;					\
+		char *kaddr;						\
+		int unmap_on_exit = (eb->map_token == NULL);		\
+		unsigned long map_start;				\
+		unsigned long map_len;					\
+		u##bits res;						\
+		err = map_extent_buffer(eb, offset,			\
+				sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER1);	\
+		if (err) {						\
+			__le##bits leres;				\
+			read_eb_member(eb, s, type, member, &leres);	\
+			return le##bits##_to_cpu(leres);		\
+		}							\
+		p = (type *)(kaddr + part_offset - map_start);		\
+		res = le##bits##_to_cpu(p->member);			\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
+		return res;						\
+	}								\
+}									\
+void btrfs_set_##name(struct extent_buffer *eb,				\
+				    type *s, u##bits val)		\
+{									\
+	unsigned long part_offset = (unsigned long)s;			\
+	unsigned long offset = part_offset + offsetof(type, member);	\
+	type *p;							\
+	/* ugly, but we want the fast path here */			\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\
+		p->member = cpu_to_le##bits(val);			\
+		return;							\
+	}								\
+	{								\
+		int err;						\
+		char *map_token;					\
+		char *kaddr;						\
+		int unmap_on_exit = (eb->map_token == NULL);		\
+		unsigned long map_start;				\
+		unsigned long map_len;					\
+		err = map_extent_buffer(eb, offset,			\
+				sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER1);	\
+		if (err) {						\
+			__le##bits val2;				\
+			val2 = cpu_to_le##bits(val);			\
+			write_eb_member(eb, s, type, member, &val2);	\
+			return;						\
+		}							\
+		p = (type *)(kaddr + part_offset - map_start);		\
+		p->member = cpu_to_le##bits(val);			\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
+	}								\
+}
+
+#include "ctree.h"
+
+void btrfs_node_key(struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr)
+{
+	unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+	if (eb->map_token && ptr >= eb->map_start &&
+	    ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
+		memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
+			sizeof(*disk_key));
+		return;
+	} else if (eb->map_token) {
+		unmap_extent_buffer(eb, eb->map_token, KM_USER1);
+		eb->map_token = NULL;
+	}
+	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 00000000000..b4c101d9322
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/parser.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/miscdevice.h>
+#include <linux/version.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "xattr.h"
+#include "volumes.h"
+#include "version.h"
+#include "export.h"
+#include "compression.h"
+
+#define BTRFS_SUPER_MAGIC 0x9123683E
+
+static struct super_operations btrfs_super_ops;
+
+static void btrfs_put_super(struct super_block *sb)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	int ret;
+
+	ret = close_ctree(root);
+	sb->s_fs_info = NULL;
+}
+
+enum {
+	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
+	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_degraded, "degraded"},
+	{Opt_subvol, "subvol=%s"},
+	{Opt_device, "device=%s"},
+	{Opt_nodatasum, "nodatasum"},
+	{Opt_nodatacow, "nodatacow"},
+	{Opt_nobarrier, "nobarrier"},
+	{Opt_max_extent, "max_extent=%s"},
+	{Opt_max_inline, "max_inline=%s"},
+	{Opt_alloc_start, "alloc_start=%s"},
+	{Opt_thread_pool, "thread_pool=%d"},
+	{Opt_compress, "compress"},
+	{Opt_ssd, "ssd"},
+	{Opt_noacl, "noacl"},
+	{Opt_err, NULL},
+};
+
+u64 btrfs_parse_size(char *str)
+{
+	u64 res;
+	int mult = 1;
+	char *end;
+	char last;
+
+	res = simple_strtoul(str, &end, 10);
+
+	last = end[0];
+	if (isalpha(last)) {
+		last = tolower(last);
+		switch (last) {
+		case 'g':
+			mult *= 1024;
+		case 'm':
+			mult *= 1024;
+		case 'k':
+			mult *= 1024;
+		}
+		res = res * mult;
+	}
+	return res;
+}
+
+/*
+ * Regular mount options parser.  Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ */
+int btrfs_parse_options(struct btrfs_root *root, char *options)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	substring_t args[MAX_OPT_ARGS];
+	char *p, *num;
+	int intarg;
+
+	if (!options)
+		return 0;
+
+	/*
+	 * strsep changes the string, duplicate it because parse_options
+	 * gets called twice
+	 */
+	options = kstrdup(options, GFP_NOFS);
+	if (!options)
+		return -ENOMEM;
+
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_degraded:
+			printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+			btrfs_set_opt(info->mount_opt, DEGRADED);
+			break;
+		case Opt_subvol:
+		case Opt_device:
+			/*
+			 * These are parsed by btrfs_parse_early_options
+			 * and can be happily ignored here.
+			 */
+			break;
+		case Opt_nodatasum:
+			printk(KERN_INFO "btrfs: setting nodatacsum\n");
+			btrfs_set_opt(info->mount_opt, NODATASUM);
+			break;
+		case Opt_nodatacow:
+			printk(KERN_INFO "btrfs: setting nodatacow\n");
+			btrfs_set_opt(info->mount_opt, NODATACOW);
+			btrfs_set_opt(info->mount_opt, NODATASUM);
+			break;
+		case Opt_compress:
+			printk(KERN_INFO "btrfs: use compression\n");
+			btrfs_set_opt(info->mount_opt, COMPRESS);
+			break;
+		case Opt_ssd:
+			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
+			btrfs_set_opt(info->mount_opt, SSD);
+			break;
+		case Opt_nobarrier:
+			printk(KERN_INFO "btrfs: turning off barriers\n");
+			btrfs_set_opt(info->mount_opt, NOBARRIER);
+			break;
+		case Opt_thread_pool:
+			intarg = 0;
+			match_int(&args[0], &intarg);
+			if (intarg) {
+				info->thread_pool_size = intarg;
+				printk(KERN_INFO "btrfs: thread pool %d\n",
+				       info->thread_pool_size);
+			}
+			break;
+		case Opt_max_extent:
+			num = match_strdup(&args[0]);
+			if (num) {
+				info->max_extent = btrfs_parse_size(num);
+				kfree(num);
+
+				info->max_extent = max_t(u64,
+					info->max_extent, root->sectorsize);
+				printk(KERN_INFO "btrfs: max_extent at %llu\n",
+				       info->max_extent);
+			}
+			break;
+		case Opt_max_inline:
+			num = match_strdup(&args[0]);
+			if (num) {
+				info->max_inline = btrfs_parse_size(num);
+				kfree(num);
+
+				if (info->max_inline) {
+					info->max_inline = max_t(u64,
+						info->max_inline,
+						root->sectorsize);
+				}
+				printk(KERN_INFO "btrfs: max_inline at %llu\n",
+					info->max_inline);
+			}
+			break;
+		case Opt_alloc_start:
+			num = match_strdup(&args[0]);
+			if (num) {
+				info->alloc_start = btrfs_parse_size(num);
+				kfree(num);
+				printk(KERN_INFO
+					"btrfs: allocations start at %llu\n",
+					info->alloc_start);
+			}
+			break;
+		case Opt_noacl:
+			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+			break;
+		default:
+			break;
+		}
+	}
+	kfree(options);
+	return 0;
+}
+
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
+		void *holder, char **subvol_name,
+		struct btrfs_fs_devices **fs_devices)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *opts, *p;
+	int error = 0;
+
+	if (!options)
+		goto out;
+
+	/*
+	 * strsep changes the string, duplicate it because parse_options
+	 * gets called twice
+	 */
+	opts = kstrdup(options, GFP_KERNEL);
+	if (!opts)
+		return -ENOMEM;
+
+	while ((p = strsep(&opts, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_subvol:
+			*subvol_name = match_strdup(&args[0]);
+			break;
+		case Opt_device:
+			error = btrfs_scan_one_device(match_strdup(&args[0]),
+					flags, holder, fs_devices);
+			if (error)
+				goto out_free_opts;
+			break;
+		default:
+			break;
+		}
+	}
+
+ out_free_opts:
+	kfree(opts);
+ out:
+	/*
+	 * If no subvolume name is specified we use the default one.  Allocate
+	 * a copy of the string "." here so that code later in the
+	 * mount path doesn't care if it's the default volume or another one.
+	 */
+	if (!*subvol_name) {
+		*subvol_name = kstrdup(".", GFP_KERNEL);
+		if (!*subvol_name)
+			return -ENOMEM;
+	}
+	return error;
+}
+
+static int btrfs_fill_super(struct super_block *sb,
+			    struct btrfs_fs_devices *fs_devices,
+			    void *data, int silent)
+{
+	struct inode *inode;
+	struct dentry *root_dentry;
+	struct btrfs_super_block *disk_super;
+	struct btrfs_root *tree_root;
+	struct btrfs_inode *bi;
+	int err;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_magic = BTRFS_SUPER_MAGIC;
+	sb->s_op = &btrfs_super_ops;
+	sb->s_export_op = &btrfs_export_ops;
+	sb->s_xattr = btrfs_xattr_handlers;
+	sb->s_time_gran = 1;
+	sb->s_flags |= MS_POSIXACL;
+
+	tree_root = open_ctree(sb, fs_devices, (char *)data);
+
+	if (IS_ERR(tree_root)) {
+		printk("btrfs: open_ctree failed\n");
+		return PTR_ERR(tree_root);
+	}
+	sb->s_fs_info = tree_root;
+	disk_super = &tree_root->fs_info->super_copy;
+	inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
+				  tree_root->fs_info->fs_root);
+	bi = BTRFS_I(inode);
+	bi->location.objectid = inode->i_ino;
+	bi->location.offset = 0;
+	bi->root = tree_root->fs_info->fs_root;
+
+	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
+
+	if (!inode) {
+		err = -ENOMEM;
+		goto fail_close;
+	}
+	if (inode->i_state & I_NEW) {
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+	}
+
+	root_dentry = d_alloc_root(inode);
+	if (!root_dentry) {
+		iput(inode);
+		err = -ENOMEM;
+		goto fail_close;
+	}
+#if 0
+	/* this does the super kobj at the same time */
+	err = btrfs_sysfs_add_super(tree_root->fs_info);
+	if (err)
+		goto fail_close;
+#endif
+
+	sb->s_root = root_dentry;
+
+	save_mount_options(sb, data);
+	return 0;
+
+fail_close:
+	close_ctree(tree_root);
+	return err;
+}
+
+int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	int ret;
+	root = btrfs_sb(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	sb->s_dirt = 0;
+	if (!wait) {
+		filemap_flush(root->fs_info->btree_inode->i_mapping);
+		return 0;
+	}
+
+	btrfs_start_delalloc_inodes(root);
+	btrfs_wait_ordered_extents(root, 0);
+
+	btrfs_clean_old_snapshots(root);
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_commit_transaction(trans, root);
+	sb->s_dirt = 0;
+	return ret;
+}
+
+static void btrfs_write_super(struct super_block *sb)
+{
+	sb->s_dirt = 0;
+}
+
+static int btrfs_test_super(struct super_block *s, void *data)
+{
+	struct btrfs_fs_devices *test_fs_devices = data;
+	struct btrfs_root *root = btrfs_sb(s);
+
+	return root->fs_info->fs_devices == test_fs_devices;
+}
+
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
+ *	  for multiple device setup.  Make sure to keep it in sync.
+ */
+static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	char *subvol_name = NULL;
+	struct block_device *bdev = NULL;
+	struct super_block *s;
+	struct dentry *root;
+	struct btrfs_fs_devices *fs_devices = NULL;
+	fmode_t mode = FMODE_READ;
+	int error = 0;
+
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	error = btrfs_parse_early_options(data, mode, fs_type,
+					  &subvol_name, &fs_devices);
+	if (error)
+		return error;
+
+	error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
+	if (error)
+		goto error_free_subvol_name;
+
+	error = btrfs_open_devices(fs_devices, mode, fs_type);
+	if (error)
+		goto error_free_subvol_name;
+
+	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+		error = -EACCES;
+		goto error_close_devices;
+	}
+
+	bdev = fs_devices->latest_bdev;
+	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+	if (IS_ERR(s))
+		goto error_s;
+
+	if (s->s_root) {
+		if ((flags ^ s->s_flags) & MS_RDONLY) {
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			error = -EBUSY;
+			goto error_close_devices;
+		}
+
+		btrfs_close_devices(fs_devices);
+	} else {
+		char b[BDEVNAME_SIZE];
+
+		s->s_flags = flags;
+		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		error = btrfs_fill_super(s, fs_devices, data,
+					 flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			goto error_free_subvol_name;
+		}
+
+		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+		s->s_flags |= MS_ACTIVE;
+	}
+
+	if (!strcmp(subvol_name, "."))
+		root = dget(s->s_root);
+	else {
+		mutex_lock(&s->s_root->d_inode->i_mutex);
+		root = lookup_one_len(subvol_name, s->s_root,
+				      strlen(subvol_name));
+		mutex_unlock(&s->s_root->d_inode->i_mutex);
+
+		if (IS_ERR(root)) {
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			error = PTR_ERR(root);
+			goto error_free_subvol_name;
+		}
+		if (!root->d_inode) {
+			dput(root);
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			error = -ENXIO;
+			goto error_free_subvol_name;
+		}
+	}
+
+	mnt->mnt_sb = s;
+	mnt->mnt_root = root;
+
+	kfree(subvol_name);
+	return 0;
+
+error_s:
+	error = PTR_ERR(s);
+error_close_devices:
+	btrfs_close_devices(fs_devices);
+error_free_subvol_name:
+	kfree(subvol_name);
+	return error;
+}
+
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	int ret;
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+
+	if (*flags & MS_RDONLY) {
+		sb->s_flags |= MS_RDONLY;
+
+		ret =  btrfs_commit_super(root);
+		WARN_ON(ret);
+	} else {
+		if (root->fs_info->fs_devices->rw_devices == 0)
+			return -EACCES;
+
+		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+			return -EINVAL;
+
+		ret = btrfs_cleanup_reloc_trees(root);
+		WARN_ON(ret);
+
+		ret = btrfs_cleanup_fs_roots(root->fs_info);
+		WARN_ON(ret);
+
+		sb->s_flags &= ~MS_RDONLY;
+	}
+
+	return 0;
+}
+
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	int bits = dentry->d_sb->s_blocksize_bits;
+	__be32 *fsid = (__be32 *)root->fs_info->fsid;
+
+	buf->f_namelen = BTRFS_NAME_LEN;
+	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
+	buf->f_bfree = buf->f_blocks -
+		(btrfs_super_bytes_used(disk_super) >> bits);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	buf->f_type = BTRFS_SUPER_MAGIC;
+
+	/* We treat it as constant endianness (it doesn't matter _which_)
+	   because we want the fsid to come out the same whether mounted
+	   on a big-endian or little-endian host */
+	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+	/* Mask in the root object ID too, to disambiguate subvols */
+	buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
+	buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
+
+	return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "btrfs",
+	.get_sb		= btrfs_get_sb,
+	.kill_sb	= kill_anon_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	struct btrfs_ioctl_vol_args *vol;
+	struct btrfs_fs_devices *fs_devices;
+	int ret = 0;
+	int len;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+	if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
+	switch (cmd) {
+	case BTRFS_IOC_SCAN_DEV:
+		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+					    &btrfs_fs_type, &fs_devices);
+		break;
+	}
+out:
+	kfree(vol);
+	return ret;
+}
+
+static void btrfs_write_super_lockfs(struct super_block *sb)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+	mutex_lock(&root->fs_info->cleaner_mutex);
+}
+
+static void btrfs_unlockfs(struct super_block *sb)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+}
+
+static struct super_operations btrfs_super_ops = {
+	.delete_inode	= btrfs_delete_inode,
+	.put_super	= btrfs_put_super,
+	.write_super	= btrfs_write_super,
+	.sync_fs	= btrfs_sync_fs,
+	.show_options	= generic_show_options,
+	.write_inode	= btrfs_write_inode,
+	.dirty_inode	= btrfs_dirty_inode,
+	.alloc_inode	= btrfs_alloc_inode,
+	.destroy_inode	= btrfs_destroy_inode,
+	.statfs		= btrfs_statfs,
+	.remount_fs	= btrfs_remount,
+	.write_super_lockfs = btrfs_write_super_lockfs,
+	.unlockfs	= btrfs_unlockfs,
+};
+
+static const struct file_operations btrfs_ctl_fops = {
+	.unlocked_ioctl	 = btrfs_control_ioctl,
+	.compat_ioctl = btrfs_control_ioctl,
+	.owner	 = THIS_MODULE,
+};
+
+static struct miscdevice btrfs_misc = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "btrfs-control",
+	.fops		= &btrfs_ctl_fops
+};
+
+static int btrfs_interface_init(void)
+{
+	return misc_register(&btrfs_misc);
+}
+
+static void btrfs_interface_exit(void)
+{
+	if (misc_deregister(&btrfs_misc) < 0)
+		printk(KERN_INFO "misc_deregister failed for control device");
+}
+
+static int __init init_btrfs_fs(void)
+{
+	int err;
+
+	err = btrfs_init_sysfs();
+	if (err)
+		return err;
+
+	err = btrfs_init_cachep();
+	if (err)
+		goto free_sysfs;
+
+	err = extent_io_init();
+	if (err)
+		goto free_cachep;
+
+	err = extent_map_init();
+	if (err)
+		goto free_extent_io;
+
+	err = btrfs_interface_init();
+	if (err)
+		goto free_extent_map;
+
+	err = register_filesystem(&btrfs_fs_type);
+	if (err)
+		goto unregister_ioctl;
+
+	printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
+	return 0;
+
+unregister_ioctl:
+	btrfs_interface_exit();
+free_extent_map:
+	extent_map_exit();
+free_extent_io:
+	extent_io_exit();
+free_cachep:
+	btrfs_destroy_cachep();
+free_sysfs:
+	btrfs_exit_sysfs();
+	return err;
+}
+
+static void __exit exit_btrfs_fs(void)
+{
+	btrfs_destroy_cachep();
+	extent_map_exit();
+	extent_io_exit();
+	btrfs_interface_exit();
+	unregister_filesystem(&btrfs_fs_type);
+	btrfs_exit_sysfs();
+	btrfs_cleanup_fs_uuids();
+	btrfs_zlib_exit();
+}
+
+module_init(init_btrfs_fs)
+module_exit(exit_btrfs_fs)
+
+MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 00000000000..a240b6fa81d
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_root_used(&root->root_item));
+}
+
+static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_root_limit(&root->root_item));
+}
+
+static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
+{
+
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
+}
+
+static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
+}
+
+static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
+}
+
+/* this is for root attrs (subvols/snapshots) */
+struct btrfs_root_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct btrfs_root *, char *);
+	ssize_t (*store)(struct btrfs_root *, const char *, size_t);
+};
+
+#define ROOT_ATTR(name, mode, show, store) \
+static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
+							      show, store)
+
+ROOT_ATTR(blocks_used,	0444,	root_blocks_used_show,	NULL);
+ROOT_ATTR(block_limit,	0644,	root_block_limit_show,	NULL);
+
+static struct attribute *btrfs_root_attrs[] = {
+	&btrfs_root_attr_blocks_used.attr,
+	&btrfs_root_attr_block_limit.attr,
+	NULL,
+};
+
+/* this is for super attrs (actual full fs) */
+struct btrfs_super_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct btrfs_fs_info *, char *);
+	ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
+};
+
+#define SUPER_ATTR(name, mode, show, store) \
+static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
+								show, store)
+
+SUPER_ATTR(blocks_used,		0444,	super_blocks_used_show,		NULL);
+SUPER_ATTR(total_blocks,	0444,	super_total_blocks_show,	NULL);
+SUPER_ATTR(blocksize,		0444,	super_blocksize_show,		NULL);
+
+static struct attribute *btrfs_super_attrs[] = {
+	&btrfs_super_attr_blocks_used.attr,
+	&btrfs_super_attr_total_blocks.attr,
+	&btrfs_super_attr_blocksize.attr,
+	NULL,
+};
+
+static ssize_t btrfs_super_attr_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+						super_kobj);
+	struct btrfs_super_attr *a = container_of(attr,
+						  struct btrfs_super_attr,
+						  attr);
+
+	return a->show ? a->show(fs, buf) : 0;
+}
+
+static ssize_t btrfs_super_attr_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+						super_kobj);
+	struct btrfs_super_attr *a = container_of(attr,
+						  struct btrfs_super_attr,
+						  attr);
+
+	return a->store ? a->store(fs, buf, len) : 0;
+}
+
+static ssize_t btrfs_root_attr_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+						root_kobj);
+	struct btrfs_root_attr *a = container_of(attr,
+						 struct btrfs_root_attr,
+						 attr);
+
+	return a->show ? a->show(root, buf) : 0;
+}
+
+static ssize_t btrfs_root_attr_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+						root_kobj);
+	struct btrfs_root_attr *a = container_of(attr,
+						 struct btrfs_root_attr,
+						 attr);
+	return a->store ? a->store(root, buf, len) : 0;
+}
+
+static void btrfs_super_release(struct kobject *kobj)
+{
+	struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+						super_kobj);
+	complete(&fs->kobj_unregister);
+}
+
+static void btrfs_root_release(struct kobject *kobj)
+{
+	struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+						root_kobj);
+	complete(&root->kobj_unregister);
+}
+
+static struct sysfs_ops btrfs_super_attr_ops = {
+	.show	= btrfs_super_attr_show,
+	.store	= btrfs_super_attr_store,
+};
+
+static struct sysfs_ops btrfs_root_attr_ops = {
+	.show	= btrfs_root_attr_show,
+	.store	= btrfs_root_attr_store,
+};
+
+static struct kobj_type btrfs_root_ktype = {
+	.default_attrs	= btrfs_root_attrs,
+	.sysfs_ops	= &btrfs_root_attr_ops,
+	.release	= btrfs_root_release,
+};
+
+static struct kobj_type btrfs_super_ktype = {
+	.default_attrs	= btrfs_super_attrs,
+	.sysfs_ops	= &btrfs_super_attr_ops,
+	.release	= btrfs_super_release,
+};
+
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
+{
+	int error;
+	char *name;
+	char c;
+	int len = strlen(fs->sb->s_id) + 1;
+	int i;
+
+	name = kmalloc(len, GFP_NOFS);
+	if (!name) {
+		error = -ENOMEM;
+		goto fail;
+	}
+
+	for (i = 0; i < len; i++) {
+		c = fs->sb->s_id[i];
+		if (c == '/' || c == '\\')
+			c = '!';
+		name[i] = c;
+	}
+	name[len] = '\0';
+
+	fs->super_kobj.kset = btrfs_kset;
+	error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
+				     NULL, "%s", name);
+	kfree(name);
+	if (error)
+		goto fail;
+
+	return 0;
+
+fail:
+	printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
+	return error;
+}
+
+int btrfs_sysfs_add_root(struct btrfs_root *root)
+{
+	int error;
+
+	error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
+				     &root->fs_info->super_kobj,
+				     "%s", root->name);
+	if (error)
+		goto fail;
+
+	return 0;
+
+fail:
+	printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
+	return error;
+}
+
+void btrfs_sysfs_del_root(struct btrfs_root *root)
+{
+	kobject_put(&root->root_kobj);
+	wait_for_completion(&root->kobj_unregister);
+}
+
+void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
+{
+	kobject_put(&fs->super_kobj);
+	wait_for_completion(&fs->kobj_unregister);
+}
+
+int btrfs_init_sysfs(void)
+{
+	btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+	if (!btrfs_kset)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_exit_sysfs(void)
+{
+	kset_unregister(btrfs_kset);
+}
+
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 00000000000..8a08f944334
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1097 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "tree-log.h"
+
+#define BTRFS_ROOT_TRANS_TAG 0
+
+static noinline void put_transaction(struct btrfs_transaction *transaction)
+{
+	WARN_ON(transaction->use_count == 0);
+	transaction->use_count--;
+	if (transaction->use_count == 0) {
+		list_del_init(&transaction->list);
+		memset(transaction, 0, sizeof(*transaction));
+		kmem_cache_free(btrfs_transaction_cachep, transaction);
+	}
+}
+
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
+static noinline int join_transaction(struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans;
+	cur_trans = root->fs_info->running_transaction;
+	if (!cur_trans) {
+		cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
+					     GFP_NOFS);
+		BUG_ON(!cur_trans);
+		root->fs_info->generation++;
+		root->fs_info->last_alloc = 0;
+		root->fs_info->last_data_alloc = 0;
+		cur_trans->num_writers = 1;
+		cur_trans->num_joined = 0;
+		cur_trans->transid = root->fs_info->generation;
+		init_waitqueue_head(&cur_trans->writer_wait);
+		init_waitqueue_head(&cur_trans->commit_wait);
+		cur_trans->in_commit = 0;
+		cur_trans->blocked = 0;
+		cur_trans->use_count = 1;
+		cur_trans->commit_done = 0;
+		cur_trans->start_time = get_seconds();
+		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+		extent_io_tree_init(&cur_trans->dirty_pages,
+				     root->fs_info->btree_inode->i_mapping,
+				     GFP_NOFS);
+		spin_lock(&root->fs_info->new_trans_lock);
+		root->fs_info->running_transaction = cur_trans;
+		spin_unlock(&root->fs_info->new_trans_lock);
+	} else {
+		cur_trans->num_writers++;
+		cur_trans->num_joined++;
+	}
+
+	return 0;
+}
+
+/*
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction.  This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
+ */
+noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
+{
+	struct btrfs_dirty_root *dirty;
+	u64 running_trans_id = root->fs_info->running_transaction->transid;
+	if (root->ref_cows && root->last_trans < running_trans_id) {
+		WARN_ON(root == root->fs_info->extent_root);
+		if (root->root_item.refs != 0) {
+			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+				   (unsigned long)root->root_key.objectid,
+				   BTRFS_ROOT_TRANS_TAG);
+
+			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+			BUG_ON(!dirty);
+			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+			BUG_ON(!dirty->root);
+			dirty->latest_root = root;
+			INIT_LIST_HEAD(&dirty->list);
+
+			root->commit_root = btrfs_root_node(root);
+
+			memcpy(dirty->root, root, sizeof(*root));
+			spin_lock_init(&dirty->root->node_lock);
+			spin_lock_init(&dirty->root->list_lock);
+			mutex_init(&dirty->root->objectid_mutex);
+			mutex_init(&dirty->root->log_mutex);
+			INIT_LIST_HEAD(&dirty->root->dead_list);
+			dirty->root->node = root->commit_root;
+			dirty->root->commit_root = NULL;
+
+			spin_lock(&root->list_lock);
+			list_add(&dirty->root->dead_list, &root->dead_list);
+			spin_unlock(&root->list_lock);
+
+			root->dirty_root = dirty;
+		} else {
+			WARN_ON(1);
+		}
+		root->last_trans = running_trans_id;
+	}
+	return 0;
+}
+
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+static void wait_current_trans(struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans;
+
+	cur_trans = root->fs_info->running_transaction;
+	if (cur_trans && cur_trans->blocked) {
+		DEFINE_WAIT(wait);
+		cur_trans->use_count++;
+		while (1) {
+			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (cur_trans->blocked) {
+				mutex_unlock(&root->fs_info->trans_mutex);
+				schedule();
+				mutex_lock(&root->fs_info->trans_mutex);
+				finish_wait(&root->fs_info->transaction_wait,
+					    &wait);
+			} else {
+				finish_wait(&root->fs_info->transaction_wait,
+					    &wait);
+				break;
+			}
+		}
+		put_transaction(cur_trans);
+	}
+}
+
+static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+					     int num_blocks, int wait)
+{
+	struct btrfs_trans_handle *h =
+		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+	int ret;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (!root->fs_info->log_root_recovering &&
+	    ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
+		wait_current_trans(root);
+	ret = join_transaction(root);
+	BUG_ON(ret);
+
+	btrfs_record_root_in_trans(root);
+	h->transid = root->fs_info->running_transaction->transid;
+	h->transaction = root->fs_info->running_transaction;
+	h->blocks_reserved = num_blocks;
+	h->blocks_used = 0;
+	h->block_group = 0;
+	h->alloc_exclude_nr = 0;
+	h->alloc_exclude_start = 0;
+	root->fs_info->running_transaction->use_count++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+	return h;
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+						   int num_blocks)
+{
+	return start_transaction(root, num_blocks, 1);
+}
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+						   int num_blocks)
+{
+	return start_transaction(root, num_blocks, 0);
+}
+
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+							 int num_blocks)
+{
+	return start_transaction(r, num_blocks, 2);
+}
+
+/* wait for a transaction commit to be fully complete */
+static noinline int wait_for_commit(struct btrfs_root *root,
+				    struct btrfs_transaction *commit)
+{
+	DEFINE_WAIT(wait);
+	mutex_lock(&root->fs_info->trans_mutex);
+	while (!commit->commit_done) {
+		prepare_to_wait(&commit->commit_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (commit->commit_done)
+			break;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		schedule();
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+	mutex_unlock(&root->fs_info->trans_mutex);
+	finish_wait(&commit->commit_wait, &wait);
+	return 0;
+}
+
+/*
+ * rate limit against the drop_snapshot code.  This helps to slow down new
+ * operations if the drop_snapshot code isn't able to keep up.
+ */
+static void throttle_on_drops(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	int harder_count = 0;
+
+harder:
+	if (atomic_read(&info->throttles)) {
+		DEFINE_WAIT(wait);
+		int thr;
+		thr = atomic_read(&info->throttle_gen);
+
+		do {
+			prepare_to_wait(&info->transaction_throttle,
+					&wait, TASK_UNINTERRUPTIBLE);
+			if (!atomic_read(&info->throttles)) {
+				finish_wait(&info->transaction_throttle, &wait);
+				break;
+			}
+			schedule();
+			finish_wait(&info->transaction_throttle, &wait);
+		} while (thr == atomic_read(&info->throttle_gen));
+		harder_count++;
+
+		if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
+		    harder_count < 2)
+			goto harder;
+
+		if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
+		    harder_count < 10)
+			goto harder;
+
+		if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
+		    harder_count < 20)
+			goto harder;
+	}
+}
+
+void btrfs_throttle(struct btrfs_root *root)
+{
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (!root->fs_info->open_ioctl_trans)
+		wait_current_trans(root);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	throttle_on_drops(root);
+}
+
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, int throttle)
+{
+	struct btrfs_transaction *cur_trans;
+	struct btrfs_fs_info *info = root->fs_info;
+
+	mutex_lock(&info->trans_mutex);
+	cur_trans = info->running_transaction;
+	WARN_ON(cur_trans != trans->transaction);
+	WARN_ON(cur_trans->num_writers < 1);
+	cur_trans->num_writers--;
+
+	if (waitqueue_active(&cur_trans->writer_wait))
+		wake_up(&cur_trans->writer_wait);
+	put_transaction(cur_trans);
+	mutex_unlock(&info->trans_mutex);
+	memset(trans, 0, sizeof(*trans));
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+	if (throttle)
+		throttle_on_drops(root);
+
+	return 0;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	return __btrfs_end_transaction(trans, root, 0);
+}
+
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	return __btrfs_end_transaction(trans, root, 1);
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages)
+{
+	int ret;
+	int err = 0;
+	int werr = 0;
+	struct page *page;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	u64 start = 0;
+	u64 end;
+	unsigned long index;
+
+	while (1) {
+		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+		while (start <= end) {
+			cond_resched();
+
+			index = start >> PAGE_CACHE_SHIFT;
+			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+			page = find_get_page(btree_inode->i_mapping, index);
+			if (!page)
+				continue;
+
+			btree_lock_page_hook(page);
+			if (!page->mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				continue;
+			}
+
+			if (PageWriteback(page)) {
+				if (PageDirty(page))
+					wait_on_page_writeback(page);
+				else {
+					unlock_page(page);
+					page_cache_release(page);
+					continue;
+				}
+			}
+			err = write_one_page(page, 0);
+			if (err)
+				werr = err;
+			page_cache_release(page);
+		}
+	}
+	while (1) {
+		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+		while (start <= end) {
+			index = start >> PAGE_CACHE_SHIFT;
+			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+			page = find_get_page(btree_inode->i_mapping, index);
+			if (!page)
+				continue;
+			if (PageDirty(page)) {
+				btree_lock_page_hook(page);
+				wait_on_page_writeback(page);
+				err = write_one_page(page, 0);
+				if (err)
+					werr = err;
+			}
+			wait_on_page_writeback(page);
+			page_cache_release(page);
+			cond_resched();
+		}
+	}
+	if (err)
+		werr = err;
+	return werr;
+}
+
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root)
+{
+	if (!trans || !trans->transaction) {
+		struct inode *btree_inode;
+		btree_inode = root->fs_info->btree_inode;
+		return filemap_write_and_wait(btree_inode->i_mapping);
+	}
+	return btrfs_write_and_wait_marked_extents(root,
+					   &trans->transaction->dirty_pages);
+}
+
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	int ret;
+	u64 old_root_bytenr;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
+
+	btrfs_extent_post_op(trans, root);
+	btrfs_write_dirty_block_groups(trans, root);
+	btrfs_extent_post_op(trans, root);
+
+	while (1) {
+		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+		if (old_root_bytenr == root->node->start)
+			break;
+		btrfs_set_root_bytenr(&root->root_item,
+				       root->node->start);
+		btrfs_set_root_level(&root->root_item,
+				     btrfs_header_level(root->node));
+		btrfs_set_root_generation(&root->root_item, trans->transid);
+
+		btrfs_extent_post_op(trans, root);
+
+		ret = btrfs_update_root(trans, tree_root,
+					&root->root_key,
+					&root->root_item);
+		BUG_ON(ret);
+		btrfs_write_dirty_block_groups(trans, root);
+		btrfs_extent_post_op(trans, root);
+	}
+	return 0;
+}
+
+/*
+ * update all the cowonly tree roots on disk
+ */
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head *next;
+	struct extent_buffer *eb;
+
+	btrfs_extent_post_op(trans, fs_info->tree_root);
+
+	eb = btrfs_lock_root_node(fs_info->tree_root);
+	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	btrfs_extent_post_op(trans, fs_info->tree_root);
+
+	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+		next = fs_info->dirty_cowonly_roots.next;
+		list_del_init(next);
+		root = list_entry(next, struct btrfs_root, dirty_list);
+
+		update_cowonly_root(trans, root);
+	}
+	return 0;
+}
+
+/*
+ * dead roots are old snapshots that need to be deleted.  This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
+{
+	struct btrfs_dirty_root *dirty;
+
+	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+	if (!dirty)
+		return -ENOMEM;
+	dirty->root = root;
+	dirty->latest_root = latest;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	list_add(&dirty->list, &latest->fs_info->dead_roots);
+	mutex_unlock(&root->fs_info->trans_mutex);
+	return 0;
+}
+
+/*
+ * at transaction commit time we need to schedule the old roots for
+ * deletion via btrfs_drop_snapshot.  This runs through all the
+ * reference counted roots that were modified in the current
+ * transaction and puts them into the drop list
+ */
+static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
+				    struct radix_tree_root *radix,
+				    struct list_head *list)
+{
+	struct btrfs_dirty_root *dirty;
+	struct btrfs_root *gang[8];
+	struct btrfs_root *root;
+	int i;
+	int ret;
+	int err = 0;
+	u32 refs;
+
+	while (1) {
+		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+						 ARRAY_SIZE(gang),
+						 BTRFS_ROOT_TRANS_TAG);
+		if (ret == 0)
+			break;
+		for (i = 0; i < ret; i++) {
+			root = gang[i];
+			radix_tree_tag_clear(radix,
+				     (unsigned long)root->root_key.objectid,
+				     BTRFS_ROOT_TRANS_TAG);
+
+			BUG_ON(!root->ref_tree);
+			dirty = root->dirty_root;
+
+			btrfs_free_log(trans, root);
+			btrfs_free_reloc_root(trans, root);
+
+			if (root->commit_root == root->node) {
+				WARN_ON(root->node->start !=
+					btrfs_root_bytenr(&root->root_item));
+
+				free_extent_buffer(root->commit_root);
+				root->commit_root = NULL;
+				root->dirty_root = NULL;
+
+				spin_lock(&root->list_lock);
+				list_del_init(&dirty->root->dead_list);
+				spin_unlock(&root->list_lock);
+
+				kfree(dirty->root);
+				kfree(dirty);
+
+				/* make sure to update the root on disk
+				 * so we get any updates to the block used
+				 * counts
+				 */
+				err = btrfs_update_root(trans,
+						root->fs_info->tree_root,
+						&root->root_key,
+						&root->root_item);
+				continue;
+			}
+
+			memset(&root->root_item.drop_progress, 0,
+			       sizeof(struct btrfs_disk_key));
+			root->root_item.drop_level = 0;
+			root->commit_root = NULL;
+			root->dirty_root = NULL;
+			root->root_key.offset = root->fs_info->generation;
+			btrfs_set_root_bytenr(&root->root_item,
+					      root->node->start);
+			btrfs_set_root_level(&root->root_item,
+					     btrfs_header_level(root->node));
+			btrfs_set_root_generation(&root->root_item,
+						  root->root_key.offset);
+
+			err = btrfs_insert_root(trans, root->fs_info->tree_root,
+						&root->root_key,
+						&root->root_item);
+			if (err)
+				break;
+
+			refs = btrfs_root_refs(&dirty->root->root_item);
+			btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
+			err = btrfs_update_root(trans, root->fs_info->tree_root,
+						&dirty->root->root_key,
+						&dirty->root->root_item);
+
+			BUG_ON(err);
+			if (refs == 1) {
+				list_add(&dirty->list, list);
+			} else {
+				WARN_ON(1);
+				free_extent_buffer(dirty->root->node);
+				kfree(dirty->root);
+				kfree(dirty);
+			}
+		}
+	}
+	return err;
+}
+
+/*
+ * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
+ * otherwise every leaf in the btree is read and defragged.
+ */
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	int ret;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr;
+
+	smp_mb();
+	if (root->defrag_running)
+		return 0;
+	trans = btrfs_start_transaction(root, 1);
+	while (1) {
+		root->defrag_running = 1;
+		ret = btrfs_defrag_leaves(trans, root, cacheonly);
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(info->tree_root, nr);
+		cond_resched();
+
+		trans = btrfs_start_transaction(root, 1);
+		if (root->fs_info->closing || ret != -EAGAIN)
+			break;
+	}
+	root->defrag_running = 0;
+	smp_mb();
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+/*
+ * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
+ * all of them
+ */
+static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
+				     struct list_head *list)
+{
+	struct btrfs_dirty_root *dirty;
+	struct btrfs_trans_handle *trans;
+	unsigned long nr;
+	u64 num_bytes;
+	u64 bytes_used;
+	u64 max_useless;
+	int ret = 0;
+	int err;
+
+	while (!list_empty(list)) {
+		struct btrfs_root *root;
+
+		dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
+		list_del_init(&dirty->list);
+
+		num_bytes = btrfs_root_used(&dirty->root->root_item);
+		root = dirty->latest_root;
+		atomic_inc(&root->fs_info->throttles);
+
+		while (1) {
+			trans = btrfs_start_transaction(tree_root, 1);
+			mutex_lock(&root->fs_info->drop_mutex);
+			ret = btrfs_drop_snapshot(trans, dirty->root);
+			if (ret != -EAGAIN)
+				break;
+			mutex_unlock(&root->fs_info->drop_mutex);
+
+			err = btrfs_update_root(trans,
+					tree_root,
+					&dirty->root->root_key,
+					&dirty->root->root_item);
+			if (err)
+				ret = err;
+			nr = trans->blocks_used;
+			ret = btrfs_end_transaction(trans, tree_root);
+			BUG_ON(ret);
+
+			btrfs_btree_balance_dirty(tree_root, nr);
+			cond_resched();
+		}
+		BUG_ON(ret);
+		atomic_dec(&root->fs_info->throttles);
+		wake_up(&root->fs_info->transaction_throttle);
+
+		num_bytes -= btrfs_root_used(&dirty->root->root_item);
+		bytes_used = btrfs_root_used(&root->root_item);
+		if (num_bytes) {
+			btrfs_record_root_in_trans(root);
+			btrfs_set_root_used(&root->root_item,
+					    bytes_used - num_bytes);
+		}
+
+		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
+		if (ret) {
+			BUG();
+			break;
+		}
+		mutex_unlock(&root->fs_info->drop_mutex);
+
+		spin_lock(&root->list_lock);
+		list_del_init(&dirty->root->dead_list);
+		if (!list_empty(&root->dead_list)) {
+			struct btrfs_root *oldest;
+			oldest = list_entry(root->dead_list.prev,
+					    struct btrfs_root, dead_list);
+			max_useless = oldest->root_key.offset - 1;
+		} else {
+			max_useless = root->root_key.offset - 1;
+		}
+		spin_unlock(&root->list_lock);
+
+		nr = trans->blocks_used;
+		ret = btrfs_end_transaction(trans, tree_root);
+		BUG_ON(ret);
+
+		ret = btrfs_remove_leaf_refs(root, max_useless, 0);
+		BUG_ON(ret);
+
+		free_extent_buffer(dirty->root->node);
+		kfree(dirty->root);
+		kfree(dirty);
+
+		btrfs_btree_balance_dirty(tree_root, nr);
+		cond_resched();
+	}
+	return ret;
+}
+
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit.  This does the actual creation
+ */
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+				   struct btrfs_fs_info *fs_info,
+				   struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_key key;
+	struct btrfs_root_item *new_root_item;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *root = pending->root;
+	struct extent_buffer *tmp;
+	struct extent_buffer *old;
+	int ret;
+	u64 objectid;
+
+	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
+	if (!new_root_item) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+	if (ret)
+		goto fail;
+
+	btrfs_record_root_in_trans(root);
+	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+
+	key.objectid = objectid;
+	key.offset = trans->transid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+	old = btrfs_lock_root_node(root);
+	btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+
+	btrfs_copy_root(trans, root, old, &tmp, objectid);
+	btrfs_tree_unlock(old);
+	free_extent_buffer(old);
+
+	btrfs_set_root_bytenr(new_root_item, tmp->start);
+	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
+	btrfs_set_root_generation(new_root_item, trans->transid);
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				new_root_item);
+	btrfs_tree_unlock(tmp);
+	free_extent_buffer(tmp);
+	if (ret)
+		goto fail;
+
+	key.offset = (u64)-1;
+	memcpy(&pending->root_key, &key, sizeof(key));
+fail:
+	kfree(new_root_item);
+	return ret;
+}
+
+static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
+				   struct btrfs_pending_snapshot *pending)
+{
+	int ret;
+	int namelen;
+	u64 index = 0;
+	struct btrfs_trans_handle *trans;
+	struct inode *parent_inode;
+	struct inode *inode;
+	struct btrfs_root *parent_root;
+
+	parent_inode = pending->dentry->d_parent->d_inode;
+	parent_root = BTRFS_I(parent_inode)->root;
+	trans = btrfs_join_transaction(parent_root, 1);
+
+	/*
+	 * insert the directory item
+	 */
+	namelen = strlen(pending->name);
+	ret = btrfs_set_inode_index(parent_inode, &index);
+	ret = btrfs_insert_dir_item(trans, parent_root,
+			    pending->name, namelen,
+			    parent_inode->i_ino,
+			    &pending->root_key, BTRFS_FT_DIR, index);
+
+	if (ret)
+		goto fail;
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+	ret = btrfs_update_inode(trans, parent_root, parent_inode);
+	BUG_ON(ret);
+
+	/* add the backref first */
+	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+				 pending->root_key.objectid,
+				 BTRFS_ROOT_BACKREF_KEY,
+				 parent_root->root_key.objectid,
+				 parent_inode->i_ino, index, pending->name,
+				 namelen);
+
+	BUG_ON(ret);
+
+	/* now add the forward ref */
+	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+				 parent_root->root_key.objectid,
+				 BTRFS_ROOT_REF_KEY,
+				 pending->root_key.objectid,
+				 parent_inode->i_ino, index, pending->name,
+				 namelen);
+
+	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
+	d_instantiate(pending->dentry, inode);
+fail:
+	btrfs_end_transaction(trans, fs_info->fs_root);
+	return ret;
+}
+
+/*
+ * create all the snapshots we've scheduled for creation
+ */
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
+					     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_pending_snapshot *pending;
+	struct list_head *head = &trans->transaction->pending_snapshots;
+	struct list_head *cur;
+	int ret;
+
+	list_for_each(cur, head) {
+		pending = list_entry(cur, struct btrfs_pending_snapshot, list);
+		ret = create_pending_snapshot(trans, fs_info, pending);
+		BUG_ON(ret);
+	}
+	return 0;
+}
+
+static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
+					     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_pending_snapshot *pending;
+	struct list_head *head = &trans->transaction->pending_snapshots;
+	int ret;
+
+	while (!list_empty(head)) {
+		pending = list_entry(head->next,
+				     struct btrfs_pending_snapshot, list);
+		ret = finish_pending_snapshot(fs_info, pending);
+		BUG_ON(ret);
+		list_del(&pending->list);
+		kfree(pending->name);
+		kfree(pending);
+	}
+	return 0;
+}
+
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root)
+{
+	unsigned long joined = 0;
+	unsigned long timeout = 1;
+	struct btrfs_transaction *cur_trans;
+	struct btrfs_transaction *prev_trans = NULL;
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+	struct list_head dirty_fs_roots;
+	struct extent_io_tree *pinned_copy;
+	DEFINE_WAIT(wait);
+	int ret;
+
+	INIT_LIST_HEAD(&dirty_fs_roots);
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (trans->transaction->in_commit) {
+		cur_trans = trans->transaction;
+		trans->transaction->use_count++;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		btrfs_end_transaction(trans, root);
+
+		ret = wait_for_commit(root, cur_trans);
+		BUG_ON(ret);
+
+		mutex_lock(&root->fs_info->trans_mutex);
+		put_transaction(cur_trans);
+		mutex_unlock(&root->fs_info->trans_mutex);
+
+		return 0;
+	}
+
+	pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
+	if (!pinned_copy)
+		return -ENOMEM;
+
+	extent_io_tree_init(pinned_copy,
+			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
+
+	trans->transaction->in_commit = 1;
+	trans->transaction->blocked = 1;
+	cur_trans = trans->transaction;
+	if (cur_trans->list.prev != &root->fs_info->trans_list) {
+		prev_trans = list_entry(cur_trans->list.prev,
+					struct btrfs_transaction, list);
+		if (!prev_trans->commit_done) {
+			prev_trans->use_count++;
+			mutex_unlock(&root->fs_info->trans_mutex);
+
+			wait_for_commit(root, prev_trans);
+
+			mutex_lock(&root->fs_info->trans_mutex);
+			put_transaction(prev_trans);
+		}
+	}
+
+	do {
+		int snap_pending = 0;
+		joined = cur_trans->num_joined;
+		if (!list_empty(&trans->transaction->pending_snapshots))
+			snap_pending = 1;
+
+		WARN_ON(cur_trans != trans->transaction);
+		prepare_to_wait(&cur_trans->writer_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		if (cur_trans->num_writers > 1)
+			timeout = MAX_SCHEDULE_TIMEOUT;
+		else
+			timeout = 1;
+
+		mutex_unlock(&root->fs_info->trans_mutex);
+
+		if (snap_pending) {
+			ret = btrfs_wait_ordered_extents(root, 1);
+			BUG_ON(ret);
+		}
+
+		schedule_timeout(timeout);
+
+		mutex_lock(&root->fs_info->trans_mutex);
+		finish_wait(&cur_trans->writer_wait, &wait);
+	} while (cur_trans->num_writers > 1 ||
+		 (cur_trans->num_joined != joined));
+
+	ret = create_pending_snapshots(trans, root->fs_info);
+	BUG_ON(ret);
+
+	WARN_ON(cur_trans != trans->transaction);
+
+	/* btrfs_commit_tree_roots is responsible for getting the
+	 * various roots consistent with each other.  Every pointer
+	 * in the tree of tree roots has to point to the most up to date
+	 * root for every subvolume and other tree.  So, we have to keep
+	 * the tree logging code from jumping in and changing any
+	 * of the trees.
+	 *
+	 * At this point in the commit, there can't be any tree-log
+	 * writers, but a little lower down we drop the trans mutex
+	 * and let new people in.  By holding the tree_log_mutex
+	 * from now until after the super is written, we avoid races
+	 * with the tree-log code.
+	 */
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	/*
+	 * keep tree reloc code from adding new reloc trees
+	 */
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+
+
+	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
+			      &dirty_fs_roots);
+	BUG_ON(ret);
+
+	/* add_dirty_roots gets rid of all the tree log roots, it is now
+	 * safe to free the root of tree log roots
+	 */
+	btrfs_free_log_root_tree(trans, root->fs_info);
+
+	ret = btrfs_commit_tree_roots(trans, root);
+	BUG_ON(ret);
+
+	cur_trans = root->fs_info->running_transaction;
+	spin_lock(&root->fs_info->new_trans_lock);
+	root->fs_info->running_transaction = NULL;
+	spin_unlock(&root->fs_info->new_trans_lock);
+	btrfs_set_super_generation(&root->fs_info->super_copy,
+				   cur_trans->transid);
+	btrfs_set_super_root(&root->fs_info->super_copy,
+			     root->fs_info->tree_root->node->start);
+	btrfs_set_super_root_level(&root->fs_info->super_copy,
+			   btrfs_header_level(root->fs_info->tree_root->node));
+
+	btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+				   chunk_root->node->start);
+	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+					 btrfs_header_level(chunk_root->node));
+	btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
+				btrfs_header_generation(chunk_root->node));
+
+	if (!root->fs_info->log_root_recovering) {
+		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+	}
+
+	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+	       sizeof(root->fs_info->super_copy));
+
+	btrfs_copy_pinned(root, pinned_copy);
+
+	trans->transaction->blocked = 0;
+	wake_up(&root->fs_info->transaction_throttle);
+	wake_up(&root->fs_info->transaction_wait);
+
+	mutex_unlock(&root->fs_info->trans_mutex);
+	ret = btrfs_write_and_wait_transaction(trans, root);
+	BUG_ON(ret);
+	write_ctree_super(trans, root, 0);
+
+	/*
+	 * the super is written, we can safely allow the tree-loggers
+	 * to go about their business
+	 */
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+
+	btrfs_finish_extent_commit(trans, root, pinned_copy);
+	kfree(pinned_copy);
+
+	btrfs_drop_dead_reloc_roots(root);
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+	/* do the directory inserts of any pending snapshot creations */
+	finish_pending_snapshots(trans, root->fs_info);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+
+	cur_trans->commit_done = 1;
+	root->fs_info->last_trans_committed = cur_trans->transid;
+	wake_up(&cur_trans->commit_wait);
+
+	put_transaction(cur_trans);
+	put_transaction(cur_trans);
+
+	list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
+	if (root->fs_info->closing)
+		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
+
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+	if (root->fs_info->closing)
+		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
+	return ret;
+}
+
+/*
+ * interface function to delete all the snapshots we have scheduled for deletion
+ */
+int btrfs_clean_old_snapshots(struct btrfs_root *root)
+{
+	struct list_head dirty_roots;
+	INIT_LIST_HEAD(&dirty_roots);
+again:
+	mutex_lock(&root->fs_info->trans_mutex);
+	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	if (!list_empty(&dirty_roots)) {
+		drop_dirty_roots(root, &dirty_roots);
+		goto again;
+	}
+	return 0;
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 00000000000..ea292117f88
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
+#include "btrfs_inode.h"
+
+struct btrfs_transaction {
+	u64 transid;
+	unsigned long num_writers;
+	unsigned long num_joined;
+	int in_commit;
+	int use_count;
+	int commit_done;
+	int blocked;
+	struct list_head list;
+	struct extent_io_tree dirty_pages;
+	unsigned long start_time;
+	wait_queue_head_t writer_wait;
+	wait_queue_head_t commit_wait;
+	struct list_head pending_snapshots;
+};
+
+struct btrfs_trans_handle {
+	u64 transid;
+	unsigned long blocks_reserved;
+	unsigned long blocks_used;
+	struct btrfs_transaction *transaction;
+	u64 block_group;
+	u64 alloc_exclude_start;
+	u64 alloc_exclude_nr;
+};
+
+struct btrfs_pending_snapshot {
+	struct dentry *dentry;
+	struct btrfs_root *root;
+	char *name;
+	struct btrfs_key root_key;
+	struct list_head list;
+};
+
+struct btrfs_dirty_root {
+	struct list_head list;
+	struct btrfs_root *root;
+	struct btrfs_root *latest_root;
+};
+
+static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
+					       struct inode *inode)
+{
+	trans->block_group = BTRFS_I(inode)->block_group;
+}
+
+static inline void btrfs_update_inode_block_group(
+					  struct btrfs_trans_handle *trans,
+					  struct inode *inode)
+{
+	BTRFS_I(inode)->block_group = trans->block_group;
+}
+
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+					      struct inode *inode)
+{
+	BTRFS_I(inode)->last_trans = trans->transaction->transid;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+						   int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+						   int num_blocks);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+						   int num_blocks);
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root);
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_clean_old_snapshots(struct btrfs_root *root);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root);
+void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages);
+#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 00000000000..3e8358c3616
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+
+/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
+ * things from disk, otherwise read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, int cache_only)
+{
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	int ret = 0;
+	int wret;
+	int level;
+	int orig_level;
+	int is_extent = 0;
+	int next_key_ret = 0;
+	u64 last_ret = 0;
+	u64 min_trans = 0;
+
+	if (cache_only)
+		goto out;
+
+	if (root->fs_info->extent_root == root) {
+		/*
+		 * there's recursion here right now in the tree locking,
+		 * we can't defrag the extent root without deadlock
+		 */
+		goto out;
+	}
+
+	if (root->ref_cows == 0 && !is_extent)
+		goto out;
+
+	if (btrfs_test_opt(root, SSD))
+		goto out;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	level = btrfs_header_level(root->node);
+	orig_level = level;
+
+	if (level == 0)
+		goto out;
+
+	if (root->defrag_progress.objectid == 0) {
+		struct extent_buffer *root_node;
+		u32 nritems;
+
+		root_node = btrfs_lock_root_node(root);
+		nritems = btrfs_header_nritems(root_node);
+		root->defrag_max.objectid = 0;
+		/* from above we know this is not a leaf */
+		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+				      nritems - 1);
+		btrfs_tree_unlock(root_node);
+		free_extent_buffer(root_node);
+		memset(&key, 0, sizeof(key));
+	} else {
+		memcpy(&key, &root->defrag_progress, sizeof(key));
+	}
+
+	path->keep_locks = 1;
+	if (cache_only)
+		min_trans = root->defrag_trans_start;
+
+	ret = btrfs_search_forward(root, &key, NULL, path,
+				   cache_only, min_trans);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+	btrfs_release_path(root, path);
+	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+
+	if (wret < 0) {
+		ret = wret;
+		goto out;
+	}
+	if (!path->nodes[1]) {
+		ret = 0;
+		goto out;
+	}
+	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+					   min_trans);
+	ret = btrfs_realloc_node(trans, root,
+				 path->nodes[1], 0,
+				 cache_only, &last_ret,
+				 &root->defrag_progress);
+	WARN_ON(ret && ret != -EAGAIN);
+	if (next_key_ret == 0) {
+		memcpy(&root->defrag_progress, &key, sizeof(key));
+		ret = -EAGAIN;
+	}
+
+	btrfs_release_path(root, path);
+	if (is_extent)
+		btrfs_extent_post_op(trans, root);
+out:
+	if (path)
+		btrfs_free_path(path);
+	if (ret == -EAGAIN) {
+		if (root->defrag_max.objectid > root->defrag_progress.objectid)
+			goto done;
+		if (root->defrag_max.type > root->defrag_progress.type)
+			goto done;
+		if (root->defrag_max.offset > root->defrag_progress.offset)
+			goto done;
+		ret = 0;
+	}
+done:
+	if (ret != -EAGAIN) {
+		memset(&root->defrag_progress, 0,
+		       sizeof(root->defrag_progress));
+		root->defrag_trans_start = trans->transid;
+	}
+	return ret;
+}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 00000000000..d81cda2e077
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2898 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "compat.h"
+#include "tree-log.h"
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+
+/*
+ * stages for the tree walking.  The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_ALL 2
+
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction.  Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree.  Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * btrfs_add_log_tree adds a new per-subvolume log tree into the
+ * tree of log tree roots.  This must be called with a tree log transaction
+ * running (see start_log_trans).
+ */
+static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root)
+{
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	struct btrfs_root *new_root = root;
+	int ret;
+	u64 objectid = root->root_key.objectid;
+
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+				      BTRFS_TREE_LOG_OBJECTID,
+				      trans->transid, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		return ret;
+	}
+
+	btrfs_set_header_nritems(leaf, 0);
+	btrfs_set_header_level(leaf, 0);
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+
+	write_extent_buffer(leaf, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_item = &root_item.inode;
+	memset(inode_item, 0, sizeof(*inode_item));
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_generation(&root_item, trans->transid);
+	btrfs_set_root_level(&root_item, 0);
+	btrfs_set_root_refs(&root_item, 0);
+	btrfs_set_root_used(&root_item, 0);
+
+	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+	root_item.drop_level = 0;
+
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	leaf = NULL;
+
+	btrfs_set_root_dirid(&root_item, 0);
+
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = objectid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
+				&root_item);
+	if (ret)
+		goto fail;
+
+	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
+					       &key);
+	BUG_ON(!new_root);
+
+	WARN_ON(root->log_root);
+	root->log_root = new_root;
+
+	/*
+	 * log trees do not get reference counted because they go away
+	 * before a real commit is actually done.  They do store pointers
+	 * to file data extents, and those reference counts still get
+	 * updated (along with back refs to the log tree).
+	 */
+	new_root->ref_cows = 0;
+	new_root->last_trans = trans->transid;
+
+	/*
+	 * we need to make sure the root block for this new tree
+	 * is marked as dirty in the dirty_log_pages tree.  This
+	 * is how it gets flushed down to disk at tree log commit time.
+	 *
+	 * the tree logging mutex keeps others from coming in and changing
+	 * the new_root->node, so we can safely access it here
+	 */
+	set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
+			 new_root->node->start + new_root->node->len - 1,
+			 GFP_NOFS);
+
+fail:
+	return ret;
+}
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root)
+{
+	int ret;
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (!root->fs_info->log_root_tree) {
+		ret = btrfs_init_log_root_tree(trans, root->fs_info);
+		BUG_ON(ret);
+	}
+	if (!root->log_root) {
+		ret = btrfs_add_log_tree(trans, root);
+		BUG_ON(ret);
+	}
+	atomic_inc(&root->fs_info->tree_log_writers);
+	root->fs_info->tree_log_batch++;
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	return 0;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	smp_mb();
+	if (!root->log_root)
+		return -ENOENT;
+
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (root->log_root) {
+		ret = 0;
+		atomic_inc(&root->fs_info->tree_log_writers);
+		root->fs_info->tree_log_batch++;
+	}
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	return ret;
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+static int end_log_trans(struct btrfs_root *root)
+{
+	atomic_dec(&root->fs_info->tree_log_writers);
+	smp_mb();
+	if (waitqueue_active(&root->fs_info->tree_log_wait))
+		wake_up(&root->fs_info->tree_log_wait);
+	return 0;
+}
+
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree.  The stage field tells us which part
+ * of the log tree processing we are currently doing.  The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+	/* should we free the extent on disk when done?  This is used
+	 * at transaction commit time while freeing a log tree
+	 */
+	int free;
+
+	/* should we write out the extent buffer?  This is used
+	 * while flushing the log tree to disk during a sync
+	 */
+	int write;
+
+	/* should we wait for the extent buffer io to finish?  Also used
+	 * while flushing the log tree to disk for a sync
+	 */
+	int wait;
+
+	/* pin only walk, we record which extents on disk belong to the
+	 * log trees
+	 */
+	int pin;
+
+	/* what stage of the replay code we're currently in */
+	int stage;
+
+	/* the root we are currently replaying */
+	struct btrfs_root *replay_dest;
+
+	/* the trans handle for the current replay */
+	struct btrfs_trans_handle *trans;
+
+	/* the function that gets used to process blocks we find in the
+	 * tree.  Note the extent_buffer might not be up to date when it is
+	 * passed in, and it must be checked or read if you need the data
+	 * inside it
+	 */
+	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+			    struct walk_control *wc, u64 gen);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+			      struct extent_buffer *eb,
+			      struct walk_control *wc, u64 gen)
+{
+	if (wc->pin) {
+		mutex_lock(&log->fs_info->pinned_mutex);
+		btrfs_update_pinned_extents(log->fs_info->extent_root,
+					    eb->start, eb->len, 1);
+		mutex_unlock(&log->fs_info->pinned_mutex);
+	}
+
+	if (btrfs_buffer_uptodate(eb, gen)) {
+		if (wc->write)
+			btrfs_write_tree_block(eb);
+		if (wc->wait)
+			btrfs_wait_tree_block_writeback(eb);
+	}
+	return 0;
+}
+
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct extent_buffer *eb, int slot,
+				   struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size;
+	u64 saved_i_size = 0;
+	int save_old_i_size = 0;
+	unsigned long src_ptr;
+	unsigned long dst_ptr;
+	int overwrite_root = 0;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+		overwrite_root = 1;
+
+	item_size = btrfs_item_size_nr(eb, slot);
+	src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+	/* look for the key in the destination tree */
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *src_copy;
+		char *dst_copy;
+		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+						  path->slots[0]);
+		if (dst_size != item_size)
+			goto insert;
+
+		if (item_size == 0) {
+			btrfs_release_path(root, path);
+			return 0;
+		}
+		dst_copy = kmalloc(item_size, GFP_NOFS);
+		src_copy = kmalloc(item_size, GFP_NOFS);
+
+		read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+				   item_size);
+		ret = memcmp(dst_copy, src_copy, item_size);
+
+		kfree(dst_copy);
+		kfree(src_copy);
+		/*
+		 * they have the same contents, just return, this saves
+		 * us from cowing blocks in the destination tree and doing
+		 * extra writes that may not have been done by a previous
+		 * sync
+		 */
+		if (ret == 0) {
+			btrfs_release_path(root, path);
+			return 0;
+		}
+
+	}
+insert:
+	btrfs_release_path(root, path);
+	/* try to insert the key into the destination tree */
+	ret = btrfs_insert_empty_item(trans, root, path,
+				      key, item_size);
+
+	/* make sure any existing item is the correct size */
+	if (ret == -EEXIST) {
+		u32 found_size;
+		found_size = btrfs_item_size_nr(path->nodes[0],
+						path->slots[0]);
+		if (found_size > item_size) {
+			btrfs_truncate_item(trans, root, path, item_size, 1);
+		} else if (found_size < item_size) {
+			ret = btrfs_extend_item(trans, root, path,
+						item_size - found_size);
+			BUG_ON(ret);
+		}
+	} else if (ret) {
+		BUG();
+	}
+	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+					path->slots[0]);
+
+	/* don't overwrite an existing inode if the generation number
+	 * was logged as zero.  This is done when the tree logging code
+	 * is just logging an inode to make sure it exists after recovery.
+	 *
+	 * Also, don't overwrite i_size on directories during replay.
+	 * log replay inserts and removes directory items based on the
+	 * state of the tree found in the subvolume, and i_size is modified
+	 * as it goes
+	 */
+	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+		struct btrfs_inode_item *src_item;
+		struct btrfs_inode_item *dst_item;
+
+		src_item = (struct btrfs_inode_item *)src_ptr;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+		if (btrfs_inode_generation(eb, src_item) == 0)
+			goto no_copy;
+
+		if (overwrite_root &&
+		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+			save_old_i_size = 1;
+			saved_i_size = btrfs_inode_size(path->nodes[0],
+							dst_item);
+		}
+	}
+
+	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+			   src_ptr, item_size);
+
+	if (save_old_i_size) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+	}
+
+	/* make sure the generation is filled in */
+	if (key->type == BTRFS_INODE_ITEM_KEY) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+			btrfs_set_inode_generation(path->nodes[0], dst_item,
+						   trans->transid);
+		}
+	}
+no_copy:
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(root, path);
+	return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+					     u64 objectid)
+{
+	struct inode *inode;
+	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		BTRFS_I(inode)->location.objectid = objectid;
+		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+		BTRFS_I(inode)->location.offset = 0;
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'.  path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet.  So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct extent_buffer *eb, int slot,
+				      struct btrfs_key *key)
+{
+	int found_type;
+	u64 mask = root->sectorsize - 1;
+	u64 extent_end;
+	u64 alloc_hint;
+	u64 start = key->offset;
+	u64 saved_nbytes;
+	struct btrfs_file_extent_item *item;
+	struct inode *inode = NULL;
+	unsigned long size;
+	int ret = 0;
+
+	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(eb, item);
+
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
+		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
+	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size = btrfs_file_extent_inline_len(eb, item);
+		extent_end = (start + size + mask) & ~mask;
+	} else {
+		ret = 0;
+		goto out;
+	}
+
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * first check to see if we already have this extent in the
+	 * file.  This must be done before the btrfs_drop_extents run
+	 * so we don't try to drop this extent.
+	 */
+	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+				       start, 0);
+
+	if (ret == 0 &&
+	    (found_type == BTRFS_FILE_EXTENT_REG ||
+	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+		struct btrfs_file_extent_item cmp1;
+		struct btrfs_file_extent_item cmp2;
+		struct btrfs_file_extent_item *existing;
+		struct extent_buffer *leaf;
+
+		leaf = path->nodes[0];
+		existing = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_file_extent_item);
+
+		read_extent_buffer(eb, &cmp1, (unsigned long)item,
+				   sizeof(cmp1));
+		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+				   sizeof(cmp2));
+
+		/*
+		 * we already have a pointer to this exact extent,
+		 * we don't have to do anything
+		 */
+		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+			btrfs_release_path(root, path);
+			goto out;
+		}
+	}
+	btrfs_release_path(root, path);
+
+	saved_nbytes = inode_get_bytes(inode);
+	/* drop any overlapping extents */
+	ret = btrfs_drop_extents(trans, root, inode,
+			 start, extent_end, start, &alloc_hint);
+	BUG_ON(ret);
+
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		unsigned long dest_offset;
+		struct btrfs_key ins;
+
+		ret = btrfs_insert_empty_item(trans, root, path, key,
+					      sizeof(*item));
+		BUG_ON(ret);
+		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+						    path->slots[0]);
+		copy_extent_buffer(path->nodes[0], eb, dest_offset,
+				(unsigned long)item,  sizeof(*item));
+
+		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+		if (ins.objectid > 0) {
+			u64 csum_start;
+			u64 csum_end;
+			LIST_HEAD(ordered_sums);
+			/*
+			 * is this extent already allocated in the extent
+			 * allocation tree?  If so, just add a reference
+			 */
+			ret = btrfs_lookup_extent(root, ins.objectid,
+						ins.offset);
+			if (ret == 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						ins.objectid, ins.offset,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid);
+			} else {
+				/*
+				 * insert the extent pointer in the extent
+				 * allocation tree
+				 */
+				ret = btrfs_alloc_logged_extent(trans, root,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid,
+						&ins);
+				BUG_ON(ret);
+			}
+			btrfs_release_path(root, path);
+
+			if (btrfs_file_extent_compression(eb, item)) {
+				csum_start = ins.objectid;
+				csum_end = csum_start + ins.offset;
+			} else {
+				csum_start = ins.objectid +
+					btrfs_file_extent_offset(eb, item);
+				csum_end = csum_start +
+					btrfs_file_extent_num_bytes(eb, item);
+			}
+
+			ret = btrfs_lookup_csums_range(root->log_root,
+						csum_start, csum_end - 1,
+						&ordered_sums);
+			BUG_ON(ret);
+			while (!list_empty(&ordered_sums)) {
+				struct btrfs_ordered_sum *sums;
+				sums = list_entry(ordered_sums.next,
+						struct btrfs_ordered_sum,
+						list);
+				ret = btrfs_csum_file_blocks(trans,
+						root->fs_info->csum_root,
+						sums);
+				BUG_ON(ret);
+				list_del(&sums->list);
+				kfree(sums);
+			}
+		} else {
+			btrfs_release_path(root, path);
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		/* inline extents are easy, we just overwrite them */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+	}
+
+	inode_set_bytes(inode, saved_nbytes);
+	btrfs_update_inode(trans, root, inode);
+out:
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct inode *dir,
+				      struct btrfs_dir_item *di)
+{
+	struct inode *inode;
+	char *name;
+	int name_len;
+	struct extent_buffer *leaf;
+	struct btrfs_key location;
+	int ret;
+
+	leaf = path->nodes[0];
+
+	btrfs_dir_item_key_to_cpu(leaf, di, &location);
+	name_len = btrfs_dir_name_len(leaf, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+	btrfs_release_path(root, path);
+
+	inode = read_one_inode(root, location.objectid);
+	BUG_ON(!inode);
+
+	ret = link_to_fixup_dir(trans, root, path, location.objectid);
+	BUG_ON(ret);
+	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	BUG_ON(ret);
+	kfree(name);
+
+	iput(inode);
+	return ret;
+}
+
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 dirid, u64 objectid, u64 index,
+				 const char *name, int name_len)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_key location;
+	int match = 0;
+
+	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+					 index, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	btrfs_release_path(root, path);
+
+	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	match = 1;
+out:
+	btrfs_release_path(root, path);
+	return match;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode.  This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+				   struct btrfs_key *key,
+				   char *name, int namelen)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	unsigned long name_ptr;
+	int found_name_len;
+	int item_size;
+	int ret;
+	int match = 0;
+
+	path = btrfs_alloc_path();
+	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+	if (ret != 0)
+		goto out;
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		ref = (struct btrfs_inode_ref *)ptr;
+		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+		if (found_name_len == namelen) {
+			name_ptr = (unsigned long)(ref + 1);
+			ret = memcmp_extent_buffer(path->nodes[0], name,
+						   name_ptr, namelen);
+			if (ret == 0) {
+				match = 1;
+				goto out;
+			}
+		}
+		ptr = (unsigned long)(ref + 1) + found_name_len;
+	}
+out:
+	btrfs_free_path(path);
+	return match;
+}
+
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  struct extent_buffer *eb, int slot,
+				  struct btrfs_key *key)
+{
+	struct inode *dir;
+	int ret;
+	struct btrfs_key location;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_dir_item *di;
+	struct inode *inode;
+	char *name;
+	int namelen;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+
+	location.objectid = key->objectid;
+	location.type = BTRFS_INODE_ITEM_KEY;
+	location.offset = 0;
+
+	/*
+	 * it is possible that we didn't log all the parent directories
+	 * for a given inode.  If we don't find the dir, just don't
+	 * copy the back ref in.  The link count fixup code will take
+	 * care of the rest
+	 */
+	dir = read_one_inode(root, key->offset);
+	if (!dir)
+		return -ENOENT;
+
+	inode = read_one_inode(root, key->objectid);
+	BUG_ON(!dir);
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+again:
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	namelen = btrfs_inode_ref_name_len(eb, ref);
+	name = kmalloc(namelen, GFP_NOFS);
+	BUG_ON(!name);
+
+	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+
+	/* if we already have a perfect match, we're done */
+	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+			 btrfs_inode_ref_index(eb, ref),
+			 name, namelen)) {
+		goto out;
+	}
+
+	/*
+	 * look for a conflicting back reference in the metadata.
+	 * if we find one we have to unlink that name of the file
+	 * before we add our new link.  Later on, we overwrite any
+	 * existing back reference, and we don't want to create
+	 * dangling pointers in the directory.
+	 */
+conflict_again:
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *victim_name;
+		int victim_name_len;
+		struct btrfs_inode_ref *victim_ref;
+		unsigned long ptr;
+		unsigned long ptr_end;
+		struct extent_buffer *leaf = path->nodes[0];
+
+		/* are we trying to overwrite a back ref for the root directory
+		 * if so, just jump out, we're done
+		 */
+		if (key->objectid == key->offset)
+			goto out_nowrite;
+
+		/* check all the names in this back reference to see
+		 * if they are in the log.  if so, we allow them to stay
+		 * otherwise they must be unlinked as a conflict
+		 */
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+		while (ptr < ptr_end) {
+			victim_ref = (struct btrfs_inode_ref *)ptr;
+			victim_name_len = btrfs_inode_ref_name_len(leaf,
+								   victim_ref);
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			BUG_ON(!victim_name);
+
+			read_extent_buffer(leaf, victim_name,
+					   (unsigned long)(victim_ref + 1),
+					   victim_name_len);
+
+			if (!backref_in_log(log, key, victim_name,
+					    victim_name_len)) {
+				btrfs_inc_nlink(inode);
+				btrfs_release_path(root, path);
+				ret = btrfs_unlink_inode(trans, root, dir,
+							 inode, victim_name,
+							 victim_name_len);
+				kfree(victim_name);
+				btrfs_release_path(root, path);
+				goto conflict_again;
+			}
+			kfree(victim_name);
+			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+		}
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+	/* look for a conflicting sequence number */
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					 btrfs_inode_ref_index(eb, ref),
+					 name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+
+	/* look for a conflicting name */
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				   name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+	/* insert our name */
+	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
+			     btrfs_inode_ref_index(eb, ref));
+	BUG_ON(ret);
+
+	btrfs_update_inode(trans, root, inode);
+
+out:
+	ref_ptr = (unsigned long)(ref + 1) + namelen;
+	kfree(name);
+	if (ref_ptr < ref_end)
+		goto again;
+
+	/* finally write the back reference in the inode */
+	ret = overwrite_item(trans, root, path, eb, slot, key);
+	BUG_ON(ret);
+
+out_nowrite:
+	btrfs_release_path(root, path);
+	iput(dir);
+	iput(inode);
+	return 0;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	u64 nlink = 0;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	int name_len;
+
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &key,
+				      path->slots[0]);
+		if (key.objectid != inode->i_ino ||
+		    key.type != BTRFS_INODE_REF_KEY)
+			break;
+		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+						   path->slots[0]);
+		while (ptr < ptr_end) {
+			struct btrfs_inode_ref *ref;
+
+			ref = (struct btrfs_inode_ref *)ptr;
+			name_len = btrfs_inode_ref_name_len(path->nodes[0],
+							    ref);
+			ptr = (unsigned long)(ref + 1) + name_len;
+			nlink++;
+		}
+
+		if (key.offset == 0)
+			break;
+		key.offset--;
+		btrfs_release_path(root, path);
+	}
+	btrfs_free_path(path);
+	if (nlink != inode->i_nlink) {
+		inode->i_nlink = nlink;
+		btrfs_update_inode(trans, root, inode);
+	}
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+
+	return 0;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    struct btrfs_path *path)
+{
+	int ret;
+	struct btrfs_key key;
+	struct inode *inode;
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = (u64)-1;
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			break;
+
+		if (ret == 1) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		ret = btrfs_del_item(trans, root, path);
+		BUG_ON(ret);
+
+		btrfs_release_path(root, path);
+		inode = read_one_inode(root, key.offset);
+		BUG_ON(!inode);
+
+		ret = fixup_inode_link_count(trans, root, inode);
+		BUG_ON(ret);
+
+		iput(inode);
+
+		if (key.offset == 0)
+			break;
+		key.offset--;
+	}
+	btrfs_release_path(root, path);
+	return 0;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done.  The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct inode *inode;
+
+	inode = read_one_inode(root, objectid);
+	BUG_ON(!inode);
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = objectid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_release_path(root, path);
+	if (ret == 0) {
+		btrfs_inc_nlink(inode);
+		btrfs_update_inode(trans, root, inode);
+	} else if (ret == -EEXIST) {
+		ret = 0;
+	} else {
+		BUG();
+	}
+	iput(inode);
+
+	return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist.  This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    u64 dirid, u64 index,
+				    char *name, int name_len, u8 type,
+				    struct btrfs_key *location)
+{
+	struct inode *inode;
+	struct inode *dir;
+	int ret;
+
+	inode = read_one_inode(root, location->objectid);
+	if (!inode)
+		return -ENOENT;
+
+	dir = read_one_inode(root, dirid);
+	if (!dir) {
+		iput(inode);
+		return -EIO;
+	}
+	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+	/* FIXME, put inode into FIXUP list */
+
+	iput(inode);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped.  fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *eb,
+				    struct btrfs_dir_item *di,
+				    struct btrfs_key *key)
+{
+	char *name;
+	int name_len;
+	struct btrfs_dir_item *dst_di;
+	struct btrfs_key found_key;
+	struct btrfs_key log_key;
+	struct inode *dir;
+	u8 log_type;
+	int exists;
+	int ret;
+
+	dir = read_one_inode(root, key->objectid);
+	BUG_ON(!dir);
+
+	name_len = btrfs_dir_name_len(eb, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	log_type = btrfs_dir_type(eb, di);
+	read_extent_buffer(eb, name, (unsigned long)(di + 1),
+		   name_len);
+
+	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+	if (exists == 0)
+		exists = 1;
+	else
+		exists = 0;
+	btrfs_release_path(root, path);
+
+	if (key->type == BTRFS_DIR_ITEM_KEY) {
+		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+				       name, name_len, 1);
+	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
+		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+						     key->objectid,
+						     key->offset, name,
+						     name_len, 1);
+	} else {
+		BUG();
+	}
+	if (!dst_di || IS_ERR(dst_di)) {
+		/* we need a sequence number to insert, so we only
+		 * do inserts for the BTRFS_DIR_INDEX_KEY types
+		 */
+		if (key->type != BTRFS_DIR_INDEX_KEY)
+			goto out;
+		goto insert;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+	/* the existing item matches the logged item */
+	if (found_key.objectid == log_key.objectid &&
+	    found_key.type == log_key.type &&
+	    found_key.offset == log_key.offset &&
+	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+		goto out;
+	}
+
+	/*
+	 * don't drop the conflicting directory entry if the inode
+	 * for the new entry doesn't exist
+	 */
+	if (!exists)
+		goto out;
+
+	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+	BUG_ON(ret);
+
+	if (key->type == BTRFS_DIR_INDEX_KEY)
+		goto insert;
+out:
+	btrfs_release_path(root, path);
+	kfree(name);
+	iput(dir);
+	return 0;
+
+insert:
+	btrfs_release_path(root, path);
+	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+			      name, name_len, log_type, &log_key);
+
+	if (ret && ret != -ENOENT)
+		BUG();
+	goto out;
+}
+
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct extent_buffer *eb, int slot,
+					struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	struct btrfs_dir_item *di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		name_len = btrfs_dir_name_len(eb, di);
+		ret = replay_one_name(trans, root, path, eb, di, key);
+		BUG_ON(ret);
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	return 0;
+}
+
+/*
+ * directory replay has two parts.  There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for.  During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   u64 dirid, int key_type,
+				   u64 *start_ret, u64 *end_ret)
+{
+	struct btrfs_key key;
+	u64 found_end;
+	struct btrfs_dir_log_item *item;
+	int ret;
+	int nritems;
+
+	if (*start_ret == (u64)-1)
+		return 1;
+
+	key.objectid = dirid;
+	key.type = key_type;
+	key.offset = *start_ret;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+	if (ret != 0)
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto next;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+	if (*start_ret >= key.offset && *start_ret <= found_end) {
+		ret = 0;
+		*start_ret = key.offset;
+		*end_ret = found_end;
+		goto out;
+	}
+	ret = 1;
+next:
+	/* check the next slot in the tree to see if it is a valid item */
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (path->slots[0] >= nritems) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret)
+			goto out;
+	} else {
+		path->slots[0]++;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto out;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+	*start_ret = key.offset;
+	*end_ret = found_end;
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+/*
+ * this looks for a given directory item in the log.  If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_root *log,
+				      struct btrfs_path *path,
+				      struct btrfs_path *log_path,
+				      struct inode *dir,
+				      struct btrfs_key *dir_key)
+{
+	int ret;
+	struct extent_buffer *eb;
+	int slot;
+	u32 item_size;
+	struct btrfs_dir_item *di;
+	struct btrfs_dir_item *log_di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	char *name;
+	struct inode *inode;
+	struct btrfs_key location;
+
+again:
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		name_len = btrfs_dir_name_len(eb, di);
+		name = kmalloc(name_len, GFP_NOFS);
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		read_extent_buffer(eb, name, (unsigned long)(di + 1),
+				  name_len);
+		log_di = NULL;
+		if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+			log_di = btrfs_lookup_dir_item(trans, log, log_path,
+						       dir_key->objectid,
+						       name, name_len, 0);
+		} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+			log_di = btrfs_lookup_dir_index_item(trans, log,
+						     log_path,
+						     dir_key->objectid,
+						     dir_key->offset,
+						     name, name_len, 0);
+		}
+		if (!log_di || IS_ERR(log_di)) {
+			btrfs_dir_item_key_to_cpu(eb, di, &location);
+			btrfs_release_path(root, path);
+			btrfs_release_path(log, log_path);
+			inode = read_one_inode(root, location.objectid);
+			BUG_ON(!inode);
+
+			ret = link_to_fixup_dir(trans, root,
+						path, location.objectid);
+			BUG_ON(ret);
+			btrfs_inc_nlink(inode);
+			ret = btrfs_unlink_inode(trans, root, dir, inode,
+						 name, name_len);
+			BUG_ON(ret);
+			kfree(name);
+			iput(inode);
+
+			/* there might still be more names under this key
+			 * check and repeat if required
+			 */
+			ret = btrfs_search_slot(NULL, root, dir_key, path,
+						0, 0);
+			if (ret == 0)
+				goto again;
+			ret = 0;
+			goto out;
+		}
+		btrfs_release_path(log, log_path);
+		kfree(name);
+
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	btrfs_release_path(log, log_path);
+	return ret;
+}
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes.  It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid)
+{
+	u64 range_start;
+	u64 range_end;
+	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+	int ret = 0;
+	struct btrfs_key dir_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *log_path;
+	struct inode *dir;
+
+	dir_key.objectid = dirid;
+	dir_key.type = BTRFS_DIR_ITEM_KEY;
+	log_path = btrfs_alloc_path();
+	if (!log_path)
+		return -ENOMEM;
+
+	dir = read_one_inode(root, dirid);
+	/* it isn't an error if the inode isn't there, that can happen
+	 * because we replay the deletes before we copy in the inode item
+	 * from the log
+	 */
+	if (!dir) {
+		btrfs_free_path(log_path);
+		return 0;
+	}
+again:
+	range_start = 0;
+	range_end = 0;
+	while (1) {
+		ret = find_dir_range(log, path, dirid, key_type,
+				     &range_start, &range_end);
+		if (ret != 0)
+			break;
+
+		dir_key.offset = range_start;
+		while (1) {
+			int nritems;
+			ret = btrfs_search_slot(NULL, root, &dir_key, path,
+						0, 0);
+			if (ret < 0)
+				goto out;
+
+			nritems = btrfs_header_nritems(path->nodes[0]);
+			if (path->slots[0] >= nritems) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+			if (found_key.objectid != dirid ||
+			    found_key.type != dir_key.type)
+				goto next_type;
+
+			if (found_key.offset > range_end)
+				break;
+
+			ret = check_item_in_log(trans, root, log, path,
+						log_path, dir, &found_key);
+			BUG_ON(ret);
+			if (found_key.offset == (u64)-1)
+				break;
+			dir_key.offset = found_key.offset + 1;
+		}
+		btrfs_release_path(root, path);
+		if (range_end == (u64)-1)
+			break;
+		range_start = range_end + 1;
+	}
+
+next_type:
+	ret = 0;
+	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+		key_type = BTRFS_DIR_LOG_INDEX_KEY;
+		dir_key.type = BTRFS_DIR_INDEX_KEY;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(log_path);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree.  This
+ * gets called in two different stages.  The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume.  The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+			     struct walk_control *wc, u64 gen)
+{
+	int nritems;
+	struct btrfs_path *path;
+	struct btrfs_root *root = wc->replay_dest;
+	struct btrfs_key key;
+	u32 item_size;
+	int level;
+	int i;
+	int ret;
+
+	btrfs_read_buffer(eb, gen);
+
+	level = btrfs_header_level(eb);
+
+	if (level != 0)
+		return 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	nritems = btrfs_header_nritems(eb);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+		item_size = btrfs_item_size_nr(eb, i);
+
+		/* inode keys are done during the first stage */
+		if (key.type == BTRFS_INODE_ITEM_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_INODES) {
+			struct inode *inode;
+			struct btrfs_inode_item *inode_item;
+			u32 mode;
+
+			inode_item = btrfs_item_ptr(eb, i,
+					    struct btrfs_inode_item);
+			mode = btrfs_inode_mode(eb, inode_item);
+			if (S_ISDIR(mode)) {
+				ret = replay_dir_deletes(wc->trans,
+					 root, log, path, key.objectid);
+				BUG_ON(ret);
+			}
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+
+			/* for regular files, truncate away
+			 * extents past the new EOF
+			 */
+			if (S_ISREG(mode)) {
+				inode = read_one_inode(root,
+						       key.objectid);
+				BUG_ON(!inode);
+
+				ret = btrfs_truncate_inode_items(wc->trans,
+					root, inode, inode->i_size,
+					BTRFS_EXTENT_DATA_KEY);
+				BUG_ON(ret);
+				iput(inode);
+			}
+			ret = link_to_fixup_dir(wc->trans, root,
+						path, key.objectid);
+			BUG_ON(ret);
+		}
+		if (wc->stage < LOG_WALK_REPLAY_ALL)
+			continue;
+
+		/* these keys are simply copied */
+		if (key.type == BTRFS_XATTR_ITEM_KEY) {
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_INODE_REF_KEY) {
+			ret = add_inode_ref(wc->trans, root, log, path,
+					    eb, i, &key);
+			BUG_ON(ret && ret != -ENOENT);
+		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+			ret = replay_one_extent(wc->trans, root, path,
+						eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
+			   key.type == BTRFS_DIR_INDEX_KEY) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			BUG_ON(ret);
+		}
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level,
+				   struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 root_gen;
+	u64 bytenr;
+	u64 ptr_gen;
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
+	struct extent_buffer *parent;
+	u32 blocksize;
+	int ret = 0;
+
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	while (*level > 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+		cur = path->nodes[*level];
+
+		if (btrfs_header_level(cur) != *level)
+			WARN_ON(1);
+
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(cur))
+			break;
+
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+		blocksize = btrfs_level_size(root, *level - 1);
+
+		parent = path->nodes[*level];
+		root_owner = btrfs_header_owner(parent);
+		root_gen = btrfs_header_generation(parent);
+
+		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+
+		wc->process_func(root, next, wc, ptr_gen);
+
+		if (*level == 1) {
+			path->slots[*level]++;
+			if (wc->free) {
+				btrfs_read_buffer(next, ptr_gen);
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				ret = btrfs_drop_leaf_ref(trans, root, next);
+				BUG_ON(ret);
+
+				WARN_ON(root_owner !=
+					BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_reserved_extent(root,
+							 bytenr, blocksize);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(next);
+			continue;
+		}
+		btrfs_read_buffer(next, ptr_gen);
+
+		WARN_ON(*level <= 0);
+		if (path->nodes[*level-1])
+			free_extent_buffer(path->nodes[*level-1]);
+		path->nodes[*level-1] = next;
+		*level = btrfs_header_level(next);
+		path->slots[*level] = 0;
+		cond_resched();
+	}
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	if (path->nodes[*level] == root->node)
+		parent = path->nodes[*level];
+	else
+		parent = path->nodes[*level + 1];
+
+	bytenr = path->nodes[*level]->start;
+
+	blocksize = btrfs_level_size(root, *level);
+	root_owner = btrfs_header_owner(parent);
+	root_gen = btrfs_header_generation(parent);
+
+	wc->process_func(root, path->nodes[*level], wc,
+			 btrfs_header_generation(path->nodes[*level]));
+
+	if (wc->free) {
+		next = path->nodes[*level];
+		btrfs_tree_lock(next);
+		clean_tree_block(trans, root, next);
+		btrfs_wait_tree_block_writeback(next);
+		btrfs_tree_unlock(next);
+
+		if (*level == 0) {
+			ret = btrfs_drop_leaf_ref(trans, root, next);
+			BUG_ON(ret);
+		}
+		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+		ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
+		BUG_ON(ret);
+	}
+	free_extent_buffer(path->nodes[*level]);
+	path->nodes[*level] = NULL;
+	*level += 1;
+
+	cond_resched();
+	return 0;
+}
+
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, int *level,
+				 struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 root_gen;
+	int i;
+	int slot;
+	int ret;
+
+	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+			struct extent_buffer *node;
+			node = path->nodes[i];
+			path->slots[i]++;
+			*level = i;
+			WARN_ON(*level == 0);
+			return 0;
+		} else {
+			struct extent_buffer *parent;
+			if (path->nodes[*level] == root->node)
+				parent = path->nodes[*level];
+			else
+				parent = path->nodes[*level + 1];
+
+			root_owner = btrfs_header_owner(parent);
+			root_gen = btrfs_header_generation(parent);
+			wc->process_func(root, path->nodes[*level], wc,
+				 btrfs_header_generation(path->nodes[*level]));
+			if (wc->free) {
+				struct extent_buffer *next;
+
+				next = path->nodes[*level];
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				if (*level == 0) {
+					ret = btrfs_drop_leaf_ref(trans, root,
+								  next);
+					BUG_ON(ret);
+				}
+
+				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_reserved_extent(root,
+						path->nodes[*level]->start,
+						path->nodes[*level]->len);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(path->nodes[*level]);
+			path->nodes[*level] = NULL;
+			*level = i + 1;
+		}
+	}
+	return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *log, struct walk_control *wc)
+{
+	int ret = 0;
+	int wret;
+	int level;
+	struct btrfs_path *path;
+	int i;
+	int orig_level;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	level = btrfs_header_level(log->node);
+	orig_level = level;
+	path->nodes[level] = log->node;
+	extent_buffer_get(log->node);
+	path->slots[level] = 0;
+
+	while (1) {
+		wret = walk_down_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+
+		wret = walk_up_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+	}
+
+	/* was the root node processed? if not, catch it here */
+	if (path->nodes[orig_level]) {
+		wc->process_func(log, path->nodes[orig_level], wc,
+			 btrfs_header_generation(path->nodes[orig_level]));
+		if (wc->free) {
+			struct extent_buffer *next;
+
+			next = path->nodes[orig_level];
+
+			btrfs_tree_lock(next);
+			clean_tree_block(trans, log, next);
+			btrfs_wait_tree_block_writeback(next);
+			btrfs_tree_unlock(next);
+
+			if (orig_level == 0) {
+				ret = btrfs_drop_leaf_ref(trans, log,
+							  next);
+				BUG_ON(ret);
+			}
+			WARN_ON(log->root_key.objectid !=
+				BTRFS_TREE_LOG_OBJECTID);
+			ret = btrfs_free_reserved_extent(log, next->start,
+							 next->len);
+			BUG_ON(ret);
+		}
+	}
+
+	for (i = 0; i <= orig_level; i++) {
+		if (path->nodes[i]) {
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
+		}
+	}
+	btrfs_free_path(path);
+	if (wc->free)
+		free_extent_buffer(log->node);
+	return ret;
+}
+
+static int wait_log_commit(struct btrfs_root *log)
+{
+	DEFINE_WAIT(wait);
+	u64 transid = log->fs_info->tree_log_transid;
+
+	do {
+		prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&log->fs_info->tree_log_mutex);
+		if (atomic_read(&log->fs_info->tree_log_commit))
+			schedule();
+		finish_wait(&log->fs_info->tree_log_wait, &wait);
+		mutex_lock(&log->fs_info->tree_log_mutex);
+	} while (transid == log->fs_info->tree_log_transid &&
+		atomic_read(&log->fs_info->tree_log_commit));
+	return 0;
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it.  When this call is done,
+ * you know that any inodes previously logged are safely on disk
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root)
+{
+	int ret;
+	unsigned long batch;
+	struct btrfs_root *log = root->log_root;
+
+	mutex_lock(&log->fs_info->tree_log_mutex);
+	if (atomic_read(&log->fs_info->tree_log_commit)) {
+		wait_log_commit(log);
+		goto out;
+	}
+	atomic_set(&log->fs_info->tree_log_commit, 1);
+
+	while (1) {
+		batch = log->fs_info->tree_log_batch;
+		mutex_unlock(&log->fs_info->tree_log_mutex);
+		schedule_timeout_uninterruptible(1);
+		mutex_lock(&log->fs_info->tree_log_mutex);
+
+		while (atomic_read(&log->fs_info->tree_log_writers)) {
+			DEFINE_WAIT(wait);
+			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			mutex_unlock(&log->fs_info->tree_log_mutex);
+			if (atomic_read(&log->fs_info->tree_log_writers))
+				schedule();
+			mutex_lock(&log->fs_info->tree_log_mutex);
+			finish_wait(&log->fs_info->tree_log_wait, &wait);
+		}
+		if (batch == log->fs_info->tree_log_batch)
+			break;
+	}
+
+	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
+	BUG_ON(ret);
+	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
+			       &root->fs_info->log_root_tree->dirty_log_pages);
+	BUG_ON(ret);
+
+	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+				 log->fs_info->log_root_tree->node->start);
+	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+		       btrfs_header_level(log->fs_info->log_root_tree->node));
+
+	write_ctree_super(trans, log->fs_info->tree_root, 2);
+	log->fs_info->tree_log_transid++;
+	log->fs_info->tree_log_batch = 0;
+	atomic_set(&log->fs_info->tree_log_commit, 0);
+	smp_mb();
+	if (waitqueue_active(&log->fs_info->tree_log_wait))
+		wake_up(&log->fs_info->tree_log_wait);
+out:
+	mutex_unlock(&log->fs_info->tree_log_mutex);
+	return 0;
+}
+
+/* * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+	int ret;
+	struct btrfs_root *log;
+	struct key;
+	u64 start;
+	u64 end;
+	struct walk_control wc = {
+		.free = 1,
+		.process_func = process_one_buffer
+	};
+
+	if (!root->log_root || root->fs_info->log_root_recovering)
+		return 0;
+
+	log = root->log_root;
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	while (1) {
+		ret = find_first_extent_bit(&log->dirty_log_pages,
+				    0, &start, &end, EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		clear_extent_dirty(&log->dirty_log_pages,
+				   start, end, GFP_NOFS);
+	}
+
+	log = root->log_root;
+	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+			     &log->root_key);
+	BUG_ON(ret);
+	root->log_root = NULL;
+	kfree(root->log_root);
+	return 0;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *log)
+{
+	u64 bytenr = btrfs_root_bytenr(&log->root_item);
+	int ret;
+
+	if (log->node->start == bytenr)
+		return 0;
+
+	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_generation(&log->root_item, trans->transid);
+	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	BUG_ON(ret);
+	return ret;
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist.  But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked.  Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index)
+{
+	struct btrfs_root *log;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	int ret;
+	int bytes_del = 0;
+
+	if (BTRFS_I(dir)->logged_trans < trans->transid)
+		return 0;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+
+	mutex_lock(&BTRFS_I(dir)->log_mutex);
+
+	log = root->log_root;
+	path = btrfs_alloc_path();
+	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+				   name, name_len, -1);
+	if (di && !IS_ERR(di)) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+	btrfs_release_path(log, path);
+	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+					 index, name, name_len, -1);
+	if (di && !IS_ERR(di)) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+
+	/* update the directory size in the log to reflect the names
+	 * we have removed
+	 */
+	if (bytes_del) {
+		struct btrfs_key key;
+
+		key.objectid = dir->i_ino;
+		key.offset = 0;
+		key.type = BTRFS_INODE_ITEM_KEY;
+		btrfs_release_path(log, path);
+
+		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+		if (ret == 0) {
+			struct btrfs_inode_item *item;
+			u64 i_size;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			i_size = btrfs_inode_size(path->nodes[0], item);
+			if (i_size > bytes_del)
+				i_size -= bytes_del;
+			else
+				i_size = 0;
+			btrfs_set_inode_size(path->nodes[0], item, i_size);
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+		} else
+			ret = 0;
+		btrfs_release_path(log, path);
+	}
+
+	btrfs_free_path(path);
+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
+	end_log_trans(root);
+
+	return 0;
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid)
+{
+	struct btrfs_root *log;
+	u64 index;
+	int ret;
+
+	if (BTRFS_I(inode)->logged_trans < trans->transid)
+		return 0;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+	log = root->log_root;
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+				  dirid, &index);
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+	end_log_trans(root);
+
+	return ret;
+}
+
+/*
+ * creates a range item in the log for 'dirid'.  first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       int key_type, u64 dirid,
+				       u64 first_offset, u64 last_offset)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_log_item *item;
+
+	key.objectid = dirid;
+	key.offset = first_offset;
+	if (key_type == BTRFS_DIR_ITEM_KEY)
+		key.type = BTRFS_DIR_LOG_ITEM_KEY;
+	else
+		key.type = BTRFS_DIR_LOG_INDEX_KEY;
+	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+	BUG_ON(ret);
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(log, path);
+	return 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory.  This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path, int key_type,
+			  u64 min_offset, u64 *last_offset_ret)
+{
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	struct extent_buffer *src;
+	int ret;
+	int i;
+	int nritems;
+	u64 first_offset = min_offset;
+	u64 last_offset = (u64)-1;
+
+	log = root->log_root;
+	max_key.objectid = inode->i_ino;
+	max_key.offset = (u64)-1;
+	max_key.type = key_type;
+
+	min_key.objectid = inode->i_ino;
+	min_key.type = key_type;
+	min_key.offset = min_offset;
+
+	path->keep_locks = 1;
+
+	ret = btrfs_search_forward(root, &min_key, &max_key,
+				   path, 0, trans->transid);
+
+	/*
+	 * we didn't find anything from this transaction, see if there
+	 * is anything at all
+	 */
+	if (ret != 0 || min_key.objectid != inode->i_ino ||
+	    min_key.type != key_type) {
+		min_key.objectid = inode->i_ino;
+		min_key.type = key_type;
+		min_key.offset = (u64)-1;
+		btrfs_release_path(root, path);
+		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+		if (ret < 0) {
+			btrfs_release_path(root, path);
+			return ret;
+		}
+		ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+
+		/* if ret == 0 there are items for this type,
+		 * create a range to tell us the last key of this type.
+		 * otherwise, there are no items in this directory after
+		 * *min_offset, and we create a range to indicate that.
+		 */
+		if (ret == 0) {
+			struct btrfs_key tmp;
+			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+					      path->slots[0]);
+			if (key_type == tmp.type)
+				first_offset = max(min_offset, tmp.offset) + 1;
+		}
+		goto done;
+	}
+
+	/* go backward to find any previous key */
+	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+	if (ret == 0) {
+		struct btrfs_key tmp;
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (key_type == tmp.type) {
+			first_offset = tmp.offset;
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+		}
+	}
+	btrfs_release_path(root, path);
+
+	/* find the first key from this transaction again */
+	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+	if (ret != 0) {
+		WARN_ON(1);
+		goto done;
+	}
+
+	/*
+	 * we have a block from this transaction, log every item in it
+	 * from our directory
+	 */
+	while (1) {
+		struct btrfs_key tmp;
+		src = path->nodes[0];
+		nritems = btrfs_header_nritems(src);
+		for (i = path->slots[0]; i < nritems; i++) {
+			btrfs_item_key_to_cpu(src, &min_key, i);
+
+			if (min_key.objectid != inode->i_ino ||
+			    min_key.type != key_type)
+				goto done;
+			ret = overwrite_item(trans, log, dst_path, src, i,
+					     &min_key);
+			BUG_ON(ret);
+		}
+		path->slots[0] = nritems;
+
+		/*
+		 * look ahead to the next item and see if it is also
+		 * from this directory and from this transaction
+		 */
+		ret = btrfs_next_leaf(root, path);
+		if (ret == 1) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+
+			BUG_ON(ret);
+			last_offset = tmp.offset;
+			goto done;
+		}
+	}
+done:
+	*last_offset_ret = last_offset;
+	btrfs_release_path(root, path);
+	btrfs_release_path(log, dst_path);
+
+	/* insert the log range keys to indicate where the log is valid */
+	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+				 first_offset, last_offset);
+	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path)
+{
+	u64 min_key;
+	u64 max_key;
+	int ret;
+	int key_type = BTRFS_DIR_ITEM_KEY;
+
+again:
+	min_key = 0;
+	max_key = 0;
+	while (1) {
+		ret = log_dir_items(trans, root, inode, path,
+				    dst_path, key_type, min_key,
+				    &max_key);
+		BUG_ON(ret);
+		if (max_key == (u64)-1)
+			break;
+		min_key = max_key + 1;
+	}
+
+	if (key_type == BTRFS_DIR_ITEM_KEY) {
+		key_type = BTRFS_DIR_INDEX_KEY;
+		goto again;
+	}
+	return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode.  max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  u64 objectid, int max_key_type)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	key.objectid = objectid;
+	key.type = max_key_type;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+
+		if (ret != 1)
+			break;
+
+		if (path->slots[0] == 0)
+			break;
+
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+
+		if (found_key.objectid != objectid)
+			break;
+
+		ret = btrfs_del_item(trans, log, path);
+		BUG_ON(ret);
+		btrfs_release_path(log, path);
+	}
+	btrfs_release_path(log, path);
+	return 0;
+}
+
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log,
+			       struct btrfs_path *dst_path,
+			       struct extent_buffer *src,
+			       int start_slot, int nr, int inode_only)
+{
+	unsigned long src_offset;
+	unsigned long dst_offset;
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_inode_item *inode_item;
+	int ret;
+	struct btrfs_key *ins_keys;
+	u32 *ins_sizes;
+	char *ins_data;
+	int i;
+	struct list_head ordered_sums;
+
+	INIT_LIST_HEAD(&ordered_sums);
+
+	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+			   nr * sizeof(u32), GFP_NOFS);
+	ins_sizes = (u32 *)ins_data;
+	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+
+	for (i = 0; i < nr; i++) {
+		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+	}
+	ret = btrfs_insert_empty_items(trans, log, dst_path,
+				       ins_keys, ins_sizes, nr);
+	BUG_ON(ret);
+
+	for (i = 0; i < nr; i++) {
+		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+						   dst_path->slots[0]);
+
+		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+
+		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+				   src_offset, ins_sizes[i]);
+
+		if (inode_only == LOG_INODE_EXISTS &&
+		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+			inode_item = btrfs_item_ptr(dst_path->nodes[0],
+						    dst_path->slots[0],
+						    struct btrfs_inode_item);
+			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+
+			/* set the generation to zero so the recover code
+			 * can tell the difference between an logging
+			 * just to say 'this inode exists' and a logging
+			 * to say 'update this inode with these values'
+			 */
+			btrfs_set_inode_generation(dst_path->nodes[0],
+						   inode_item, 0);
+		}
+		/* take a reference on file data extents so that truncates
+		 * or deletes of this inode don't have to relog the inode
+		 * again
+		 */
+		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+			int found_type;
+			extent = btrfs_item_ptr(src, start_slot + i,
+						struct btrfs_file_extent_item);
+
+			found_type = btrfs_file_extent_type(src, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG ||
+			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+				u64 ds = btrfs_file_extent_disk_bytenr(src,
+								   extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(src,
+								      extent);
+				u64 cs = btrfs_file_extent_offset(src, extent);
+				u64 cl = btrfs_file_extent_num_bytes(src,
+								     extent);;
+				if (btrfs_file_extent_compression(src,
+								  extent)) {
+					cs = 0;
+					cl = dl;
+				}
+				/* ds == 0 is a hole */
+				if (ds != 0) {
+					ret = btrfs_inc_extent_ref(trans, log,
+						   ds, dl,
+						   dst_path->nodes[0]->start,
+						   BTRFS_TREE_LOG_OBJECTID,
+						   trans->transid,
+						   ins_keys[i].objectid);
+					BUG_ON(ret);
+					ret = btrfs_lookup_csums_range(
+						   log->fs_info->csum_root,
+						   ds + cs, ds + cs + cl - 1,
+						   &ordered_sums);
+					BUG_ON(ret);
+				}
+			}
+		}
+		dst_path->slots[0]++;
+	}
+
+	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+	btrfs_release_path(log, dst_path);
+	kfree(ins_data);
+
+	/*
+	 * we have to do this after the loop above to avoid changing the
+	 * log tree while trying to change the log tree.
+	 */
+	while (!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		ret = btrfs_csum_file_blocks(trans, log, sums);
+		BUG_ON(ret);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	return 0;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree.  An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only)
+{
+	struct btrfs_path *path;
+	struct btrfs_path *dst_path;
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	struct extent_buffer *src = NULL;
+	u32 size;
+	int ret;
+	int nritems;
+	int ins_start_slot = 0;
+	int ins_nr;
+
+	log = root->log_root;
+
+	path = btrfs_alloc_path();
+	dst_path = btrfs_alloc_path();
+
+	min_key.objectid = inode->i_ino;
+	min_key.type = BTRFS_INODE_ITEM_KEY;
+	min_key.offset = 0;
+
+	max_key.objectid = inode->i_ino;
+	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+		max_key.type = BTRFS_XATTR_ITEM_KEY;
+	else
+		max_key.type = (u8)-1;
+	max_key.offset = (u64)-1;
+
+	/*
+	 * if this inode has already been logged and we're in inode_only
+	 * mode, we don't want to delete the things that have already
+	 * been written to the log.
+	 *
+	 * But, if the inode has been through an inode_only log,
+	 * the logged_trans field is not set.  This allows us to catch
+	 * any new names for this inode in the backrefs by logging it
+	 * again
+	 */
+	if (inode_only == LOG_INODE_EXISTS &&
+	    BTRFS_I(inode)->logged_trans == trans->transid) {
+		btrfs_free_path(path);
+		btrfs_free_path(dst_path);
+		goto out;
+	}
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	/*
+	 * a brute force approach to making sure we get the most uptodate
+	 * copies of everything.
+	 */
+	if (S_ISDIR(inode->i_mode)) {
+		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+
+		if (inode_only == LOG_INODE_EXISTS)
+			max_key_type = BTRFS_XATTR_ITEM_KEY;
+		ret = drop_objectid_items(trans, log, path,
+					  inode->i_ino, max_key_type);
+	} else {
+		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+	}
+	BUG_ON(ret);
+	path->keep_locks = 1;
+
+	while (1) {
+		ins_nr = 0;
+		ret = btrfs_search_forward(root, &min_key, &max_key,
+					   path, 0, trans->transid);
+		if (ret != 0)
+			break;
+again:
+		/* note, ins_nr might be > 0 here, cleanup outside the loop */
+		if (min_key.objectid != inode->i_ino)
+			break;
+		if (min_key.type > max_key.type)
+			break;
+
+		src = path->nodes[0];
+		size = btrfs_item_size_nr(src, path->slots[0]);
+		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+			ins_nr++;
+			goto next_slot;
+		} else if (!ins_nr) {
+			ins_start_slot = path->slots[0];
+			ins_nr = 1;
+			goto next_slot;
+		}
+
+		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+				 ins_nr, inode_only);
+		BUG_ON(ret);
+		ins_nr = 1;
+		ins_start_slot = path->slots[0];
+next_slot:
+
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		path->slots[0]++;
+		if (path->slots[0] < nritems) {
+			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+					      path->slots[0]);
+			goto again;
+		}
+		if (ins_nr) {
+			ret = copy_items(trans, log, dst_path, src,
+					 ins_start_slot,
+					 ins_nr, inode_only);
+			BUG_ON(ret);
+			ins_nr = 0;
+		}
+		btrfs_release_path(root, path);
+
+		if (min_key.offset < (u64)-1)
+			min_key.offset++;
+		else if (min_key.type < (u8)-1)
+			min_key.type++;
+		else if (min_key.objectid < (u64)-1)
+			min_key.objectid++;
+		else
+			break;
+	}
+	if (ins_nr) {
+		ret = copy_items(trans, log, dst_path, src,
+				 ins_start_slot,
+				 ins_nr, inode_only);
+		BUG_ON(ret);
+		ins_nr = 0;
+	}
+	WARN_ON(ins_nr);
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+		btrfs_release_path(root, path);
+		btrfs_release_path(log, dst_path);
+		BTRFS_I(inode)->log_dirty_trans = 0;
+		ret = log_directory_changes(trans, root, inode, path, dst_path);
+		BUG_ON(ret);
+	}
+	BTRFS_I(inode)->logged_trans = trans->transid;
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+	btrfs_free_path(path);
+	btrfs_free_path(dst_path);
+
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	ret = update_log_root(trans, log);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+out:
+	return 0;
+}
+
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    int inode_only)
+{
+	int ret;
+
+	start_log_trans(trans, root);
+	ret = __btrfs_log_inode(trans, root, inode, inode_only);
+	end_log_trans(root);
+	return ret;
+}
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log.  A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct dentry *dentry)
+{
+	int inode_only = LOG_INODE_ALL;
+	struct super_block *sb;
+	int ret;
+
+	start_log_trans(trans, root);
+	sb = dentry->d_inode->i_sb;
+	while (1) {
+		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
+					inode_only);
+		BUG_ON(ret);
+		inode_only = LOG_INODE_EXISTS;
+
+		dentry = dentry->d_parent;
+		if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+			break;
+
+		if (BTRFS_I(dentry->d_inode)->generation <=
+		    root->fs_info->last_trans_committed)
+			break;
+	}
+	end_log_trans(root);
+	return 0;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry)
+{
+	u64 gen;
+	gen = root->fs_info->last_trans_new_blockgroup;
+	if (gen > root->fs_info->last_trans_committed)
+		return 1;
+	else
+		return btrfs_log_dentry(trans, root, dentry);
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key tmp_key;
+	struct btrfs_root *log;
+	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+	u64 highest_inode;
+	struct walk_control wc = {
+		.process_func = process_one_buffer,
+		.stage = 0,
+	};
+
+	fs_info->log_root_recovering = 1;
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+	wc.trans = trans;
+	wc.pin = 1;
+
+	walk_log_tree(trans, log_root_tree, &wc);
+
+again:
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = (u64)-1;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		btrfs_release_path(log_root_tree, path);
+		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			break;
+
+		log = btrfs_read_fs_root_no_radix(log_root_tree,
+						  &found_key);
+		BUG_ON(!log);
+
+
+		tmp_key.objectid = found_key.offset;
+		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_key.offset = (u64)-1;
+
+		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+		BUG_ON(!wc.replay_dest);
+
+		wc.replay_dest->log_root = log;
+		btrfs_record_root_in_trans(wc.replay_dest);
+		ret = walk_log_tree(trans, log, &wc);
+		BUG_ON(ret);
+
+		if (wc.stage == LOG_WALK_REPLAY_ALL) {
+			ret = fixup_inode_link_counts(trans, wc.replay_dest,
+						      path);
+			BUG_ON(ret);
+		}
+		ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
+		if (ret == 0) {
+			wc.replay_dest->highest_inode = highest_inode;
+			wc.replay_dest->last_inode_alloc = highest_inode;
+		}
+
+		key.offset = found_key.offset - 1;
+		wc.replay_dest->log_root = NULL;
+		free_extent_buffer(log->node);
+		kfree(log);
+
+		if (found_key.offset == 0)
+			break;
+	}
+	btrfs_release_path(log_root_tree, path);
+
+	/* step one is to pin it all, step two is to replay just inodes */
+	if (wc.pin) {
+		wc.pin = 0;
+		wc.process_func = replay_one_buffer;
+		wc.stage = LOG_WALK_REPLAY_INODES;
+		goto again;
+	}
+	/* step three is to replay everything */
+	if (wc.stage < LOG_WALK_REPLAY_ALL) {
+		wc.stage++;
+		goto again;
+	}
+
+	btrfs_free_path(path);
+
+	free_extent_buffer(log_root_tree->node);
+	log_root_tree->log_root = NULL;
+	fs_info->log_root_recovering = 0;
+
+	/* step 4: commit the transaction, which also unpins the blocks */
+	btrfs_commit_transaction(trans, fs_info->tree_root);
+
+	kfree(log_root_tree);
+	return 0;
+}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 00000000000..b9409b32ed0
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct dentry *dentry);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry);
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    int inode_only);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid);
+#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 00000000000..9bf3946d5ef
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
+#ifndef __BTRFS_VERSION_H
+#define __BTRFS_VERSION_H
+#define BTRFS_BUILD_VERSION "Btrfs"
+#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 00000000000..1ca1952fd91
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# determine-version -- report a useful version for releases
+#
+# Copyright 2008, Aron Griffis <agriffis@n01se.net>
+# Copyright 2008, Oracle
+# Released under the GNU GPLv2
+ 
+v="v0.16"
+
+which git &> /dev/null
+if [ $? == 0 ]; then
+    git branch >& /dev/null
+    if [ $? == 0 ]; then
+	    if head=`git rev-parse --verify HEAD 2>/dev/null`; then
+		if tag=`git describe --tags 2>/dev/null`; then
+		    v="$tag"
+		fi
+
+		# Are there uncommitted changes?
+		git update-index --refresh --unmerged > /dev/null
+		if git diff-index --name-only HEAD | \
+		    grep -v "^scripts/package" \
+		    | read dummy; then
+		    v="$v"-dirty
+		fi
+	    fi
+    fi
+fi
+ 
+echo "#ifndef __BUILD_VERSION" > .build-version.h
+echo "#define __BUILD_VERSION" >> .build-version.h
+echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
+echo "#endif" >> .build-version.h
+
+diff -q version.h .build-version.h >& /dev/null
+
+if [ $? == 0 ]; then
+    rm .build-version.h
+    exit 0
+fi
+
+mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 00000000000..b187b537888
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3218 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/version.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+
+struct map_lookup {
+	u64 type;
+	int io_align;
+	int io_width;
+	int stripe_len;
+	int sector_size;
+	int num_stripes;
+	int sub_stripes;
+	struct btrfs_bio_stripe stripes[];
+};
+
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
+
+static DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+
+void btrfs_lock_volumes(void)
+{
+	mutex_lock(&uuid_mutex);
+}
+
+void btrfs_unlock_volumes(void)
+{
+	mutex_unlock(&uuid_mutex);
+}
+
+static void lock_chunks(struct btrfs_root *root)
+{
+	mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+	mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device;
+	WARN_ON(fs_devices->opened);
+	while (!list_empty(&fs_devices->devices)) {
+		device = list_entry(fs_devices->devices.next,
+				    struct btrfs_device, dev_list);
+		list_del(&device->dev_list);
+		kfree(device->name);
+		kfree(device);
+	}
+	kfree(fs_devices);
+}
+
+int btrfs_cleanup_fs_uuids(void)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	while (!list_empty(&fs_uuids)) {
+		fs_devices = list_entry(fs_uuids.next,
+					struct btrfs_fs_devices, list);
+		list_del(&fs_devices->list);
+		free_fs_devices(fs_devices);
+	}
+	return 0;
+}
+
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+						   u64 devid, u8 *uuid)
+{
+	struct btrfs_device *dev;
+	struct list_head *cur;
+
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (dev->devid == devid &&
+		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
+			return dev;
+		}
+	}
+	return NULL;
+}
+
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+	struct list_head *cur;
+	struct btrfs_fs_devices *fs_devices;
+
+	list_for_each(cur, &fs_uuids) {
+		fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+			return fs_devices;
+	}
+	return NULL;
+}
+
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+static noinline int run_scheduled_bios(struct btrfs_device *device)
+{
+	struct bio *pending;
+	struct backing_dev_info *bdi;
+	struct btrfs_fs_info *fs_info;
+	struct bio *tail;
+	struct bio *cur;
+	int again = 0;
+	unsigned long num_run = 0;
+	unsigned long limit;
+
+	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+	fs_info = device->dev_root->fs_info;
+	limit = btrfs_async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
+loop:
+	spin_lock(&device->io_lock);
+
+	/* take all the bios off the list at once and process them
+	 * later on (without the lock held).  But, remember the
+	 * tail and other pointers so the bios can be properly reinserted
+	 * into the list if we hit congestion
+	 */
+	pending = device->pending_bios;
+	tail = device->pending_bio_tail;
+	WARN_ON(pending && !tail);
+	device->pending_bios = NULL;
+	device->pending_bio_tail = NULL;
+
+	/*
+	 * if pending was null this time around, no bios need processing
+	 * at all and we can stop.  Otherwise it'll loop back up again
+	 * and do an additional check so no bios are missed.
+	 *
+	 * device->running_pending is used to synchronize with the
+	 * schedule_bio code.
+	 */
+	if (pending) {
+		again = 1;
+		device->running_pending = 1;
+	} else {
+		again = 0;
+		device->running_pending = 0;
+	}
+	spin_unlock(&device->io_lock);
+
+	while (pending) {
+		cur = pending;
+		pending = pending->bi_next;
+		cur->bi_next = NULL;
+		atomic_dec(&fs_info->nr_async_bios);
+
+		if (atomic_read(&fs_info->nr_async_bios) < limit &&
+		    waitqueue_active(&fs_info->async_submit_wait))
+			wake_up(&fs_info->async_submit_wait);
+
+		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+		bio_get(cur);
+		submit_bio(cur->bi_rw, cur);
+		bio_put(cur);
+		num_run++;
+
+		/*
+		 * we made progress, there is more work to do and the bdi
+		 * is now congested.  Back off and let other work structs
+		 * run instead
+		 */
+		if (pending && bdi_write_congested(bdi) &&
+		    fs_info->fs_devices->open_devices > 1) {
+			struct bio *old_head;
+
+			spin_lock(&device->io_lock);
+
+			old_head = device->pending_bios;
+			device->pending_bios = pending;
+			if (device->pending_bio_tail)
+				tail->bi_next = old_head;
+			else
+				device->pending_bio_tail = tail;
+
+			spin_unlock(&device->io_lock);
+			btrfs_requeue_work(&device->work);
+			goto done;
+		}
+	}
+	if (again)
+		goto loop;
+done:
+	return 0;
+}
+
+static void pending_bios_fn(struct btrfs_work *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, work);
+	run_scheduled_bios(device);
+}
+
+static noinline int device_list_add(const char *path,
+			   struct btrfs_super_block *disk_super,
+			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices;
+	u64 found_transid = btrfs_super_generation(disk_super);
+
+	fs_devices = find_fsid(disk_super->fsid);
+	if (!fs_devices) {
+		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+		if (!fs_devices)
+			return -ENOMEM;
+		INIT_LIST_HEAD(&fs_devices->devices);
+		INIT_LIST_HEAD(&fs_devices->alloc_list);
+		list_add(&fs_devices->list, &fs_uuids);
+		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+		device = NULL;
+	} else {
+		device = __find_device(&fs_devices->devices, devid,
+				       disk_super->dev_item.uuid);
+	}
+	if (!device) {
+		if (fs_devices->opened)
+			return -EBUSY;
+
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device) {
+			/* we can safely leave the fs_devices entry around */
+			return -ENOMEM;
+		}
+		device->devid = devid;
+		device->work.func = pending_bios_fn;
+		memcpy(device->uuid, disk_super->dev_item.uuid,
+		       BTRFS_UUID_SIZE);
+		device->barriers = 1;
+		spin_lock_init(&device->io_lock);
+		device->name = kstrdup(path, GFP_NOFS);
+		if (!device->name) {
+			kfree(device);
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&device->dev_alloc_list);
+		list_add(&device->dev_list, &fs_devices->devices);
+		device->fs_devices = fs_devices;
+		fs_devices->num_devices++;
+	}
+
+	if (found_transid > fs_devices->latest_trans) {
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+	}
+	*fs_devices_ret = fs_devices;
+	return 0;
+}
+
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_device *device;
+	struct btrfs_device *orig_dev;
+
+	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!fs_devices)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&fs_devices->devices);
+	INIT_LIST_HEAD(&fs_devices->alloc_list);
+	INIT_LIST_HEAD(&fs_devices->list);
+	fs_devices->latest_devid = orig->latest_devid;
+	fs_devices->latest_trans = orig->latest_trans;
+	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+
+	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device)
+			goto error;
+
+		device->name = kstrdup(orig_dev->name, GFP_NOFS);
+		if (!device->name)
+			goto error;
+
+		device->devid = orig_dev->devid;
+		device->work.func = pending_bios_fn;
+		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+		device->barriers = 1;
+		spin_lock_init(&device->io_lock);
+		INIT_LIST_HEAD(&device->dev_list);
+		INIT_LIST_HEAD(&device->dev_alloc_list);
+
+		list_add(&device->dev_list, &fs_devices->devices);
+		device->fs_devices = fs_devices;
+		fs_devices->num_devices++;
+	}
+	return fs_devices;
+error:
+	free_fs_devices(fs_devices);
+	return ERR_PTR(-ENOMEM);
+}
+
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *tmp;
+	struct list_head *cur;
+	struct btrfs_device *device;
+
+	mutex_lock(&uuid_mutex);
+again:
+	list_for_each_safe(cur, tmp, &fs_devices->devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->in_fs_metadata)
+			continue;
+
+		if (device->bdev) {
+			close_bdev_exclusive(device->bdev, device->mode);
+			device->bdev = NULL;
+			fs_devices->open_devices--;
+		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			device->writeable = 0;
+			fs_devices->rw_devices--;
+		}
+		list_del_init(&device->dev_list);
+		fs_devices->num_devices--;
+		kfree(device->name);
+		kfree(device);
+	}
+
+	if (fs_devices->seed) {
+		fs_devices = fs_devices->seed;
+		goto again;
+	}
+
+	mutex_unlock(&uuid_mutex);
+	return 0;
+}
+
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *cur;
+	struct btrfs_device *device;
+
+	if (--fs_devices->opened > 0)
+		return 0;
+
+	list_for_each(cur, &fs_devices->devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev) {
+			close_bdev_exclusive(device->bdev, device->mode);
+			fs_devices->open_devices--;
+		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			fs_devices->rw_devices--;
+		}
+
+		device->bdev = NULL;
+		device->writeable = 0;
+		device->in_fs_metadata = 0;
+	}
+	WARN_ON(fs_devices->open_devices);
+	WARN_ON(fs_devices->rw_devices);
+	fs_devices->opened = 0;
+	fs_devices->seeding = 0;
+
+	return 0;
+}
+
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_fs_devices *seed_devices = NULL;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	ret = __btrfs_close_devices(fs_devices);
+	if (!fs_devices->opened) {
+		seed_devices = fs_devices->seed;
+		fs_devices->seed = NULL;
+	}
+	mutex_unlock(&uuid_mutex);
+
+	while (seed_devices) {
+		fs_devices = seed_devices;
+		seed_devices = fs_devices->seed;
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+	}
+	return ret;
+}
+
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+				fmode_t flags, void *holder)
+{
+	struct block_device *bdev;
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+	struct block_device *latest_bdev = NULL;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 latest_devid = 0;
+	u64 latest_transid = 0;
+	u64 devid;
+	int seeding = 1;
+	int ret = 0;
+
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev)
+			continue;
+		if (!device->name)
+			continue;
+
+		bdev = open_bdev_exclusive(device->name, flags, holder);
+		if (IS_ERR(bdev)) {
+			printk(KERN_INFO "open %s failed\n", device->name);
+			goto error;
+		}
+		set_blocksize(bdev, 4096);
+
+		bh = btrfs_read_dev_super(bdev);
+		if (!bh)
+			goto error_close;
+
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		devid = le64_to_cpu(disk_super->dev_item.devid);
+		if (devid != device->devid)
+			goto error_brelse;
+
+		if (memcmp(device->uuid, disk_super->dev_item.uuid,
+			   BTRFS_UUID_SIZE))
+			goto error_brelse;
+
+		device->generation = btrfs_super_generation(disk_super);
+		if (!latest_transid || device->generation > latest_transid) {
+			latest_devid = devid;
+			latest_transid = device->generation;
+			latest_bdev = bdev;
+		}
+
+		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+			device->writeable = 0;
+		} else {
+			device->writeable = !bdev_read_only(bdev);
+			seeding = 0;
+		}
+
+		device->bdev = bdev;
+		device->in_fs_metadata = 0;
+		device->mode = flags;
+
+		fs_devices->open_devices++;
+		if (device->writeable) {
+			fs_devices->rw_devices++;
+			list_add(&device->dev_alloc_list,
+				 &fs_devices->alloc_list);
+		}
+		continue;
+
+error_brelse:
+		brelse(bh);
+error_close:
+		close_bdev_exclusive(bdev, FMODE_READ);
+error:
+		continue;
+	}
+	if (fs_devices->open_devices == 0) {
+		ret = -EIO;
+		goto out;
+	}
+	fs_devices->seeding = seeding;
+	fs_devices->opened = 1;
+	fs_devices->latest_bdev = latest_bdev;
+	fs_devices->latest_devid = latest_devid;
+	fs_devices->latest_trans = latest_transid;
+	fs_devices->total_rw_bytes = 0;
+out:
+	return ret;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       fmode_t flags, void *holder)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	if (fs_devices->opened) {
+		fs_devices->opened++;
+		ret = 0;
+	} else {
+		ret = __btrfs_open_devices(fs_devices, flags, holder);
+	}
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_super_block *disk_super;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	int ret;
+	u64 devid;
+	u64 transid;
+
+	mutex_lock(&uuid_mutex);
+
+	bdev = open_bdev_exclusive(path, flags, holder);
+
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		goto error;
+	}
+
+	ret = set_blocksize(bdev, 4096);
+	if (ret)
+		goto error_close;
+	bh = btrfs_read_dev_super(bdev);
+	if (!bh) {
+		ret = -EIO;
+		goto error_close;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	devid = le64_to_cpu(disk_super->dev_item.devid);
+	transid = btrfs_super_generation(disk_super);
+	if (disk_super->label[0])
+		printk(KERN_INFO "device label %s ", disk_super->label);
+	else {
+		/* FIXME, make a readl uuid parser */
+		printk(KERN_INFO "device fsid %llx-%llx ",
+		       *(unsigned long long *)disk_super->fsid,
+		       *(unsigned long long *)(disk_super->fsid + 8));
+	}
+	printk(KERN_INFO "devid %llu transid %llu %s\n",
+	       (unsigned long long)devid, (unsigned long long)transid, path);
+	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+
+	brelse(bh);
+error_close:
+	close_bdev_exclusive(bdev, flags);
+error:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+/*
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_device *device,
+					 u64 num_bytes, u64 *start)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	u64 hole_size = 0;
+	u64 last_byte = 0;
+	u64 search_start = 0;
+	u64 search_end = device->total_bytes;
+	int ret;
+	int slot = 0;
+	int start_found;
+	struct extent_buffer *l;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 2;
+	start_found = 0;
+
+	/* FIXME use last free of some kind */
+
+	/* we don't want to overwrite the superblock on the drive,
+	 * so we make sure to start at an offset of at least 1MB
+	 */
+	search_start = max((u64)1024 * 1024, search_start);
+
+	if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+		search_start = max(root->fs_info->alloc_start, search_start);
+
+	key.objectid = device->devid;
+	key.offset = search_start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	ret = btrfs_previous_item(root, path, 0, key.type);
+	if (ret < 0)
+		goto error;
+	l = path->nodes[0];
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+no_more_items:
+			if (!start_found) {
+				if (search_start >= search_end) {
+					ret = -ENOSPC;
+					goto error;
+				}
+				*start = search_start;
+				start_found = 1;
+				goto check_pending;
+			}
+			*start = last_byte > search_start ?
+				last_byte : search_start;
+			if (search_end <= *start) {
+				ret = -ENOSPC;
+				goto error;
+			}
+			goto check_pending;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			goto no_more_items;
+
+		if (key.offset >= search_start && key.offset > last_byte &&
+		    start_found) {
+			if (last_byte < search_start)
+				last_byte = search_start;
+			hole_size = key.offset - last_byte;
+			if (key.offset > last_byte &&
+			    hole_size >= num_bytes) {
+				*start = last_byte;
+				goto check_pending;
+			}
+		}
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+			goto next;
+
+		start_found = 1;
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+check_pending:
+	/* we have to make sure we didn't find an extent that has already
+	 * been allocated by the map tree or the original allocation
+	 */
+	BUG_ON(*start < search_start);
+
+	if (*start + num_bytes > search_end) {
+		ret = -ENOSPC;
+		goto error;
+	}
+	/* check for pending inserts here */
+	ret = 0;
+
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_device *device,
+			  u64 start)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf = NULL;
+	struct btrfs_dev_extent *extent = NULL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid,
+					  BTRFS_DEV_EXTENT_KEY);
+		BUG_ON(ret);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+		BUG_ON(found_key.offset > start || found_key.offset +
+		       btrfs_dev_extent_length(leaf, extent) < start);
+		ret = 0;
+	} else if (ret == 0) {
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+	}
+	BUG_ON(ret);
+
+	if (device->bytes_used > 0)
+		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 chunk_tree, u64 chunk_objectid,
+			   u64 chunk_offset, u64 start, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	WARN_ON(!device->in_fs_metadata);
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*extent));
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_dev_extent);
+	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
+		    BTRFS_UUID_SIZE);
+
+	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int find_next_chunk(struct btrfs_root *root,
+				    u64 objectid, u64 *offset)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key found_key;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = objectid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+	if (ret) {
+		*offset = 0;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != objectid)
+			*offset = 0;
+		else {
+			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					       struct btrfs_chunk);
+			*offset = found_key.offset +
+				btrfs_chunk_length(path->nodes[0], chunk);
+		}
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_path *path;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+				  BTRFS_DEV_ITEM_KEY);
+	if (ret) {
+		*objectid = 1;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.offset + 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*dev_item));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_generation(leaf, dev_item, 0);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_set_device_group(leaf, dev_item, 0);
+	btrfs_set_device_seek_speed(leaf, dev_item, 0);
+	btrfs_set_device_bandwidth(leaf, dev_item, 0);
+	btrfs_set_device_start_offset(leaf, dev_item, 0);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+	ptr = (unsigned long)btrfs_device_fsid(dev_item);
+	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+			     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_trans_handle *trans;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+	lock_chunks(root);
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+out:
+	btrfs_free_path(path);
+	unlock_chunks(root);
+	btrfs_commit_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_device *device;
+	struct btrfs_device *next_device;
+	struct block_device *bdev;
+	struct buffer_head *bh = NULL;
+	struct btrfs_super_block *disk_super;
+	u64 all_avail;
+	u64 devid;
+	u64 num_devices;
+	u8 *dev_uuid;
+	int ret = 0;
+
+	mutex_lock(&uuid_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
+
+	all_avail = root->fs_info->avail_data_alloc_bits |
+		root->fs_info->avail_system_alloc_bits |
+		root->fs_info->avail_metadata_alloc_bits;
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+	    root->fs_info->fs_devices->rw_devices <= 4) {
+		printk(KERN_ERR "btrfs: unable to go below four devices "
+		       "on raid10\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+	    root->fs_info->fs_devices->rw_devices <= 2) {
+		printk(KERN_ERR "btrfs: unable to go below two "
+		       "devices on raid1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *cur;
+		struct list_head *devices;
+		struct btrfs_device *tmp;
+
+		device = NULL;
+		devices = &root->fs_info->fs_devices->devices;
+		list_for_each(cur, devices) {
+			tmp = list_entry(cur, struct btrfs_device, dev_list);
+			if (tmp->in_fs_metadata && !tmp->bdev) {
+				device = tmp;
+				break;
+			}
+		}
+		bdev = NULL;
+		bh = NULL;
+		disk_super = NULL;
+		if (!device) {
+			printk(KERN_ERR "btrfs: no missing devices found to "
+			       "remove\n");
+			goto out;
+		}
+	} else {
+		bdev = open_bdev_exclusive(device_path, FMODE_READ,
+				      root->fs_info->bdev_holder);
+		if (IS_ERR(bdev)) {
+			ret = PTR_ERR(bdev);
+			goto out;
+		}
+
+		set_blocksize(bdev, 4096);
+		bh = btrfs_read_dev_super(bdev);
+		if (!bh) {
+			ret = -EIO;
+			goto error_close;
+		}
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		devid = le64_to_cpu(disk_super->dev_item.devid);
+		dev_uuid = disk_super->dev_item.uuid;
+		device = btrfs_find_device(root, devid, dev_uuid,
+					   disk_super->fsid);
+		if (!device) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
+	}
+
+	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+		printk(KERN_ERR "btrfs: unable to remove the only writeable "
+		       "device\n");
+		ret = -EINVAL;
+		goto error_brelse;
+	}
+
+	if (device->writeable) {
+		list_del_init(&device->dev_alloc_list);
+		root->fs_info->fs_devices->rw_devices--;
+	}
+
+	ret = btrfs_shrink_device(device, 0);
+	if (ret)
+		goto error_brelse;
+
+	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+	if (ret)
+		goto error_brelse;
+
+	device->in_fs_metadata = 0;
+	list_del_init(&device->dev_list);
+	device->fs_devices->num_devices--;
+
+	next_device = list_entry(root->fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (device->bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_device->bdev;
+	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+
+	if (device->bdev) {
+		close_bdev_exclusive(device->bdev, device->mode);
+		device->bdev = NULL;
+		device->fs_devices->open_devices--;
+	}
+
+	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+
+	if (device->fs_devices->open_devices == 0) {
+		struct btrfs_fs_devices *fs_devices;
+		fs_devices = root->fs_info->fs_devices;
+		while (fs_devices) {
+			if (fs_devices->seed == device->fs_devices)
+				break;
+			fs_devices = fs_devices->seed;
+		}
+		fs_devices->seed = device->fs_devices->seed;
+		device->fs_devices->seed = NULL;
+		__btrfs_close_devices(device->fs_devices);
+		free_fs_devices(device->fs_devices);
+	}
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	if (device->writeable) {
+		/* make sure this device isn't detected as part of
+		 * the FS anymore
+		 */
+		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+		set_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+	}
+
+	kfree(device->name);
+	kfree(device);
+	ret = 0;
+
+error_brelse:
+	brelse(bh);
+error_close:
+	if (bdev)
+		close_bdev_exclusive(bdev, FMODE_READ);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_fs_devices *old_devices;
+	struct btrfs_fs_devices *seed_devices;
+	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_device *device;
+	u64 super_flags;
+
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	if (!fs_devices->seeding)
+		return -EINVAL;
+
+	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!seed_devices)
+		return -ENOMEM;
+
+	old_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(old_devices)) {
+		kfree(seed_devices);
+		return PTR_ERR(old_devices);
+	}
+
+	list_add(&old_devices->list, &fs_uuids);
+
+	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+	seed_devices->opened = 1;
+	INIT_LIST_HEAD(&seed_devices->devices);
+	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	list_splice_init(&fs_devices->devices, &seed_devices->devices);
+	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+	list_for_each_entry(device, &seed_devices->devices, dev_list) {
+		device->fs_devices = seed_devices;
+	}
+
+	fs_devices->seeding = 0;
+	fs_devices->num_devices = 0;
+	fs_devices->open_devices = 0;
+	fs_devices->seed = seed_devices;
+
+	generate_random_uuid(fs_devices->fsid);
+	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	super_flags = btrfs_super_flags(disk_super) &
+		      ~BTRFS_SUPER_FLAG_SEEDING;
+	btrfs_set_super_flags(disk_super, super_flags);
+
+	return 0;
+}
+
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_device *device;
+	struct btrfs_key key;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+	u64 devid;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	root = root->fs_info->chunk_root;
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = BTRFS_DEV_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto error;
+
+		leaf = path->nodes[0];
+next_slot:
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret > 0)
+				break;
+			if (ret < 0)
+				goto error;
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+			btrfs_release_path(root, path);
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+		    key.type != BTRFS_DEV_ITEM_KEY)
+			break;
+
+		dev_item = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_dev_item);
+		devid = btrfs_device_id(leaf, dev_item);
+		read_extent_buffer(leaf, dev_uuid,
+				   (unsigned long)btrfs_device_uuid(dev_item),
+				   BTRFS_UUID_SIZE);
+		read_extent_buffer(leaf, fs_uuid,
+				   (unsigned long)btrfs_device_fsid(dev_item),
+				   BTRFS_UUID_SIZE);
+		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+		BUG_ON(!device);
+
+		if (device->fs_devices->seeding) {
+			btrfs_set_device_generation(leaf, dev_item,
+						    device->generation);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
+		path->slots[0]++;
+		goto next_slot;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct list_head *cur;
+	struct list_head *devices;
+	struct super_block *sb = root->fs_info->sb;
+	u64 total_bytes;
+	int seeding_dev = 0;
+	int ret = 0;
+
+	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+		return -EINVAL;
+
+	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+	if (!bdev)
+		return -EIO;
+
+	if (root->fs_info->fs_devices->seeding) {
+		seeding_dev = 1;
+		down_write(&sb->s_umount);
+		mutex_lock(&uuid_mutex);
+	}
+
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	mutex_lock(&root->fs_info->volume_mutex);
+
+	devices = &root->fs_info->fs_devices->devices;
+	list_for_each(cur, devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev == bdev) {
+			ret = -EEXIST;
+			goto error;
+		}
+	}
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device) {
+		/* we can safely leave the fs_devices entry around */
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	device->name = kstrdup(device_path, GFP_NOFS);
+	if (!device->name) {
+		kfree(device);
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ret = find_next_devid(root, &device->devid);
+	if (ret) {
+		kfree(device);
+		goto error;
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	lock_chunks(root);
+
+	device->barriers = 1;
+	device->writeable = 1;
+	device->work.func = pending_bios_fn;
+	generate_random_uuid(device->uuid);
+	spin_lock_init(&device->io_lock);
+	device->generation = trans->transid;
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = bdev;
+	device->in_fs_metadata = 1;
+	device->mode = 0;
+	set_blocksize(device->bdev, 4096);
+
+	if (seeding_dev) {
+		sb->s_flags &= ~MS_RDONLY;
+		ret = btrfs_prepare_sprout(trans, root);
+		BUG_ON(ret);
+	}
+
+	device->fs_devices = root->fs_info->fs_devices;
+	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &root->fs_info->fs_devices->alloc_list);
+	root->fs_info->fs_devices->num_devices++;
+	root->fs_info->fs_devices->open_devices++;
+	root->fs_info->fs_devices->rw_devices++;
+	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+
+	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+				    total_bytes + device->total_bytes);
+
+	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+				    total_bytes + 1);
+
+	if (seeding_dev) {
+		ret = init_first_rw_device(trans, root, device);
+		BUG_ON(ret);
+		ret = btrfs_finish_sprout(trans, root);
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_add_device(trans, root, device);
+	}
+
+	unlock_chunks(root);
+	btrfs_commit_transaction(trans, root);
+
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
+
+		ret = btrfs_relocate_sys_chunks(root);
+		BUG_ON(ret);
+	}
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	return ret;
+error:
+	close_bdev_exclusive(bdev, 0);
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
+	}
+	goto out;
+}
+
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+					struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	root = device->dev_root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_mark_buffer_dirty(leaf);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_super_block *super_copy =
+		&device->dev_root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 diff = new_size - device->total_bytes;
+
+	if (!device->writeable)
+		return -EACCES;
+	if (new_size <= device->total_bytes)
+		return -EINVAL;
+
+	btrfs_set_super_total_bytes(super_copy, old_total + diff);
+	device->fs_devices->total_rw_bytes += diff;
+
+	device->total_bytes = new_size;
+	return btrfs_update_device(trans, device);
+}
+
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	int ret;
+	lock_chunks(device->dev_root);
+	ret = __btrfs_grow_device(trans, device, new_size);
+	unlock_chunks(device->dev_root);
+	return ret;
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    u64 chunk_tree, u64 chunk_objectid,
+			    u64 chunk_offset)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	root = root->fs_info->chunk_root;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = chunk_objectid;
+	key.offset = chunk_offset;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+			chunk_offset)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_chunk *chunk;
+	u8 *ptr;
+	int ret = 0;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u32 cur;
+	struct btrfs_key key;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	ptr = super_copy->sys_chunk_array;
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+			chunk = (struct btrfs_chunk *)(ptr + len);
+			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+			len += btrfs_chunk_item_size(num_stripes);
+		} else {
+			ret = -EIO;
+			break;
+		}
+		if (key.objectid == chunk_objectid &&
+		    key.offset == chunk_offset) {
+			memmove(ptr, ptr + len, array_size - (cur + len));
+			array_size -= len;
+			btrfs_set_super_sys_array_size(super_copy, array_size);
+		} else {
+			ptr += len;
+			cur += len;
+		}
+	}
+	return ret;
+}
+
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+			 u64 chunk_tree, u64 chunk_objectid,
+			 u64 chunk_offset)
+{
+	struct extent_map_tree *em_tree;
+	struct btrfs_root *extent_root;
+	struct btrfs_trans_handle *trans;
+	struct extent_map *em;
+	struct map_lookup *map;
+	int ret;
+	int i;
+
+	printk(KERN_INFO "btrfs relocating chunk %llu\n",
+	       (unsigned long long)chunk_offset);
+	root = root->fs_info->chunk_root;
+	extent_root = root->fs_info->extent_root;
+	em_tree = &root->fs_info->mapping_tree.map_tree;
+
+	/* step one, relocate all the extents inside this chunk */
+	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+	BUG_ON(ret);
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	lock_chunks(root);
+
+	/*
+	 * step two, delete the device extents and the
+	 * chunk tree entries
+	 */
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+	spin_unlock(&em_tree->lock);
+
+	BUG_ON(em->start > chunk_offset ||
+	       em->start + em->len < chunk_offset);
+	map = (struct map_lookup *)em->bdev;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+					    map->stripes[i].physical);
+		BUG_ON(ret);
+
+		if (map->stripes[i].dev) {
+			ret = btrfs_update_device(trans, map->stripes[i].dev);
+			BUG_ON(ret);
+		}
+	}
+	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+			       chunk_offset);
+
+	BUG_ON(ret);
+
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+		BUG_ON(ret);
+	}
+
+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+	BUG_ON(ret);
+
+	spin_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+
+	kfree(map);
+	em->bdev = NULL;
+
+	/* once for the tree */
+	free_extent_map(em);
+	/* once for us */
+	free_extent_map(em);
+
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 chunk_tree = chunk_root->root_key.objectid;
+	u64 chunk_type;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+		BUG_ON(ret == 0);
+
+		ret = btrfs_previous_item(chunk_root, path, key.objectid,
+					  key.type);
+		if (ret < 0)
+			goto error;
+		if (ret > 0)
+			break;
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		chunk = btrfs_item_ptr(leaf, path->slots[0],
+				       struct btrfs_chunk);
+		chunk_type = btrfs_chunk_type(leaf, chunk);
+		btrfs_release_path(chunk_root, path);
+
+		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+						   found_key.objectid,
+						   found_key.offset);
+			BUG_ON(ret);
+		}
+
+		if (found_key.offset == 0)
+			break;
+		key.offset = found_key.offset - 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+	int ret;
+	struct list_head *cur;
+	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+	struct btrfs_device *device;
+	u64 old_size;
+	u64 size_to_free;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
+	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key found_key;
+
+	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	mutex_lock(&dev_root->fs_info->volume_mutex);
+	dev_root = dev_root->fs_info->dev_root;
+
+	/* step one make some room on all the devices */
+	list_for_each(cur, devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		old_size = device->total_bytes;
+		size_to_free = div_factor(old_size, 1);
+		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+		if (!device->writeable ||
+		    device->total_bytes - device->bytes_used > size_to_free)
+			continue;
+
+		ret = btrfs_shrink_device(device, old_size - size_to_free);
+		BUG_ON(ret);
+
+		trans = btrfs_start_transaction(dev_root, 1);
+		BUG_ON(!trans);
+
+		ret = btrfs_grow_device(trans, device, old_size);
+		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, dev_root);
+	}
+
+	/* step two, relocate all the chunks */
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+
+		/*
+		 * this shouldn't happen, it means the last relocate
+		 * failed
+		 */
+		if (ret == 0)
+			break;
+
+		ret = btrfs_previous_item(chunk_root, path, 0,
+					  BTRFS_CHUNK_ITEM_KEY);
+		if (ret)
+			break;
+
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != key.objectid)
+			break;
+
+		chunk = btrfs_item_ptr(path->nodes[0],
+				       path->slots[0],
+				       struct btrfs_chunk);
+		key.offset = found_key.offset;
+		/* chunk zero is special */
+		if (key.offset == 0)
+			break;
+
+		btrfs_release_path(chunk_root, path);
+		ret = btrfs_relocate_chunk(chunk_root,
+					   chunk_root->root_key.objectid,
+					   found_key.objectid,
+					   found_key.offset);
+		BUG_ON(ret);
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	mutex_unlock(&dev_root->fs_info->volume_mutex);
+	return ret;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	u64 length;
+	u64 chunk_tree;
+	u64 chunk_objectid;
+	u64 chunk_offset;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 diff = device->total_bytes - new_size;
+
+	if (new_size >= device->total_bytes)
+		return -EINVAL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	path->reada = 2;
+
+	lock_chunks(root);
+
+	device->total_bytes = new_size;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes -= diff;
+	ret = btrfs_update_device(trans, device);
+	if (ret) {
+		unlock_chunks(root);
+		btrfs_end_transaction(trans, root);
+		goto done;
+	}
+	WARN_ON(diff > old_total);
+	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
+
+	key.objectid = device->devid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto done;
+
+		ret = btrfs_previous_item(root, path, 0, key.type);
+		if (ret < 0)
+			goto done;
+		if (ret) {
+			ret = 0;
+			goto done;
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+		if (key.objectid != device->devid)
+			goto done;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		if (key.offset + length <= new_size)
+			goto done;
+
+		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+		btrfs_release_path(root, path);
+
+		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+					   chunk_offset);
+		if (ret)
+			goto done;
+	}
+
+done:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_key *key,
+			   struct btrfs_chunk *chunk, int item_size)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key disk_key;
+	u32 array_size;
+	u8 *ptr;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+		return -EFBIG;
+
+	ptr = super_copy->sys_chunk_array + array_size;
+	btrfs_cpu_key_to_disk(&disk_key, key);
+	memcpy(ptr, &disk_key, sizeof(disk_key));
+	ptr += sizeof(disk_key);
+	memcpy(ptr, chunk, item_size);
+	item_size += sizeof(disk_key);
+	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+	return 0;
+}
+
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
+					int num_stripes, int sub_stripes)
+{
+	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+		return calc_size;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		return calc_size * (num_stripes / sub_stripes);
+	else
+		return calc_size * num_stripes;
+}
+
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct map_lookup **map_ret,
+			       u64 *num_bytes, u64 *stripe_size,
+			       u64 start, u64 type)
+{
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_device *device = NULL;
+	struct btrfs_fs_devices *fs_devices = info->fs_devices;
+	struct list_head *cur;
+	struct map_lookup *map = NULL;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct list_head private_devs;
+	int min_stripe_size = 1 * 1024 * 1024;
+	u64 calc_size = 1024 * 1024 * 1024;
+	u64 max_chunk_size = calc_size;
+	u64 min_free;
+	u64 avail;
+	u64 max_avail = 0;
+	u64 dev_offset;
+	int num_stripes = 1;
+	int min_stripes = 1;
+	int sub_stripes = 0;
+	int looped = 0;
+	int ret;
+	int index;
+	int stripe_len = 64 * 1024;
+
+	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (type & BTRFS_BLOCK_GROUP_DUP)) {
+		WARN_ON(1);
+		type &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
+	if (list_empty(&fs_devices->alloc_list))
+		return -ENOSPC;
+
+	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+		num_stripes = fs_devices->rw_devices;
+		min_stripes = 2;
+	}
+	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+		num_stripes = 2;
+		min_stripes = 2;
+	}
+	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+		num_stripes = min_t(u64, 2, fs_devices->rw_devices);
+		if (num_stripes < 2)
+			return -ENOSPC;
+		min_stripes = 2;
+	}
+	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+		num_stripes = fs_devices->rw_devices;
+		if (num_stripes < 4)
+			return -ENOSPC;
+		num_stripes &= ~(u32)1;
+		sub_stripes = 2;
+		min_stripes = 4;
+	}
+
+	if (type & BTRFS_BLOCK_GROUP_DATA) {
+		max_chunk_size = 10 * calc_size;
+		min_stripe_size = 64 * 1024 * 1024;
+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+		max_chunk_size = 4 * calc_size;
+		min_stripe_size = 32 * 1024 * 1024;
+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		calc_size = 8 * 1024 * 1024;
+		max_chunk_size = calc_size * 2;
+		min_stripe_size = 1 * 1024 * 1024;
+	}
+
+	/* we don't want a chunk larger than 10% of writeable space */
+	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+			     max_chunk_size);
+
+again:
+	if (!map || map->num_stripes != num_stripes) {
+		kfree(map);
+		map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+		if (!map)
+			return -ENOMEM;
+		map->num_stripes = num_stripes;
+	}
+
+	if (calc_size * num_stripes > max_chunk_size) {
+		calc_size = max_chunk_size;
+		do_div(calc_size, num_stripes);
+		do_div(calc_size, stripe_len);
+		calc_size *= stripe_len;
+	}
+	/* we don't want tiny stripes */
+	calc_size = max_t(u64, min_stripe_size, calc_size);
+
+	do_div(calc_size, stripe_len);
+	calc_size *= stripe_len;
+
+	cur = fs_devices->alloc_list.next;
+	index = 0;
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_free = calc_size * 2;
+	else
+		min_free = calc_size;
+
+	/*
+	 * we add 1MB because we never use the first 1MB of the device, unless
+	 * we've looped, then we are likely allocating the maximum amount of
+	 * space left already
+	 */
+	if (!looped)
+		min_free += 1024 * 1024;
+
+	INIT_LIST_HEAD(&private_devs);
+	while (index < num_stripes) {
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+		BUG_ON(!device->writeable);
+		if (device->total_bytes > device->bytes_used)
+			avail = device->total_bytes - device->bytes_used;
+		else
+			avail = 0;
+		cur = cur->next;
+
+		if (device->in_fs_metadata && avail >= min_free) {
+			ret = find_free_dev_extent(trans, device,
+						   min_free, &dev_offset);
+			if (ret == 0) {
+				list_move_tail(&device->dev_alloc_list,
+					       &private_devs);
+				map->stripes[index].dev = device;
+				map->stripes[index].physical = dev_offset;
+				index++;
+				if (type & BTRFS_BLOCK_GROUP_DUP) {
+					map->stripes[index].dev = device;
+					map->stripes[index].physical =
+						dev_offset + calc_size;
+					index++;
+				}
+			}
+		} else if (device->in_fs_metadata && avail > max_avail)
+			max_avail = avail;
+		if (cur == &fs_devices->alloc_list)
+			break;
+	}
+	list_splice(&private_devs, &fs_devices->alloc_list);
+	if (index < num_stripes) {
+		if (index >= min_stripes) {
+			num_stripes = index;
+			if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+				num_stripes /= sub_stripes;
+				num_stripes *= sub_stripes;
+			}
+			looped = 1;
+			goto again;
+		}
+		if (!looped && max_avail > 0) {
+			looped = 1;
+			calc_size = max_avail;
+			goto again;
+		}
+		kfree(map);
+		return -ENOSPC;
+	}
+	map->sector_size = extent_root->sectorsize;
+	map->stripe_len = stripe_len;
+	map->io_align = stripe_len;
+	map->io_width = stripe_len;
+	map->type = type;
+	map->num_stripes = num_stripes;
+	map->sub_stripes = sub_stripes;
+
+	*map_ret = map;
+	*stripe_size = calc_size;
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em) {
+		kfree(map);
+		return -ENOMEM;
+	}
+	em->bdev = (struct block_device *)map;
+	em->start = start;
+	em->len = *num_bytes;
+	em->block_start = 0;
+	em->block_len = em->len;
+
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+	BUG_ON(ret);
+	free_extent_map(em);
+
+	ret = btrfs_make_block_group(trans, extent_root, 0, type,
+				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				     start, *num_bytes);
+	BUG_ON(ret);
+
+	index = 0;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		dev_offset = map->stripes[index].physical;
+
+		ret = btrfs_alloc_dev_extent(trans, device,
+				info->chunk_root->root_key.objectid,
+				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				start, dev_offset, calc_size);
+		BUG_ON(ret);
+		index++;
+	}
+
+	return 0;
+}
+
+static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				struct map_lookup *map, u64 chunk_offset,
+				u64 chunk_size, u64 stripe_size)
+{
+	u64 dev_offset;
+	struct btrfs_key key;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_device *device;
+	struct btrfs_chunk *chunk;
+	struct btrfs_stripe *stripe;
+	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+	int index = 0;
+	int ret;
+
+	chunk = kzalloc(item_size, GFP_NOFS);
+	if (!chunk)
+		return -ENOMEM;
+
+	index = 0;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		device->bytes_used += stripe_size;
+		ret = btrfs_update_device(trans, device);
+		BUG_ON(ret);
+		index++;
+	}
+
+	index = 0;
+	stripe = &chunk->stripe;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		dev_offset = map->stripes[index].physical;
+
+		btrfs_set_stack_stripe_devid(stripe, device->devid);
+		btrfs_set_stack_stripe_offset(stripe, dev_offset);
+		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+		stripe++;
+		index++;
+	}
+
+	btrfs_set_stack_chunk_length(chunk, chunk_size);
+	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_type(chunk, map->type);
+	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	key.offset = chunk_offset;
+
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+	BUG_ON(ret);
+
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+					     item_size);
+		BUG_ON(ret);
+	}
+	kfree(chunk);
+	return 0;
+}
+
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 type)
+{
+	u64 chunk_offset;
+	u64 chunk_size;
+	u64 stripe_size;
+	struct map_lookup *map;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	int ret;
+
+	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+			      &chunk_offset);
+	if (ret)
+		return ret;
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, type);
+	if (ret)
+		return ret;
+
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+	return 0;
+}
+
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct btrfs_device *device)
+{
+	u64 chunk_offset;
+	u64 sys_chunk_offset;
+	u64 chunk_size;
+	u64 sys_chunk_size;
+	u64 stripe_size;
+	u64 sys_stripe_size;
+	u64 alloc_profile;
+	struct map_lookup *map;
+	struct map_lookup *sys_map;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+
+	ret = find_next_chunk(fs_info->chunk_root,
+			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+	BUG_ON(ret);
+
+	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+			(fs_info->metadata_alloc_profile &
+			 fs_info->avail_metadata_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	sys_chunk_offset = chunk_offset + chunk_size;
+
+	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+			(fs_info->system_alloc_profile &
+			 fs_info->avail_system_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+				  &sys_chunk_size, &sys_stripe_size,
+				  sys_chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+	BUG_ON(ret);
+
+	/*
+	 * Modifying chunk tree needs allocating new blocks from both
+	 * system block group and metadata block group. So we only can
+	 * do operations require modifying the chunk tree after both
+	 * block groups were created.
+	 */
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+
+	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
+				   sys_chunk_offset, sys_chunk_size,
+				   sys_stripe_size);
+	BUG_ON(ret);
+	return 0;
+}
+
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	int readonly = 0;
+	int i;
+
+	spin_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+	spin_unlock(&map_tree->map_tree.lock);
+	if (!em)
+		return 1;
+
+	map = (struct map_lookup *)em->bdev;
+	for (i = 0; i < map->num_stripes; i++) {
+		if (!map->stripes[i].dev->writeable) {
+			readonly = 1;
+			break;
+		}
+	}
+	free_extent_map(em);
+	return readonly;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+	extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+	struct extent_map *em;
+
+	while (1) {
+		spin_lock(&tree->map_tree.lock);
+		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+		if (em)
+			remove_extent_mapping(&tree->map_tree, em);
+		spin_unlock(&tree->map_tree.lock);
+		if (!em)
+			break;
+		kfree(em->bdev);
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree */
+		free_extent_map(em);
+	}
+}
+
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	spin_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+		ret = map->num_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		ret = map->sub_stripes;
+	else
+		ret = 1;
+	free_extent_map(em);
+	return ret;
+}
+
+static int find_live_mirror(struct map_lookup *map, int first, int num,
+			    int optimal)
+{
+	int i;
+	if (map->stripes[optimal].dev->bdev)
+		return optimal;
+	for (i = first; i < first + num; i++) {
+		if (map->stripes[i].dev->bdev)
+			return i;
+	}
+	/* we couldn't find one that doesn't fail.  Just return something
+	 * and the io error handling code will clean up eventually
+	 */
+	return optimal;
+}
+
+static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+			     u64 logical, u64 *length,
+			     struct btrfs_multi_bio **multi_ret,
+			     int mirror_num, struct page *unplug_page)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	u64 offset;
+	u64 stripe_offset;
+	u64 stripe_nr;
+	int stripes_allocated = 8;
+	int stripes_required = 1;
+	int stripe_index;
+	int i;
+	int num_stripes;
+	int max_errors = 0;
+	struct btrfs_multi_bio *multi = NULL;
+
+	if (multi_ret && !(rw & (1 << BIO_RW)))
+		stripes_allocated = 1;
+again:
+	if (multi_ret) {
+		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+				GFP_NOFS);
+		if (!multi)
+			return -ENOMEM;
+
+		atomic_set(&multi->error, 0);
+	}
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, *length);
+	spin_unlock(&em_tree->lock);
+
+	if (!em && unplug_page)
+		return 0;
+
+	if (!em) {
+		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+		       (unsigned long long)logical,
+		       (unsigned long long)*length);
+		BUG();
+	}
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	offset = logical - em->start;
+
+	if (mirror_num > map->num_stripes)
+		mirror_num = 0;
+
+	/* if our multi bio struct is too small, back off and try again */
+	if (rw & (1 << BIO_RW)) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			stripes_required = map->num_stripes;
+			max_errors = 1;
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripes_required = map->sub_stripes;
+			max_errors = 1;
+		}
+	}
+	if (multi_ret && rw == WRITE &&
+	    stripes_allocated < stripes_required) {
+		stripes_allocated = map->num_stripes;
+		free_extent_map(em);
+		kfree(multi);
+		goto again;
+	}
+	stripe_nr = offset;
+	/*
+	 * stripe_nr counts the total number of stripes we have to stride
+	 * to get to this block
+	 */
+	do_div(stripe_nr, map->stripe_len);
+
+	stripe_offset = stripe_nr * map->stripe_len;
+	BUG_ON(offset < stripe_offset);
+
+	/* stripe_offset is the offset of this block in its stripe*/
+	stripe_offset = offset - stripe_offset;
+
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_RAID10 |
+			 BTRFS_BLOCK_GROUP_DUP)) {
+		/* we limit the length of each bio to what fits in a stripe */
+		*length = min_t(u64, em->len - offset,
+			      map->stripe_len - stripe_offset);
+	} else {
+		*length = em->len - offset;
+	}
+
+	if (!multi_ret && !unplug_page)
+		goto out;
+
+	num_stripes = 1;
+	stripe_index = 0;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		if (unplug_page || (rw & (1 << BIO_RW)))
+			num_stripes = map->num_stripes;
+		else if (mirror_num)
+			stripe_index = mirror_num - 1;
+		else {
+			stripe_index = find_live_mirror(map, 0,
+					    map->num_stripes,
+					    current->pid % map->num_stripes);
+		}
+
+	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+		if (rw & (1 << BIO_RW))
+			num_stripes = map->num_stripes;
+		else if (mirror_num)
+			stripe_index = mirror_num - 1;
+
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+		int factor = map->num_stripes / map->sub_stripes;
+
+		stripe_index = do_div(stripe_nr, factor);
+		stripe_index *= map->sub_stripes;
+
+		if (unplug_page || (rw & (1 << BIO_RW)))
+			num_stripes = map->sub_stripes;
+		else if (mirror_num)
+			stripe_index += mirror_num - 1;
+		else {
+			stripe_index = find_live_mirror(map, stripe_index,
+					      map->sub_stripes, stripe_index +
+					      current->pid % map->sub_stripes);
+		}
+	} else {
+		/*
+		 * after this do_div call, stripe_nr is the number of stripes
+		 * on this device we have to walk to find the data, and
+		 * stripe_index is the number of our device in the stripe array
+		 */
+		stripe_index = do_div(stripe_nr, map->num_stripes);
+	}
+	BUG_ON(stripe_index >= map->num_stripes);
+
+	for (i = 0; i < num_stripes; i++) {
+		if (unplug_page) {
+			struct btrfs_device *device;
+			struct backing_dev_info *bdi;
+
+			device = map->stripes[stripe_index].dev;
+			if (device->bdev) {
+				bdi = blk_get_backing_dev_info(device->bdev);
+				if (bdi->unplug_io_fn)
+					bdi->unplug_io_fn(bdi, unplug_page);
+			}
+		} else {
+			multi->stripes[i].physical =
+				map->stripes[stripe_index].physical +
+				stripe_offset + stripe_nr * map->stripe_len;
+			multi->stripes[i].dev = map->stripes[stripe_index].dev;
+		}
+		stripe_index++;
+	}
+	if (multi_ret) {
+		*multi_ret = multi;
+		multi->num_stripes = num_stripes;
+		multi->max_errors = max_errors;
+	}
+out:
+	free_extent_map(em);
+	return 0;
+}
+
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		      u64 logical, u64 *length,
+		      struct btrfs_multi_bio **multi_ret, int mirror_num)
+{
+	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+				 mirror_num, NULL);
+}
+
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len)
+{
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 *buf;
+	u64 bytenr;
+	u64 length;
+	u64 stripe_nr;
+	int i, j, nr = 0;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_start, 1);
+	spin_unlock(&em_tree->lock);
+
+	BUG_ON(!em || em->start != chunk_start);
+	map = (struct map_lookup *)em->bdev;
+
+	length = em->len;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		do_div(length, map->num_stripes / map->sub_stripes);
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+		do_div(length, map->num_stripes);
+
+	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+	BUG_ON(!buf);
+
+	for (i = 0; i < map->num_stripes; i++) {
+		if (devid && map->stripes[i].dev->devid != devid)
+			continue;
+		if (map->stripes[i].physical > physical ||
+		    map->stripes[i].physical + length <= physical)
+			continue;
+
+		stripe_nr = physical - map->stripes[i].physical;
+		do_div(stripe_nr, map->stripe_len);
+
+		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+			do_div(stripe_nr, map->sub_stripes);
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+		}
+		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		WARN_ON(nr >= map->num_stripes);
+		for (j = 0; j < nr; j++) {
+			if (buf[j] == bytenr)
+				break;
+		}
+		if (j == nr) {
+			WARN_ON(nr >= map->num_stripes);
+			buf[nr++] = bytenr;
+		}
+	}
+
+	for (i = 0; i > nr; i++) {
+		struct btrfs_multi_bio *multi;
+		struct btrfs_bio_stripe *stripe;
+		int ret;
+
+		length = 1;
+		ret = btrfs_map_block(map_tree, WRITE, buf[i],
+				      &length, &multi, 0);
+		BUG_ON(ret);
+
+		stripe = multi->stripes;
+		for (j = 0; j < multi->num_stripes; j++) {
+			if (stripe->physical >= physical &&
+			    physical < stripe->physical + length)
+				break;
+		}
+		BUG_ON(j >= multi->num_stripes);
+		kfree(multi);
+	}
+
+	*logical = buf;
+	*naddrs = nr;
+	*stripe_len = map->stripe_len;
+
+	free_extent_map(em);
+	return 0;
+}
+
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+		      u64 logical, struct page *page)
+{
+	u64 length = PAGE_CACHE_SIZE;
+	return __btrfs_map_block(map_tree, READ, logical, &length,
+				 NULL, 0, page);
+}
+
+static void end_bio_multi_stripe(struct bio *bio, int err)
+{
+	struct btrfs_multi_bio *multi = bio->bi_private;
+	int is_orig_bio = 0;
+
+	if (err)
+		atomic_inc(&multi->error);
+
+	if (bio == multi->orig_bio)
+		is_orig_bio = 1;
+
+	if (atomic_dec_and_test(&multi->stripes_pending)) {
+		if (!is_orig_bio) {
+			bio_put(bio);
+			bio = multi->orig_bio;
+		}
+		bio->bi_private = multi->private;
+		bio->bi_end_io = multi->end_io;
+		/* only send an error to the higher layers if it is
+		 * beyond the tolerance of the multi-bio
+		 */
+		if (atomic_read(&multi->error) > multi->max_errors) {
+			err = -EIO;
+		} else if (err) {
+			/*
+			 * this bio is actually up to date, we didn't
+			 * go over the max number of errors
+			 */
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
+			err = 0;
+		}
+		kfree(multi);
+
+		bio_endio(bio, err);
+	} else if (!is_orig_bio) {
+		bio_put(bio);
+	}
+}
+
+struct async_sched {
+	struct bio *bio;
+	int rw;
+	struct btrfs_fs_info *info;
+	struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+static noinline int schedule_bio(struct btrfs_root *root,
+				 struct btrfs_device *device,
+				 int rw, struct bio *bio)
+{
+	int should_queue = 1;
+
+	/* don't bother with additional async steps for reads, right now */
+	if (!(rw & (1 << BIO_RW))) {
+		bio_get(bio);
+		submit_bio(rw, bio);
+		bio_put(bio);
+		return 0;
+	}
+
+	/*
+	 * nr_async_bios allows us to reliably return congestion to the
+	 * higher layers.  Otherwise, the async bio makes it appear we have
+	 * made progress against dirty pages when we've really just put it
+	 * on a queue for later
+	 */
+	atomic_inc(&root->fs_info->nr_async_bios);
+	WARN_ON(bio->bi_next);
+	bio->bi_next = NULL;
+	bio->bi_rw |= rw;
+
+	spin_lock(&device->io_lock);
+
+	if (device->pending_bio_tail)
+		device->pending_bio_tail->bi_next = bio;
+
+	device->pending_bio_tail = bio;
+	if (!device->pending_bios)
+		device->pending_bios = bio;
+	if (device->running_pending)
+		should_queue = 0;
+
+	spin_unlock(&device->io_lock);
+
+	if (should_queue)
+		btrfs_queue_worker(&root->fs_info->submit_workers,
+				   &device->work);
+	return 0;
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num, int async_submit)
+{
+	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_device *dev;
+	struct bio *first_bio = bio;
+	u64 logical = (u64)bio->bi_sector << 9;
+	u64 length = 0;
+	u64 map_length;
+	struct btrfs_multi_bio *multi = NULL;
+	int ret;
+	int dev_nr = 0;
+	int total_devs = 1;
+
+	length = bio->bi_size;
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+			      mirror_num);
+	BUG_ON(ret);
+
+	total_devs = multi->num_stripes;
+	if (map_length < length) {
+		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+		       "len %llu\n", (unsigned long long)logical,
+		       (unsigned long long)length,
+		       (unsigned long long)map_length);
+		BUG();
+	}
+	multi->end_io = first_bio->bi_end_io;
+	multi->private = first_bio->bi_private;
+	multi->orig_bio = first_bio;
+	atomic_set(&multi->stripes_pending, multi->num_stripes);
+
+	while (dev_nr < total_devs) {
+		if (total_devs > 1) {
+			if (dev_nr < total_devs - 1) {
+				bio = bio_clone(first_bio, GFP_NOFS);
+				BUG_ON(!bio);
+			} else {
+				bio = first_bio;
+			}
+			bio->bi_private = multi;
+			bio->bi_end_io = end_bio_multi_stripe;
+		}
+		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+		dev = multi->stripes[dev_nr].dev;
+		BUG_ON(rw == WRITE && !dev->writeable);
+		if (dev && dev->bdev) {
+			bio->bi_bdev = dev->bdev;
+			if (async_submit)
+				schedule_bio(root, dev, rw, bio);
+			else
+				submit_bio(rw, bio);
+		} else {
+			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
+			bio->bi_sector = logical >> 9;
+			bio_endio(bio, -EIO);
+		}
+		dev_nr++;
+	}
+	if (total_devs == 1)
+		kfree(multi);
+	return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+				       u8 *uuid, u8 *fsid)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *cur_devices;
+
+	cur_devices = root->fs_info->fs_devices;
+	while (cur_devices) {
+		if (!fsid ||
+		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			device = __find_device(&cur_devices->devices,
+					       devid, uuid);
+			if (device)
+				return device;
+		}
+		cur_devices = cur_devices->seed;
+	}
+	return NULL;
+}
+
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+					    u64 devid, u8 *dev_uuid)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device)
+		return NULL;
+	list_add(&device->dev_list,
+		 &fs_devices->devices);
+	device->barriers = 1;
+	device->dev_root = root->fs_info->dev_root;
+	device->devid = devid;
+	device->work.func = pending_bios_fn;
+	device->fs_devices = fs_devices;
+	fs_devices->num_devices++;
+	spin_lock_init(&device->io_lock);
+	INIT_LIST_HEAD(&device->dev_alloc_list);
+	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+	return device;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+			  struct extent_buffer *leaf,
+			  struct btrfs_chunk *chunk)
+{
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 logical;
+	u64 length;
+	u64 devid;
+	u8 uuid[BTRFS_UUID_SIZE];
+	int num_stripes;
+	int ret;
+	int i;
+
+	logical = key->offset;
+	length = btrfs_chunk_length(leaf, chunk);
+
+	spin_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+	spin_unlock(&map_tree->map_tree.lock);
+
+	/* already mapped? */
+	if (em && em->start <= logical && em->start + em->len > logical) {
+		free_extent_map(em);
+		return 0;
+	} else if (em) {
+		free_extent_map(em);
+	}
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (!map)
+		return -ENOMEM;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = logical;
+	em->len = length;
+	em->block_start = 0;
+	em->block_len = em->len;
+
+	map->num_stripes = num_stripes;
+	map->io_width = btrfs_chunk_io_width(leaf, chunk);
+	map->io_align = btrfs_chunk_io_align(leaf, chunk);
+	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	map->type = btrfs_chunk_type(leaf, chunk);
+	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+	for (i = 0; i < num_stripes; i++) {
+		map->stripes[i].physical =
+			btrfs_stripe_offset_nr(leaf, chunk, i);
+		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+		read_extent_buffer(leaf, uuid, (unsigned long)
+				   btrfs_stripe_dev_uuid_nr(chunk, i),
+				   BTRFS_UUID_SIZE);
+		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+							NULL);
+		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+			kfree(map);
+			free_extent_map(em);
+			return -EIO;
+		}
+		if (!map->stripes[i].dev) {
+			map->stripes[i].dev =
+				add_missing_dev(root, devid, uuid);
+			if (!map->stripes[i].dev) {
+				kfree(map);
+				free_extent_map(em);
+				return -EIO;
+			}
+		}
+		map->stripes[i].dev->in_fs_metadata = 1;
+	}
+
+	spin_lock(&map_tree->map_tree.lock);
+	ret = add_extent_mapping(&map_tree->map_tree, em);
+	spin_unlock(&map_tree->map_tree.lock);
+	BUG_ON(ret);
+	free_extent_map(em);
+
+	return 0;
+}
+
+static int fill_device_from_item(struct extent_buffer *leaf,
+				 struct btrfs_dev_item *dev_item,
+				 struct btrfs_device *device)
+{
+	unsigned long ptr;
+
+	device->devid = btrfs_device_id(leaf, dev_item);
+	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+	device->type = btrfs_device_type(leaf, dev_item);
+	device->io_align = btrfs_device_io_align(leaf, dev_item);
+	device->io_width = btrfs_device_io_width(leaf, dev_item);
+	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+
+	return 0;
+}
+
+static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devices;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+
+	fs_devices = root->fs_info->fs_devices->seed;
+	while (fs_devices) {
+		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			ret = 0;
+			goto out;
+		}
+		fs_devices = fs_devices->seed;
+	}
+
+	fs_devices = find_fsid(fsid);
+	if (!fs_devices) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	fs_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(fs_devices)) {
+		ret = PTR_ERR(fs_devices);
+		goto out;
+	}
+
+	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
+				   root->fs_info->bdev_holder);
+	if (ret)
+		goto out;
+
+	if (!fs_devices->seeding) {
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	fs_devices->seed = root->fs_info->fs_devices->seed;
+	root->fs_info->fs_devices->seed = fs_devices;
+out:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+static int read_one_dev(struct btrfs_root *root,
+			struct extent_buffer *leaf,
+			struct btrfs_dev_item *dev_item)
+{
+	struct btrfs_device *device;
+	u64 devid;
+	int ret;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+
+	devid = btrfs_device_id(leaf, dev_item);
+	read_extent_buffer(leaf, dev_uuid,
+			   (unsigned long)btrfs_device_uuid(dev_item),
+			   BTRFS_UUID_SIZE);
+	read_extent_buffer(leaf, fs_uuid,
+			   (unsigned long)btrfs_device_fsid(dev_item),
+			   BTRFS_UUID_SIZE);
+
+	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+		ret = open_seed_devices(root, fs_uuid);
+		if (ret && !btrfs_test_opt(root, DEGRADED))
+			return ret;
+	}
+
+	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+	if (!device || !device->bdev) {
+		if (!btrfs_test_opt(root, DEGRADED))
+			return -EIO;
+
+		if (!device) {
+			printk(KERN_WARNING "warning devid %llu missing\n",
+			       (unsigned long long)devid);
+			device = add_missing_dev(root, devid, dev_uuid);
+			if (!device)
+				return -ENOMEM;
+		}
+	}
+
+	if (device->fs_devices != root->fs_info->fs_devices) {
+		BUG_ON(device->writeable);
+		if (device->generation !=
+		    btrfs_device_generation(leaf, dev_item))
+			return -EINVAL;
+	}
+
+	fill_device_from_item(leaf, dev_item, device);
+	device->dev_root = root->fs_info->dev_root;
+	device->in_fs_metadata = 1;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes += device->total_bytes;
+	ret = 0;
+	return ret;
+}
+
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
+{
+	struct btrfs_dev_item *dev_item;
+
+	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+						     dev_item);
+	return read_one_dev(root, buf, dev_item);
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct extent_buffer *sb;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_chunk *chunk;
+	u8 *ptr;
+	unsigned long sb_ptr;
+	int ret = 0;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u32 cur;
+	struct btrfs_key key;
+
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+					  BTRFS_SUPER_INFO_SIZE);
+	if (!sb)
+		return -ENOMEM;
+	btrfs_set_buffer_uptodate(sb);
+	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	ptr = super_copy->sys_chunk_array;
+	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key); ptr += len;
+		sb_ptr += len;
+		cur += len;
+
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+			chunk = (struct btrfs_chunk *)sb_ptr;
+			ret = read_one_chunk(root, &key, sb, chunk);
+			if (ret)
+				break;
+			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+			len = btrfs_chunk_item_size(num_stripes);
+		} else {
+			ret = -EIO;
+			break;
+		}
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+	}
+	free_extent_buffer(sb);
+	return ret;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int ret;
+	int slot;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* first we search for all of the device items, and then we
+	 * read in all of the chunk items.  This way we can create chunk
+	 * mappings that reference all of the devices that are afound
+	 */
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = 0;
+again:
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+				break;
+			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+				struct btrfs_dev_item *dev_item;
+				dev_item = btrfs_item_ptr(leaf, slot,
+						  struct btrfs_dev_item);
+				ret = read_one_dev(root, leaf, dev_item);
+				if (ret)
+					goto error;
+			}
+		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+			struct btrfs_chunk *chunk;
+			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+			ret = read_one_chunk(root, &found_key, leaf, chunk);
+			if (ret)
+				goto error;
+		}
+		path->slots[0]++;
+	}
+	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+		key.objectid = 0;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 00000000000..86c44e9ae11
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+
+#include <linux/bio.h>
+#include "async-thread.h"
+
+struct buffer_head;
+struct btrfs_device {
+	struct list_head dev_list;
+	struct list_head dev_alloc_list;
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_root *dev_root;
+	struct bio *pending_bios;
+	struct bio *pending_bio_tail;
+	int running_pending;
+	u64 generation;
+
+	int barriers;
+	int writeable;
+	int in_fs_metadata;
+
+	spinlock_t io_lock;
+
+	struct block_device *bdev;
+
+	/* the mode sent to open_bdev_exclusive */
+	fmode_t mode;
+
+	char *name;
+
+	/* the internal btrfs device id */
+	u64 devid;
+
+	/* size of the device */
+	u64 total_bytes;
+
+	/* bytes used */
+	u64 bytes_used;
+
+	/* optimal io alignment for this device */
+	u32 io_align;
+
+	/* optimal io width for this device */
+	u32 io_width;
+
+	/* minimal io size for this device */
+	u32 sector_size;
+
+	/* type and info about this device */
+	u64 type;
+
+	/* physical drive uuid (or lvm uuid) */
+	u8 uuid[BTRFS_UUID_SIZE];
+
+	struct btrfs_work work;
+};
+
+struct btrfs_fs_devices {
+	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+
+	/* the device with this id has the most recent coyp of the super */
+	u64 latest_devid;
+	u64 latest_trans;
+	u64 num_devices;
+	u64 open_devices;
+	u64 rw_devices;
+	u64 total_rw_bytes;
+	struct block_device *latest_bdev;
+	/* all of the devices in the FS */
+	struct list_head devices;
+
+	/* devices not currently being allocated */
+	struct list_head alloc_list;
+	struct list_head list;
+
+	struct btrfs_fs_devices *seed;
+	int seeding;
+
+	int opened;
+};
+
+struct btrfs_bio_stripe {
+	struct btrfs_device *dev;
+	u64 physical;
+};
+
+struct btrfs_multi_bio {
+	atomic_t stripes_pending;
+	bio_end_io_t *end_io;
+	struct bio *orig_bio;
+	void *private;
+	atomic_t error;
+	int max_errors;
+	int num_stripes;
+	struct btrfs_bio_stripe stripes[];
+};
+
+#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 chunk_tree, u64 chunk_objectid,
+			   u64 chunk_offset, u64 start, u64 num_bytes);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		    u64 logical, u64 *length,
+		    struct btrfs_multi_bio **multi_ret, int mirror_num);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num, int async_submit);
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+int btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+		      u64 logical, struct page *page);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+				       u8 *uuid, u8 *fsid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_balance(struct btrfs_root *dev_root);
+void btrfs_unlock_volumes(void);
+void btrfs_lock_volumes(void);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 00000000000..7f332e27089
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+
+
+ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+				void *buffer, size_t size)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret = 0;
+	unsigned long data_ptr;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* lookup the xattr by name */
+	di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+				strlen(name), 0);
+	if (!di || IS_ERR(di)) {
+		ret = -ENODATA;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	/* if size is 0, that means we want the size of the attr */
+	if (!size) {
+		ret = btrfs_dir_data_len(leaf, di);
+		goto out;
+	}
+
+	/* now get the data out of our dir_item */
+	if (btrfs_dir_data_len(leaf, di) > size) {
+		ret = -ERANGE;
+		goto out;
+	}
+	data_ptr = (unsigned long)((char *)(di + 1) +
+				   btrfs_dir_name_len(leaf, di));
+	read_extent_buffer(leaf, buffer, data_ptr,
+			   btrfs_dir_data_len(leaf, di));
+	ret = btrfs_dir_data_len(leaf, di);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int __btrfs_setxattr(struct inode *inode, const char *name,
+			    const void *value, size_t size, int flags)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	int ret = 0, mod = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+
+	/* first lets see if we already have this xattr */
+	di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+				strlen(name), -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+
+	/* ok we already have this xattr, lets remove it */
+	if (di) {
+		/* if we want create only exit */
+		if (flags & XATTR_CREATE) {
+			ret = -EEXIST;
+			goto out;
+		}
+
+		ret = btrfs_delete_one_dir_name(trans, root, path, di);
+		if (ret)
+			goto out;
+		btrfs_release_path(root, path);
+
+		/* if we don't have a value then we are removing the xattr */
+		if (!value) {
+			mod = 1;
+			goto out;
+		}
+	} else {
+		btrfs_release_path(root, path);
+
+		if (flags & XATTR_REPLACE) {
+			/* we couldn't find the attr to replace */
+			ret = -ENODATA;
+			goto out;
+		}
+	}
+
+	/* ok we have to create a completely new xattr */
+	ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
+				      value, size, inode->i_ino);
+	if (ret)
+		goto out;
+	mod = 1;
+
+out:
+	if (mod) {
+		inode->i_ctime = CURRENT_TIME;
+		ret = btrfs_update_inode(trans, root, inode);
+	}
+
+	btrfs_end_transaction(trans, root);
+	btrfs_free_path(path);
+	return ret;
+}
+
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct btrfs_key key, found_key;
+	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_item *item;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	int ret = 0, slot, advance;
+	size_t total_size = 0, size_left = size;
+	unsigned long name_ptr;
+	size_t name_len;
+	u32 nritems;
+
+	/*
+	 * ok we want all objects associated with this id.
+	 * NOTE: we set key.offset = 0; because we want to start with the
+	 * first xattr that we find and walk forward
+	 */
+	key.objectid = inode->i_ino;
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 2;
+
+	/* search for our xattrs */
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+	ret = 0;
+	advance = 0;
+	while (1) {
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		slot = path->slots[0];
+
+		/* this is where we start walking through the path */
+		if (advance || slot >= nritems) {
+			/*
+			 * if we've reached the last slot in this leaf we need
+			 * to go to the next leaf and reset everything
+			 */
+			if (slot >= nritems-1) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+				leaf = path->nodes[0];
+				nritems = btrfs_header_nritems(leaf);
+				slot = path->slots[0];
+			} else {
+				/*
+				 * just walking through the slots on this leaf
+				 */
+				slot++;
+				path->slots[0]++;
+			}
+		}
+		advance = 1;
+
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* check to make sure this item is what we want */
+		if (found_key.objectid != key.objectid)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+			break;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+
+		name_len = btrfs_dir_name_len(leaf, di);
+		total_size += name_len + 1;
+
+		/* we are just looking for how big our buffer needs to be */
+		if (!size)
+			continue;
+
+		if (!buffer || (name_len + 1) > size_left) {
+			ret = -ERANGE;
+			goto err;
+		}
+
+		name_ptr = (unsigned long)(di + 1);
+		read_extent_buffer(leaf, buffer, name_ptr, name_len);
+		buffer[name_len] = '\0';
+
+		size_left -= name_len + 1;
+		buffer += name_len + 1;
+	}
+	ret = total_size;
+
+err:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+/*
+ * List of handlers for synthetic system.* attributes.  All real ondisk
+ * attributes are handled directly.
+ */
+struct xattr_handler *btrfs_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+	&btrfs_xattr_acl_access_handler,
+	&btrfs_xattr_acl_default_handler,
+#endif
+	NULL,
+};
+
+/*
+ * Check if the attribute is in a supported namespace.
+ *
+ * This applied after the check for the synthetic attributes in the system
+ * namespace.
+ */
+static bool btrfs_is_valid_xattr(const char *name)
+{
+	return !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size)
+{
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_getxattr(dentry, name, buffer, size);
+
+	if (!btrfs_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+	return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+}
+
+int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		   size_t size, int flags)
+{
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_setxattr(dentry, name, value, size, flags);
+
+	if (!btrfs_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (size == 0)
+		value = "";  /* empty EA, do not remove */
+	return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
+}
+
+int btrfs_removexattr(struct dentry *dentry, const char *name)
+{
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_removexattr(dentry, name);
+
+	if (!btrfs_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+	return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 00000000000..5b1d08f8e68
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __XATTR__
+#define __XATTR__
+
+#include <linux/xattr.h>
+
+extern struct xattr_handler btrfs_xattr_acl_access_handler;
+extern struct xattr_handler btrfs_xattr_acl_default_handler;
+extern struct xattr_handler *btrfs_xattr_handlers[];
+
+extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+		void *buffer, size_t size);
+extern int __btrfs_setxattr(struct inode *inode, const char *name,
+		const void *value, size_t size, int flags);
+
+extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+		void *buffer, size_t size);
+extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags);
+extern int btrfs_removexattr(struct dentry *dentry, const char *name);
+
+#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 00000000000..ecfbce836d3
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include "compression.h"
+
+/* Plan: call deflate() with avail_in == *sourcelen,
+	avail_out = *dstlen - 12 and flush == Z_FINISH.
+	If it doesn't manage to finish,	call it again with
+	avail_in == 0 and avail_out set to the remaining 12
+	bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+
+struct workspace {
+	z_stream inf_strm;
+	z_stream def_strm;
+	char *buf;
+	struct list_head list;
+};
+
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+	struct workspace *workspace;
+	int ret;
+	int cpus = num_online_cpus();
+
+again:
+	spin_lock(&workspace_lock);
+	if (!list_empty(&idle_workspace)) {
+		workspace = list_entry(idle_workspace.next, struct workspace,
+				       list);
+		list_del(&workspace->list);
+		num_workspace--;
+		spin_unlock(&workspace_lock);
+		return workspace;
+
+	}
+	spin_unlock(&workspace_lock);
+	if (atomic_read(&alloc_workspace) > cpus) {
+		DEFINE_WAIT(wait);
+		prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&alloc_workspace) > cpus)
+			schedule();
+		finish_wait(&workspace_wait, &wait);
+		goto again;
+	}
+	atomic_inc(&alloc_workspace);
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+	if (!workspace->def_strm.workspace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+	if (!workspace->inf_strm.workspace) {
+		ret = -ENOMEM;
+		goto fail_inflate;
+	}
+	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+	if (!workspace->buf) {
+		ret = -ENOMEM;
+		goto fail_kmalloc;
+	}
+	return workspace;
+
+fail_kmalloc:
+	vfree(workspace->inf_strm.workspace);
+fail_inflate:
+	vfree(workspace->def_strm.workspace);
+fail:
+	kfree(workspace);
+	atomic_dec(&alloc_workspace);
+	wake_up(&workspace_wait);
+	return ERR_PTR(ret);
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+	spin_lock(&workspace_lock);
+	if (num_workspace < num_online_cpus()) {
+		list_add_tail(&workspace->list, &idle_workspace);
+		num_workspace++;
+		spin_unlock(&workspace_lock);
+		if (waitqueue_active(&workspace_wait))
+			wake_up(&workspace_wait);
+		return 0;
+	}
+	spin_unlock(&workspace_lock);
+	vfree(workspace->def_strm.workspace);
+	vfree(workspace->inf_strm.workspace);
+	kfree(workspace->buf);
+	kfree(workspace);
+
+	atomic_dec(&alloc_workspace);
+	if (waitqueue_active(&workspace_wait))
+		wake_up(&workspace_wait);
+	return 0;
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+	struct workspace *workspace;
+	while (!list_empty(&idle_workspace)) {
+		workspace = list_entry(idle_workspace.next, struct workspace,
+				       list);
+		list_del(&workspace->list);
+		vfree(workspace->def_strm.workspace);
+		vfree(workspace->inf_strm.workspace);
+		kfree(workspace->buf);
+		kfree(workspace);
+		atomic_dec(&alloc_workspace);
+	}
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out)
+{
+	int ret;
+	struct workspace *workspace;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	int out_written = 0;
+	int in_read = 0;
+	unsigned long bytes_left;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -1;
+
+	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+		printk(KERN_WARNING "deflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+
+	workspace->def_strm.total_in = 0;
+	workspace->def_strm.total_out = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	cpage_out = kmap(out_page);
+	pages[0] = out_page;
+	nr_pages = 1;
+
+	workspace->def_strm.next_in = data_in;
+	workspace->def_strm.next_out = cpage_out;
+	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+	out_written = 0;
+	in_read = 0;
+
+	while (workspace->def_strm.total_in < len) {
+		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+		if (ret != Z_OK) {
+			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			       ret);
+			zlib_deflateEnd(&workspace->def_strm);
+			ret = -1;
+			goto out;
+		}
+
+		/* we're making it bigger, give up */
+		if (workspace->def_strm.total_in > 8192 &&
+		    workspace->def_strm.total_in <
+		    workspace->def_strm.total_out) {
+			ret = -1;
+			goto out;
+		}
+		/* we need another page for writing out.  Test this
+		 * before the total_in so we will pull in a new page for
+		 * the stream end if required
+		 */
+		if (workspace->def_strm.avail_out == 0) {
+			kunmap(out_page);
+			if (nr_pages == nr_dest_pages) {
+				out_page = NULL;
+				ret = -1;
+				goto out;
+			}
+			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			cpage_out = kmap(out_page);
+			pages[nr_pages] = out_page;
+			nr_pages++;
+			workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+			workspace->def_strm.next_out = cpage_out;
+		}
+		/* we're all done */
+		if (workspace->def_strm.total_in >= len)
+			break;
+
+		/* we've read in a full page, get a new one */
+		if (workspace->def_strm.avail_in == 0) {
+			if (workspace->def_strm.total_out > max_out)
+				break;
+
+			bytes_left = len - workspace->def_strm.total_in;
+			kunmap(in_page);
+			page_cache_release(in_page);
+
+			start += PAGE_CACHE_SIZE;
+			in_page = find_get_page(mapping,
+						start >> PAGE_CACHE_SHIFT);
+			data_in = kmap(in_page);
+			workspace->def_strm.avail_in = min(bytes_left,
+							   PAGE_CACHE_SIZE);
+			workspace->def_strm.next_in = data_in;
+		}
+	}
+	workspace->def_strm.avail_in = 0;
+	ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+	zlib_deflateEnd(&workspace->def_strm);
+
+	if (ret != Z_STREAM_END) {
+		ret = -1;
+		goto out;
+	}
+
+	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+		ret = -1;
+		goto out;
+	}
+
+	ret = 0;
+	*total_out = workspace->def_strm.total_out;
+	*total_in = workspace->def_strm.total_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+	free_workspace(workspace);
+	return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+			      u64 disk_start,
+			      struct bio_vec *bvec,
+			      int vcnt,
+			      size_t srclen)
+{
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	struct workspace *workspace;
+	char *data_in;
+	size_t total_out = 0;
+	unsigned long page_bytes_left;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	struct page *page_out;
+	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+					PAGE_CACHE_SIZE;
+	unsigned long buf_start;
+	unsigned long buf_offset;
+	unsigned long bytes;
+	unsigned long working_bytes;
+	unsigned long pg_offset;
+	unsigned long start_byte;
+	unsigned long current_buf_start;
+	char *kaddr;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -ENOMEM;
+
+	data_in = kmap(pages_in[page_in_index]);
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.total_out = 0;
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	page_out = bvec[page_out_index].bv_page;
+	page_bytes_left = PAGE_CACHE_SIZE;
+	pg_offset = 0;
+
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+	while (workspace->inf_strm.total_in < srclen) {
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END)
+			break;
+		/*
+		 * buf start is the byte offset we're of the start of
+		 * our workspace buffer
+		 */
+		buf_start = total_out;
+
+		/* total_out is the last byte of the workspace buffer */
+		total_out = workspace->inf_strm.total_out;
+
+		working_bytes = total_out - buf_start;
+
+		/*
+		 * start byte is the first byte of the page we're currently
+		 * copying into relative to the start of the compressed data.
+		 */
+		start_byte = page_offset(page_out) - disk_start;
+
+		if (working_bytes == 0) {
+			/* we didn't make progress in this inflate
+			 * call, we're done
+			 */
+			if (ret != Z_STREAM_END)
+				ret = -1;
+			break;
+		}
+
+		/* we haven't yet hit data corresponding to this page */
+		if (total_out <= start_byte)
+			goto next;
+
+		/*
+		 * the start of the data we care about is offset into
+		 * the middle of our working buffer
+		 */
+		if (total_out > start_byte && buf_start < start_byte) {
+			buf_offset = start_byte - buf_start;
+			working_bytes -= buf_offset;
+		} else {
+			buf_offset = 0;
+		}
+		current_buf_start = buf_start;
+
+		/* copy bytes from the working buffer into the pages */
+		while (working_bytes > 0) {
+			bytes = min(PAGE_CACHE_SIZE - pg_offset,
+				    PAGE_CACHE_SIZE - buf_offset);
+			bytes = min(bytes, working_bytes);
+			kaddr = kmap_atomic(page_out, KM_USER0);
+			memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+			       bytes);
+			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(page_out);
+
+			pg_offset += bytes;
+			page_bytes_left -= bytes;
+			buf_offset += bytes;
+			working_bytes -= bytes;
+			current_buf_start += bytes;
+
+			/* check if we need to pick another page */
+			if (page_bytes_left == 0) {
+				page_out_index++;
+				if (page_out_index >= vcnt) {
+					ret = 0;
+					goto done;
+				}
+
+				page_out = bvec[page_out_index].bv_page;
+				pg_offset = 0;
+				page_bytes_left = PAGE_CACHE_SIZE;
+				start_byte = page_offset(page_out) - disk_start;
+
+				/*
+				 * make sure our new page is covered by this
+				 * working buffer
+				 */
+				if (total_out <= start_byte)
+					goto next;
+
+				/* the next page in the biovec might not
+				 * be adjacent to the last page, but it
+				 * might still be found inside this working
+				 * buffer.  bump our offset pointer
+				 */
+				if (total_out > start_byte &&
+				    current_buf_start < start_byte) {
+					buf_offset = start_byte - buf_start;
+					working_bytes = total_out - start_byte;
+					current_buf_start = buf_start +
+						buf_offset;
+				}
+			}
+		}
+next:
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+		if (workspace->inf_strm.avail_in == 0) {
+			unsigned long tmp;
+			kunmap(pages_in[page_in_index]);
+			page_in_index++;
+			if (page_in_index >= total_pages_in) {
+				data_in = NULL;
+				break;
+			}
+			data_in = kmap(pages_in[page_in_index]);
+			workspace->inf_strm.next_in = data_in;
+			tmp = srclen - workspace->inf_strm.total_in;
+			workspace->inf_strm.avail_in = min(tmp,
+							   PAGE_CACHE_SIZE);
+		}
+	}
+	if (ret != Z_STREAM_END)
+		ret = -1;
+	else
+		ret = 0;
+done:
+	zlib_inflateEnd(&workspace->inf_strm);
+	if (data_in)
+		kunmap(pages_in[page_in_index]);
+out:
+	free_workspace(workspace);
+	return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen)
+{
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	struct workspace *workspace;
+	unsigned long bytes_left = destlen;
+	unsigned long total_out = 0;
+	char *kaddr;
+
+	if (destlen > PAGE_CACHE_SIZE)
+		return -ENOMEM;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -ENOMEM;
+
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = srclen;
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->inf_strm.total_out = 0;
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+
+	while (bytes_left > 0) {
+		unsigned long buf_start;
+		unsigned long buf_offset;
+		unsigned long bytes;
+		unsigned long pg_offset = 0;
+
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END)
+			break;
+
+		buf_start = total_out;
+		total_out = workspace->inf_strm.total_out;
+
+		if (total_out == buf_start) {
+			ret = -1;
+			break;
+		}
+
+		if (total_out <= start_byte)
+			goto next;
+
+		if (total_out > start_byte && buf_start < start_byte)
+			buf_offset = start_byte - buf_start;
+		else
+			buf_offset = 0;
+
+		bytes = min(PAGE_CACHE_SIZE - pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, bytes_left);
+
+		kaddr = kmap_atomic(dest_page, KM_USER0);
+		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		pg_offset += bytes;
+		bytes_left -= bytes;
+next:
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	}
+
+	if (ret != Z_STREAM_END && bytes_left != 0)
+		ret = -1;
+	else
+		ret = 0;
+
+	zlib_inflateEnd(&workspace->inf_strm);
+out:
+	free_workspace(workspace);
+	return ret;
+}
+
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index c73fa89b5f8..170d289ac78 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -22,9 +22,7 @@
 
 
 #define BIT_DIVIDER_MIPS 1043
-static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */
-
-#include <linux/errno.h>
+static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
 
 struct pushpull {
 	unsigned char *buf;
@@ -43,7 +41,9 @@ struct rubin_state {
 	int bits[8];
 };
 
-static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
+static inline void init_pushpull(struct pushpull *pp, char *buf,
+				 unsigned buflen, unsigned ofs,
+				 unsigned reserve)
 {
 	pp->buf = buf;
 	pp->buflen = buflen;
@@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen
 
 static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
 {
-	if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
+	if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
 		return -ENOSPC;
-	}
 
-	if (bit) {
-		pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
-	}
-	else {
-		pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
-	}
+	if (bit)
+		pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
+	else
+		pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
+
 	pp->ofs++;
 
 	return 0;
@@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits)
 	rs->p = (long) (2 * UPPER_BIT_RUBIN);
 	rs->bit_number = (long) 0;
 	rs->bit_divider = div;
+
 	for (c=0; c<8; c++)
 		rs->bits[c] = bits[c];
 }
@@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
 	long i0, i1;
 	int ret;
 
-	while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
+	while ((rs->q >= UPPER_BIT_RUBIN) ||
+	       ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
 		rs->bit_number++;
 
 		ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
@@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
 		rs->p <<= 1;
 	}
 	i0 = A * rs->p / (A + B);
-	if (i0 <= 0) {
+	if (i0 <= 0)
 		i0 = 1;
-	}
-	if (i0 >= rs->p) {
+
+	if (i0 >= rs->p)
 		i0 = rs->p - 1;
-	}
+
 	i1 = rs->p - i0;
 
 	if (symbol == 0)
@@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits)
 	/* behalve lower */
 	rs->rec_q = 0;
 
-	for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
+	for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
+	     rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
 		;
 }
 
-static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q)
+static void __do_decode(struct rubin_state *rs, unsigned long p,
+			unsigned long q)
 {
 	register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
 	unsigned long rec_q;
@@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B)
 		__do_decode(rs, p, q);
 
 	i0 = A * rs->p / (A + B);
-	if (i0 <= 0) {
+	if (i0 <= 0)
 		i0 = 1;
-	}
-	if (i0 >= rs->p) {
+
+	if (i0 >= rs->p)
 		i0 = rs->p - 1;
-	}
 
 	threshold = rs->q + i0;
 	symbol = rs->rec_q >= threshold;
@@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte)
 	struct rubin_state rs_copy;
 	rs_copy = *rs;
 
-	for (i=0;i<8;i++) {
-		ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1);
+	for (i=0; i<8; i++) {
+		ret = encode(rs, rs->bit_divider-rs->bits[i],
+			     rs->bits[i], byte & 1);
 		if (ret) {
 			/* Failed. Restore old state */
 			*rs = rs_copy;
 			return ret;
 		}
-		byte=byte>>1;
+		byte >>= 1 ;
 	}
 	return 0;
 }
@@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs)
 	int i, result = 0, bit_divider = rs->bit_divider;
 
 	for (i = 0; i < 8; i++)
-		result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i;
+		result |= decode(rs, bit_divider - rs->bits[i],
+				 rs->bits[i]) << i;
 
 	return result;
 }
@@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs)
 
 
 static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
-		      unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen)
+			     unsigned char *cpage_out, uint32_t *sourcelen,
+			     uint32_t *dstlen)
 	{
 	int outpos = 0;
 	int pos=0;
@@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
 int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
 		   uint32_t *sourcelen, uint32_t *dstlen, void *model)
 {
-	return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+	return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+				 cpage_out, sourcelen, dstlen);
 }
 #endif
 static int jffs2_dynrubin_compress(unsigned char *data_in,
@@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
 		return -1;
 
 	memset(histo, 0, 256);
-	for (i=0; i<mysrclen; i++) {
+	for (i=0; i<mysrclen; i++)
 		histo[data_in[i]]++;
-	}
 	memset(bits, 0, sizeof(int)*8);
 	for (i=0; i<256; i++) {
 		if (i&128)
@@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
 		cpage_out[i] = bits[i];
 	}
 
-	ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen);
+	ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
+				&mydstlen);
 	if (ret)
 		return ret;
 
@@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
 	return 0;
 }
 
-static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in,
-			 unsigned char *page_out, uint32_t srclen, uint32_t destlen)
+static void rubin_do_decompress(int bit_divider, int *bits,
+				unsigned char *cdata_in, 
+				unsigned char *page_out, uint32_t srclen,
+				uint32_t destlen)
 {
 	int outpos = 0;
 	struct rubin_state rs;
@@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata
 	init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
 	init_decode(&rs, bit_divider, bits);
 
-	while (outpos < destlen) {
+	while (outpos < destlen)
 		page_out[outpos++] = in_byte(&rs);
-	}
 }
 
 
@@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
 				      uint32_t sourcelen, uint32_t dstlen,
 				      void *model)
 {
-	rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+	rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+			    cpage_out, sourcelen, dstlen);
 	return 0;
 }
 
@@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in,
 	for (c=0; c<8; c++)
 		bits[c] = data_in[c];
 
-	rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen);
+	rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
+			    dstlen);
 	return 0;
 }
 
 static struct jffs2_compressor jffs2_rubinmips_comp = {
-    .priority = JFFS2_RUBINMIPS_PRIORITY,
-    .name = "rubinmips",
-    .compr = JFFS2_COMPR_DYNRUBIN,
-    .compress = NULL, /*&jffs2_rubinmips_compress,*/
-    .decompress = &jffs2_rubinmips_decompress,
+	.priority = JFFS2_RUBINMIPS_PRIORITY,
+	.name = "rubinmips",
+	.compr = JFFS2_COMPR_DYNRUBIN,
+	.compress = NULL, /*&jffs2_rubinmips_compress,*/
+	.decompress = &jffs2_rubinmips_decompress,
 #ifdef JFFS2_RUBINMIPS_DISABLED
-    .disabled = 1,
+	.disabled = 1,
 #else
-    .disabled = 0,
+	.disabled = 0,
 #endif
 };
 
 int jffs2_rubinmips_init(void)
 {
-    return jffs2_register_compressor(&jffs2_rubinmips_comp);
+	return jffs2_register_compressor(&jffs2_rubinmips_comp);
 }
 
 void jffs2_rubinmips_exit(void)
 {
-    jffs2_unregister_compressor(&jffs2_rubinmips_comp);
+	jffs2_unregister_compressor(&jffs2_rubinmips_comp);
 }
 
 static struct jffs2_compressor jffs2_dynrubin_comp = {
-    .priority = JFFS2_DYNRUBIN_PRIORITY,
-    .name = "dynrubin",
-    .compr = JFFS2_COMPR_RUBINMIPS,
-    .compress = jffs2_dynrubin_compress,
-    .decompress = &jffs2_dynrubin_decompress,
+	.priority = JFFS2_DYNRUBIN_PRIORITY,
+	.name = "dynrubin",
+	.compr = JFFS2_COMPR_RUBINMIPS,
+	.compress = jffs2_dynrubin_compress,
+	.decompress = &jffs2_dynrubin_decompress,
 #ifdef JFFS2_DYNRUBIN_DISABLED
-    .disabled = 1,
+	.disabled = 1,
 #else
-    .disabled = 0,
+	.disabled = 0,
 #endif
 };
 
 int jffs2_dynrubin_init(void)
 {
-    return jffs2_register_compressor(&jffs2_dynrubin_comp);
+	return jffs2_register_compressor(&jffs2_dynrubin_comp);
 }
 
 void jffs2_dynrubin_exit(void)
 {
-    jffs2_unregister_compressor(&jffs2_dynrubin_comp);
+	jffs2_unregister_compressor(&jffs2_dynrubin_comp);
 }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 259461b910a..c32b4a1ad6c 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 {
 	/* For NAND, if the failure did not occur at the device level for a
 	   specific physical page, don't bother updating the bad block table. */
-	if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
+	if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
 		/* We had a device-level failure to erase.  Let's see if we've
 		   failed too many times. */
 		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr)
 	struct erase_priv_struct *priv = (void *)instr->priv;
 
 	if(instr->state != MTD_ERASE_DONE) {
-		printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state);
+		printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
+			(unsigned long long)instr->addr, instr->state);
 		jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
 	} else {
 		jffs2_erase_succeeded(priv->c, priv->jeb);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 5198ada6739..6d720243f5f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno)
 
 	blk_free_devt(part_devt(part));
 	rcu_assign_pointer(ptbl->part[partno], NULL);
+	rcu_assign_pointer(ptbl->last_lookup, NULL);
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
diff --git a/include/acpi/acdisasm.h b/include/acpi/acdisasm.h
deleted file mode 100644
index 0c1ed387073..00000000000
--- a/include/acpi/acdisasm.h
+++ /dev/null
@@ -1,445 +0,0 @@
-/******************************************************************************
- *
- * Name: acdisasm.h - AML disassembler
- *
- *****************************************************************************/
-
-/*
- * Copyright (C) 2000 - 2008, Intel Corp.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions, and the following disclaimer,
- *    without modification.
- * 2. Redistributions in binary form must reproduce at minimum a disclaimer
- *    substantially similar to the "NO WARRANTY" disclaimer below
- *    ("Disclaimer") and any redistribution must be conditioned upon
- *    including a substantially similar Disclaimer requirement for further
- *    binary redistribution.
- * 3. Neither the names of the above-listed copyright holders nor the names
- *    of any contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") version 2 as published by the Free
- * Software Foundation.
- *
- * NO WARRANTY
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGES.
- */
-
-#ifndef __ACDISASM_H__
-#define __ACDISASM_H__
-
-#include "amlresrc.h"
-
-#define BLOCK_NONE              0
-#define BLOCK_PAREN             1
-#define BLOCK_BRACE             2
-#define BLOCK_COMMA_LIST        4
-#define ACPI_DEFAULT_RESNAME    *(u32 *) "__RD"
-
-struct acpi_external_list {
-	char *path;
-	char *internal_path;
-	struct acpi_external_list *next;
-	u32 value;
-	u16 length;
-	u8 type;
-};
-
-extern struct acpi_external_list *acpi_gbl_external_list;
-
-typedef const struct acpi_dmtable_info {
-	u8 opcode;
-	u8 offset;
-	char *name;
-
-} acpi_dmtable_info;
-
-/*
- * Values for Opcode above.
- * Note: 0-7 must not change, used as a flag shift value
- */
-#define ACPI_DMT_FLAG0                  0
-#define ACPI_DMT_FLAG1                  1
-#define ACPI_DMT_FLAG2                  2
-#define ACPI_DMT_FLAG3                  3
-#define ACPI_DMT_FLAG4                  4
-#define ACPI_DMT_FLAG5                  5
-#define ACPI_DMT_FLAG6                  6
-#define ACPI_DMT_FLAG7                  7
-#define ACPI_DMT_FLAGS0                 8
-#define ACPI_DMT_FLAGS2                 9
-#define ACPI_DMT_UINT8                  10
-#define ACPI_DMT_UINT16                 11
-#define ACPI_DMT_UINT24                 12
-#define ACPI_DMT_UINT32                 13
-#define ACPI_DMT_UINT56                 14
-#define ACPI_DMT_UINT64                 15
-#define ACPI_DMT_STRING                 16
-#define ACPI_DMT_NAME4                  17
-#define ACPI_DMT_NAME6                  18
-#define ACPI_DMT_NAME8                  19
-#define ACPI_DMT_CHKSUM                 20
-#define ACPI_DMT_SPACEID                21
-#define ACPI_DMT_GAS                    22
-#define ACPI_DMT_ASF                    23
-#define ACPI_DMT_DMAR                   24
-#define ACPI_DMT_HEST                   25
-#define ACPI_DMT_HESTNTFY               26
-#define ACPI_DMT_HESTNTYP               27
-#define ACPI_DMT_MADT                   28
-#define ACPI_DMT_SRAT                   29
-#define ACPI_DMT_EXIT                   30
-#define ACPI_DMT_SIG                    31
-
-typedef
-void (*acpi_dmtable_handler) (struct acpi_table_header * table);
-
-struct acpi_dmtable_data {
-	char *signature;
-	struct acpi_dmtable_info *table_info;
-	acpi_dmtable_handler table_handler;
-	char *name;
-};
-
-struct acpi_op_walk_info {
-	u32 level;
-	u32 last_level;
-	u32 count;
-	u32 bit_offset;
-	u32 flags;
-	struct acpi_walk_state *walk_state;
-};
-
-typedef
-acpi_status(*asl_walk_callback) (union acpi_parse_object * op,
-				 u32 level, void *context);
-
-struct acpi_resource_tag {
-	u32 bit_index;
-	char *tag;
-};
-
-/* Strings used for decoding flags to ASL keywords */
-
-extern const char *acpi_gbl_word_decode[];
-extern const char *acpi_gbl_irq_decode[];
-extern const char *acpi_gbl_lock_rule[];
-extern const char *acpi_gbl_access_types[];
-extern const char *acpi_gbl_update_rules[];
-extern const char *acpi_gbl_match_ops[];
-
-extern struct acpi_dmtable_info acpi_dm_table_info_asf0[];
-extern struct acpi_dmtable_info acpi_dm_table_info_asf1[];
-extern struct acpi_dmtable_info acpi_dm_table_info_asf1a[];
-extern struct acpi_dmtable_info acpi_dm_table_info_asf2[];
-extern struct acpi_dmtable_info acpi_dm_table_info_asf2a[];
-extern struct acpi_dmtable_info acpi_dm_table_info_asf3[];
-extern struct acpi_dmtable_info acpi_dm_table_info_asf4[];
-extern struct acpi_dmtable_info acpi_dm_table_info_asf_hdr[];
-extern struct acpi_dmtable_info acpi_dm_table_info_boot[];
-extern struct acpi_dmtable_info acpi_dm_table_info_bert[];
-extern struct acpi_dmtable_info acpi_dm_table_info_cpep[];
-extern struct acpi_dmtable_info acpi_dm_table_info_cpep0[];
-extern struct acpi_dmtable_info acpi_dm_table_info_dbgp[];
-extern struct acpi_dmtable_info acpi_dm_table_info_dmar[];
-extern struct acpi_dmtable_info acpi_dm_table_info_dmar_hdr[];
-extern struct acpi_dmtable_info acpi_dm_table_info_dmar_scope[];
-extern struct acpi_dmtable_info acpi_dm_table_info_dmar0[];
-extern struct acpi_dmtable_info acpi_dm_table_info_dmar1[];
-extern struct acpi_dmtable_info acpi_dm_table_info_dmar2[];
-extern struct acpi_dmtable_info acpi_dm_table_info_ecdt[];
-extern struct acpi_dmtable_info acpi_dm_table_info_einj[];
-extern struct acpi_dmtable_info acpi_dm_table_info_einj0[];
-extern struct acpi_dmtable_info acpi_dm_table_info_erst[];
-extern struct acpi_dmtable_info acpi_dm_table_info_facs[];
-extern struct acpi_dmtable_info acpi_dm_table_info_fadt1[];
-extern struct acpi_dmtable_info acpi_dm_table_info_fadt2[];
-extern struct acpi_dmtable_info acpi_dm_table_info_gas[];
-extern struct acpi_dmtable_info acpi_dm_table_info_header[];
-extern struct acpi_dmtable_info acpi_dm_table_info_hest[];
-extern struct acpi_dmtable_info acpi_dm_table_info_hest9[];
-extern struct acpi_dmtable_info acpi_dm_table_info_hest_notify[];
-extern struct acpi_dmtable_info acpi_dm_table_info_hpet[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt0[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt1[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt2[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt3[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt4[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt5[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt6[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt7[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt8[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt9[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt10[];
-extern struct acpi_dmtable_info acpi_dm_table_info_madt_hdr[];
-extern struct acpi_dmtable_info acpi_dm_table_info_mcfg[];
-extern struct acpi_dmtable_info acpi_dm_table_info_mcfg0[];
-extern struct acpi_dmtable_info acpi_dm_table_info_rsdp1[];
-extern struct acpi_dmtable_info acpi_dm_table_info_rsdp2[];
-extern struct acpi_dmtable_info acpi_dm_table_info_sbst[];
-extern struct acpi_dmtable_info acpi_dm_table_info_slic[];
-extern struct acpi_dmtable_info acpi_dm_table_info_slit[];
-extern struct acpi_dmtable_info acpi_dm_table_info_spcr[];
-extern struct acpi_dmtable_info acpi_dm_table_info_spmi[];
-extern struct acpi_dmtable_info acpi_dm_table_info_srat[];
-extern struct acpi_dmtable_info acpi_dm_table_info_srat_hdr[];
-extern struct acpi_dmtable_info acpi_dm_table_info_srat0[];
-extern struct acpi_dmtable_info acpi_dm_table_info_srat1[];
-extern struct acpi_dmtable_info acpi_dm_table_info_srat2[];
-extern struct acpi_dmtable_info acpi_dm_table_info_tcpa[];
-extern struct acpi_dmtable_info acpi_dm_table_info_wdrt[];
-
-/*
- * dmtable
- */
-void acpi_dm_dump_data_table(struct acpi_table_header *table);
-
-acpi_status
-acpi_dm_dump_table(u32 table_length,
-		   u32 table_offset,
-		   void *table,
-		   u32 sub_table_length, struct acpi_dmtable_info *info);
-
-void acpi_dm_line_header(u32 offset, u32 byte_length, char *name);
-
-void acpi_dm_line_header2(u32 offset, u32 byte_length, char *name, u32 value);
-
-/*
- * dmtbdump
- */
-void acpi_dm_dump_asf(struct acpi_table_header *table);
-
-void acpi_dm_dump_cpep(struct acpi_table_header *table);
-
-void acpi_dm_dump_dmar(struct acpi_table_header *table);
-
-void acpi_dm_dump_einj(struct acpi_table_header *table);
-
-void acpi_dm_dump_erst(struct acpi_table_header *table);
-
-void acpi_dm_dump_fadt(struct acpi_table_header *table);
-
-void acpi_dm_dump_hest(struct acpi_table_header *table);
-
-void acpi_dm_dump_mcfg(struct acpi_table_header *table);
-
-void acpi_dm_dump_madt(struct acpi_table_header *table);
-
-u32 acpi_dm_dump_rsdp(struct acpi_table_header *table);
-
-void acpi_dm_dump_rsdt(struct acpi_table_header *table);
-
-void acpi_dm_dump_slit(struct acpi_table_header *table);
-
-void acpi_dm_dump_srat(struct acpi_table_header *table);
-
-void acpi_dm_dump_xsdt(struct acpi_table_header *table);
-
-/*
- * dmwalk
- */
-void
-acpi_dm_disassemble(struct acpi_walk_state *walk_state,
-		    union acpi_parse_object *origin, u32 num_opcodes);
-
-void
-acpi_dm_walk_parse_tree(union acpi_parse_object *op,
-			asl_walk_callback descending_callback,
-			asl_walk_callback ascending_callback, void *context);
-
-/*
- * dmopcode
- */
-void
-acpi_dm_disassemble_one_op(struct acpi_walk_state *walk_state,
-			   struct acpi_op_walk_info *info,
-			   union acpi_parse_object *op);
-
-void acpi_dm_decode_internal_object(union acpi_operand_object *obj_desc);
-
-u32 acpi_dm_list_type(union acpi_parse_object *op);
-
-void acpi_dm_method_flags(union acpi_parse_object *op);
-
-void acpi_dm_field_flags(union acpi_parse_object *op);
-
-void acpi_dm_address_space(u8 space_id);
-
-void acpi_dm_region_flags(union acpi_parse_object *op);
-
-void acpi_dm_match_op(union acpi_parse_object *op);
-
-u8 acpi_dm_comma_if_list_member(union acpi_parse_object *op);
-
-void acpi_dm_comma_if_field_member(union acpi_parse_object *op);
-
-/*
- * dmnames
- */
-u32 acpi_dm_dump_name(char *name);
-
-acpi_status
-acpi_ps_display_object_pathname(struct acpi_walk_state *walk_state,
-				union acpi_parse_object *op);
-
-void acpi_dm_namestring(char *name);
-
-/*
- * dmobject
- */
-void
-acpi_dm_display_internal_object(union acpi_operand_object *obj_desc,
-				struct acpi_walk_state *walk_state);
-
-void acpi_dm_display_arguments(struct acpi_walk_state *walk_state);
-
-void acpi_dm_display_locals(struct acpi_walk_state *walk_state);
-
-void
-acpi_dm_dump_method_info(acpi_status status,
-			 struct acpi_walk_state *walk_state,
-			 union acpi_parse_object *op);
-
-/*
- * dmbuffer
- */
-void acpi_dm_disasm_byte_list(u32 level, u8 * byte_data, u32 byte_count);
-
-void
-acpi_dm_byte_list(struct acpi_op_walk_info *info, union acpi_parse_object *op);
-
-void acpi_dm_is_eisa_id(union acpi_parse_object *op);
-
-void acpi_dm_eisa_id(u32 encoded_id);
-
-u8 acpi_dm_is_unicode_buffer(union acpi_parse_object *op);
-
-u8 acpi_dm_is_string_buffer(union acpi_parse_object *op);
-
-/*
- * dmresrc
- */
-void acpi_dm_dump_integer8(u8 value, char *name);
-
-void acpi_dm_dump_integer16(u16 value, char *name);
-
-void acpi_dm_dump_integer32(u32 value, char *name);
-
-void acpi_dm_dump_integer64(u64 value, char *name);
-
-void
-acpi_dm_resource_template(struct acpi_op_walk_info *info,
-			  union acpi_parse_object *op,
-			  u8 * byte_data, u32 byte_count);
-
-acpi_status acpi_dm_is_resource_template(union acpi_parse_object *op);
-
-void acpi_dm_indent(u32 level);
-
-void acpi_dm_bit_list(u16 mask);
-
-void acpi_dm_decode_attribute(u8 attribute);
-
-void acpi_dm_descriptor_name(void);
-
-/*
- * dmresrcl
- */
-void
-acpi_dm_word_descriptor(union aml_resource *resource, u32 length, u32 level);
-
-void
-acpi_dm_dword_descriptor(union aml_resource *resource, u32 length, u32 level);
-
-void
-acpi_dm_extended_descriptor(union aml_resource *resource,
-			    u32 length, u32 level);
-
-void
-acpi_dm_qword_descriptor(union aml_resource *resource, u32 length, u32 level);
-
-void
-acpi_dm_memory24_descriptor(union aml_resource *resource,
-			    u32 length, u32 level);
-
-void
-acpi_dm_memory32_descriptor(union aml_resource *resource,
-			    u32 length, u32 level);
-
-void
-acpi_dm_fixed_memory32_descriptor(union aml_resource *resource,
-				  u32 length, u32 level);
-
-void
-acpi_dm_generic_register_descriptor(union aml_resource *resource,
-				    u32 length, u32 level);
-
-void
-acpi_dm_interrupt_descriptor(union aml_resource *resource,
-			     u32 length, u32 level);
-
-void
-acpi_dm_vendor_large_descriptor(union aml_resource *resource,
-				u32 length, u32 level);
-
-void acpi_dm_vendor_common(char *name, u8 * byte_data, u32 length, u32 level);
-
-/*
- * dmresrcs
- */
-void
-acpi_dm_irq_descriptor(union aml_resource *resource, u32 length, u32 level);
-
-void
-acpi_dm_dma_descriptor(union aml_resource *resource, u32 length, u32 level);
-
-void acpi_dm_io_descriptor(union aml_resource *resource, u32 length, u32 level);
-
-void
-acpi_dm_fixed_io_descriptor(union aml_resource *resource,
-			    u32 length, u32 level);
-
-void
-acpi_dm_start_dependent_descriptor(union aml_resource *resource,
-				   u32 length, u32 level);
-
-void
-acpi_dm_end_dependent_descriptor(union aml_resource *resource,
-				 u32 length, u32 level);
-
-void
-acpi_dm_vendor_small_descriptor(union aml_resource *resource,
-				u32 length, u32 level);
-
-/*
- * dmutils
- */
-void acpi_dm_add_to_external_list(char *path, u8 type, u32 value);
-
-/*
- * dmrestag
- */
-void acpi_dm_find_resources(union acpi_parse_object *root);
-
-void
-acpi_dm_check_resource_reference(union acpi_parse_object *op,
-				 struct acpi_walk_state *walk_state);
-
-#endif				/* __ACDISASM_H__ */
diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h
index 84f5cb24286..eda04546cdf 100644
--- a/include/acpi/acexcep.h
+++ b/include/acpi/acexcep.h
@@ -153,8 +153,9 @@
 #define AE_AML_CIRCULAR_REFERENCE       (acpi_status) (0x001E | AE_CODE_AML)
 #define AE_AML_BAD_RESOURCE_LENGTH      (acpi_status) (0x001F | AE_CODE_AML)
 #define AE_AML_ILLEGAL_ADDRESS          (acpi_status) (0x0020 | AE_CODE_AML)
+#define AE_AML_INFINITE_LOOP            (acpi_status) (0x0021 | AE_CODE_AML)
 
-#define AE_CODE_AML_MAX                 0x0020
+#define AE_CODE_AML_MAX                 0x0021
 
 /*
  * Internal exceptions used for control
@@ -175,6 +176,8 @@
 
 #define AE_CODE_CTRL_MAX                0x000D
 
+/* Exception strings for acpi_format_exception */
+
 #ifdef DEFINE_ACPI_GLOBALS
 
 /*
@@ -267,6 +270,7 @@ char const *acpi_gbl_exception_names_aml[] = {
 	"AE_AML_CIRCULAR_REFERENCE",
 	"AE_AML_BAD_RESOURCE_LENGTH",
 	"AE_AML_ILLEGAL_ADDRESS",
+	"AE_AML_INFINITE_LOOP"
 };
 
 char const *acpi_gbl_exception_names_ctrl[] = {
diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h
index db8852d8bcf..5c823d5ab78 100644
--- a/include/acpi/acoutput.h
+++ b/include/acpi/acoutput.h
@@ -45,9 +45,9 @@
 #define __ACOUTPUT_H__
 
 /*
- * Debug levels and component IDs.  These are used to control the
- * granularity of the output of the DEBUG_PRINT macro -- on a per-
- * component basis and a per-exception-type basis.
+ * Debug levels and component IDs. These are used to control the
+ * granularity of the output of the ACPI_DEBUG_PRINT macro -- on a
+ * per-component basis and a per-exception-type basis.
  */
 
 /* Component IDs are used in the global "DebugLayer" */
@@ -69,8 +69,10 @@
 
 #define ACPI_COMPILER               0x00001000
 #define ACPI_TOOLS                  0x00002000
+#define ACPI_EXAMPLE                0x00004000
+#define ACPI_DRIVER                 0x00008000
 
-#define ACPI_ALL_COMPONENTS         0x00003FFF
+#define ACPI_ALL_COMPONENTS         0x0000FFFF
 #define ACPI_COMPONENT_DEFAULT      (ACPI_ALL_COMPONENTS)
 
 /* Component IDs reserved for ACPI drivers */
@@ -78,7 +80,7 @@
 #define ACPI_ALL_DRIVERS            0xFFFF0000
 
 /*
- * Raw debug output levels, do not use these in the DEBUG_PRINT macros
+ * Raw debug output levels, do not use these in the ACPI_DEBUG_PRINT macros
  */
 #define ACPI_LV_INIT                0x00000001
 #define ACPI_LV_DEBUG_OBJECT        0x00000002
@@ -176,4 +178,95 @@
 #define ACPI_NORMAL_DEFAULT         (ACPI_LV_INIT | ACPI_LV_DEBUG_OBJECT)
 #define ACPI_DEBUG_ALL              (ACPI_LV_AML_DISASSEMBLE | ACPI_LV_ALL_EXCEPTIONS | ACPI_LV_ALL)
 
+#if defined (ACPI_DEBUG_OUTPUT) || !defined (ACPI_NO_ERROR_MESSAGES)
+/*
+ * Module name is included in both debug and non-debug versions primarily for
+ * error messages. The __FILE__ macro is not very useful for this, because it
+ * often includes the entire pathname to the module
+ */
+#define ACPI_MODULE_NAME(name)          static const char ACPI_UNUSED_VAR _acpi_module_name[] = name;
+#else
+#define ACPI_MODULE_NAME(name)
+#endif
+
+/*
+ * Ascii error messages can be configured out
+ */
+#ifndef ACPI_NO_ERROR_MESSAGES
+#define AE_INFO                         _acpi_module_name, __LINE__
+
+/*
+ * Error reporting. Callers module and line number are inserted by AE_INFO,
+ * the plist contains a set of parens to allow variable-length lists.
+ * These macros are used for both the debug and non-debug versions of the code.
+ */
+#define ACPI_INFO(plist)                acpi_info plist
+#define ACPI_WARNING(plist)             acpi_warning plist
+#define ACPI_EXCEPTION(plist)           acpi_exception plist
+#define ACPI_ERROR(plist)               acpi_error plist
+
+#else
+
+/* No error messages */
+
+#define ACPI_INFO(plist)
+#define ACPI_WARNING(plist)
+#define ACPI_EXCEPTION(plist)
+#define ACPI_ERROR(plist)
+
+#endif				/* ACPI_NO_ERROR_MESSAGES */
+
+/*
+ * Debug macros that are conditionally compiled
+ */
+#ifdef ACPI_DEBUG_OUTPUT
+
+/*
+ * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header,
+ * define it now. This is the case where there the compiler does not support
+ * a __FUNCTION__ macro or equivalent.
+ */
+#ifndef ACPI_GET_FUNCTION_NAME
+#define ACPI_GET_FUNCTION_NAME          _acpi_function_name
+
+/*
+ * The Name parameter should be the procedure name as a quoted string.
+ * The function name is also used by the function exit macros below.
+ * Note: (const char) is used to be compatible with the debug interfaces
+ * and macros such as __FUNCTION__.
+ */
+#define ACPI_FUNCTION_NAME(name)        static const char _acpi_function_name[] = #name;
+
+#else
+/* Compiler supports __FUNCTION__ (or equivalent) -- Ignore this macro */
+
+#define ACPI_FUNCTION_NAME(name)
+#endif				/* ACPI_GET_FUNCTION_NAME */
+
+/*
+ * Common parameters used for debug output functions:
+ * line number, function name, module(file) name, component ID
+ */
+#define ACPI_DEBUG_PARAMETERS           __LINE__, ACPI_GET_FUNCTION_NAME, _acpi_module_name, _COMPONENT
+
+/*
+ * Master debug print macros
+ * Print message if and only if:
+ *    1) Debug print for the current component is enabled
+ *    2) Debug error level or trace level for the print statement is enabled
+ */
+#define ACPI_DEBUG_PRINT(plist)         acpi_debug_print plist
+#define ACPI_DEBUG_PRINT_RAW(plist)     acpi_debug_print_raw plist
+
+#else
+/*
+ * This is the non-debug case -- make everything go away,
+ * leaving no executable debug code!
+ */
+#define ACPI_FUNCTION_NAME(a)
+#define ACPI_DEBUG_PRINT(pl)
+#define ACPI_DEBUG_PRINT_RAW(pl)
+
+#endif				/* ACPI_DEBUG_OUTPUT */
+
 #endif				/* __ACOUTPUT_H__ */
diff --git a/include/acpi/acpi.h b/include/acpi/acpi.h
index c515ef6cc89..472b7bf0c5d 100644
--- a/include/acpi/acpi.h
+++ b/include/acpi/acpi.h
@@ -1,6 +1,6 @@
 /******************************************************************************
  *
- * Name: acpi.h - Master include file, Publics and external data.
+ * Name: acpi.h - Master public include file used to interface to ACPICA
  *
  *****************************************************************************/
 
@@ -45,25 +45,22 @@
 #define __ACPI_H__
 
 /*
- * Common includes for all ACPI driver files
- * We put them here because we don't want to duplicate them
- * in the rest of the source code again and again.
+ * Public include files for use by code that will interface to ACPICA.
+ *
+ * Information includes the ACPICA data types, names, exceptions, and
+ * external interface prototypes. Also included are the definitions for
+ * all ACPI tables (FADT, MADT, etc.)
+ *
+ * Note: The order of these include files is important.
  */
-#include "acnames.h"		/* Global ACPI names and strings */
-#include "acconfig.h"		/* Configuration constants */
-#include "platform/acenv.h"	/* Target environment specific items */
-#include "actypes.h"		/* Fundamental common data types */
-#include "acexcep.h"		/* ACPI exception codes */
-#include "acmacros.h"		/* C macros */
+#include "platform/acenv.h"	/* Environment-specific items */
+#include "acnames.h"		/* Common ACPI names and strings */
+#include "actypes.h"		/* ACPICA data types and structures */
+#include "acexcep.h"		/* ACPICA exceptions */
 #include "actbl.h"		/* ACPI table definitions */
-#include "aclocal.h"		/* Internal data types */
 #include "acoutput.h"		/* Error output and Debug macros */
-#include "acpiosxf.h"		/* Interfaces to the ACPI-to-OS layer */
+#include "acrestyp.h"		/* Resource Descriptor structs */
+#include "acpiosxf.h"		/* OSL interfaces (ACPICA-to-OS) */
 #include "acpixf.h"		/* ACPI core subsystem external interfaces */
-#include "acobject.h"		/* ACPI internal object */
-#include "acstruct.h"		/* Common structures */
-#include "acglobal.h"		/* All global variables */
-#include "achware.h"		/* Hardware defines and interfaces */
-#include "acutils.h"		/* Utility interfaces */
 
 #endif				/* __ACPI_H__ */
diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h
index b91440ac0d1..a62720a7edc 100644
--- a/include/acpi/acpiosxf.h
+++ b/include/acpi/acpiosxf.h
@@ -121,8 +121,11 @@ acpi_os_wait_semaphore(acpi_semaphore handle, u32 units, u16 timeout);
 acpi_status acpi_os_signal_semaphore(acpi_semaphore handle, u32 units);
 
 /*
- * Mutex primitives
+ * Mutex primitives. May be configured to use semaphores instead via
+ * ACPI_MUTEX_TYPE (see platform/acenv.h)
  */
+#if (ACPI_MUTEX_TYPE != ACPI_BINARY_SEMAPHORE)
+
 acpi_status acpi_os_create_mutex(acpi_mutex * out_handle);
 
 void acpi_os_delete_mutex(acpi_mutex handle);
@@ -130,13 +133,7 @@ void acpi_os_delete_mutex(acpi_mutex handle);
 acpi_status acpi_os_acquire_mutex(acpi_mutex handle, u16 timeout);
 
 void acpi_os_release_mutex(acpi_mutex handle);
-
-/* Temporary macros for Mutex* interfaces, map to existing semaphore xfaces */
-
-#define acpi_os_create_mutex(out_handle)    acpi_os_create_semaphore (1, 1, out_handle)
-#define acpi_os_delete_mutex(handle)        (void) acpi_os_delete_semaphore (handle)
-#define acpi_os_acquire_mutex(handle,time)  acpi_os_wait_semaphore (handle, 1, time)
-#define acpi_os_release_mutex(handle)       (void) acpi_os_signal_semaphore (handle, 1)
+#endif
 
 /*
  * Memory allocation and mapping
diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index 33bc0e3b195..c8e8cf45830 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -45,9 +45,32 @@
 #ifndef __ACXFACE_H__
 #define __ACXFACE_H__
 
+/* Current ACPICA subsystem version in YYYYMMDD format */
+
+#define ACPI_CA_VERSION                 0x20081204
+
 #include "actypes.h"
 #include "actbl.h"
 
+extern u8 acpi_gbl_permanent_mmap;
+
+/*
+ * Globals that are publically available, allowing for
+ * run time configuration
+ */
+extern u32 acpi_dbg_level;
+extern u32 acpi_dbg_layer;
+extern u8 acpi_gbl_enable_interpreter_slack;
+extern u8 acpi_gbl_all_methods_serialized;
+extern u8 acpi_gbl_create_osi_method;
+extern u8 acpi_gbl_leave_wake_gpes_disabled;
+extern acpi_name acpi_gbl_trace_method_name;
+extern u32 acpi_gbl_trace_flags;
+
+extern u32 acpi_current_gpe_count;
+extern struct acpi_table_fadt acpi_gbl_FADT;
+
+extern u32 acpi_rsdt_forced;
 /*
  * Global interfaces
  */
@@ -79,11 +102,6 @@ const char *acpi_format_exception(acpi_status exception);
 
 acpi_status acpi_purge_cached_objects(void);
 
-#ifdef ACPI_FUTURE_USAGE
-acpi_status
-acpi_install_initialization_handler(acpi_init_handler handler, u32 function);
-#endif
-
 /*
  * ACPI Memory management
  */
@@ -193,9 +211,12 @@ acpi_status acpi_get_id(acpi_handle object, acpi_owner_id * out_type);
 acpi_status acpi_get_parent(acpi_handle object, acpi_handle * out_handle);
 
 /*
- * Event handler interfaces
+ * Handler interfaces
  */
 acpi_status
+acpi_install_initialization_handler(acpi_init_handler handler, u32 function);
+
+acpi_status
 acpi_install_fixed_event_handler(u32 acpi_event,
 				 acpi_event_handler handler, void *context);
 
@@ -227,6 +248,10 @@ acpi_install_gpe_handler(acpi_handle gpe_device,
 			 u32 gpe_number,
 			 u32 type, acpi_event_handler address, void *context);
 
+acpi_status
+acpi_remove_gpe_handler(acpi_handle gpe_device,
+			u32 gpe_number, acpi_event_handler address);
+
 #ifdef ACPI_FUTURE_USAGE
 acpi_status acpi_install_exception_handler(acpi_exception_handler handler);
 #endif
@@ -238,10 +263,6 @@ acpi_status acpi_acquire_global_lock(u16 timeout, u32 * handle);
 
 acpi_status acpi_release_global_lock(u32 handle);
 
-acpi_status
-acpi_remove_gpe_handler(acpi_handle gpe_device,
-			u32 gpe_number, acpi_event_handler address);
-
 acpi_status acpi_enable_event(u32 event, u32 flags);
 
 acpi_status acpi_disable_event(u32 event, u32 flags);
@@ -250,6 +271,9 @@ acpi_status acpi_clear_event(u32 event);
 
 acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status);
 
+/*
+ * GPE Interfaces
+ */
 acpi_status acpi_set_gpe_type(acpi_handle gpe_device, u32 gpe_number, u8 type);
 
 acpi_status acpi_enable_gpe(acpi_handle gpe_device, u32 gpe_number);
@@ -263,6 +287,12 @@ acpi_get_gpe_status(acpi_handle gpe_device,
 		    u32 gpe_number,
 		    u32 flags, acpi_event_status * event_status);
 
+acpi_status acpi_disable_all_gpes(void);
+
+acpi_status acpi_enable_all_runtime_gpes(void);
+
+acpi_status acpi_get_gpe_device(u32 gpe_index, acpi_handle *gpe_device);
+
 acpi_status
 acpi_install_gpe_block(acpi_handle gpe_device,
 		       struct acpi_generic_address *gpe_block_address,
@@ -313,6 +343,8 @@ acpi_resource_to_address64(struct acpi_resource *resource,
 /*
  * Hardware (ACPI device) interfaces
  */
+acpi_status acpi_reset(void);
+
 acpi_status acpi_get_register(u32 register_id, u32 * return_value);
 
 acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value);
@@ -320,12 +352,14 @@ acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value);
 acpi_status acpi_set_register(u32 register_id, u32 value);
 
 acpi_status
-acpi_set_firmware_waking_vector(acpi_physical_address physical_address);
+acpi_set_firmware_waking_vector(u32 physical_address);
 
-#ifdef ACPI_FUTURE_USAGE
 acpi_status
-acpi_get_firmware_waking_vector(acpi_physical_address * physical_address);
-#endif
+acpi_set_firmware_waking_vector64(u64 physical_address);
+
+acpi_status acpi_read(u32 *value, struct acpi_generic_address *reg);
+
+acpi_status acpi_write(u32 value, struct acpi_generic_address *reg);
 
 acpi_status
 acpi_get_sleep_type_data(u8 sleep_state, u8 * slp_typ_a, u8 * slp_typ_b);
@@ -340,4 +374,42 @@ acpi_status acpi_leave_sleep_state_prep(u8 sleep_state);
 
 acpi_status acpi_leave_sleep_state(u8 sleep_state);
 
+/*
+ * Debug output
+ */
+void ACPI_INTERNAL_VAR_XFACE
+acpi_error(const char *module_name,
+	   u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3);
+
+void ACPI_INTERNAL_VAR_XFACE
+acpi_exception(const char *module_name,
+	       u32 line_number,
+	       acpi_status status, const char *format, ...) ACPI_PRINTF_LIKE(4);
+
+void ACPI_INTERNAL_VAR_XFACE
+acpi_warning(const char *module_name,
+	     u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3);
+
+void ACPI_INTERNAL_VAR_XFACE
+acpi_info(const char *module_name,
+	  u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3);
+
+#ifdef ACPI_DEBUG_OUTPUT
+
+void ACPI_INTERNAL_VAR_XFACE
+acpi_debug_print(u32 requested_debug_level,
+		 u32 line_number,
+		 const char *function_name,
+		 const char *module_name,
+		 u32 component_id, const char *format, ...) ACPI_PRINTF_LIKE(6);
+
+void ACPI_INTERNAL_VAR_XFACE
+acpi_debug_print_raw(u32 requested_debug_level,
+		     u32 line_number,
+		     const char *function_name,
+		     const char *module_name,
+		     u32 component_id,
+		     const char *format, ...) ACPI_PRINTF_LIKE(6);
+#endif
+
 #endif				/* __ACXFACE_H__ */
diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h
new file mode 100644
index 00000000000..9ffe00feada
--- /dev/null
+++ b/include/acpi/acrestyp.h
@@ -0,0 +1,405 @@
+/******************************************************************************
+ *
+ * Name: acrestyp.h - Defines, types, and structures for resource descriptors
+ *
+ *****************************************************************************/
+
+/*
+ * Copyright (C) 2000 - 2008, Intel Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ * 3. Neither the names of the above-listed copyright holders nor the names
+ *    of any contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ */
+
+#ifndef __ACRESTYP_H__
+#define __ACRESTYP_H__
+
+/*
+ * Definitions for Resource Attributes
+ */
+typedef u16 acpi_rs_length;	/* Resource Length field is fixed at 16 bits */
+typedef u32 acpi_rsdesc_size;	/* Max Resource Descriptor size is (Length+3) = (64_k-1)+3 */
+
+/*
+ * Memory Attributes
+ */
+#define ACPI_READ_ONLY_MEMORY           (u8) 0x00
+#define ACPI_READ_WRITE_MEMORY          (u8) 0x01
+
+#define ACPI_NON_CACHEABLE_MEMORY       (u8) 0x00
+#define ACPI_CACHABLE_MEMORY            (u8) 0x01
+#define ACPI_WRITE_COMBINING_MEMORY     (u8) 0x02
+#define ACPI_PREFETCHABLE_MEMORY        (u8) 0x03
+
+/*
+ * IO Attributes
+ * The ISA IO ranges are:     n000-n0_fFh, n400-n4_fFh, n800-n8_fFh, n_c00-n_cFFh.
+ * The non-ISA IO ranges are: n100-n3_fFh, n500-n7_fFh, n900-n_bFFh, n_cd0-n_fFFh.
+ */
+#define ACPI_NON_ISA_ONLY_RANGES        (u8) 0x01
+#define ACPI_ISA_ONLY_RANGES            (u8) 0x02
+#define ACPI_ENTIRE_RANGE               (ACPI_NON_ISA_ONLY_RANGES | ACPI_ISA_ONLY_RANGES)
+
+/* Type of translation - 1=Sparse, 0=Dense */
+
+#define ACPI_SPARSE_TRANSLATION         (u8) 0x01
+
+/*
+ * IO Port Descriptor Decode
+ */
+#define ACPI_DECODE_10                  (u8) 0x00	/* 10-bit IO address decode */
+#define ACPI_DECODE_16                  (u8) 0x01	/* 16-bit IO address decode */
+
+/*
+ * IRQ Attributes
+ */
+#define ACPI_LEVEL_SENSITIVE            (u8) 0x00
+#define ACPI_EDGE_SENSITIVE             (u8) 0x01
+
+#define ACPI_ACTIVE_HIGH                (u8) 0x00
+#define ACPI_ACTIVE_LOW                 (u8) 0x01
+
+#define ACPI_EXCLUSIVE                  (u8) 0x00
+#define ACPI_SHARED                     (u8) 0x01
+
+/*
+ * DMA Attributes
+ */
+#define ACPI_COMPATIBILITY              (u8) 0x00
+#define ACPI_TYPE_A                     (u8) 0x01
+#define ACPI_TYPE_B                     (u8) 0x02
+#define ACPI_TYPE_F                     (u8) 0x03
+
+#define ACPI_NOT_BUS_MASTER             (u8) 0x00
+#define ACPI_BUS_MASTER                 (u8) 0x01
+
+#define ACPI_TRANSFER_8                 (u8) 0x00
+#define ACPI_TRANSFER_8_16              (u8) 0x01
+#define ACPI_TRANSFER_16                (u8) 0x02
+
+/*
+ * Start Dependent Functions Priority definitions
+ */
+#define ACPI_GOOD_CONFIGURATION         (u8) 0x00
+#define ACPI_ACCEPTABLE_CONFIGURATION   (u8) 0x01
+#define ACPI_SUB_OPTIMAL_CONFIGURATION  (u8) 0x02
+
+/*
+ * 16, 32 and 64-bit Address Descriptor resource types
+ */
+#define ACPI_MEMORY_RANGE               (u8) 0x00
+#define ACPI_IO_RANGE                   (u8) 0x01
+#define ACPI_BUS_NUMBER_RANGE           (u8) 0x02
+
+#define ACPI_ADDRESS_NOT_FIXED          (u8) 0x00
+#define ACPI_ADDRESS_FIXED              (u8) 0x01
+
+#define ACPI_POS_DECODE                 (u8) 0x00
+#define ACPI_SUB_DECODE                 (u8) 0x01
+
+#define ACPI_PRODUCER                   (u8) 0x00
+#define ACPI_CONSUMER                   (u8) 0x01
+
+/*
+ * If possible, pack the following structures to byte alignment
+ */
+#ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED
+#pragma pack(1)
+#endif
+
+/* UUID data structures for use in vendor-defined resource descriptors */
+
+struct acpi_uuid {
+	u8 data[ACPI_UUID_LENGTH];
+};
+
+struct acpi_vendor_uuid {
+	u8 subtype;
+	u8 data[ACPI_UUID_LENGTH];
+};
+
+/*
+ * Structures used to describe device resources
+ */
+struct acpi_resource_irq {
+	u8 descriptor_length;
+	u8 triggering;
+	u8 polarity;
+	u8 sharable;
+	u8 interrupt_count;
+	u8 interrupts[1];
+};
+
+struct acpi_resource_dma {
+	u8 type;
+	u8 bus_master;
+	u8 transfer;
+	u8 channel_count;
+	u8 channels[1];
+};
+
+struct acpi_resource_start_dependent {
+	u8 descriptor_length;
+	u8 compatibility_priority;
+	u8 performance_robustness;
+};
+
+/*
+ * The END_DEPENDENT_FUNCTIONS_RESOURCE struct is not
+ * needed because it has no fields
+ */
+
+struct acpi_resource_io {
+	u8 io_decode;
+	u8 alignment;
+	u8 address_length;
+	u16 minimum;
+	u16 maximum;
+};
+
+struct acpi_resource_fixed_io {
+	u16 address;
+	u8 address_length;
+};
+
+struct acpi_resource_vendor {
+	u16 byte_length;
+	u8 byte_data[1];
+};
+
+/* Vendor resource with UUID info (introduced in ACPI 3.0) */
+
+struct acpi_resource_vendor_typed {
+	u16 byte_length;
+	u8 uuid_subtype;
+	u8 uuid[ACPI_UUID_LENGTH];
+	u8 byte_data[1];
+};
+
+struct acpi_resource_end_tag {
+	u8 checksum;
+};
+
+struct acpi_resource_memory24 {
+	u8 write_protect;
+	u16 minimum;
+	u16 maximum;
+	u16 alignment;
+	u16 address_length;
+};
+
+struct acpi_resource_memory32 {
+	u8 write_protect;
+	u32 minimum;
+	u32 maximum;
+	u32 alignment;
+	u32 address_length;
+};
+
+struct acpi_resource_fixed_memory32 {
+	u8 write_protect;
+	u32 address;
+	u32 address_length;
+};
+
+struct acpi_memory_attribute {
+	u8 write_protect;
+	u8 caching;
+	u8 range_type;
+	u8 translation;
+};
+
+struct acpi_io_attribute {
+	u8 range_type;
+	u8 translation;
+	u8 translation_type;
+	u8 reserved1;
+};
+
+union acpi_resource_attribute {
+	struct acpi_memory_attribute mem;
+	struct acpi_io_attribute io;
+
+	/* Used for the *word_space macros */
+
+	u8 type_specific;
+};
+
+struct acpi_resource_source {
+	u8 index;
+	u16 string_length;
+	char *string_ptr;
+};
+
+/* Fields common to all address descriptors, 16/32/64 bit */
+
+#define ACPI_RESOURCE_ADDRESS_COMMON \
+	u8                                      resource_type; \
+	u8                                      producer_consumer; \
+	u8                                      decode; \
+	u8                                      min_address_fixed; \
+	u8                                      max_address_fixed; \
+	union acpi_resource_attribute           info;
+
+struct acpi_resource_address {
+ACPI_RESOURCE_ADDRESS_COMMON};
+
+struct acpi_resource_address16 {
+	ACPI_RESOURCE_ADDRESS_COMMON u16 granularity;
+	u16 minimum;
+	u16 maximum;
+	u16 translation_offset;
+	u16 address_length;
+	struct acpi_resource_source resource_source;
+};
+
+struct acpi_resource_address32 {
+	ACPI_RESOURCE_ADDRESS_COMMON u32 granularity;
+	u32 minimum;
+	u32 maximum;
+	u32 translation_offset;
+	u32 address_length;
+	struct acpi_resource_source resource_source;
+};
+
+struct acpi_resource_address64 {
+	ACPI_RESOURCE_ADDRESS_COMMON u64 granularity;
+	u64 minimum;
+	u64 maximum;
+	u64 translation_offset;
+	u64 address_length;
+	struct acpi_resource_source resource_source;
+};
+
+struct acpi_resource_extended_address64 {
+	ACPI_RESOURCE_ADDRESS_COMMON u8 revision_iD;
+	u64 granularity;
+	u64 minimum;
+	u64 maximum;
+	u64 translation_offset;
+	u64 address_length;
+	u64 type_specific;
+};
+
+struct acpi_resource_extended_irq {
+	u8 producer_consumer;
+	u8 triggering;
+	u8 polarity;
+	u8 sharable;
+	u8 interrupt_count;
+	struct acpi_resource_source resource_source;
+	u32 interrupts[1];
+};
+
+struct acpi_resource_generic_register {
+	u8 space_id;
+	u8 bit_width;
+	u8 bit_offset;
+	u8 access_size;
+	u64 address;
+};
+
+/* ACPI_RESOURCE_TYPEs */
+
+#define ACPI_RESOURCE_TYPE_IRQ                  0
+#define ACPI_RESOURCE_TYPE_DMA                  1
+#define ACPI_RESOURCE_TYPE_START_DEPENDENT      2
+#define ACPI_RESOURCE_TYPE_END_DEPENDENT        3
+#define ACPI_RESOURCE_TYPE_IO                   4
+#define ACPI_RESOURCE_TYPE_FIXED_IO             5
+#define ACPI_RESOURCE_TYPE_VENDOR               6
+#define ACPI_RESOURCE_TYPE_END_TAG              7
+#define ACPI_RESOURCE_TYPE_MEMORY24             8
+#define ACPI_RESOURCE_TYPE_MEMORY32             9
+#define ACPI_RESOURCE_TYPE_FIXED_MEMORY32       10
+#define ACPI_RESOURCE_TYPE_ADDRESS16            11
+#define ACPI_RESOURCE_TYPE_ADDRESS32            12
+#define ACPI_RESOURCE_TYPE_ADDRESS64            13
+#define ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64   14	/* ACPI 3.0 */
+#define ACPI_RESOURCE_TYPE_EXTENDED_IRQ         15
+#define ACPI_RESOURCE_TYPE_GENERIC_REGISTER     16
+#define ACPI_RESOURCE_TYPE_MAX                  16
+
+/* Master union for resource descriptors */
+
+union acpi_resource_data {
+	struct acpi_resource_irq irq;
+	struct acpi_resource_dma dma;
+	struct acpi_resource_start_dependent start_dpf;
+	struct acpi_resource_io io;
+	struct acpi_resource_fixed_io fixed_io;
+	struct acpi_resource_vendor vendor;
+	struct acpi_resource_vendor_typed vendor_typed;
+	struct acpi_resource_end_tag end_tag;
+	struct acpi_resource_memory24 memory24;
+	struct acpi_resource_memory32 memory32;
+	struct acpi_resource_fixed_memory32 fixed_memory32;
+	struct acpi_resource_address16 address16;
+	struct acpi_resource_address32 address32;
+	struct acpi_resource_address64 address64;
+	struct acpi_resource_extended_address64 ext_address64;
+	struct acpi_resource_extended_irq extended_irq;
+	struct acpi_resource_generic_register generic_reg;
+
+	/* Common fields */
+
+	struct acpi_resource_address address;	/* Common 16/32/64 address fields */
+};
+
+/* Common resource header */
+
+struct acpi_resource {
+	u32 type;
+	u32 length;
+	union acpi_resource_data data;
+};
+
+/* restore default alignment */
+
+#pragma pack()
+
+#define ACPI_RS_SIZE_NO_DATA                8	/* Id + Length fields */
+#define ACPI_RS_SIZE_MIN                    (u32) ACPI_ROUND_UP_TO_NATIVE_WORD (12)
+#define ACPI_RS_SIZE(type)                  (u32) (ACPI_RS_SIZE_NO_DATA + sizeof (type))
+
+#define ACPI_NEXT_RESOURCE(res)             (struct acpi_resource *)((u8 *) res + res->length)
+
+struct acpi_pci_routing_table {
+	u32 length;
+	u32 pin;
+	acpi_integer address;	/* here for 64-bit alignment */
+	u32 source_index;
+	char source[4];		/* pad to 64 bits so sizeof() works in all cases */
+};
+
+#endif				/* __ACRESTYP_H__ */
diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h
index 13a3d9ad92d..813e4b6c2c0 100644
--- a/include/acpi/actbl.h
+++ b/include/acpi/actbl.h
@@ -288,6 +288,31 @@ enum acpi_prefered_pm_profiles {
 
 #define ACPI_FADT_OFFSET(f)             (u8) ACPI_OFFSET (struct acpi_table_fadt, f)
 
+union acpi_name_union {
+	u32 integer;
+	char ascii[4];
+};
+
+/*
+ * Internal ACPI Table Descriptor. One per ACPI table
+ */
+struct acpi_table_desc {
+	acpi_physical_address address;
+	struct acpi_table_header *pointer;
+	u32 length;		/* Length fixed at 32 bits */
+	union acpi_name_union signature;
+	acpi_owner_id owner_id;
+	u8 flags;
+};
+
+/* Flags for above */
+
+#define ACPI_TABLE_ORIGIN_UNKNOWN       (0)
+#define ACPI_TABLE_ORIGIN_MAPPED        (1)
+#define ACPI_TABLE_ORIGIN_ALLOCATED     (2)
+#define ACPI_TABLE_ORIGIN_MASK          (3)
+#define ACPI_TABLE_IS_LOADED            (4)
+
 /*
  * Get the remaining ACPI tables
  */
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 63f5b4cf4de..18963b96811 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -627,7 +627,7 @@ struct acpi_hest_aer_common {
 	u32 uncorrectable_error_mask;
 	u32 uncorrectable_error_severity;
 	u32 correctable_error_mask;
-	u32 advanced_error_cababilities;
+	u32 advanced_error_capabilities;
 };
 
 /* Hardware Error Notification */
diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h
index 8222e8de0d1..a20aab51017 100644
--- a/include/acpi/actypes.h
+++ b/include/acpi/actypes.h
@@ -204,11 +204,10 @@ typedef u32 acpi_physical_address;
 
 /*******************************************************************************
  *
- * OS-dependent and compiler-dependent types
+ * OS-dependent types
  *
  * If the defaults below are not appropriate for the host system, they can
- * be defined in the compiler-specific or OS-specific header, and this will
- * take precedence.
+ * be defined in the OS-specific header, and this will take precedence.
  *
  ******************************************************************************/
 
@@ -218,12 +217,6 @@ typedef u32 acpi_physical_address;
 #define acpi_thread_id			acpi_size
 #endif
 
-/* Object returned from acpi_os_create_lock */
-
-#ifndef acpi_spinlock
-#define acpi_spinlock                   void *
-#endif
-
 /* Flags for acpi_os_acquire_lock/acpi_os_release_lock */
 
 #ifndef acpi_cpu_flags
@@ -233,9 +226,51 @@ typedef u32 acpi_physical_address;
 /* Object returned from acpi_os_create_cache */
 
 #ifndef acpi_cache_t
+#ifdef ACPI_USE_LOCAL_CACHE
 #define acpi_cache_t                    struct acpi_memory_list
+#else
+#define acpi_cache_t                    void *
+#endif
+#endif
+
+/*
+ * Synchronization objects - Mutexes, Semaphores, and spin_locks
+ */
+#if (ACPI_MUTEX_TYPE == ACPI_BINARY_SEMAPHORE)
+/*
+ * These macros are used if the host OS does not support a mutex object.
+ * Map the OSL Mutex interfaces to binary semaphores.
+ */
+#define acpi_mutex                      acpi_semaphore
+#define acpi_os_create_mutex(out_handle) acpi_os_create_semaphore (1, 1, out_handle)
+#define acpi_os_delete_mutex(handle)    (void) acpi_os_delete_semaphore (handle)
+#define acpi_os_acquire_mutex(handle,time) acpi_os_wait_semaphore (handle, 1, time)
+#define acpi_os_release_mutex(handle)   (void) acpi_os_signal_semaphore (handle, 1)
+#endif
+
+/* Configurable types for synchronization objects */
+
+#ifndef acpi_spinlock
+#define acpi_spinlock                   void *
+#endif
+
+#ifndef acpi_semaphore
+#define acpi_semaphore                  void *
+#endif
+
+#ifndef acpi_mutex
+#define acpi_mutex                      void *
 #endif
 
+/*******************************************************************************
+ *
+ * Compiler-dependent types
+ *
+ * If the defaults below are not appropriate for the host compiler, they can
+ * be defined in the compiler-specific header, and this will take precedence.
+ *
+ ******************************************************************************/
+
 /* Use C99 uintptr_t for pointer casting if available, "void *" otherwise */
 
 #ifndef acpi_uintptr_t
@@ -268,6 +303,43 @@ typedef u32 acpi_physical_address;
 #define ACPI_EXPORT_SYMBOL(symbol)
 #endif
 
+/******************************************************************************
+ *
+ * ACPI Specification constants (Do not change unless the specification changes)
+ *
+ *****************************************************************************/
+
+/* Number of distinct FADT-based GPE register blocks (GPE0 and GPE1) */
+
+#define ACPI_MAX_GPE_BLOCKS             2
+
+/* Default ACPI register widths */
+
+#define ACPI_GPE_REGISTER_WIDTH         8
+#define ACPI_PM1_REGISTER_WIDTH         16
+#define ACPI_PM2_REGISTER_WIDTH         8
+#define ACPI_PM_TIMER_WIDTH             32
+
+/* Names within the namespace are 4 bytes long */
+
+#define ACPI_NAME_SIZE                  4
+#define ACPI_PATH_SEGMENT_LENGTH        5	/* 4 chars for name + 1 char for separator */
+#define ACPI_PATH_SEPARATOR             '.'
+
+/* Sizes for ACPI table headers */
+
+#define ACPI_OEM_ID_SIZE                6
+#define ACPI_OEM_TABLE_ID_SIZE          8
+
+/* ACPI/PNP hardware IDs */
+
+#define PCI_ROOT_HID_STRING             "PNP0A03"
+#define PCI_EXPRESS_ROOT_HID_STRING     "PNP0A08"
+
+/* PM Timer ticks per second (HZ) */
+
+#define PM_TIMER_FREQUENCY  3579545
+
 /*******************************************************************************
  *
  * Independent types
@@ -291,13 +363,18 @@ typedef u32 acpi_physical_address;
 #endif
 
 /*
- * Mescellaneous types
+ * Miscellaneous types
  */
 typedef u32 acpi_status;	/* All ACPI Exceptions */
 typedef u32 acpi_name;		/* 4-byte ACPI name */
 typedef char *acpi_string;	/* Null terminated ASCII string */
 typedef void *acpi_handle;	/* Actually a ptr to a NS Node */
 
+/* Owner IDs are used to track namespace nodes for selective deletion */
+
+typedef u8 acpi_owner_id;
+#define ACPI_OWNER_ID_MAX               0xFF
+
 struct uint64_struct {
 	u32 lo;
 	u32 hi;
@@ -313,13 +390,8 @@ struct uint32_struct {
 	u32 hi;
 };
 
-/* Synchronization objects */
-
-#define acpi_mutex                      void *
-#define acpi_semaphore                  void *
-
 /*
- * Acpi integer width. In ACPI version 1, integers are 32 bits.  In ACPI
+ * Acpi integer width. In ACPI version 1, integers are 32 bits. In ACPI
  * version 2, integers are 64 bits. Note that this pertains to the ACPI integer
  * type only, not other integers used in the implementation of the ACPI CA
  * subsystem.
@@ -338,10 +410,75 @@ typedef unsigned long long acpi_integer;
 #define ACPI_MAX16_DECIMAL_DIGITS        5
 #define ACPI_MAX8_DECIMAL_DIGITS         3
 
+/* PM Timer ticks per second (HZ) */
+
+#define PM_TIMER_FREQUENCY  3579545
+
 /*
  * Constants with special meanings
  */
 #define ACPI_ROOT_OBJECT                ACPI_ADD_PTR (acpi_handle, NULL, ACPI_MAX_PTR)
+#define ACPI_WAIT_FOREVER               0xFFFF	/* u16, as per ACPI spec */
+#define ACPI_DO_NOT_WAIT                0
+
+/*******************************************************************************
+ *
+ * Commonly used macros
+ *
+ ******************************************************************************/
+
+/* Data manipulation */
+
+#define ACPI_LOWORD(l)                  ((u16)(u32)(l))
+#define ACPI_HIWORD(l)                  ((u16)((((u32)(l)) >> 16) & 0xFFFF))
+#define ACPI_LOBYTE(l)                  ((u8)(u16)(l))
+#define ACPI_HIBYTE(l)                  ((u8)((((u16)(l)) >> 8) & 0xFF))
+
+/* Full 64-bit integer must be available on both 32-bit and 64-bit platforms */
+
+struct acpi_integer_overlay {
+	u32 lo_dword;
+	u32 hi_dword;
+};
+
+#define ACPI_LODWORD(integer)           (ACPI_CAST_PTR (struct acpi_integer_overlay, &integer)->lo_dword)
+#define ACPI_HIDWORD(integer)           (ACPI_CAST_PTR (struct acpi_integer_overlay, &integer)->hi_dword)
+
+#define ACPI_SET_BIT(target,bit)        ((target) |= (bit))
+#define ACPI_CLEAR_BIT(target,bit)      ((target) &= ~(bit))
+#define ACPI_MIN(a,b)                   (((a)<(b))?(a):(b))
+#define ACPI_MAX(a,b)                   (((a)>(b))?(a):(b))
+
+/* Size calculation */
+
+#define ACPI_ARRAY_LENGTH(x)            (sizeof(x) / sizeof((x)[0]))
+
+/* Pointer manipulation */
+
+#define ACPI_CAST_PTR(t, p)             ((t *) (acpi_uintptr_t) (p))
+#define ACPI_CAST_INDIRECT_PTR(t, p)    ((t **) (acpi_uintptr_t) (p))
+#define ACPI_ADD_PTR(t, a, b)           ACPI_CAST_PTR (t, (ACPI_CAST_PTR (u8, (a)) + (acpi_size)(b)))
+#define ACPI_PTR_DIFF(a, b)             (acpi_size) (ACPI_CAST_PTR (u8, (a)) - ACPI_CAST_PTR (u8, (b)))
+
+/* Pointer/Integer type conversions */
+
+#define ACPI_TO_POINTER(i)              ACPI_ADD_PTR (void, (void *) NULL,(acpi_size) i)
+#define ACPI_TO_INTEGER(p)              ACPI_PTR_DIFF (p, (void *) NULL)
+#define ACPI_OFFSET(d, f)               (acpi_size) ACPI_PTR_DIFF (&(((d *)0)->f), (void *) NULL)
+#define ACPI_PHYSADDR_TO_PTR(i)         ACPI_TO_POINTER(i)
+#define ACPI_PTR_TO_PHYSADDR(i)         ACPI_TO_INTEGER(i)
+
+#ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED
+#define ACPI_COMPARE_NAME(a,b)          (*ACPI_CAST_PTR (u32, (a)) == *ACPI_CAST_PTR (u32, (b)))
+#else
+#define ACPI_COMPARE_NAME(a,b)          (!ACPI_STRNCMP (ACPI_CAST_PTR (char, (a)), ACPI_CAST_PTR (char, (b)), ACPI_NAME_SIZE))
+#endif
+
+/*******************************************************************************
+ *
+ * Miscellaneous constants
+ *
+ ******************************************************************************/
 
 /*
  * Initialization sequence
@@ -414,7 +551,7 @@ typedef unsigned long long acpi_integer;
 #define ACPI_NOTIFY_MAX                 0x0B
 
 /*
- * Types associated with ACPI names and objects.  The first group of
+ * Types associated with ACPI names and objects. The first group of
  * values (up to ACPI_TYPE_EXTERNAL_MAX) correspond to the definition
  * of the ACPI object_type() operator (See the ACPI Spec). Therefore,
  * only add to the first group if the spec changes.
@@ -732,6 +869,15 @@ struct acpi_buffer {
 #define ACPI_NAME_TYPE_MAX              1
 
 /*
+ * Predefined Namespace items
+ */
+struct acpi_predefined_names {
+	char *name;
+	u8 type;
+	char *val;
+};
+
+/*
  * Structure and flags for acpi_get_system_info
  */
 #define ACPI_SYS_MODE_UNKNOWN           0x0000
@@ -787,7 +933,7 @@ acpi_status(*acpi_exception_handler) (acpi_status aml_status,
 				      u16 opcode,
 				      u32 aml_offset, void *context);
 
-/* Table Event handler (Load, load_table etc) and types */
+/* Table Event handler (Load, load_table, etc.) and types */
 
 typedef
 acpi_status(*acpi_tbl_handler) (u32 event, void *table, void *context);
@@ -823,6 +969,12 @@ acpi_status(*acpi_walk_callback) (acpi_handle obj_handle,
 #define ACPI_INTERRUPT_NOT_HANDLED      0x00
 #define ACPI_INTERRUPT_HANDLED          0x01
 
+/* Length of _HID, _UID, _CID, and UUID values */
+
+#define ACPI_DEVICE_ID_LENGTH           0x09
+#define ACPI_MAX_CID_LENGTH             48
+#define ACPI_UUID_LENGTH                16
+
 /* Common string version of device HIDs and UIDs */
 
 struct acpica_device_id {
@@ -900,357 +1052,28 @@ struct acpi_mem_space_context {
 };
 
 /*
- * Definitions for Resource Attributes
- */
-typedef u16 acpi_rs_length;	/* Resource Length field is fixed at 16 bits */
-typedef u32 acpi_rsdesc_size;	/* Max Resource Descriptor size is (Length+3) = (64_k-1)+3 */
-
-/*
- *  Memory Attributes
- */
-#define ACPI_READ_ONLY_MEMORY           (u8) 0x00
-#define ACPI_READ_WRITE_MEMORY          (u8) 0x01
-
-#define ACPI_NON_CACHEABLE_MEMORY       (u8) 0x00
-#define ACPI_CACHABLE_MEMORY            (u8) 0x01
-#define ACPI_WRITE_COMBINING_MEMORY     (u8) 0x02
-#define ACPI_PREFETCHABLE_MEMORY        (u8) 0x03
-
-/*
- *  IO Attributes
- *  The ISA IO ranges are:     n000-n0_fFh, n400-n4_fFh, n800-n8_fFh, n_c00-n_cFFh.
- *  The non-ISA IO ranges are: n100-n3_fFh, n500-n7_fFh, n900-n_bFFh, n_cd0-n_fFFh.
+ * struct acpi_memory_list is used only if the ACPICA local cache is enabled
  */
-#define ACPI_NON_ISA_ONLY_RANGES        (u8) 0x01
-#define ACPI_ISA_ONLY_RANGES            (u8) 0x02
-#define ACPI_ENTIRE_RANGE               (ACPI_NON_ISA_ONLY_RANGES | ACPI_ISA_ONLY_RANGES)
-
-/* Type of translation - 1=Sparse, 0=Dense */
-
-#define ACPI_SPARSE_TRANSLATION         (u8) 0x01
-
-/*
- *  IO Port Descriptor Decode
- */
-#define ACPI_DECODE_10                  (u8) 0x00	/* 10-bit IO address decode */
-#define ACPI_DECODE_16                  (u8) 0x01	/* 16-bit IO address decode */
-
-/*
- *  IRQ Attributes
- */
-#define ACPI_LEVEL_SENSITIVE            (u8) 0x00
-#define ACPI_EDGE_SENSITIVE             (u8) 0x01
-
-#define ACPI_ACTIVE_HIGH                (u8) 0x00
-#define ACPI_ACTIVE_LOW                 (u8) 0x01
-
-#define ACPI_EXCLUSIVE                  (u8) 0x00
-#define ACPI_SHARED                     (u8) 0x01
-
-/*
- *  DMA Attributes
- */
-#define ACPI_COMPATIBILITY              (u8) 0x00
-#define ACPI_TYPE_A                     (u8) 0x01
-#define ACPI_TYPE_B                     (u8) 0x02
-#define ACPI_TYPE_F                     (u8) 0x03
-
-#define ACPI_NOT_BUS_MASTER             (u8) 0x00
-#define ACPI_BUS_MASTER                 (u8) 0x01
-
-#define ACPI_TRANSFER_8                 (u8) 0x00
-#define ACPI_TRANSFER_8_16              (u8) 0x01
-#define ACPI_TRANSFER_16                (u8) 0x02
-
-/*
- * Start Dependent Functions Priority definitions
- */
-#define ACPI_GOOD_CONFIGURATION         (u8) 0x00
-#define ACPI_ACCEPTABLE_CONFIGURATION   (u8) 0x01
-#define ACPI_SUB_OPTIMAL_CONFIGURATION  (u8) 0x02
-
-/*
- *  16, 32 and 64-bit Address Descriptor resource types
- */
-#define ACPI_MEMORY_RANGE               (u8) 0x00
-#define ACPI_IO_RANGE                   (u8) 0x01
-#define ACPI_BUS_NUMBER_RANGE           (u8) 0x02
-
-#define ACPI_ADDRESS_NOT_FIXED          (u8) 0x00
-#define ACPI_ADDRESS_FIXED              (u8) 0x01
-
-#define ACPI_POS_DECODE                 (u8) 0x00
-#define ACPI_SUB_DECODE                 (u8) 0x01
-
-#define ACPI_PRODUCER                   (u8) 0x00
-#define ACPI_CONSUMER                   (u8) 0x01
-
-/*
- * If possible, pack the following structures to byte alignment
- */
-#ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED
-#pragma pack(1)
+struct acpi_memory_list {
+	char *list_name;
+	void *list_head;
+	u16 object_size;
+	u16 max_depth;
+	u16 current_depth;
+	u16 link_offset;
+
+#ifdef ACPI_DBG_TRACK_ALLOCATIONS
+
+	/* Statistics for debug memory tracking only */
+
+	u32 total_allocated;
+	u32 total_freed;
+	u32 max_occupied;
+	u32 total_size;
+	u32 current_total_size;
+	u32 requests;
+	u32 hits;
 #endif
-
-/* UUID data structures for use in vendor-defined resource descriptors */
-
-struct acpi_uuid {
-	u8 data[ACPI_UUID_LENGTH];
-};
-
-struct acpi_vendor_uuid {
-	u8 subtype;
-	u8 data[ACPI_UUID_LENGTH];
-};
-
-/*
- *  Structures used to describe device resources
- */
-struct acpi_resource_irq {
-	u8 descriptor_length;
-	u8 triggering;
-	u8 polarity;
-	u8 sharable;
-	u8 interrupt_count;
-	u8 interrupts[1];
-};
-
-struct acpi_resource_dma {
-	u8 type;
-	u8 bus_master;
-	u8 transfer;
-	u8 channel_count;
-	u8 channels[1];
-};
-
-struct acpi_resource_start_dependent {
-	u8 descriptor_length;
-	u8 compatibility_priority;
-	u8 performance_robustness;
-};
-
-/*
- * END_DEPENDENT_FUNCTIONS_RESOURCE struct is not
- * needed because it has no fields
- */
-
-struct acpi_resource_io {
-	u8 io_decode;
-	u8 alignment;
-	u8 address_length;
-	u16 minimum;
-	u16 maximum;
-};
-
-struct acpi_resource_fixed_io {
-	u16 address;
-	u8 address_length;
-};
-
-struct acpi_resource_vendor {
-	u16 byte_length;
-	u8 byte_data[1];
-};
-
-/* Vendor resource with UUID info (introduced in ACPI 3.0) */
-
-struct acpi_resource_vendor_typed {
-	u16 byte_length;
-	u8 uuid_subtype;
-	u8 uuid[ACPI_UUID_LENGTH];
-	u8 byte_data[1];
-};
-
-struct acpi_resource_end_tag {
-	u8 checksum;
-};
-
-struct acpi_resource_memory24 {
-	u8 write_protect;
-	u16 minimum;
-	u16 maximum;
-	u16 alignment;
-	u16 address_length;
-};
-
-struct acpi_resource_memory32 {
-	u8 write_protect;
-	u32 minimum;
-	u32 maximum;
-	u32 alignment;
-	u32 address_length;
-};
-
-struct acpi_resource_fixed_memory32 {
-	u8 write_protect;
-	u32 address;
-	u32 address_length;
-};
-
-struct acpi_memory_attribute {
-	u8 write_protect;
-	u8 caching;
-	u8 range_type;
-	u8 translation;
-};
-
-struct acpi_io_attribute {
-	u8 range_type;
-	u8 translation;
-	u8 translation_type;
-	u8 reserved1;
-};
-
-union acpi_resource_attribute {
-	struct acpi_memory_attribute mem;
-	struct acpi_io_attribute io;
-
-	/* Used for the *word_space macros */
-
-	u8 type_specific;
-};
-
-struct acpi_resource_source {
-	u8 index;
-	u16 string_length;
-	char *string_ptr;
-};
-
-/* Fields common to all address descriptors, 16/32/64 bit */
-
-#define ACPI_RESOURCE_ADDRESS_COMMON \
-	u8                              resource_type; \
-	u8                              producer_consumer; \
-	u8                              decode; \
-	u8                              min_address_fixed; \
-	u8                              max_address_fixed; \
-	union acpi_resource_attribute   info;
-
-struct acpi_resource_address {
-ACPI_RESOURCE_ADDRESS_COMMON};
-
-struct acpi_resource_address16 {
-	ACPI_RESOURCE_ADDRESS_COMMON u16 granularity;
-	u16 minimum;
-	u16 maximum;
-	u16 translation_offset;
-	u16 address_length;
-	struct acpi_resource_source resource_source;
-};
-
-struct acpi_resource_address32 {
-	ACPI_RESOURCE_ADDRESS_COMMON u32 granularity;
-	u32 minimum;
-	u32 maximum;
-	u32 translation_offset;
-	u32 address_length;
-	struct acpi_resource_source resource_source;
-};
-
-struct acpi_resource_address64 {
-	ACPI_RESOURCE_ADDRESS_COMMON u64 granularity;
-	u64 minimum;
-	u64 maximum;
-	u64 translation_offset;
-	u64 address_length;
-	struct acpi_resource_source resource_source;
-};
-
-struct acpi_resource_extended_address64 {
-	ACPI_RESOURCE_ADDRESS_COMMON u8 revision_iD;
-	u64 granularity;
-	u64 minimum;
-	u64 maximum;
-	u64 translation_offset;
-	u64 address_length;
-	u64 type_specific;
-};
-
-struct acpi_resource_extended_irq {
-	u8 producer_consumer;
-	u8 triggering;
-	u8 polarity;
-	u8 sharable;
-	u8 interrupt_count;
-	struct acpi_resource_source resource_source;
-	u32 interrupts[1];
-};
-
-struct acpi_resource_generic_register {
-	u8 space_id;
-	u8 bit_width;
-	u8 bit_offset;
-	u8 access_size;
-	u64 address;
-};
-
-/* ACPI_RESOURCE_TYPEs */
-
-#define ACPI_RESOURCE_TYPE_IRQ                  0
-#define ACPI_RESOURCE_TYPE_DMA                  1
-#define ACPI_RESOURCE_TYPE_START_DEPENDENT      2
-#define ACPI_RESOURCE_TYPE_END_DEPENDENT        3
-#define ACPI_RESOURCE_TYPE_IO                   4
-#define ACPI_RESOURCE_TYPE_FIXED_IO             5
-#define ACPI_RESOURCE_TYPE_VENDOR               6
-#define ACPI_RESOURCE_TYPE_END_TAG              7
-#define ACPI_RESOURCE_TYPE_MEMORY24             8
-#define ACPI_RESOURCE_TYPE_MEMORY32             9
-#define ACPI_RESOURCE_TYPE_FIXED_MEMORY32       10
-#define ACPI_RESOURCE_TYPE_ADDRESS16            11
-#define ACPI_RESOURCE_TYPE_ADDRESS32            12
-#define ACPI_RESOURCE_TYPE_ADDRESS64            13
-#define ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64   14	/* ACPI 3.0 */
-#define ACPI_RESOURCE_TYPE_EXTENDED_IRQ         15
-#define ACPI_RESOURCE_TYPE_GENERIC_REGISTER     16
-#define ACPI_RESOURCE_TYPE_MAX                  16
-
-union acpi_resource_data {
-	struct acpi_resource_irq irq;
-	struct acpi_resource_dma dma;
-	struct acpi_resource_start_dependent start_dpf;
-	struct acpi_resource_io io;
-	struct acpi_resource_fixed_io fixed_io;
-	struct acpi_resource_vendor vendor;
-	struct acpi_resource_vendor_typed vendor_typed;
-	struct acpi_resource_end_tag end_tag;
-	struct acpi_resource_memory24 memory24;
-	struct acpi_resource_memory32 memory32;
-	struct acpi_resource_fixed_memory32 fixed_memory32;
-	struct acpi_resource_address16 address16;
-	struct acpi_resource_address32 address32;
-	struct acpi_resource_address64 address64;
-	struct acpi_resource_extended_address64 ext_address64;
-	struct acpi_resource_extended_irq extended_irq;
-	struct acpi_resource_generic_register generic_reg;
-
-	/* Common fields */
-
-	struct acpi_resource_address address;	/* Common 16/32/64 address fields */
-};
-
-struct acpi_resource {
-	u32 type;
-	u32 length;
-	union acpi_resource_data data;
-};
-
-/* restore default alignment */
-
-#pragma pack()
-
-#define ACPI_RS_SIZE_NO_DATA                8	/* Id + Length fields */
-#define ACPI_RS_SIZE_MIN                    (u32) ACPI_ROUND_UP_TO_NATIVE_WORD (12)
-#define ACPI_RS_SIZE(type)                  (u32) (ACPI_RS_SIZE_NO_DATA + sizeof (type))
-
-#define ACPI_NEXT_RESOURCE(res)             (struct acpi_resource *)((u8 *) res + res->length)
-
-struct acpi_pci_routing_table {
-	u32 length;
-	u32 pin;
-	acpi_integer address;	/* here for 64-bit alignment */
-	u32 source_index;
-	char source[4];		/* pad to 64 bits so sizeof() works in all cases */
 };
 
 #endif				/* __ACTYPES_H__ */
diff --git a/include/acpi/platform/acenv.h b/include/acpi/platform/acenv.h
index fcd2572e428..e62f10d9a7d 100644
--- a/include/acpi/platform/acenv.h
+++ b/include/acpi/platform/acenv.h
@@ -44,14 +44,26 @@
 #ifndef __ACENV_H__
 #define __ACENV_H__
 
-/*
+/* Types for ACPI_MUTEX_TYPE */
+
+#define ACPI_BINARY_SEMAPHORE       0
+#define ACPI_OSL_MUTEX              1
+
+/* Types for DEBUGGER_THREADING */
+
+#define DEBUGGER_SINGLE_THREADED    0
+#define DEBUGGER_MULTI_THREADED     1
+
+/******************************************************************************
+ *
  * Configuration for ACPI tools and utilities
- */
+ *
+ *****************************************************************************/
 
 #ifdef ACPI_LIBRARY
 /*
  * Note: The non-debug version of the acpi_library does not contain any
- * debug support, for minimimal size. The debug version uses ACPI_FULL_DEBUG
+ * debug support, for minimal size. The debug version uses ACPI_FULL_DEBUG
  */
 #define ACPI_USE_LOCAL_CACHE
 #endif
@@ -75,17 +87,6 @@
 #define ACPI_DBG_TRACK_ALLOCATIONS
 #endif
 
-#ifdef ACPI_DASM_APP
-#ifndef MSDOS
-#define ACPI_DEBUG_OUTPUT
-#endif
-#define ACPI_APPLICATION
-#define ACPI_DISASSEMBLER
-#define ACPI_NO_METHOD_EXECUTION
-#define ACPI_LARGE_NAMESPACE_NODE
-#define ACPI_DATA_TABLE_DISASSEMBLY
-#endif
-
 #ifdef ACPI_APPLICATION
 #define ACPI_USE_SYSTEM_CLIBRARY
 #define ACPI_USE_LOCAL_CACHE
@@ -179,6 +180,19 @@
 
 /*! [End] no source code translation !*/
 
+/******************************************************************************
+ *
+ * Miscellaneous configuration
+ *
+ *****************************************************************************/
+
+/*
+ * Are mutexes supported by the host? default is no, use binary semaphores.
+ */
+#ifndef ACPI_MUTEX_TYPE
+#define ACPI_MUTEX_TYPE             ACPI_BINARY_SEMAPHORE
+#endif
+
 /*
  * Debugger threading model
  * Use single threaded if the entire subsystem is contained in an application
@@ -187,9 +201,6 @@
  * By default the model is single threaded if ACPI_APPLICATION is set,
  * multi-threaded if ACPI_APPLICATION is not set.
  */
-#define DEBUGGER_SINGLE_THREADED    0
-#define DEBUGGER_MULTI_THREADED     1
-
 #ifndef DEBUGGER_THREADING
 #ifdef ACPI_APPLICATION
 #define DEBUGGER_THREADING          DEBUGGER_SINGLE_THREADED
diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h
index 0515e754449..6d49b2a498c 100644
--- a/include/acpi/platform/aclinux.h
+++ b/include/acpi/platform/aclinux.h
@@ -46,6 +46,7 @@
 
 #define ACPI_USE_SYSTEM_CLIBRARY
 #define ACPI_USE_DO_WHILE_0
+#define ACPI_MUTEX_TYPE             ACPI_BINARY_SEMAPHORE
 
 #ifdef __KERNEL__
 
@@ -70,9 +71,6 @@
 #define ACPI_EXPORT_SYMBOL(symbol)  EXPORT_SYMBOL(symbol);
 #define strtoul                     simple_strtoul
 
-/* Full namespace pathname length limit - arbitrary */
-#define ACPI_PATHNAME_MAX              256
-
 #else				/* !__KERNEL__ */
 
 #include <stdarg.h>
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index fba8051fb29..6fce2fc2d12 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -131,22 +131,6 @@ extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity);
  */
 void acpi_unregister_gsi (u32 gsi);
 
-struct acpi_prt_entry {
-	struct list_head	node;
-	struct acpi_pci_id	id;
-	u8			pin;
-	struct {
-		acpi_handle		handle;
-		u32			index;
-	}			link;
-	u32			irq;
-};
-
-struct acpi_prt_list {
-	int			count;
-	struct list_head	entries;
-};
-
 struct pci_dev;
 
 int acpi_pci_irq_enable (struct pci_dev *dev);
@@ -270,6 +254,7 @@ int acpi_check_mem_region(resource_size_t start, resource_size_t n,
 #ifdef CONFIG_PM_SLEEP
 void __init acpi_no_s4_hw_signature(void);
 void __init acpi_old_suspend_ordering(void);
+void __init acpi_s4_no_nvs(void);
 #endif /* CONFIG_PM_SLEEP */
 #else	/* CONFIG_ACPI */
 
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index 0f50d4cc436..45f6297821b 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -59,9 +59,7 @@ enum async_tx_flags {
 };
 
 #ifdef CONFIG_DMA_ENGINE
-void async_tx_issue_pending_all(void);
-enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
-void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx);
+#define async_tx_issue_pending_all dma_issue_pending_all
 #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL
 #include <asm/async_tx.h>
 #else
@@ -77,19 +75,6 @@ static inline void async_tx_issue_pending_all(void)
 	do { } while (0);
 }
 
-static inline enum dma_status
-dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
-{
-	return DMA_SUCCESS;
-}
-
-static inline void
-async_tx_run_dependencies(struct dma_async_tx_descriptor *tx,
-	struct dma_chan *host_chan)
-{
-	do { } while (0);
-}
-
 static inline struct dma_chan *
 async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
 	enum dma_transaction_type tx_type, struct page **dst, int dst_count,
diff --git a/include/linux/atmel-mci.h b/include/linux/atmel-mci.h
index 2a2213eefd8..2f1f95737ac 100644
--- a/include/linux/atmel-mci.h
+++ b/include/linux/atmel-mci.h
@@ -3,7 +3,7 @@
 
 #define ATMEL_MCI_MAX_NR_SLOTS	2
 
-struct dma_slave;
+#include <linux/dw_dmac.h>
 
 /**
  * struct mci_slot_pdata - board-specific per-slot configuration
@@ -28,11 +28,11 @@ struct mci_slot_pdata {
 
 /**
  * struct mci_platform_data - board-specific MMC/SDcard configuration
- * @dma_slave: DMA slave interface to use in data transfers, or NULL.
+ * @dma_slave: DMA slave interface to use in data transfers.
  * @slot: Per-slot configuration data.
  */
 struct mci_platform_data {
-	struct dma_slave	*dma_slave;
+	struct dw_dma_slave	dma_slave;
 	struct mci_slot_pdata	slot[ATMEL_MCI_MAX_NR_SLOTS];
 };
 
diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index 1ee9488ca2e..79ca2da81c8 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -31,6 +31,10 @@ struct backlight_device;
 struct fb_info;
 
 struct backlight_ops {
+	unsigned int options;
+
+#define BL_CORE_SUSPENDRESUME	(1 << 0)
+
 	/* Notify the backlight driver some property has changed */
 	int (*update_status)(struct backlight_device *);
 	/* Return the current backlight brightness (accounting for power,
@@ -51,7 +55,19 @@ struct backlight_properties {
 	   modes; 4: full off), see FB_BLANK_XXX */
 	int power;
 	/* FB Blanking active? (values as for power) */
+	/* Due to be removed, please use (state & BL_CORE_FBBLANK) */
 	int fb_blank;
+	/* Flags used to signal drivers of state changes */
+	/* Upper 4 bits are reserved for driver internal use */
+	unsigned int state;
+
+#define BL_CORE_SUSPENDED	(1 << 0)	/* backlight is suspended */
+#define BL_CORE_FBBLANK		(1 << 1)	/* backlight is under an fb blank event */
+#define BL_CORE_DRIVER4		(1 << 28)	/* reserved for driver specific use */
+#define BL_CORE_DRIVER3		(1 << 29)	/* reserved for driver specific use */
+#define BL_CORE_DRIVER2		(1 << 30)	/* reserved for driver specific use */
+#define BL_CORE_DRIVER1		(1 << 31)	/* reserved for driver specific use */
+
 };
 
 struct backlight_device {
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index adb0b084eb5..64dea2ab326 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -29,32 +29,6 @@
 #include <linux/dma-mapping.h>
 
 /**
- * enum dma_state - resource PNP/power management state
- * @DMA_RESOURCE_SUSPEND: DMA device going into low power state
- * @DMA_RESOURCE_RESUME: DMA device returning to full power
- * @DMA_RESOURCE_AVAILABLE: DMA device available to the system
- * @DMA_RESOURCE_REMOVED: DMA device removed from the system
- */
-enum dma_state {
-	DMA_RESOURCE_SUSPEND,
-	DMA_RESOURCE_RESUME,
-	DMA_RESOURCE_AVAILABLE,
-	DMA_RESOURCE_REMOVED,
-};
-
-/**
- * enum dma_state_client - state of the channel in the client
- * @DMA_ACK: client would like to use, or was using this channel
- * @DMA_DUP: client has already seen this channel, or is not using this channel
- * @DMA_NAK: client does not want to see any more channels
- */
-enum dma_state_client {
-	DMA_ACK,
-	DMA_DUP,
-	DMA_NAK,
-};
-
-/**
  * typedef dma_cookie_t - an opaque DMA cookie
  *
  * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code
@@ -89,23 +63,13 @@ enum dma_transaction_type {
 	DMA_MEMSET,
 	DMA_MEMCPY_CRC32C,
 	DMA_INTERRUPT,
+	DMA_PRIVATE,
 	DMA_SLAVE,
 };
 
 /* last transaction type for creation of the capabilities mask */
 #define DMA_TX_TYPE_END (DMA_SLAVE + 1)
 
-/**
- * enum dma_slave_width - DMA slave register access width.
- * @DMA_SLAVE_WIDTH_8BIT: Do 8-bit slave register accesses
- * @DMA_SLAVE_WIDTH_16BIT: Do 16-bit slave register accesses
- * @DMA_SLAVE_WIDTH_32BIT: Do 32-bit slave register accesses
- */
-enum dma_slave_width {
-	DMA_SLAVE_WIDTH_8BIT,
-	DMA_SLAVE_WIDTH_16BIT,
-	DMA_SLAVE_WIDTH_32BIT,
-};
 
 /**
  * enum dma_ctrl_flags - DMA flags to augment operation preparation,
@@ -132,32 +96,6 @@ enum dma_ctrl_flags {
 typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t;
 
 /**
- * struct dma_slave - Information about a DMA slave
- * @dev: device acting as DMA slave
- * @dma_dev: required DMA master device. If non-NULL, the client can not be
- *	bound to other masters than this.
- * @tx_reg: physical address of data register used for
- *	memory-to-peripheral transfers
- * @rx_reg: physical address of data register used for
- *	peripheral-to-memory transfers
- * @reg_width: peripheral register width
- *
- * If dma_dev is non-NULL, the client can not be bound to other DMA
- * masters than the one corresponding to this device. The DMA master
- * driver may use this to determine if there is controller-specific
- * data wrapped around this struct. Drivers of platform code that sets
- * the dma_dev field must therefore make sure to use an appropriate
- * controller-specific dma slave structure wrapping this struct.
- */
-struct dma_slave {
-	struct device		*dev;
-	struct device		*dma_dev;
-	dma_addr_t		tx_reg;
-	dma_addr_t		rx_reg;
-	enum dma_slave_width	reg_width;
-};
-
-/**
  * struct dma_chan_percpu - the per-CPU part of struct dma_chan
  * @refcount: local_t used for open-coded "bigref" counting
  * @memcpy_count: transaction counter
@@ -165,7 +103,6 @@ struct dma_slave {
  */
 
 struct dma_chan_percpu {
-	local_t refcount;
 	/* stats */
 	unsigned long memcpy_count;
 	unsigned long bytes_transferred;
@@ -176,13 +113,14 @@ struct dma_chan_percpu {
  * @device: ptr to the dma device who supplies this channel, always !%NULL
  * @cookie: last cookie value returned to client
  * @chan_id: channel ID for sysfs
- * @class_dev: class device for sysfs
+ * @dev: class device for sysfs
  * @refcount: kref, used in "bigref" slow-mode
  * @slow_ref: indicates that the DMA channel is free
  * @rcu: the DMA channel's RCU head
  * @device_node: used to add this to the device chan list
  * @local: per-cpu pointer to a struct dma_chan_percpu
  * @client-count: how many clients are using this channel
+ * @table_count: number of appearances in the mem-to-mem allocation table
  */
 struct dma_chan {
 	struct dma_device *device;
@@ -190,73 +128,47 @@ struct dma_chan {
 
 	/* sysfs */
 	int chan_id;
-	struct device dev;
-
-	struct kref refcount;
-	int slow_ref;
-	struct rcu_head rcu;
+	struct dma_chan_dev *dev;
 
 	struct list_head device_node;
 	struct dma_chan_percpu *local;
 	int client_count;
+	int table_count;
 };
 
-#define to_dma_chan(p) container_of(p, struct dma_chan, dev)
-
-void dma_chan_cleanup(struct kref *kref);
-
-static inline void dma_chan_get(struct dma_chan *chan)
-{
-	if (unlikely(chan->slow_ref))
-		kref_get(&chan->refcount);
-	else {
-		local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
-		put_cpu();
-	}
-}
+/**
+ * struct dma_chan_dev - relate sysfs device node to backing channel device
+ * @chan - driver channel device
+ * @device - sysfs device
+ * @dev_id - parent dma_device dev_id
+ * @idr_ref - reference count to gate release of dma_device dev_id
+ */
+struct dma_chan_dev {
+	struct dma_chan *chan;
+	struct device device;
+	int dev_id;
+	atomic_t *idr_ref;
+};
 
-static inline void dma_chan_put(struct dma_chan *chan)
+static inline const char *dma_chan_name(struct dma_chan *chan)
 {
-	if (unlikely(chan->slow_ref))
-		kref_put(&chan->refcount, dma_chan_cleanup);
-	else {
-		local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount));
-		put_cpu();
-	}
+	return dev_name(&chan->dev->device);
 }
 
-/*
- * typedef dma_event_callback - function pointer to a DMA event callback
- * For each channel added to the system this routine is called for each client.
- * If the client would like to use the channel it returns '1' to signal (ack)
- * the dmaengine core to take out a reference on the channel and its
- * corresponding device.  A client must not 'ack' an available channel more
- * than once.  When a channel is removed all clients are notified.  If a client
- * is using the channel it must 'ack' the removal.  A client must not 'ack' a
- * removed channel more than once.
- * @client - 'this' pointer for the client context
- * @chan - channel to be acted upon
- * @state - available or removed
- */
-struct dma_client;
-typedef enum dma_state_client (*dma_event_callback) (struct dma_client *client,
-		struct dma_chan *chan, enum dma_state state);
+void dma_chan_cleanup(struct kref *kref);
 
 /**
- * struct dma_client - info on the entity making use of DMA services
- * @event_callback: func ptr to call when something happens
- * @cap_mask: only return channels that satisfy the requested capabilities
- *  a value of zero corresponds to any capability
- * @slave: data for preparing slave transfer. Must be non-NULL iff the
- *  DMA_SLAVE capability is requested.
- * @global_node: list_head for global dma_client_list
+ * typedef dma_filter_fn - callback filter for dma_request_channel
+ * @chan: channel to be reviewed
+ * @filter_param: opaque parameter passed through dma_request_channel
+ *
+ * When this optional parameter is specified in a call to dma_request_channel a
+ * suitable channel is passed to this routine for further dispositioning before
+ * being returned.  Where 'suitable' indicates a non-busy channel that
+ * satisfies the given capability mask.  It returns 'true' to indicate that the
+ * channel is suitable.
  */
-struct dma_client {
-	dma_event_callback	event_callback;
-	dma_cap_mask_t		cap_mask;
-	struct dma_slave	*slave;
-	struct list_head	global_node;
-};
+typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param);
 
 typedef void (*dma_async_tx_callback)(void *dma_async_param);
 /**
@@ -323,14 +235,10 @@ struct dma_device {
 	dma_cap_mask_t  cap_mask;
 	int max_xor;
 
-	struct kref refcount;
-	struct completion done;
-
 	int dev_id;
 	struct device *dev;
 
-	int (*device_alloc_chan_resources)(struct dma_chan *chan,
-			struct dma_client *client);
+	int (*device_alloc_chan_resources)(struct dma_chan *chan);
 	void (*device_free_chan_resources)(struct dma_chan *chan);
 
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
@@ -362,9 +270,8 @@ struct dma_device {
 
 /* --- public DMA engine API --- */
 
-void dma_async_client_register(struct dma_client *client);
-void dma_async_client_unregister(struct dma_client *client);
-void dma_async_client_chan_request(struct dma_client *client);
+void dmaengine_get(void);
+void dmaengine_put(void);
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
 dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
@@ -406,6 +313,12 @@ __dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp)
 	set_bit(tx_type, dstp->bits);
 }
 
+#define dma_cap_zero(mask) __dma_cap_zero(&(mask))
+static inline void __dma_cap_zero(dma_cap_mask_t *dstp)
+{
+	bitmap_zero(dstp->bits, DMA_TX_TYPE_END);
+}
+
 #define dma_has_cap(tx, mask) __dma_has_cap((tx), &(mask))
 static inline int
 __dma_has_cap(enum dma_transaction_type tx_type, dma_cap_mask_t *srcp)
@@ -475,11 +388,25 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie,
 }
 
 enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
+#ifdef CONFIG_DMA_ENGINE
+enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
+#else
+static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
+{
+	return DMA_SUCCESS;
+}
+#endif
 
 /* --- DMA device --- */
 
 int dma_async_device_register(struct dma_device *device);
 void dma_async_device_unregister(struct dma_device *device);
+void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
+struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type);
+void dma_issue_pending_all(void);
+#define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y)
+struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param);
+void dma_release_channel(struct dma_chan *chan);
 
 /* --- Helper iov-locking functions --- */
 
diff --git a/include/linux/dw_dmac.h b/include/linux/dw_dmac.h
index 04d217b442b..d797dde247f 100644
--- a/include/linux/dw_dmac.h
+++ b/include/linux/dw_dmac.h
@@ -22,14 +22,34 @@ struct dw_dma_platform_data {
 };
 
 /**
+ * enum dw_dma_slave_width - DMA slave register access width.
+ * @DMA_SLAVE_WIDTH_8BIT: Do 8-bit slave register accesses
+ * @DMA_SLAVE_WIDTH_16BIT: Do 16-bit slave register accesses
+ * @DMA_SLAVE_WIDTH_32BIT: Do 32-bit slave register accesses
+ */
+enum dw_dma_slave_width {
+	DW_DMA_SLAVE_WIDTH_8BIT,
+	DW_DMA_SLAVE_WIDTH_16BIT,
+	DW_DMA_SLAVE_WIDTH_32BIT,
+};
+
+/**
  * struct dw_dma_slave - Controller-specific information about a slave
- * @slave: Generic information about the slave
- * @ctl_lo: Platform-specific initializer for the CTL_LO register
+ *
+ * @dma_dev: required DMA master device
+ * @tx_reg: physical address of data register used for
+ *	memory-to-peripheral transfers
+ * @rx_reg: physical address of data register used for
+ *	peripheral-to-memory transfers
+ * @reg_width: peripheral register width
  * @cfg_hi: Platform-specific initializer for the CFG_HI register
  * @cfg_lo: Platform-specific initializer for the CFG_LO register
  */
 struct dw_dma_slave {
-	struct dma_slave	slave;
+	struct device		*dma_dev;
+	dma_addr_t		tx_reg;
+	dma_addr_t		rx_reg;
+	enum dw_dma_slave_width	reg_width;
 	u32			cfg_hi;
 	u32			cfg_lo;
 };
@@ -54,9 +74,4 @@ struct dw_dma_slave {
 #define DWC_CFGL_HS_DST_POL	(1 << 18)	/* dst handshake active low */
 #define DWC_CFGL_HS_SRC_POL	(1 << 19)	/* src handshake active low */
 
-static inline struct dw_dma_slave *to_dw_dma_slave(struct dma_slave *slave)
-{
-	return container_of(slave, struct dw_dma_slave, slave);
-}
-
 #endif /* DW_DMAC_H */
diff --git a/include/linux/leds-pca9532.h b/include/linux/leds-pca9532.h
index 81b4207deb9..96eea90f01a 100644
--- a/include/linux/leds-pca9532.h
+++ b/include/linux/leds-pca9532.h
@@ -15,6 +15,7 @@
 #define __LINUX_PCA9532_H
 
 #include <linux/leds.h>
+#include <linux/workqueue.h>
 
 enum pca9532_state {
 	PCA9532_OFF  = 0x0,
@@ -31,6 +32,7 @@ struct pca9532_led {
 	struct i2c_client *client;
 	char *name;
 	struct led_classdev ldev;
+       struct work_struct work;
 	enum pca9532_type type;
 	enum pca9532_state state;
 };
diff --git a/include/linux/leds.h b/include/linux/leds.h
index d3a73f5a48c..24489da701e 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -32,7 +32,10 @@ struct led_classdev {
 	int			 brightness;
 	int			 flags;
 
+	/* Lower 16 bits reflect status */
 #define LED_SUSPENDED		(1 << 0)
+	/* Upper 16 bits reflect control information */
+#define LED_CORE_SUSPENDRESUME	(1 << 16)
 
 	/* Set LED brightness level */
 	/* Must not sleep, use a workqueue if needed */
@@ -62,7 +65,7 @@ struct led_classdev {
 
 extern int led_classdev_register(struct device *parent,
 				 struct led_classdev *led_cdev);
-extern void led_classdev_unregister(struct led_classdev *lcd);
+extern void led_classdev_unregister(struct led_classdev *led_cdev);
 extern void led_classdev_suspend(struct led_classdev *led_cdev);
 extern void led_classdev_resume(struct led_classdev *led_cdev);
 
diff --git a/include/linux/mfd/wm8350/pmic.h b/include/linux/mfd/wm8350/pmic.h
index 96acbfc8aa1..be3264e286e 100644
--- a/include/linux/mfd/wm8350/pmic.h
+++ b/include/linux/mfd/wm8350/pmic.h
@@ -13,6 +13,10 @@
 #ifndef __LINUX_MFD_WM8350_PMIC_H
 #define __LINUX_MFD_WM8350_PMIC_H
 
+#include <linux/platform_device.h>
+#include <linux/leds.h>
+#include <linux/regulator/machine.h>
+
 /*
  * Register values.
  */
@@ -700,6 +704,33 @@ struct wm8350;
 struct platform_device;
 struct regulator_init_data;
 
+/*
+ * WM8350 LED platform data
+ */
+struct wm8350_led_platform_data {
+	const char *name;
+	const char *default_trigger;
+	int max_uA;
+};
+
+struct wm8350_led {
+	struct platform_device *pdev;
+	struct mutex mutex;
+	struct work_struct work;
+	spinlock_t value_lock;
+	enum led_brightness value;
+	struct led_classdev cdev;
+	int max_uA_index;
+	int enabled;
+
+	struct regulator *isink;
+	struct regulator_consumer_supply isink_consumer;
+	struct regulator_init_data isink_init;
+	struct regulator *dcdc;
+	struct regulator_consumer_supply dcdc_consumer;
+	struct regulator_init_data dcdc_init;
+};
+
 struct wm8350_pmic {
 	/* Number of regulators of each type on this device */
 	int max_dcdc;
@@ -717,10 +748,15 @@ struct wm8350_pmic {
 
 	/* regulator devices */
 	struct platform_device *pdev[NUM_WM8350_REGULATORS];
+
+	/* LED devices */
+	struct wm8350_led led[2];
 };
 
 int wm8350_register_regulator(struct wm8350 *wm8350, int reg,
 			      struct regulator_init_data *initdata);
+int wm8350_register_led(struct wm8350 *wm8350, int lednum, int dcdc, int isink,
+			struct wm8350_led_platform_data *pdata);
 
 /*
  * Additional DCDC control not supported via regulator API
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 00e2b575021..88d3d8fbf9f 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -520,6 +520,7 @@ struct cfi_fixup {
 
 #define CFI_MFR_AMD 0x0001
 #define CFI_MFR_ATMEL 0x001F
+#define CFI_MFR_SAMSUNG 0x00EC
 #define CFI_MFR_ST  0x0020 	/* STMicroelectronics */
 
 void cfi_fixup(struct mtd_info *mtd, struct cfi_fixup* fixups);
diff --git a/include/linux/mtd/ftl.h b/include/linux/mtd/ftl.h
index 0be442f881d..0555f7a0b9e 100644
--- a/include/linux/mtd/ftl.h
+++ b/include/linux/mtd/ftl.h
@@ -32,25 +32,25 @@
 #define _LINUX_FTL_H
 
 typedef struct erase_unit_header_t {
-    u_int8_t	LinkTargetTuple[5];
-    u_int8_t	DataOrgTuple[10];
-    u_int8_t	NumTransferUnits;
-    u_int32_t	EraseCount;
-    u_int16_t	LogicalEUN;
-    u_int8_t	BlockSize;
-    u_int8_t	EraseUnitSize;
-    u_int16_t	FirstPhysicalEUN;
-    u_int16_t	NumEraseUnits;
-    u_int32_t	FormattedSize;
-    u_int32_t	FirstVMAddress;
-    u_int16_t	NumVMPages;
-    u_int8_t	Flags;
-    u_int8_t	Code;
-    u_int32_t	SerialNumber;
-    u_int32_t	AltEUHOffset;
-    u_int32_t	BAMOffset;
-    u_int8_t	Reserved[12];
-    u_int8_t	EndTuple[2];
+    uint8_t	LinkTargetTuple[5];
+    uint8_t	DataOrgTuple[10];
+    uint8_t	NumTransferUnits;
+    uint32_t	EraseCount;
+    uint16_t	LogicalEUN;
+    uint8_t	BlockSize;
+    uint8_t	EraseUnitSize;
+    uint16_t	FirstPhysicalEUN;
+    uint16_t	NumEraseUnits;
+    uint32_t	FormattedSize;
+    uint32_t	FirstVMAddress;
+    uint16_t	NumVMPages;
+    uint8_t	Flags;
+    uint8_t	Code;
+    uint32_t	SerialNumber;
+    uint32_t	AltEUHOffset;
+    uint32_t	BAMOffset;
+    uint8_t	Reserved[12];
+    uint8_t	EndTuple[2];
 } erase_unit_header_t;
 
 /* Flags in erase_unit_header_t */
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index aa30244492c..b981b877221 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -223,6 +223,7 @@ struct map_info {
 	   must leave it enabled. */
 	void (*set_vpp)(struct map_info *, int);
 
+	unsigned long pfow_base;
 	unsigned long map_priv_1;
 	unsigned long map_priv_2;
 	void *fldrv_priv;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 64433eb411d..3aa5d77c2cd 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -15,6 +15,8 @@
 #include <linux/mtd/compatmac.h>
 #include <mtd/mtd-abi.h>
 
+#include <asm/div64.h>
+
 #define MTD_CHAR_MAJOR 90
 #define MTD_BLOCK_MAJOR 31
 #define MAX_MTD_DEVICES 32
@@ -25,20 +27,20 @@
 #define MTD_ERASE_DONE          0x08
 #define MTD_ERASE_FAILED        0x10
 
-#define MTD_FAIL_ADDR_UNKNOWN 0xffffffff
+#define MTD_FAIL_ADDR_UNKNOWN -1LL
 
 /* If the erase fails, fail_addr might indicate exactly which block failed.  If
    fail_addr = MTD_FAIL_ADDR_UNKNOWN, the failure was not at the device level or was not
    specific to any particular block. */
 struct erase_info {
 	struct mtd_info *mtd;
-	u_int32_t addr;
-	u_int32_t len;
-	u_int32_t fail_addr;
+	uint64_t addr;
+	uint64_t len;
+	uint64_t fail_addr;
 	u_long time;
 	u_long retries;
-	u_int dev;
-	u_int cell;
+	unsigned dev;
+	unsigned cell;
 	void (*callback) (struct erase_info *self);
 	u_long priv;
 	u_char state;
@@ -46,9 +48,9 @@ struct erase_info {
 };
 
 struct mtd_erase_region_info {
-	u_int32_t offset;			/* At which this region starts, from the beginning of the MTD */
-	u_int32_t erasesize;		/* For this region */
-	u_int32_t numblocks;		/* Number of blocks of erasesize in this region */
+	uint64_t offset;			/* At which this region starts, from the beginning of the MTD */
+	uint32_t erasesize;		/* For this region */
+	uint32_t numblocks;		/* Number of blocks of erasesize in this region */
 	unsigned long *lockmap;		/* If keeping bitmap of locks */
 };
 
@@ -100,14 +102,14 @@ struct mtd_oob_ops {
 
 struct mtd_info {
 	u_char type;
-	u_int32_t flags;
-	u_int32_t size;	 // Total size of the MTD
+	uint32_t flags;
+	uint64_t size;	 // Total size of the MTD
 
 	/* "Major" erase size for the device. Naïve users may take this
 	 * to be the only erase size available, or may use the more detailed
 	 * information below if they desire
 	 */
-	u_int32_t erasesize;
+	uint32_t erasesize;
 	/* Minimal writable flash unit size. In case of NOR flash it is 1 (even
 	 * though individual bits can be cleared), in case of NAND flash it is
 	 * one NAND page (or half, or one-fourths of it), in case of ECC-ed NOR
@@ -115,10 +117,20 @@ struct mtd_info {
 	 * Any driver registering a struct mtd_info must ensure a writesize of
 	 * 1 or larger.
 	 */
-	u_int32_t writesize;
+	uint32_t writesize;
+
+	uint32_t oobsize;   // Amount of OOB data per block (e.g. 16)
+	uint32_t oobavail;  // Available OOB bytes per block
 
-	u_int32_t oobsize;   // Amount of OOB data per block (e.g. 16)
-	u_int32_t oobavail;  // Available OOB bytes per block
+	/*
+	 * If erasesize is a power of 2 then the shift is stored in
+	 * erasesize_shift otherwise erasesize_shift is zero. Ditto writesize.
+	 */
+	unsigned int erasesize_shift;
+	unsigned int writesize_shift;
+	/* Masks based on erasesize_shift and writesize_shift */
+	unsigned int erasesize_mask;
+	unsigned int writesize_mask;
 
 	// Kernel-only stuff starts here.
 	const char *name;
@@ -190,8 +202,8 @@ struct mtd_info {
 	void (*sync) (struct mtd_info *mtd);
 
 	/* Chip-supported device locking */
-	int (*lock) (struct mtd_info *mtd, loff_t ofs, size_t len);
-	int (*unlock) (struct mtd_info *mtd, loff_t ofs, size_t len);
+	int (*lock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
+	int (*unlock) (struct mtd_info *mtd, loff_t ofs, uint64_t len);
 
 	/* Power Management functions */
 	int (*suspend) (struct mtd_info *mtd);
@@ -221,6 +233,35 @@ struct mtd_info {
 	void (*put_device) (struct mtd_info *mtd);
 };
 
+static inline uint32_t mtd_div_by_eb(uint64_t sz, struct mtd_info *mtd)
+{
+	if (mtd->erasesize_shift)
+		return sz >> mtd->erasesize_shift;
+	do_div(sz, mtd->erasesize);
+	return sz;
+}
+
+static inline uint32_t mtd_mod_by_eb(uint64_t sz, struct mtd_info *mtd)
+{
+	if (mtd->erasesize_shift)
+		return sz & mtd->erasesize_mask;
+	return do_div(sz, mtd->erasesize);
+}
+
+static inline uint32_t mtd_div_by_ws(uint64_t sz, struct mtd_info *mtd)
+{
+	if (mtd->writesize_shift)
+		return sz >> mtd->writesize_shift;
+	do_div(sz, mtd->writesize);
+	return sz;
+}
+
+static inline uint32_t mtd_mod_by_ws(uint64_t sz, struct mtd_info *mtd)
+{
+	if (mtd->writesize_shift)
+		return sz & mtd->writesize_mask;
+	return do_div(sz, mtd->writesize);
+}
 
 	/* Kernel-side ioctl definitions */
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 733d3f3b4eb..db5b63da2a7 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -335,17 +335,12 @@ struct nand_buffers {
  * @erase_cmd:		[INTERN] erase command write function, selectable due to AND support
  * @scan_bbt:		[REPLACEABLE] function to scan bad block table
  * @chip_delay:		[BOARDSPECIFIC] chip dependent delay for transfering data from array to read regs (tR)
- * @wq:			[INTERN] wait queue to sleep on if a NAND operation is in progress
  * @state:		[INTERN] the current state of the NAND device
  * @oob_poi:		poison value buffer
  * @page_shift:		[INTERN] number of address bits in a page (column address bits)
  * @phys_erase_shift:	[INTERN] number of address bits in a physical eraseblock
  * @bbt_erase_shift:	[INTERN] number of address bits in a bbt entry
  * @chip_shift:		[INTERN] number of address bits in one chip
- * @datbuf:		[INTERN] internal buffer for one page + oob
- * @oobbuf:		[INTERN] oob buffer for one eraseblock
- * @oobdirty:		[INTERN] indicates that oob_buf must be reinitialized
- * @data_poi:		[INTERN] pointer to a data buffer
  * @options:		[BOARDSPECIFIC] various chip options. They can partly be set to inform nand_scan about
  *			special functionality. See the defines for further explanation
  * @badblockpos:	[INTERN] position of the bad block marker in the oob area
@@ -399,7 +394,7 @@ struct nand_chip {
 	int		bbt_erase_shift;
 	int		chip_shift;
 	int		numchips;
-	unsigned long	chipsize;
+	uint64_t	chipsize;
 	int		pagemask;
 	int		pagebuf;
 	int		subpagesize;
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index c92b4d43960..a45dd831b3f 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -36,9 +36,9 @@
 
 struct mtd_partition {
 	char *name;			/* identifier string */
-	u_int32_t size;			/* partition size */
-	u_int32_t offset;		/* offset within the master MTD space */
-	u_int32_t mask_flags;		/* master MTD flags to mask out for this partition */
+	uint64_t size;			/* partition size */
+	uint64_t offset;		/* offset within the master MTD space */
+	uint32_t mask_flags;		/* master MTD flags to mask out for this partition */
 	struct nand_ecclayout *ecclayout;	/* out of band layout for this partition (NAND only)*/
 	struct mtd_info **mtdp;		/* pointer to store the MTD object */
 };
diff --git a/include/linux/mtd/pfow.h b/include/linux/mtd/pfow.h
new file mode 100644
index 00000000000..b730d4f8465
--- /dev/null
+++ b/include/linux/mtd/pfow.h
@@ -0,0 +1,159 @@
+/* Primary function overlay window definitions
+ * and service functions used by LPDDR chips
+ */
+#ifndef __LINUX_MTD_PFOW_H
+#define __LINUX_MTD_PFOW_H
+
+#include <linux/mtd/qinfo.h>
+
+/* PFOW registers addressing */
+/* Address of symbol "P" */
+#define PFOW_QUERY_STRING_P			0x0000
+/* Address of symbol "F" */
+#define PFOW_QUERY_STRING_F			0x0002
+/* Address of symbol "O" */
+#define PFOW_QUERY_STRING_O			0x0004
+/* Address of symbol "W" */
+#define PFOW_QUERY_STRING_W			0x0006
+/* Identification info for LPDDR chip */
+#define PFOW_MANUFACTURER_ID			0x0020
+#define PFOW_DEVICE_ID				0x0022
+/* Address in PFOW where prog buffer can can be found */
+#define PFOW_PROGRAM_BUFFER_OFFSET		0x0040
+/* Size of program buffer in words */
+#define PFOW_PROGRAM_BUFFER_SIZE		0x0042
+/* Address command code register */
+#define PFOW_COMMAND_CODE			0x0080
+/* command data register */
+#define PFOW_COMMAND_DATA			0x0084
+/* command address register lower address bits */
+#define PFOW_COMMAND_ADDRESS_L			0x0088
+/* command address register upper address bits */
+#define PFOW_COMMAND_ADDRESS_H			0x008a
+/* number of bytes to be proggrammed lower address bits */
+#define PFOW_DATA_COUNT_L			0x0090
+/* number of bytes to be proggrammed higher address bits */
+#define PFOW_DATA_COUNT_H			0x0092
+/* command execution register, the only possible value is 0x01 */
+#define PFOW_COMMAND_EXECUTE			0x00c0
+/* 0x01 should be written at this address to clear buffer */
+#define PFOW_CLEAR_PROGRAM_BUFFER		0x00c4
+/* device program/erase suspend register */
+#define PFOW_PROGRAM_ERASE_SUSPEND		0x00c8
+/* device status register */
+#define PFOW_DSR				0x00cc
+
+/* LPDDR memory device command codes */
+/* They are possible values of PFOW command code register */
+#define LPDDR_WORD_PROGRAM		0x0041
+#define LPDDR_BUFF_PROGRAM		0x00E9
+#define LPDDR_BLOCK_ERASE		0x0020
+#define LPDDR_LOCK_BLOCK		0x0061
+#define LPDDR_UNLOCK_BLOCK		0x0062
+#define LPDDR_READ_BLOCK_LOCK_STATUS	0x0065
+#define LPDDR_INFO_QUERY		0x0098
+#define LPDDR_READ_OTP			0x0097
+#define LPDDR_PROG_OTP			0x00C0
+#define LPDDR_RESUME			0x00D0
+
+/* Defines possible value of PFOW command execution register */
+#define LPDDR_START_EXECUTION			0x0001
+
+/* Defines possible value of PFOW program/erase suspend register */
+#define LPDDR_SUSPEND				0x0001
+
+/* Possible values of PFOW device status register */
+/* access R - read; RC read & clearable */
+#define DSR_DPS			(1<<1) /* RC; device protect status
+					* 0 - not protected 1 - locked */
+#define DSR_PSS			(1<<2) /* R; program suspend status;
+					* 0-prog in progress/completed,
+					* 1- prog suspended */
+#define DSR_VPPS		(1<<3) /* RC; 0-Vpp OK, * 1-Vpp low */
+#define DSR_PROGRAM_STATUS	(1<<4) /* RC; 0-successful, 1-error */
+#define DSR_ERASE_STATUS	(1<<5) /* RC; erase or blank check status;
+					* 0-success erase/blank check,
+					* 1 blank check error */
+#define DSR_ESS			(1<<6) /* R; erase suspend status;
+					* 0-erase in progress/complete,
+					* 1 erase suspended */
+#define DSR_READY_STATUS	(1<<7) /* R; Device status
+					* 0-busy,
+					* 1-ready */
+#define DSR_RPS			(0x3<<8) /* RC;  region program status
+					* 00 - Success,
+					* 01-re-program attempt in region with
+					* object mode data,
+					* 10-object mode program w attempt in
+					* region with control mode data
+					* 11-attempt to program invalid half
+					* with 0x41 command */
+#define DSR_AOS			(1<<12) /* RC; 1- AO related failure */
+#define DSR_AVAILABLE		(1<<15) /* R; Device availbility
+					* 1 - Device available
+					* 0 - not available */
+
+/* The superset of all possible error bits in DSR */
+#define DSR_ERR			0x133A
+
+static inline void send_pfow_command(struct map_info *map,
+				unsigned long cmd_code, unsigned long adr,
+				unsigned long len, map_word *datum)
+{
+	int bits_per_chip = map_bankwidth(map) * 8;
+	int chipnum;
+	struct lpddr_private *lpddr = map->fldrv_priv;
+	chipnum = adr >> lpddr->chipshift;
+
+	map_write(map, CMD(cmd_code), map->pfow_base + PFOW_COMMAND_CODE);
+	map_write(map, CMD(adr & ((1<<bits_per_chip) - 1)),
+				map->pfow_base + PFOW_COMMAND_ADDRESS_L);
+	map_write(map, CMD(adr>>bits_per_chip),
+				map->pfow_base + PFOW_COMMAND_ADDRESS_H);
+	if (len) {
+		map_write(map, CMD(len & ((1<<bits_per_chip) - 1)),
+					map->pfow_base + PFOW_DATA_COUNT_L);
+		map_write(map, CMD(len>>bits_per_chip),
+					map->pfow_base + PFOW_DATA_COUNT_H);
+	}
+	if (datum)
+		map_write(map, *datum, map->pfow_base + PFOW_COMMAND_DATA);
+
+	/* Command execution start */
+	map_write(map, CMD(LPDDR_START_EXECUTION),
+			map->pfow_base + PFOW_COMMAND_EXECUTE);
+}
+
+static inline void print_drs_error(unsigned dsr)
+{
+	int prog_status = (dsr & DSR_RPS) >> 8;
+
+	if (!(dsr & DSR_AVAILABLE))
+		printk(KERN_NOTICE"DSR.15: (0) Device not Available\n");
+	if (prog_status & 0x03)
+		printk(KERN_NOTICE"DSR.9,8: (11) Attempt to program invalid "
+						"half with 41h command\n");
+	else if (prog_status & 0x02)
+		printk(KERN_NOTICE"DSR.9,8: (10) Object Mode Program attempt "
+					"in region with Control Mode data\n");
+	else if (prog_status &  0x01)
+		printk(KERN_NOTICE"DSR.9,8: (01) Program attempt in region "
+						"with Object Mode data\n");
+	if (!(dsr & DSR_READY_STATUS))
+		printk(KERN_NOTICE"DSR.7: (0) Device is Busy\n");
+	if (dsr & DSR_ESS)
+		printk(KERN_NOTICE"DSR.6: (1) Erase Suspended\n");
+	if (dsr & DSR_ERASE_STATUS)
+		printk(KERN_NOTICE"DSR.5: (1) Erase/Blank check error\n");
+	if (dsr & DSR_PROGRAM_STATUS)
+		printk(KERN_NOTICE"DSR.4: (1) Program Error\n");
+	if (dsr & DSR_VPPS)
+		printk(KERN_NOTICE"DSR.3: (1) Vpp low detect, operation "
+					"aborted\n");
+	if (dsr & DSR_PSS)
+		printk(KERN_NOTICE"DSR.2: (1) Program suspended\n");
+	if (dsr & DSR_DPS)
+		printk(KERN_NOTICE"DSR.1: (1) Aborted Erase/Program attempt "
+					"on locked block\n");
+}
+#endif /* __LINUX_MTD_PFOW_H */
diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h
index c8e63a5ee72..76f7cabf07d 100644
--- a/include/linux/mtd/physmap.h
+++ b/include/linux/mtd/physmap.h
@@ -24,6 +24,7 @@ struct physmap_flash_data {
 	unsigned int		width;
 	void			(*set_vpp)(struct map_info *, int);
 	unsigned int		nr_parts;
+	unsigned int		pfow_base;
 	struct mtd_partition	*parts;
 };
 
diff --git a/include/linux/mtd/qinfo.h b/include/linux/mtd/qinfo.h
new file mode 100644
index 00000000000..7b3d487d8b3
--- /dev/null
+++ b/include/linux/mtd/qinfo.h
@@ -0,0 +1,91 @@
+#ifndef __LINUX_MTD_QINFO_H
+#define __LINUX_MTD_QINFO_H
+
+#include <linux/mtd/map.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/flashchip.h>
+#include <linux/mtd/partitions.h>
+
+/* lpddr_private describes lpddr flash chip in memory map
+ * @ManufactId - Chip Manufacture ID
+ * @DevId - Chip Device ID
+ * @qinfo - pointer to qinfo records describing the chip
+ * @numchips - number of chips including virual RWW partitions
+ * @chipshift - Chip/partiton size 2^chipshift
+ * @chips - per-chip data structure
+ */
+struct lpddr_private {
+	uint16_t ManufactId;
+	uint16_t DevId;
+	struct qinfo_chip *qinfo;
+	int numchips;
+	unsigned long chipshift;
+	struct flchip chips[0];
+};
+
+/* qinfo_query_info structure contains request information for
+ * each qinfo record
+ * @major - major number of qinfo record
+ * @major - minor number of qinfo record
+ * @id_str - descriptive string to access the record
+ * @desc - detailed description for the qinfo record
+ */
+struct qinfo_query_info {
+	uint8_t	major;
+	uint8_t	minor;
+	char *id_str;
+	char *desc;
+};
+
+/*
+ * qinfo_chip structure contains necessary qinfo records data
+ * @DevSizeShift - Device size 2^n bytes
+ * @BufSizeShift - Program buffer size 2^n bytes
+ * @TotalBlocksNum - Total number of blocks
+ * @UniformBlockSizeShift - Uniform block size 2^UniformBlockSizeShift bytes
+ * @HWPartsNum - Number of hardware partitions
+ * @SuspEraseSupp - Suspend erase supported
+ * @SingleWordProgTime - Single word program 2^SingleWordProgTime u-sec
+ * @ProgBufferTime - Program buffer write 2^ProgBufferTime u-sec
+ * @BlockEraseTime - Block erase 2^BlockEraseTime m-sec
+ */
+struct qinfo_chip {
+	/* General device info */
+	uint16_t DevSizeShift;
+	uint16_t BufSizeShift;
+	/* Erase block information */
+	uint16_t TotalBlocksNum;
+	uint16_t UniformBlockSizeShift;
+	/* Partition information */
+	uint16_t HWPartsNum;
+	/* Optional features */
+	uint16_t SuspEraseSupp;
+	/* Operation typical time */
+	uint16_t SingleWordProgTime;
+	uint16_t ProgBufferTime;
+	uint16_t BlockEraseTime;
+};
+
+/* defines for fixup usage */
+#define LPDDR_MFR_ANY		0xffff
+#define LPDDR_ID_ANY		0xffff
+#define NUMONYX_MFGR_ID		0x0089
+#define R18_DEVICE_ID_1G	0x893c
+
+static inline map_word lpddr_build_cmd(u_long cmd, struct map_info *map)
+{
+	map_word val = { {0} };
+	val.x[0] = cmd;
+	return val;
+}
+
+#define CMD(x) lpddr_build_cmd(x, map)
+#define CMDVAL(cmd) cmd.x[0]
+
+struct mtd_info *lpddr_cmdset(struct map_info *);
+
+#endif
+
diff --git a/include/linux/mtd/sharpsl.h b/include/linux/mtd/sharpsl.h
new file mode 100644
index 00000000000..25f4d2a845c
--- /dev/null
+++ b/include/linux/mtd/sharpsl.h
@@ -0,0 +1,20 @@
+/*
+ * SharpSL NAND support
+ *
+ * Copyright (C) 2008 Dmitry Baryshkov
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/mtd/nand.h>
+#include <linux/mtd/nand_ecc.h>
+#include <linux/mtd/partitions.h>
+
+struct sharpsl_nand_platform_data {
+	struct nand_bbt_descr	*badblock_pattern;
+	struct nand_ecclayout	*ecc_layout;
+	struct mtd_partition	*partitions;
+	unsigned int		nr_partitions;
+};
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 114091be887..f2455681337 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1125,9 +1125,6 @@ struct softnet_data
 	struct sk_buff		*completion_queue;
 
 	struct napi_struct	backlog;
-#ifdef CONFIG_NET_DMA
-	struct dma_chan		*net_dma;
-#endif
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h
index 1ce9fe572e5..1d9518bc4c5 100644
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -164,4 +164,22 @@ void oprofile_put_buff(unsigned long *buf, unsigned int start,
 unsigned long oprofile_get_cpu_buffer_size(void);
 void oprofile_cpu_buffer_inc_smpl_lost(void);
  
+/* cpu buffer functions */
+
+struct op_sample;
+
+struct op_entry {
+	struct ring_buffer_event *event;
+	struct op_sample *sample;
+	unsigned long irq_flags;
+	unsigned long size;
+	unsigned long *data;
+};
+
+void oprofile_write_reserve(struct op_entry *entry,
+			    struct pt_regs * const regs,
+			    unsigned long pc, int code, int size);
+int oprofile_add_data(struct op_entry *entry, unsigned long val);
+int oprofile_write_commit(struct op_entry *entry);
+
 #endif /* OPROFILE_H */
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index f7cc204fab0..20998746518 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -223,7 +223,6 @@ struct hotplug_params {
 #ifdef CONFIG_ACPI
 #include <acpi/acpi.h>
 #include <acpi/acpi_bus.h>
-#include <acpi/actypes.h>
 extern acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus,
 				struct hotplug_params *hpp);
 int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags);
diff --git a/include/linux/spi/tdo24m.h b/include/linux/spi/tdo24m.h
new file mode 100644
index 00000000000..7572d4e1fe7
--- /dev/null
+++ b/include/linux/spi/tdo24m.h
@@ -0,0 +1,13 @@
+#ifndef __TDO24M_H__
+#define __TDO24M_H__
+
+enum tdo24m_model {
+	TDO24M,
+	TDO35S,
+};
+
+struct tdo24m_platform_data {
+	enum tdo24m_model model;
+};
+
+#endif /* __TDO24M_H__ */
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 2ce8207686e..2b409c44db8 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -232,6 +232,11 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 
 extern void hibernation_set_ops(struct platform_hibernation_ops *ops);
 extern int hibernate(void);
+extern int hibernate_nvs_register(unsigned long start, unsigned long size);
+extern int hibernate_nvs_alloc(void);
+extern void hibernate_nvs_free(void);
+extern void hibernate_nvs_save(void);
+extern void hibernate_nvs_restore(void);
 #else /* CONFIG_HIBERNATION */
 static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
@@ -239,6 +244,14 @@ static inline void swsusp_unset_page_free(struct page *p) {}
 
 static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
+static inline int hibernate_nvs_register(unsigned long a, unsigned long b)
+{
+	return 0;
+}
+static inline int hibernate_nvs_alloc(void) { return 0; }
+static inline void hibernate_nvs_free(void) {}
+static inline void hibernate_nvs_save(void) {}
+static inline void hibernate_nvs_restore(void) {}
 #endif /* CONFIG_HIBERNATION */
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/include/net/netdma.h b/include/net/netdma.h
index f28c6e064e8..8ba8ce284ee 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -24,17 +24,6 @@
 #include <linux/dmaengine.h>
 #include <linux/skbuff.h>
 
-static inline struct dma_chan *get_softnet_dma(void)
-{
-	struct dma_chan *chan;
-	rcu_read_lock();
-	chan = rcu_dereference(__get_cpu_var(softnet_data).net_dma);
-	if (chan)
-		dma_chan_get(chan);
-	rcu_read_unlock();
-	return chan;
-}
-
 int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
 		struct sk_buff *skb, int offset, struct iovec *to,
 		size_t len, struct dma_pinned_list *pinned_list);
diff --git a/kernel/cred.c b/kernel/cred.c
index ff7bc071991..043f78c133c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -506,6 +506,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	else
 		old = get_cred(&init_cred);
 
+	*new = *old;
 	get_uid(new->user);
 	get_group_info(new->group_info);
 
@@ -529,6 +530,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 
 error:
 	put_cred(new);
+	put_cred(old);
 	return NULL;
 }
 EXPORT_SYMBOL(prepare_kernel_cred);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f77d3819ef5..45e8541ab7e 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -258,12 +258,12 @@ int hibernation_snapshot(int platform_mode)
 {
 	int error;
 
-	/* Free memory before shutting down devices. */
-	error = swsusp_shrink_memory();
+	error = platform_begin(platform_mode);
 	if (error)
 		return error;
 
-	error = platform_begin(platform_mode);
+	/* Free memory before shutting down devices. */
+	error = swsusp_shrink_memory();
 	if (error)
 		goto Close;
 
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5d2ab836e99..f5fc2d7680f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -25,6 +25,7 @@
 #include <linux/syscalls.h>
 #include <linux/console.h>
 #include <linux/highmem.h>
+#include <linux/list.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 	return ret;
 }
 
-static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
-{
-	free_list_of_pages(ca->chain, clear_page_nosave);
-	memset(ca, 0, sizeof(struct chain_allocator));
-}
-
 /**
  *	Data types related to memory bitmaps.
  *
@@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
 #define BM_BITS_PER_BLOCK	(PAGE_SIZE << 3)
 
 struct bm_block {
-	struct bm_block *next;		/* next element of the list */
+	struct list_head hook;	/* hook into a list of bitmap blocks */
 	unsigned long start_pfn;	/* pfn represented by the first bit */
 	unsigned long end_pfn;	/* pfn represented by the last bit plus 1 */
 	unsigned long *data;	/* bitmap representing pages */
@@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb)
 	return bb->end_pfn - bb->start_pfn;
 }
 
-struct zone_bitmap {
-	struct zone_bitmap *next;	/* next element of the list */
-	unsigned long start_pfn;	/* minimal pfn in this zone */
-	unsigned long end_pfn;		/* maximal pfn in this zone plus 1 */
-	struct bm_block *bm_blocks;	/* list of bitmap blocks */
-	struct bm_block *cur_block;	/* recently used bitmap block */
-};
-
 /* strcut bm_position is used for browsing memory bitmaps */
 
 struct bm_position {
-	struct zone_bitmap *zone_bm;
 	struct bm_block *block;
 	int bit;
 };
 
 struct memory_bitmap {
-	struct zone_bitmap *zone_bm_list;	/* list of zone bitmaps */
+	struct list_head blocks;	/* list of bitmap blocks */
 	struct linked_page *p_list;	/* list of pages used to store zone
 					 * bitmap objects and bitmap block
 					 * objects
@@ -273,11 +259,7 @@ struct memory_bitmap {
 
 static void memory_bm_position_reset(struct memory_bitmap *bm)
 {
-	struct zone_bitmap *zone_bm;
-
-	zone_bm = bm->zone_bm_list;
-	bm->cur.zone_bm = zone_bm;
-	bm->cur.block = zone_bm->bm_blocks;
+	bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
 	bm->cur.bit = 0;
 }
 
@@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 
 /**
  *	create_bm_block_list - create a list of block bitmap objects
+ *	@nr_blocks - number of blocks to allocate
+ *	@list - list to put the allocated blocks into
+ *	@ca - chain allocator to be used for allocating memory
  */
-
-static inline struct bm_block *
-create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
+static int create_bm_block_list(unsigned long pages,
+				struct list_head *list,
+				struct chain_allocator *ca)
 {
-	struct bm_block *bblist = NULL;
+	unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
 
 	while (nr_blocks-- > 0) {
 		struct bm_block *bb;
 
 		bb = chain_alloc(ca, sizeof(struct bm_block));
 		if (!bb)
-			return NULL;
-
-		bb->next = bblist;
-		bblist = bb;
+			return -ENOMEM;
+		list_add(&bb->hook, list);
 	}
-	return bblist;
+
+	return 0;
 }
 
+struct mem_extent {
+	struct list_head hook;
+	unsigned long start;
+	unsigned long end;
+};
+
 /**
- *	create_zone_bm_list - create a list of zone bitmap objects
+ *	free_mem_extents - free a list of memory extents
+ *	@list - list of extents to empty
  */
+static void free_mem_extents(struct list_head *list)
+{
+	struct mem_extent *ext, *aux;
 
-static inline struct zone_bitmap *
-create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
+	list_for_each_entry_safe(ext, aux, list, hook) {
+		list_del(&ext->hook);
+		kfree(ext);
+	}
+}
+
+/**
+ *	create_mem_extents - create a list of memory extents representing
+ *	                     contiguous ranges of PFNs
+ *	@list - list to put the extents into
+ *	@gfp_mask - mask to use for memory allocations
+ */
+static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 {
-	struct zone_bitmap *zbmlist = NULL;
+	struct zone *zone;
 
-	while (nr_zones-- > 0) {
-		struct zone_bitmap *zbm;
+	INIT_LIST_HEAD(list);
 
-		zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
-		if (!zbm)
-			return NULL;
+	for_each_zone(zone) {
+		unsigned long zone_start, zone_end;
+		struct mem_extent *ext, *cur, *aux;
+
+		if (!populated_zone(zone))
+			continue;
 
-		zbm->next = zbmlist;
-		zbmlist = zbm;
+		zone_start = zone->zone_start_pfn;
+		zone_end = zone->zone_start_pfn + zone->spanned_pages;
+
+		list_for_each_entry(ext, list, hook)
+			if (zone_start <= ext->end)
+				break;
+
+		if (&ext->hook == list || zone_end < ext->start) {
+			/* New extent is necessary */
+			struct mem_extent *new_ext;
+
+			new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask);
+			if (!new_ext) {
+				free_mem_extents(list);
+				return -ENOMEM;
+			}
+			new_ext->start = zone_start;
+			new_ext->end = zone_end;
+			list_add_tail(&new_ext->hook, &ext->hook);
+			continue;
+		}
+
+		/* Merge this zone's range of PFNs with the existing one */
+		if (zone_start < ext->start)
+			ext->start = zone_start;
+		if (zone_end > ext->end)
+			ext->end = zone_end;
+
+		/* More merging may be possible */
+		cur = ext;
+		list_for_each_entry_safe_continue(cur, aux, list, hook) {
+			if (zone_end < cur->start)
+				break;
+			if (zone_end < cur->end)
+				ext->end = cur->end;
+			list_del(&cur->hook);
+			kfree(cur);
+		}
 	}
-	return zbmlist;
+
+	return 0;
 }
 
 /**
   *	memory_bm_create - allocate memory for a memory bitmap
   */
-
 static int
 memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
 {
 	struct chain_allocator ca;
-	struct zone *zone;
-	struct zone_bitmap *zone_bm;
-	struct bm_block *bb;
-	unsigned int nr;
+	struct list_head mem_extents;
+	struct mem_extent *ext;
+	int error;
 
 	chain_init(&ca, gfp_mask, safe_needed);
+	INIT_LIST_HEAD(&bm->blocks);
 
-	/* Compute the number of zones */
-	nr = 0;
-	for_each_zone(zone)
-		if (populated_zone(zone))
-			nr++;
-
-	/* Allocate the list of zones bitmap objects */
-	zone_bm = create_zone_bm_list(nr, &ca);
-	bm->zone_bm_list = zone_bm;
-	if (!zone_bm) {
-		chain_free(&ca, PG_UNSAFE_CLEAR);
-		return -ENOMEM;
-	}
-
-	/* Initialize the zone bitmap objects */
-	for_each_zone(zone) {
-		unsigned long pfn;
+	error = create_mem_extents(&mem_extents, gfp_mask);
+	if (error)
+		return error;
 
-		if (!populated_zone(zone))
-			continue;
+	list_for_each_entry(ext, &mem_extents, hook) {
+		struct bm_block *bb;
+		unsigned long pfn = ext->start;
+		unsigned long pages = ext->end - ext->start;
 
-		zone_bm->start_pfn = zone->zone_start_pfn;
-		zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
-		/* Allocate the list of bitmap block objects */
-		nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
-		bb = create_bm_block_list(nr, &ca);
-		zone_bm->bm_blocks = bb;
-		zone_bm->cur_block = bb;
-		if (!bb)
-			goto Free;
+		bb = list_entry(bm->blocks.prev, struct bm_block, hook);
 
-		nr = zone->spanned_pages;
-		pfn = zone->zone_start_pfn;
-		/* Initialize the bitmap block objects */
-		while (bb) {
-			unsigned long *ptr;
+		error = create_bm_block_list(pages, bm->blocks.prev, &ca);
+		if (error)
+			goto Error;
 
-			ptr = get_image_page(gfp_mask, safe_needed);
-			bb->data = ptr;
-			if (!ptr)
-				goto Free;
+		list_for_each_entry_continue(bb, &bm->blocks, hook) {
+			bb->data = get_image_page(gfp_mask, safe_needed);
+			if (!bb->data) {
+				error = -ENOMEM;
+				goto Error;
+			}
 
 			bb->start_pfn = pfn;
-			if (nr >= BM_BITS_PER_BLOCK) {
+			if (pages >= BM_BITS_PER_BLOCK) {
 				pfn += BM_BITS_PER_BLOCK;
-				nr -= BM_BITS_PER_BLOCK;
+				pages -= BM_BITS_PER_BLOCK;
 			} else {
 				/* This is executed only once in the loop */
-				pfn += nr;
+				pfn += pages;
 			}
 			bb->end_pfn = pfn;
-			bb = bb->next;
 		}
-		zone_bm = zone_bm->next;
 	}
+
 	bm->p_list = ca.chain;
 	memory_bm_position_reset(bm);
-	return 0;
+ Exit:
+	free_mem_extents(&mem_extents);
+	return error;
 
- Free:
+ Error:
 	bm->p_list = ca.chain;
 	memory_bm_free(bm, PG_UNSAFE_CLEAR);
-	return -ENOMEM;
+	goto Exit;
 }
 
 /**
   *	memory_bm_free - free memory occupied by the memory bitmap @bm
   */
-
 static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
 {
-	struct zone_bitmap *zone_bm;
+	struct bm_block *bb;
 
-	/* Free the list of bit blocks for each zone_bitmap object */
-	zone_bm = bm->zone_bm_list;
-	while (zone_bm) {
-		struct bm_block *bb;
+	list_for_each_entry(bb, &bm->blocks, hook)
+		if (bb->data)
+			free_image_page(bb->data, clear_nosave_free);
 
-		bb = zone_bm->bm_blocks;
-		while (bb) {
-			if (bb->data)
-				free_image_page(bb->data, clear_nosave_free);
-			bb = bb->next;
-		}
-		zone_bm = zone_bm->next;
-	}
 	free_list_of_pages(bm->p_list, clear_nosave_free);
-	bm->zone_bm_list = NULL;
+
+	INIT_LIST_HEAD(&bm->blocks);
 }
 
 /**
@@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
  *	to given pfn.  The cur_zone_bm member of @bm and the cur_block member
  *	of @bm->cur_zone_bm are updated.
  */
-
 static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
 				void **addr, unsigned int *bit_nr)
 {
-	struct zone_bitmap *zone_bm;
 	struct bm_block *bb;
 
-	/* Check if the pfn is from the current zone */
-	zone_bm = bm->cur.zone_bm;
-	if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
-		zone_bm = bm->zone_bm_list;
-		/* We don't assume that the zones are sorted by pfns */
-		while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
-			zone_bm = zone_bm->next;
-
-			if (!zone_bm)
-				return -EFAULT;
-		}
-		bm->cur.zone_bm = zone_bm;
-	}
-	/* Check if the pfn corresponds to the current bitmap block */
-	bb = zone_bm->cur_block;
+	/*
+	 * Check if the pfn corresponds to the current bitmap block and find
+	 * the block where it fits if this is not the case.
+	 */
+	bb = bm->cur.block;
 	if (pfn < bb->start_pfn)
-		bb = zone_bm->bm_blocks;
+		list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
+			if (pfn >= bb->start_pfn)
+				break;
 
-	while (pfn >= bb->end_pfn) {
-		bb = bb->next;
+	if (pfn >= bb->end_pfn)
+		list_for_each_entry_continue(bb, &bm->blocks, hook)
+			if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
+				break;
 
-		BUG_ON(!bb);
-	}
-	zone_bm->cur_block = bb;
+	if (&bb->hook == &bm->blocks)
+		return -EFAULT;
+
+	/* The block has been found */
+	bm->cur.block = bb;
 	pfn -= bb->start_pfn;
+	bm->cur.bit = pfn + 1;
 	*bit_nr = pfn;
 	*addr = bb->data;
 	return 0;
@@ -519,6 +529,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 	return test_bit(bit, addr);
 }
 
+static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
+{
+	void *addr;
+	unsigned int bit;
+
+	return !memory_bm_find_bit(bm, pfn, &addr, &bit);
+}
+
 /**
  *	memory_bm_next_pfn - find the pfn that corresponds to the next set bit
  *	in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
@@ -530,29 +548,21 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
 
 static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
 {
-	struct zone_bitmap *zone_bm;
 	struct bm_block *bb;
 	int bit;
 
+	bb = bm->cur.block;
 	do {
-		bb = bm->cur.block;
-		do {
-			bit = bm->cur.bit;
-			bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
-			if (bit < bm_block_bits(bb))
-				goto Return_pfn;
-
-			bb = bb->next;
-			bm->cur.block = bb;
-			bm->cur.bit = 0;
-		} while (bb);
-		zone_bm = bm->cur.zone_bm->next;
-		if (zone_bm) {
-			bm->cur.zone_bm = zone_bm;
-			bm->cur.block = zone_bm->bm_blocks;
-			bm->cur.bit = 0;
-		}
-	} while (zone_bm);
+		bit = bm->cur.bit;
+		bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
+		if (bit < bm_block_bits(bb))
+			goto Return_pfn;
+
+		bb = list_entry(bb->hook.next, struct bm_block, hook);
+		bm->cur.block = bb;
+		bm->cur.bit = 0;
+	} while (&bb->hook != &bm->blocks);
+
 	memory_bm_position_reset(bm);
 	return BM_END_OF_MAP;
 
@@ -808,8 +818,7 @@ static unsigned int count_free_highmem_pages(void)
  *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
  *	and it isn't a part of a free chunk of pages.
  */
-
-static struct page *saveable_highmem_page(unsigned long pfn)
+static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -817,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
 		return NULL;
 
 	page = pfn_to_page(pfn);
+	if (page_zone(page) != zone)
+		return NULL;
 
 	BUG_ON(!PageHighMem(page));
 
@@ -846,13 +857,16 @@ unsigned int count_highmem_pages(void)
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			if (saveable_highmem_page(pfn))
+			if (saveable_highmem_page(zone, pfn))
 				n++;
 	}
 	return n;
 }
 #else
-static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
+static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+	return NULL;
+}
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -863,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
  *	of pages statically defined as 'unsaveable', and it isn't a part of
  *	a free chunk of pages.
  */
-
-static struct page *saveable_page(unsigned long pfn)
+static struct page *saveable_page(struct zone *zone, unsigned long pfn)
 {
 	struct page *page;
 
@@ -872,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn)
 		return NULL;
 
 	page = pfn_to_page(pfn);
+	if (page_zone(page) != zone)
+		return NULL;
 
 	BUG_ON(PageHighMem(page));
 
@@ -903,7 +918,7 @@ unsigned int count_data_pages(void)
 		mark_free_pages(zone);
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-			if(saveable_page(pfn))
+			if (saveable_page(zone, pfn))
 				n++;
 	}
 	return n;
@@ -944,7 +959,7 @@ static inline struct page *
 page_is_saveable(struct zone *zone, unsigned long pfn)
 {
 	return is_highmem(zone) ?
-			saveable_highmem_page(pfn) : saveable_page(pfn);
+		saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
 }
 
 static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
@@ -966,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 			 * data modified by kmap_atomic()
 			 */
 			safe_copy_page(buffer, s_page);
-			dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
+			dst = kmap_atomic(d_page, KM_USER0);
 			memcpy(dst, buffer, PAGE_SIZE);
 			kunmap_atomic(dst, KM_USER0);
 		} else {
@@ -975,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 	}
 }
 #else
-#define page_is_saveable(zone, pfn)	saveable_page(pfn)
+#define page_is_saveable(zone, pfn)	saveable_page(zone, pfn)
 
 static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
 {
@@ -1459,9 +1474,7 @@ load_header(struct swsusp_info *info)
  *	unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
  *	the corresponding bit in the memory bitmap @bm
  */
-
-static inline void
-unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
+static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
 	int j;
 
@@ -1469,8 +1482,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 		if (unlikely(buf[j] == BM_END_OF_MAP))
 			break;
 
-		memory_bm_set_bit(bm, buf[j]);
+		if (memory_bm_pfn_present(bm, buf[j]))
+			memory_bm_set_bit(bm, buf[j]);
+		else
+			return -EFAULT;
 	}
+
+	return 0;
 }
 
 /* List of "safe" pages that may be used to store data loaded from the suspend
@@ -1608,7 +1626,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 	pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
 	if (!pbe) {
 		swsusp_free();
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 	pbe->orig_page = page;
 	if (safe_highmem_pages > 0) {
@@ -1677,7 +1695,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
 static inline void *
 get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
 {
-	return NULL;
+	return ERR_PTR(-EINVAL);
 }
 
 static inline void copy_last_highmem_page(void) {}
@@ -1788,8 +1806,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
 	struct pbe *pbe;
-	struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
+	struct page *page;
+	unsigned long pfn = memory_bm_next_pfn(bm);
 
+	if (pfn == BM_END_OF_MAP)
+		return ERR_PTR(-EFAULT);
+
+	page = pfn_to_page(pfn);
 	if (PageHighMem(page))
 		return get_highmem_page_buffer(page, ca);
 
@@ -1805,7 +1828,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 	pbe = chain_alloc(ca, sizeof(struct pbe));
 	if (!pbe) {
 		swsusp_free();
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 	pbe->orig_address = page_address(page);
 	pbe->address = safe_pages_list;
@@ -1868,7 +1891,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 				return error;
 
 		} else if (handle->prev <= nr_meta_pages) {
-			unpack_orig_pfns(buffer, &copy_bm);
+			error = unpack_orig_pfns(buffer, &copy_bm);
+			if (error)
+				return error;
+
 			if (handle->prev == nr_meta_pages) {
 				error = prepare_image(&orig_bm, &copy_bm);
 				if (error)
@@ -1879,12 +1905,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 				restore_pblist = NULL;
 				handle->buffer = get_buffer(&orig_bm, &ca);
 				handle->sync_read = 0;
-				if (!handle->buffer)
-					return -ENOMEM;
+				if (IS_ERR(handle->buffer))
+					return PTR_ERR(handle->buffer);
 			}
 		} else {
 			copy_last_highmem_page();
 			handle->buffer = get_buffer(&orig_bm, &ca);
+			if (IS_ERR(handle->buffer))
+				return PTR_ERR(handle->buffer);
 			if (handle->buffer != buffer)
 				handle->sync_read = 0;
 		}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 023ff2a31d8..a92c9145155 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -262,3 +262,125 @@ int swsusp_shrink_memory(void)
 
 	return 0;
 }
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * hibernation and to restore the contents of this memory during the subsequent
+ * resume.  The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+	unsigned long phys_start;
+	unsigned int size;
+	void *kaddr;
+	void *data;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ *	hibernate_nvs_register - register platform NVS memory region to save
+ *	@start - physical address of the region
+ *	@size - size of the region
+ *
+ *	The NVS region need not be page-aligned (both ends) and we arrange
+ *	things so that the data from page-aligned addresses in this region will
+ *	be copied into separate RAM pages.
+ */
+int hibernate_nvs_register(unsigned long start, unsigned long size)
+{
+	struct nvs_page *entry, *next;
+
+	while (size > 0) {
+		unsigned int nr_bytes;
+
+		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+		if (!entry)
+			goto Error;
+
+		list_add_tail(&entry->node, &nvs_list);
+		entry->phys_start = start;
+		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+		entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+		start += entry->size;
+		size -= entry->size;
+	}
+	return 0;
+
+ Error:
+	list_for_each_entry_safe(entry, next, &nvs_list, node) {
+		list_del(&entry->node);
+		kfree(entry);
+	}
+	return -ENOMEM;
+}
+
+/**
+ *	hibernate_nvs_free - free data pages allocated for saving NVS regions
+ */
+void hibernate_nvs_free(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			free_page((unsigned long)entry->data);
+			entry->data = NULL;
+			if (entry->kaddr) {
+				iounmap(entry->kaddr);
+				entry->kaddr = NULL;
+			}
+		}
+}
+
+/**
+ *	hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int hibernate_nvs_alloc(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node) {
+		entry->data = (void *)__get_free_page(GFP_KERNEL);
+		if (!entry->data) {
+			hibernate_nvs_free();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/**
+ *	hibernate_nvs_save - save NVS memory regions
+ */
+void hibernate_nvs_save(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			memcpy(entry->data, entry->kaddr, entry->size);
+		}
+}
+
+/**
+ *	hibernate_nvs_restore - restore NVS memory regions
+ *
+ *	This function is going to be called with interrupts disabled, so it
+ *	cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void hibernate_nvs_restore(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data)
+			memcpy(entry->kaddr, entry->data, entry->size);
+}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a9d9760dc7b..8b0daf0662e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -168,7 +168,13 @@ rb_event_length(struct ring_buffer_event *event)
  */
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
-	return rb_event_length(event);
+	unsigned length = rb_event_length(event);
+	if (event->type != RINGBUF_TYPE_DATA)
+		return length;
+	length -= RB_EVNT_HDR_SIZE;
+	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
+                length -= sizeof(event->array[0]);
+	return length;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index bab8bcedd62..5f736f1ceea 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -170,25 +170,6 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 static struct list_head ptype_all __read_mostly;	/* Taps */
 
-#ifdef CONFIG_NET_DMA
-struct net_dma {
-	struct dma_client client;
-	spinlock_t lock;
-	cpumask_t channel_mask;
-	struct dma_chan **channels;
-};
-
-static enum dma_state_client
-netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
-	enum dma_state state);
-
-static struct net_dma net_dma = {
-	.client = {
-		.event_callback = netdev_dma_event,
-	},
-};
-#endif
-
 /*
  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  * semaphore.
@@ -2754,14 +2735,7 @@ out:
 	 * There may not be any more sk_buffs coming right now, so push
 	 * any pending DMA copies to hardware
 	 */
-	if (!cpus_empty(net_dma.channel_mask)) {
-		int chan_idx;
-		for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) {
-			struct dma_chan *chan = net_dma.channels[chan_idx];
-			if (chan)
-				dma_async_memcpy_issue_pending(chan);
-		}
-	}
+	dma_issue_pending_all();
 #endif
 
 	return;
@@ -4952,122 +4926,6 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-#ifdef CONFIG_NET_DMA
-/**
- * net_dma_rebalance - try to maintain one DMA channel per CPU
- * @net_dma: DMA client and associated data (lock, channels, channel_mask)
- *
- * This is called when the number of channels allocated to the net_dma client
- * changes.  The net_dma client tries to have one DMA channel per CPU.
- */
-
-static void net_dma_rebalance(struct net_dma *net_dma)
-{
-	unsigned int cpu, i, n, chan_idx;
-	struct dma_chan *chan;
-
-	if (cpus_empty(net_dma->channel_mask)) {
-		for_each_online_cpu(cpu)
-			rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL);
-		return;
-	}
-
-	i = 0;
-	cpu = first_cpu(cpu_online_map);
-
-	for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) {
-		chan = net_dma->channels[chan_idx];
-
-		n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask))
-		   + (i < (num_online_cpus() %
-			cpus_weight(net_dma->channel_mask)) ? 1 : 0));
-
-		while(n) {
-			per_cpu(softnet_data, cpu).net_dma = chan;
-			cpu = next_cpu(cpu, cpu_online_map);
-			n--;
-		}
-		i++;
-	}
-}
-
-/**
- * netdev_dma_event - event callback for the net_dma_client
- * @client: should always be net_dma_client
- * @chan: DMA channel for the event
- * @state: DMA state to be handled
- */
-static enum dma_state_client
-netdev_dma_event(struct dma_client *client, struct dma_chan *chan,
-	enum dma_state state)
-{
-	int i, found = 0, pos = -1;
-	struct net_dma *net_dma =
-		container_of(client, struct net_dma, client);
-	enum dma_state_client ack = DMA_DUP; /* default: take no action */
-
-	spin_lock(&net_dma->lock);
-	switch (state) {
-	case DMA_RESOURCE_AVAILABLE:
-		for (i = 0; i < nr_cpu_ids; i++)
-			if (net_dma->channels[i] == chan) {
-				found = 1;
-				break;
-			} else if (net_dma->channels[i] == NULL && pos < 0)
-				pos = i;
-
-		if (!found && pos >= 0) {
-			ack = DMA_ACK;
-			net_dma->channels[pos] = chan;
-			cpu_set(pos, net_dma->channel_mask);
-			net_dma_rebalance(net_dma);
-		}
-		break;
-	case DMA_RESOURCE_REMOVED:
-		for (i = 0; i < nr_cpu_ids; i++)
-			if (net_dma->channels[i] == chan) {
-				found = 1;
-				pos = i;
-				break;
-			}
-
-		if (found) {
-			ack = DMA_ACK;
-			cpu_clear(pos, net_dma->channel_mask);
-			net_dma->channels[i] = NULL;
-			net_dma_rebalance(net_dma);
-		}
-		break;
-	default:
-		break;
-	}
-	spin_unlock(&net_dma->lock);
-
-	return ack;
-}
-
-/**
- * netdev_dma_register - register the networking subsystem as a DMA client
- */
-static int __init netdev_dma_register(void)
-{
-	net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma),
-								GFP_KERNEL);
-	if (unlikely(!net_dma.channels)) {
-		printk(KERN_NOTICE
-				"netdev_dma: no memory for net_dma.channels\n");
-		return -ENOMEM;
-	}
-	spin_lock_init(&net_dma.lock);
-	dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask);
-	dma_async_client_register(&net_dma.client);
-	dma_async_client_chan_request(&net_dma.client);
-	return 0;
-}
-
-#else
-static int __init netdev_dma_register(void) { return -ENODEV; }
-#endif /* CONFIG_NET_DMA */
 
 /**
  *	netdev_increment_features - increment feature set by one
@@ -5287,14 +5145,15 @@ static int __init net_dev_init(void)
 	if (register_pernet_device(&default_device_ops))
 		goto out;
 
-	netdev_dma_register();
-
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
 	hotcpu_notifier(dev_cpu_callback, 0);
 	dst_init();
 	dev_mcast_init();
+	#ifdef CONFIG_NET_DMA
+	dmaengine_get();
+	#endif
 	rc = 0;
 out:
 	return rc;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bd6ff907d9e..ce572f9dff0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1313,7 +1313,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		if ((available < target) &&
 		    (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
 		    !sysctl_tcp_low_latency &&
-		    __get_cpu_var(softnet_data).net_dma) {
+		    dma_find_channel(DMA_MEMCPY)) {
 			preempt_enable_no_resched();
 			tp->ucopy.pinned_list =
 					dma_pin_iovec_pages(msg->msg_iov, len);
@@ -1523,7 +1523,7 @@ do_prequeue:
 		if (!(flags & MSG_TRUNC)) {
 #ifdef CONFIG_NET_DMA
 			if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-				tp->ucopy.dma_chan = get_softnet_dma();
+				tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
 
 			if (tp->ucopy.dma_chan) {
 				tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
@@ -1628,7 +1628,6 @@ skip_copy:
 
 		/* Safe to free early-copied skbs now */
 		__skb_queue_purge(&sk->sk_async_wait_queue);
-		dma_chan_put(tp->ucopy.dma_chan);
 		tp->ucopy.dma_chan = NULL;
 	}
 	if (tp->ucopy.pinned_list) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 99b7ecbe889..a6961d75c7e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5005,7 +5005,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
 		return 0;
 
 	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-		tp->ucopy.dma_chan = get_softnet_dma();
+		tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
 
 	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9d839fa9331..19d7b429a26 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1594,7 +1594,7 @@ process:
 #ifdef CONFIG_NET_DMA
 		struct tcp_sock *tp = tcp_sk(sk);
 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = get_softnet_dma();
+			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
 		if (tp->ucopy.dma_chan)
 			ret = tcp_v4_do_rcv(sk, skb);
 		else
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1297306d729..e5b85d45bee 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1675,7 +1675,7 @@ process:
 #ifdef CONFIG_NET_DMA
 		struct tcp_sock *tp = tcp_sk(sk);
 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
-			tp->ucopy.dma_chan = get_softnet_dma();
+			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
 		if (tp->ucopy.dma_chan)
 			ret = tcp_v6_do_rcv(sk, skb);
 		else