603 files changed, 36027 insertions, 4543 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index a986e9bbba3..bcebb9eaedc 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -160,7 +160,7 @@ Description:
 		match the driver to the device.  For example:
 		# echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id
 
-What:		/sys/bus/usb/device/.../avoid_reset
+What:		/sys/bus/usb/device/.../avoid_reset_quirk
 Date:		December 2009
 Contact:	Oliver Neukum <oliver@neukum.org>
 Description:
diff --git a/Documentation/PCI/PCI-DMA-mapping.txt b/Documentation/DMA-API-HOWTO.txt
index 52618ab069a..52618ab069a 100644
--- a/Documentation/PCI/PCI-DMA-mapping.txt
+++ b/Documentation/DMA-API-HOWTO.txt
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index f8bc802d70b..3a6aecd078b 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -340,7 +340,7 @@ Note:
 5.3 swappiness
   Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
 
-  Following cgroups' swapiness can't be changed.
+  Following cgroups' swappiness can't be changed.
   - root cgroup (uses /proc/sys/vm/swappiness).
   - a cgroup which uses hierarchy and it has child cgroup.
   - a cgroup which uses hierarchy and not the root of hierarchy.
diff --git a/Documentation/circular-buffers.txt b/Documentation/circular-buffers.txt
new file mode 100644
index 00000000000..8117e5bf606
--- /dev/null
+++ b/Documentation/circular-buffers.txt
@@ -0,0 +1,234 @@
+			       ================
+			       CIRCULAR BUFFERS
+			       ================
+
+By: David Howells <dhowells@redhat.com>
+    Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+
+
+Linux provides a number of features that can be used to implement circular
+buffering.  There are two sets of such features:
+
+ (1) Convenience functions for determining information about power-of-2 sized
+     buffers.
+
+ (2) Memory barriers for when the producer and the consumer of objects in the
+     buffer don't want to share a lock.
+
+To use these facilities, as discussed below, there needs to be just one
+producer and just one consumer.  It is possible to handle multiple producers by
+serialising them, and to handle multiple consumers by serialising them.
+
+
+Contents:
+
+ (*) What is a circular buffer?
+
+ (*) Measuring power-of-2 buffers.
+
+ (*) Using memory barriers with circular buffers.
+     - The producer.
+     - The consumer.
+
+
+==========================
+WHAT IS A CIRCULAR BUFFER?
+==========================
+
+First of all, what is a circular buffer?  A circular buffer is a buffer of
+fixed, finite size into which there are two indices:
+
+ (1) A 'head' index - the point at which the producer inserts items into the
+     buffer.
+
+ (2) A 'tail' index - the point at which the consumer finds the next item in
+     the buffer.
+
+Typically when the tail pointer is equal to the head pointer, the buffer is
+empty; and the buffer is full when the head pointer is one less than the tail
+pointer.
+
+The head index is incremented when items are added, and the tail index when
+items are removed.  The tail index should never jump the head index, and both
+indices should be wrapped to 0 when they reach the end of the buffer, thus
+allowing an infinite amount of data to flow through the buffer.
+
+Typically, items will all be of the same unit size, but this isn't strictly
+required to use the techniques below.  The indices can be increased by more
+than 1 if multiple items or variable-sized items are to be included in the
+buffer, provided that neither index overtakes the other.  The implementer must
+be careful, however, as a region more than one unit in size may wrap the end of
+the buffer and be broken into two segments.
+
+
+============================
+MEASURING POWER-OF-2 BUFFERS
+============================
+
+Calculation of the occupancy or the remaining capacity of an arbitrarily sized
+circular buffer would normally be a slow operation, requiring the use of a
+modulus (divide) instruction.  However, if the buffer is of a power-of-2 size,
+then a much quicker bitwise-AND instruction can be used instead.
+
+Linux provides a set of macros for handling power-of-2 circular buffers.  These
+can be made use of by:
+
+	#include <linux/circ_buf.h>
+
+The macros are:
+
+ (*) Measure the remaining capacity of a buffer:
+
+	CIRC_SPACE(head_index, tail_index, buffer_size);
+
+     This returns the amount of space left in the buffer[1] into which items
+     can be inserted.
+
+
+ (*) Measure the maximum consecutive immediate space in a buffer:
+
+	CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
+
+     This returns the amount of consecutive space left in the buffer[1] into
+     which items can be immediately inserted without having to wrap back to the
+     beginning of the buffer.
+
+
+ (*) Measure the occupancy of a buffer:
+
+	CIRC_CNT(head_index, tail_index, buffer_size);
+
+     This returns the number of items currently occupying a buffer[2].
+
+
+ (*) Measure the non-wrapping occupancy of a buffer:
+
+	CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
+
+     This returns the number of consecutive items[2] that can be extracted from
+     the buffer without having to wrap back to the beginning of the buffer.
+
+
+Each of these macros will nominally return a value between 0 and buffer_size-1,
+however:
+
+ [1] CIRC_SPACE*() are intended to be used in the producer.  To the producer
+     they will return a lower bound as the producer controls the head index,
+     but the consumer may still be depleting the buffer on another CPU and
+     moving the tail index.
+
+     To the consumer it will show an upper bound as the producer may be busy
+     depleting the space.
+
+ [2] CIRC_CNT*() are intended to be used in the consumer.  To the consumer they
+     will return a lower bound as the consumer controls the tail index, but the
+     producer may still be filling the buffer on another CPU and moving the
+     head index.
+
+     To the producer it will show an upper bound as the consumer may be busy
+     emptying the buffer.
+
+ [3] To a third party, the order in which the writes to the indices by the
+     producer and consumer become visible cannot be guaranteed as they are
+     independent and may be made on different CPUs - so the result in such a
+     situation will merely be a guess, and may even be negative.
+
+
+===========================================
+USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
+===========================================
+
+By using memory barriers in conjunction with circular buffers, you can avoid
+the need to:
+
+ (1) use a single lock to govern access to both ends of the buffer, thus
+     allowing the buffer to be filled and emptied at the same time; and
+
+ (2) use atomic counter operations.
+
+There are two sides to this: the producer that fills the buffer, and the
+consumer that empties it.  Only one thing should be filling a buffer at any one
+time, and only one thing should be emptying a buffer at any one time, but the
+two sides can operate simultaneously.
+
+
+THE PRODUCER
+------------
+
+The producer will look something like this:
+
+	spin_lock(&producer_lock);
+
+	unsigned long head = buffer->head;
+	unsigned long tail = ACCESS_ONCE(buffer->tail);
+
+	if (CIRC_SPACE(head, tail, buffer->size) >= 1) {
+		/* insert one item into the buffer */
+		struct item *item = buffer[head];
+
+		produce_item(item);
+
+		smp_wmb(); /* commit the item before incrementing the head */
+
+		buffer->head = (head + 1) & (buffer->size - 1);
+
+		/* wake_up() will make sure that the head is committed before
+		 * waking anyone up */
+		wake_up(consumer);
+	}
+
+	spin_unlock(&producer_lock);
+
+This will instruct the CPU that the contents of the new item must be written
+before the head index makes it available to the consumer and then instructs the
+CPU that the revised head index must be written before the consumer is woken.
+
+Note that wake_up() doesn't have to be the exact mechanism used, but whatever
+is used must guarantee a (write) memory barrier between the update of the head
+index and the change of state of the consumer, if a change of state occurs.
+
+
+THE CONSUMER
+------------
+
+The consumer will look something like this:
+
+	spin_lock(&consumer_lock);
+
+	unsigned long head = ACCESS_ONCE(buffer->head);
+	unsigned long tail = buffer->tail;
+
+	if (CIRC_CNT(head, tail, buffer->size) >= 1) {
+		/* read index before reading contents at that index */
+		smp_read_barrier_depends();
+
+		/* extract one item from the buffer */
+		struct item *item = buffer[tail];
+
+		consume_item(item);
+
+		smp_mb(); /* finish reading descriptor before incrementing tail */
+
+		buffer->tail = (tail + 1) & (buffer->size - 1);
+	}
+
+	spin_unlock(&consumer_lock);
+
+This will instruct the CPU to make sure the index is up to date before reading
+the new item, and then it shall make sure the CPU has finished reading the item
+before it writes the new tail pointer, which will erase the item.
+
+
+Note the use of ACCESS_ONCE() in both algorithms to read the opposition index.
+This prevents the compiler from discarding and reloading its cached value -
+which some compilers will do across smp_read_barrier_depends().  This isn't
+strictly needed if you can be sure that the opposition index will _only_ be
+used the once.
+
+
+===============
+FURTHER READING
+===============
+
+See also Documentation/memory-barriers.txt for a description of Linux's memory
+barrier facilities.
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 3bae418c6ad..4303614b5ad 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -16,6 +16,8 @@ befs.txt
 	- information about the BeOS filesystem for Linux.
 bfs.txt
 	- info for the SCO UnixWare Boot Filesystem (BFS).
+ceph.txt
+	- info for the Ceph Distributed File System
 cifs.txt
 	- description of the CIFS filesystem.
 coda.txt
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
new file mode 100644
index 00000000000..0660c9f5dee
--- /dev/null
+++ b/Documentation/filesystems/ceph.txt
@@ -0,0 +1,140 @@
+Ceph Distributed File System
+============================
+
+Ceph is a distributed network file system designed to provide good
+performance, reliability, and scalability.
+
+Basic features include:
+
+ * POSIX semantics
+ * Seamless scaling from 1 to many thousands of nodes
+ * High availability and reliability.  No single point of failure.
+ * N-way replication of data across storage nodes
+ * Fast recovery from node failures
+ * Automatic rebalancing of data on node addition/removal
+ * Easy deployment: most FS components are userspace daemons
+
+Also,
+ * Flexible snapshots (on any directory)
+ * Recursive accounting (nested files, directories, bytes)
+
+In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
+on symmetric access by all clients to shared block devices, Ceph
+separates data and metadata management into independent server
+clusters, similar to Lustre.  Unlike Lustre, however, metadata and
+storage nodes run entirely as user space daemons.  Storage nodes
+utilize btrfs to store data objects, leveraging its advanced features
+(checksumming, metadata replication, etc.).  File data is striped
+across storage nodes in large chunks to distribute workload and
+facilitate high throughputs.  When storage nodes fail, data is
+re-replicated in a distributed fashion by the storage nodes themselves
+(with some minimal coordination from a cluster monitor), making the
+system extremely efficient and scalable.
+
+Metadata servers effectively form a large, consistent, distributed
+in-memory cache above the file namespace that is extremely scalable,
+dynamically redistributes metadata in response to workload changes,
+and can tolerate arbitrary (well, non-Byzantine) node failures.  The
+metadata server takes a somewhat unconventional approach to metadata
+storage to significantly improve performance for common workloads.  In
+particular, inodes with only a single link are embedded in
+directories, allowing entire directories of dentries and inodes to be
+loaded into its cache with a single I/O operation.  The contents of
+extremely large directories can be fragmented and managed by
+independent metadata servers, allowing scalable concurrent access.
+
+The system offers automatic data rebalancing/migration when scaling
+from a small cluster of just a few nodes to many hundreds, without
+requiring an administrator carve the data set into static volumes or
+go through the tedious process of migrating data between servers.
+When the file system approaches full, new nodes can be easily added
+and things will "just work."
+
+Ceph includes flexible snapshot mechanism that allows a user to create
+a snapshot on any subdirectory (and its nested contents) in the
+system.  Snapshot creation and deletion are as simple as 'mkdir
+.snap/foo' and 'rmdir .snap/foo'.
+
+Ceph also provides some recursive accounting on directories for nested
+files and bytes.  That is, a 'getfattr -d foo' on any directory in the
+system will reveal the total number of nested regular files and
+subdirectories, and a summation of all nested file sizes.  This makes
+the identification of large disk space consumers relatively quick, as
+no 'du' or similar recursive scan of the file system is required.
+
+
+Mount Syntax
+============
+
+The basic mount syntax is:
+
+ # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
+
+You only need to specify a single monitor, as the client will get the
+full list when it connects.  (However, if the monitor you specify
+happens to be down, the mount won't succeed.)  The port can be left
+off if the monitor is using the default.  So if the monitor is at
+1.2.3.4,
+
+ # mount -t ceph 1.2.3.4:/ /mnt/ceph
+
+is sufficient.  If /sbin/mount.ceph is installed, a hostname can be
+used instead of an IP address.
+
+
+
+Mount Options
+=============
+
+  ip=A.B.C.D[:N]
+	Specify the IP and/or port the client should bind to locally.
+	There is normally not much reason to do this.  If the IP is not
+	specified, the client's IP address is determined by looking at the
+	address it's connection to the monitor originates from.
+
+  wsize=X
+	Specify the maximum write size in bytes.  By default there is no
+	maximum.  Ceph will normally size writes based on the file stripe
+	size.
+
+  rsize=X
+	Specify the maximum readahead.
+
+  mount_timeout=X
+	Specify the timeout value for mount (in seconds), in the case
+	of a non-responsive Ceph file system.  The default is 30
+	seconds.
+
+  rbytes
+	When stat() is called on a directory, set st_size to 'rbytes',
+	the summation of file sizes over all files nested beneath that
+	directory.  This is the default.
+
+  norbytes
+	When stat() is called on a directory, set st_size to the
+	number of entries in that directory.
+
+  nocrc
+	Disable CRC32C calculation for data writes.  If set, the storage node
+	must rely on TCP's error correction to detect data corruption
+	in the data payload.
+
+  noasyncreaddir
+	Disable client's use its local cache to satisfy	readdir
+	requests.  (This does not change correctness; the client uses
+	cached metadata only when a lease or capability ensures it is
+	valid.)
+
+
+More Information
+================
+
+For more information on Ceph, see the home page at
+	http://ceph.newdream.net/
+
+The Linux kernel client source tree is available at
+	git://ceph.newdream.net/git/ceph-client.git
+	git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
+
+and the source for the full system is at
+	git://ceph.newdream.net/git/ceph.git
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 3015da0c6b2..fe09a2cb185 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -82,11 +82,13 @@ tmpfs has a mount option to set the NUMA memory allocation policy for
 all files in that instance (if CONFIG_NUMA is enabled) - which can be
 adjusted on the fly via 'mount -o remount ...'
 
-mpol=default             prefers to allocate memory from the local node
+mpol=default             use the process allocation policy
+                         (see set_mempolicy(2))
 mpol=prefer:Node         prefers to allocate memory from the given Node
 mpol=bind:NodeList       allocates memory only from nodes in NodeList
 mpol=interleave          prefers to allocate from each node in turn
 mpol=interleave:NodeList allocates from each node of NodeList in turn
+mpol=local		 prefers to allocate memory from the local node
 
 NodeList format is a comma-separated list of decimal numbers and ranges,
 a range being two hyphen-separated decimal numbers, the smallest and
@@ -134,3 +136,5 @@ Author:
    Christoph Rohland <cr@sap.com>, 1.12.01
 Updated:
    Hugh Dickins, 4 June 2007
+Updated:
+   KOSAKI Motohiro, 16 Mar 2010
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 35c9b51d20e..dd5806f4fcc 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -291,6 +291,7 @@ Code  Seq#(hex)	Include File		Comments
 0x92	00-0F	drivers/usb/mon/mon_bin.c
 0x93	60-7F	linux/auto_fs.h
 0x94	all	fs/btrfs/ioctl.h
+0x97	00-7F	fs/ceph/ioctl.h		Ceph file system
 0x99	00-0F				537-Addinboard driver
 					<mailto:buk@buks.ipn.de>
 0xA0	all	linux/sdp/sdp.h		Industrial Device Project
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index bdb13817e1e..3ab2472509c 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -59,37 +59,56 @@ nice to have in other objects.  The C language does not allow for the
 direct expression of inheritance, so other techniques - such as structure
 embedding - must be used.
 
-So, for example, the UIO code has a structure that defines the memory
-region associated with a uio device:
+(As an aside, for those familiar with the kernel linked list implementation,
+this is analogous as to how "list_head" structs are rarely useful on
+their own, but are invariably found embedded in the larger objects of
+interest.)
 
-struct uio_mem {
+So, for example, the UIO code in drivers/uio/uio.c has a structure that
+defines the memory region associated with a uio device:
+
+    struct uio_map {
 	struct kobject kobj;
-	unsigned long addr;
-	unsigned long size;
-	int memtype;
-	void __iomem *internal_addr;
-};
+	struct uio_mem *mem;
+    };
 
-If you have a struct uio_mem structure, finding its embedded kobject is
+If you have a struct uio_map structure, finding its embedded kobject is
 just a matter of using the kobj member.  Code that works with kobjects will
 often have the opposite problem, however: given a struct kobject pointer,
 what is the pointer to the containing structure?  You must avoid tricks
 (such as assuming that the kobject is at the beginning of the structure)
 and, instead, use the container_of() macro, found in <linux/kernel.h>:
 
-	container_of(pointer, type, member)
+    container_of(pointer, type, member)
+
+where:
+
+  * "pointer" is the pointer to the embedded kobject,
+  * "type" is the type of the containing structure, and
+  * "member" is the name of the structure field to which "pointer" points.
+
+The return value from container_of() is a pointer to the corresponding
+container type. So, for example, a pointer "kp" to a struct kobject
+embedded *within* a struct uio_map could be converted to a pointer to the
+*containing* uio_map structure with:
+
+    struct uio_map *u_map = container_of(kp, struct uio_map, kobj);
+
+For convenience, programmers often define a simple macro for "back-casting"
+kobject pointers to the containing type.  Exactly this happens in the
+earlier drivers/uio/uio.c, as you can see here:
+
+    struct uio_map {
+        struct kobject kobj;
+        struct uio_mem *mem;
+    };
 
-where pointer is the pointer to the embedded kobject, type is the type of
-the containing structure, and member is the name of the structure field to
-which pointer points.  The return value from container_of() is a pointer to
-the given type. So, for example, a pointer "kp" to a struct kobject
-embedded within a struct uio_mem could be converted to a pointer to the
-containing uio_mem structure with:
+    #define to_map(map) container_of(map, struct uio_map, kobj)
 
-    struct uio_mem *u_mem = container_of(kp, struct uio_mem, kobj);
+where the macro argument "map" is a pointer to the struct kobject in
+question.  That macro is subsequently invoked with:
 
-Programmers often define a simple macro for "back-casting" kobject pointers
-to the containing type.
+    struct uio_map *map = to_map(kobj);
 
 
 Initialization of kobjects
@@ -387,4 +406,5 @@ called, and the objects in the former circle release each other.
 Example code to copy from
 
 For a more complete example of using ksets and kobjects properly, see the
-sample/kobject/kset-example.c code.
+example programs samples/kobject/{kobject-example.c,kset-example.c},
+which will be built as loadable modules if you select CONFIG_SAMPLE_KOBJECT.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 7f5809eddee..631ad2f1b22 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -3,6 +3,7 @@
 			 ============================
 
 By: David Howells <dhowells@redhat.com>
+    Paul E. McKenney <paulmck@linux.vnet.ibm.com>
 
 Contents:
 
@@ -60,6 +61,10 @@ Contents:
 
      - And then there's the Alpha.
 
+ (*) Example uses.
+
+     - Circular buffers.
+
  (*) References.
 
 
@@ -2226,6 +2231,21 @@ The Alpha defines the Linux kernel's memory barrier model.
 See the subsection on "Cache Coherency" above.
 
 
+============
+EXAMPLE USES
+============
+
+CIRCULAR BUFFERS
+----------------
+
+Memory barriers can be used to implement circular buffering without the need
+of a lock to serialise the producer with the consumer.  See:
+
+	Documentation/circular-buffers.txt
+
+for details.
+
+
 ==========
 REFERENCES
 ==========
diff --git a/Documentation/volatile-considered-harmful.txt b/Documentation/volatile-considered-harmful.txt
index 991c26a6ef6..db0cb228d64 100644
--- a/Documentation/volatile-considered-harmful.txt
+++ b/Documentation/volatile-considered-harmful.txt
@@ -63,9 +63,9 @@ way to perform a busy wait is:
         cpu_relax();
 
 The cpu_relax() call can lower CPU power consumption or yield to a
-hyperthreaded twin processor; it also happens to serve as a memory barrier,
-so, once again, volatile is unnecessary.  Of course, busy-waiting is
-generally an anti-social act to begin with.
+hyperthreaded twin processor; it also happens to serve as a compiler
+barrier, so, once again, volatile is unnecessary.  Of course, busy-
+waiting is generally an anti-social act to begin with.
 
 There are still a few rare situations where volatile makes sense in the
 kernel:
diff --git a/MAINTAINERS b/MAINTAINERS
index 382eaa4d006..3d29fa38988 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -797,12 +797,12 @@ M:	Michael Petchkovsky <mkpetch@internode.on.net>
 S:	Maintained
 
 ARM/NOMADIK ARCHITECTURE
-M:     Alessandro Rubini <rubini@unipv.it>
-M:     STEricsson <STEricsson_nomadik_linux@list.st.com>
-L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-S:     Maintained
-F:     arch/arm/mach-nomadik/
-F:     arch/arm/plat-nomadik/
+M:	Alessandro Rubini <rubini@unipv.it>
+M:	STEricsson <STEricsson_nomadik_linux@list.st.com>
+L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+S:	Maintained
+F:	arch/arm/mach-nomadik/
+F:	arch/arm/plat-nomadik/
 
 ARM/OPENMOKO NEO FREERUNNER (GTA02) MACHINE SUPPORT
 M:	Nelson Castillo <arhuaco@freaks-unidos.net>
@@ -1441,6 +1441,15 @@ F:	arch/powerpc/include/asm/spu*.h
 F:	arch/powerpc/oprofile/*cell*
 F:	arch/powerpc/platforms/cell/
 
+CEPH DISTRIBUTED FILE SYSTEM CLIENT
+M:	Sage Weil <sage@newdream.net>
+L:	ceph-devel@vger.kernel.org
+W:	http://ceph.newdream.net/
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
+S:	Supported
+F:	Documentation/filesystems/ceph.txt
+F:	fs/ceph
+
 CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
 M:	David Vrabel <david.vrabel@csr.com>
 L:	linux-usb@vger.kernel.org
@@ -1917,17 +1926,17 @@ F:	drivers/scsi/dpt*
 F:	drivers/scsi/dpt/
 
 DRBD DRIVER
-P:     Philipp Reisner
-P:     Lars Ellenberg
-M:     drbd-dev@lists.linbit.com
-L:     drbd-user@lists.linbit.com
-W:     http://www.drbd.org
-T:     git git://git.drbd.org/linux-2.6-drbd.git drbd
-T:     git git://git.drbd.org/drbd-8.3.git
-S:     Supported
-F:     drivers/block/drbd/
-F:     lib/lru_cache.c
-F:     Documentation/blockdev/drbd/
+P:	Philipp Reisner
+P:	Lars Ellenberg
+M:	drbd-dev@lists.linbit.com
+L:	drbd-user@lists.linbit.com
+W:	http://www.drbd.org
+T:	git git://git.drbd.org/linux-2.6-drbd.git drbd
+T:	git git://git.drbd.org/drbd-8.3.git
+S:	Supported
+F:	drivers/block/drbd/
+F:	lib/lru_cache.c
+F:	Documentation/blockdev/drbd/
 
 DRIVER CORE, KOBJECTS, AND SYSFS
 M:	Greg Kroah-Hartman <gregkh@suse.de>
@@ -3074,6 +3083,7 @@ F:	include/scsi/*iscsi*
 ISDN SUBSYSTEM
 M:	Karsten Keil <isdn@linux-pingi.de>
 L:	isdn4linux@listserv.isdn4linux.de (subscribers-only)
+L:	netdev@vger.kernel.org
 W:	http://www.isdn4linux.de
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kkeil/isdn-2.6.git
 S:	Maintained
@@ -3260,6 +3270,16 @@ S:	Maintained
 F:	include/linux/kexec.h
 F:	kernel/kexec.c
 
+KEYS/KEYRINGS:
+M:	David Howells <dhowells@redhat.com>
+L:	keyrings@linux-nfs.org
+S:	Maintained
+F:	Documentation/keys.txt
+F:	include/linux/key.h
+F:	include/linux/key-type.h
+F:	include/keys/
+F:	security/keys/
+
 KGDB
 M:	Jason Wessel <jason.wessel@windriver.com>
 L:	kgdb-bugreport@lists.sourceforge.net
@@ -3509,8 +3529,8 @@ F:	drivers/scsi/sym53c8xx_2/
 LTP (Linux Test Project)
 M:	Rishikesh K Rajak <risrajak@linux.vnet.ibm.com>
 M:	Garrett Cooper <yanegomi@gmail.com>
-M:     Mike Frysinger <vapier@gentoo.org>
-M:     Subrata Modak <subrata@linux.vnet.ibm.com>
+M:	Mike Frysinger <vapier@gentoo.org>
+M:	Subrata Modak <subrata@linux.vnet.ibm.com>
 L:	ltp-list@lists.sourceforge.net (subscribers-only)
 W:	http://ltp.sourceforge.net/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/galak/ltp.git
@@ -5414,7 +5434,6 @@ S:	Maintained
 F:	sound/soc/codecs/twl4030*
 
 TIPC NETWORK LAYER
-M:	Per Liden <per.liden@ericsson.com>
 M:	Jon Maloy <jon.maloy@ericsson.com>
 M:	Allan Stephens <allan.stephens@windriver.com>
 L:	tipc-discussion@lists.sourceforge.net
@@ -6192,7 +6211,7 @@ F:	arch/x86/
 X86 PLATFORM DRIVERS
 M:	Matthew Garrett <mjg@redhat.com>
 L:	platform-driver-x86@vger.kernel.org
-T:      git git://git.kernel.org/pub/scm/linux/kernel/git/mjg59/platform-drivers-x86.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mjg59/platform-drivers-x86.git
 S:	Maintained
 F:	drivers/platform/x86
 
diff --git a/Makefile b/Makefile
index 08ff02da7ce..67c1001cfbf 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 34
-EXTRAVERSION = -rc1
+EXTRAVERSION = -rc3
 NAME = Man-Eating Seals of Antiquity
 
 # *DOCUMENTATION*
diff --git a/arch/alpha/include/asm/core_marvel.h b/arch/alpha/include/asm/core_marvel.h
index 30d55fe7aaf..dad300fa14c 100644
--- a/arch/alpha/include/asm/core_marvel.h
+++ b/arch/alpha/include/asm/core_marvel.h
@@ -12,7 +12,6 @@
 #define __ALPHA_MARVEL__H__
 
 #include <linux/types.h>
-#include <linux/pci.h>
 #include <linux/spinlock.h>
 
 #include <asm/compiler.h>
diff --git a/arch/alpha/include/asm/core_mcpcia.h b/arch/alpha/include/asm/core_mcpcia.h
index acf55b48347..21ac53383b3 100644
--- a/arch/alpha/include/asm/core_mcpcia.h
+++ b/arch/alpha/include/asm/core_mcpcia.h
@@ -6,7 +6,6 @@
 #define MCPCIA_ONE_HAE_WINDOW 1
 
 #include <linux/types.h>
-#include <linux/pci.h>
 #include <asm/compiler.h>
 
 /*
diff --git a/arch/alpha/include/asm/core_titan.h b/arch/alpha/include/asm/core_titan.h
index a17f6f33b68..8cf79d1219e 100644
--- a/arch/alpha/include/asm/core_titan.h
+++ b/arch/alpha/include/asm/core_titan.h
@@ -2,7 +2,6 @@
 #define __ALPHA_TITAN__H__
 
 #include <linux/types.h>
-#include <linux/pci.h>
 #include <asm/compiler.h>
 
 /*
diff --git a/arch/alpha/include/asm/core_tsunami.h b/arch/alpha/include/asm/core_tsunami.h
index 58d4fe48742..8e39ecf0941 100644
--- a/arch/alpha/include/asm/core_tsunami.h
+++ b/arch/alpha/include/asm/core_tsunami.h
@@ -2,7 +2,6 @@
 #define __ALPHA_TSUNAMI__H__
 
 #include <linux/types.h>
-#include <linux/pci.h>
 #include <asm/compiler.h>
 
 /*
diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index d64e1e497e7..4026502ab70 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -224,7 +224,7 @@ static void
 dp264_device_interrupt(unsigned long vector)
 {
 #if 1
-	printk("dp264_device_interrupt: NOT IMPLEMENTED YET!! \n");
+	printk("dp264_device_interrupt: NOT IMPLEMENTED YET!!\n");
 #else
 	unsigned long pld;
 	unsigned int i;
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 288053342c8..9008d0f20c5 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -171,7 +171,7 @@ titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 static void
 titan_device_interrupt(unsigned long vector)
 {
-	printk("titan_device_interrupt: NOT IMPLEMENTED YET!! \n");
+	printk("titan_device_interrupt: NOT IMPLEMENTED YET!!\n");
 }
 
 static void 
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 6ee7655b756..b14f015008a 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
+#include <linux/ratelimit.h>
 
 #include <asm/gentrap.h>
 #include <asm/uaccess.h>
@@ -771,8 +772,7 @@ asmlinkage void
 do_entUnaUser(void __user * va, unsigned long opcode,
 	      unsigned long reg, struct pt_regs *regs)
 {
-	static int cnt = 0;
-	static unsigned long last_time;
+	static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
 
 	unsigned long tmp1, tmp2, tmp3, tmp4;
 	unsigned long fake_reg, *reg_addr = &fake_reg;
@@ -783,15 +783,11 @@ do_entUnaUser(void __user * va, unsigned long opcode,
 	   with the unaliged access.  */
 
 	if (!test_thread_flag (TIF_UAC_NOPRINT)) {
-		if (cnt >= 5 && time_after(jiffies, last_time + 5 * HZ)) {
-			cnt = 0;
-		}
-		if (++cnt < 5) {
+		if (__ratelimit(&ratelimit)) {
 			printk("%s(%d): unaligned trap at %016lx: %p %lx %ld\n",
 			       current->comm, task_pid_nr(current),
 			       regs->pc - 4, va, opcode, reg);
 		}
-		last_time = jiffies;
 	}
 	if (test_thread_flag (TIF_UAC_SIGBUS))
 		goto give_sigbus;
diff --git a/arch/arm/common/locomo.c b/arch/arm/common/locomo.c
index 90ae00b631c..9dff07c80dd 100644
--- a/arch/arm/common/locomo.c
+++ b/arch/arm/common/locomo.c
@@ -290,7 +290,7 @@ static int locomo_suspend(struct platform_device *dev, pm_message_t state)
 	save->LCM_GPO     = locomo_readl(lchip->base + LOCOMO_GPO);	/* GPIO */
 	locomo_writel(0x00, lchip->base + LOCOMO_GPO);
 	save->LCM_SPICT   = locomo_readl(lchip->base + LOCOMO_SPI + LOCOMO_SPICT);	/* SPI */
-	locomo_writel(0x40, lchip->base + LOCOMO_SPICT);
+	locomo_writel(0x40, lchip->base + LOCOMO_SPI + LOCOMO_SPICT);
 	save->LCM_GPE     = locomo_readl(lchip->base + LOCOMO_GPE);	/* GPIO */
 	locomo_writel(0x00, lchip->base + LOCOMO_GPE);
 	save->LCM_ASD     = locomo_readl(lchip->base + LOCOMO_ASD);	/* ADSTART */
@@ -418,7 +418,7 @@ __locomo_probe(struct device *me, struct resource *mem, int irq)
 	/* Longtime timer */
 	locomo_writel(0, lchip->base + LOCOMO_LTINT);
 	/* SPI */
-	locomo_writel(0, lchip->base + LOCOMO_SPIIE);
+	locomo_writel(0, lchip->base + LOCOMO_SPI + LOCOMO_SPIIE);
 
 	locomo_writel(6 + 8 + 320 + 30 - 10, lchip->base + LOCOMO_ASD);
 	r = locomo_readl(lchip->base + LOCOMO_ASD);
@@ -707,7 +707,7 @@ void locomo_m62332_senddata(struct locomo_dev *ldev, unsigned int dac_data, int
 	udelay(DAC_SCL_HIGH_HOLD_TIME);	/* 4.7 usec */
 	if (locomo_readl(mapbase + LOCOMO_DAC) & LOCOMO_DAC_SDAOEB) {	/* High is error */
 		printk(KERN_WARNING "locomo: m62332_senddata Error 1\n");
-		return;
+		goto out;
 	}
 
 	/* Send Sub address (LSB is channel select) */
@@ -735,7 +735,7 @@ void locomo_m62332_senddata(struct locomo_dev *ldev, unsigned int dac_data, int
 	udelay(DAC_SCL_HIGH_HOLD_TIME);	/* 4.7 usec */
 	if (locomo_readl(mapbase + LOCOMO_DAC) & LOCOMO_DAC_SDAOEB) {	/* High is error */
 		printk(KERN_WARNING "locomo: m62332_senddata Error 2\n");
-		return;
+		goto out;
 	}
 
 	/* Send DAC data */
@@ -760,9 +760,9 @@ void locomo_m62332_senddata(struct locomo_dev *ldev, unsigned int dac_data, int
 	udelay(DAC_SCL_HIGH_HOLD_TIME);	/* 4.7 usec */
 	if (locomo_readl(mapbase + LOCOMO_DAC) & LOCOMO_DAC_SDAOEB) {	/* High is error */
 		printk(KERN_WARNING "locomo: m62332_senddata Error 3\n");
-		return;
 	}
 
+out:
 	/* stop */
 	r = locomo_readl(mapbase + LOCOMO_DAC);
 	r &=  ~(LOCOMO_DAC_SCLOEB);
diff --git a/arch/arm/mach-ixp23xx/include/mach/memory.h b/arch/arm/mach-ixp23xx/include/mach/memory.h
index 94a3a86cfeb..6ef65d813f1 100644
--- a/arch/arm/mach-ixp23xx/include/mach/memory.h
+++ b/arch/arm/mach-ixp23xx/include/mach/memory.h
@@ -19,7 +19,7 @@
  */
 #define PHYS_OFFSET		(0x00000000)
 
-#define IXP23XX_PCI_SDRAM_OFFSET (*((volatile int *)IXP23XX_PCI_SDRAM_BAR) & 0xfffffff0))
+#define IXP23XX_PCI_SDRAM_OFFSET (*((volatile int *)IXP23XX_PCI_SDRAM_BAR) & 0xfffffff0)
 
 #define __phys_to_bus(x)	((x) + (IXP23XX_PCI_SDRAM_OFFSET - PHYS_OFFSET))
 #define __bus_to_phys(x)	((x) - (IXP23XX_PCI_SDRAM_OFFSET - PHYS_OFFSET))
diff --git a/arch/arm/mach-kirkwood/mv88f6281gtw_ge-setup.c b/arch/arm/mach-kirkwood/mv88f6281gtw_ge-setup.c
index 0358f45766c..5e6f711b1c6 100644
--- a/arch/arm/mach-kirkwood/mv88f6281gtw_ge-setup.c
+++ b/arch/arm/mach-kirkwood/mv88f6281gtw_ge-setup.c
@@ -74,9 +74,9 @@ static struct gpio_keys_button mv88f6281gtw_ge_button_pins[] = {
 		.desc		= "SWR Button",
 		.active_low	= 1,
 	}, {
-		.code		= KEY_F1,
+		.code		= KEY_WPS_BUTTON,
 		.gpio		= 46,
-		.desc		= "WPS Button(F1)",
+		.desc		= "WPS Button",
 		.active_low	= 1,
 	},
 };
diff --git a/arch/arm/mach-mmp/include/mach/uncompress.h b/arch/arm/mach-mmp/include/mach/uncompress.h
index a7dcc530721..85bd8a2d84b 100644
--- a/arch/arm/mach-mmp/include/mach/uncompress.h
+++ b/arch/arm/mach-mmp/include/mach/uncompress.h
@@ -14,7 +14,7 @@
 #define UART2_BASE	(APB_PHYS_BASE + 0x17000)
 #define UART3_BASE	(APB_PHYS_BASE + 0x18000)
 
-static volatile unsigned long *UART = (unsigned long *)UART2_BASE;
+static volatile unsigned long *UART;
 
 static inline void putc(char c)
 {
@@ -37,6 +37,9 @@ static inline void flush(void)
 
 static inline void arch_decomp_setup(void)
 {
+	/* default to UART2 */
+	UART = (unsigned long *)UART2_BASE;
+
 	if (machine_is_avengers_lite())
 		UART = (unsigned long *)UART3_BASE;
 }
diff --git a/arch/arm/mach-orion5x/wrt350n-v2-setup.c b/arch/arm/mach-orion5x/wrt350n-v2-setup.c
index cb0feca193d..f9f222ebb7e 100644
--- a/arch/arm/mach-orion5x/wrt350n-v2-setup.c
+++ b/arch/arm/mach-orion5x/wrt350n-v2-setup.c
@@ -77,7 +77,7 @@ static struct gpio_keys_button wrt350n_v2_buttons[] = {
 		.desc		= "Reset Button",
 		.active_low	= 1,
 	}, {
-		.code		= KEY_WLAN,
+		.code		= KEY_WPS_BUTTON,
 		.gpio		= 2,
 		.desc		= "WPS Button",
 		.active_low	= 1,
diff --git a/arch/arm/mach-pxa/Kconfig b/arch/arm/mach-pxa/Kconfig
index 38fbd0a0e40..5b6ee46fa7f 100644
--- a/arch/arm/mach-pxa/Kconfig
+++ b/arch/arm/mach-pxa/Kconfig
@@ -272,7 +272,6 @@ config MACH_H5000
 config MACH_HIMALAYA
 	bool "HTC Himalaya Support"
 	select CPU_PXA26x
-	select FB_W100
 
 config MACH_MAGICIAN
 	bool "Enable HTC Magician Support"
@@ -454,6 +453,13 @@ config PXA_SHARPSL
 config SHARPSL_PM
 	bool
 	select APM_EMULATION
+	select SHARPSL_PM_MAX1111
+
+config SHARPSL_PM_MAX1111
+	bool
+	depends on !CORGI_SSP_DEPRECATED
+	select HWMON
+	select SENSORS_MAX1111
 
 config CORGI_SSP_DEPRECATED
 	bool
@@ -547,7 +553,6 @@ config MACH_E740
 	bool "Toshiba e740"
 	default y
 	depends on ARCH_PXA_ESERIES
-	select FB_W100
 	help
 	  Say Y here if you intend to run this kernel on a Toshiba
 	  e740 family PDA.
@@ -556,7 +561,6 @@ config MACH_E750
 	bool "Toshiba e750"
 	default y
 	depends on ARCH_PXA_ESERIES
-	select FB_W100
 	help
 	  Say Y here if you intend to run this kernel on a Toshiba
 	  e750 family PDA.
@@ -573,7 +577,6 @@ config MACH_E800
 	bool "Toshiba e800"
 	default y
 	depends on ARCH_PXA_ESERIES
-	select FB_W100
 	help
 	  Say Y here if you intend to run this kernel on a Toshiba
 	  e800 family PDA.
diff --git a/arch/arm/mach-pxa/imote2.c b/arch/arm/mach-pxa/imote2.c
index b2f878bd460..5161dca8ccc 100644
--- a/arch/arm/mach-pxa/imote2.c
+++ b/arch/arm/mach-pxa/imote2.c
@@ -559,10 +559,6 @@ static void __init imote2_init(void)
 	pxa_set_btuart_info(NULL);
 	pxa_set_stuart_info(NULL);
 
-	/* SPI chip select directions - all other directions should
-	 * be handled by drivers.*/
-	gpio_direction_output(37, 0);
-
 	platform_add_devices(imote2_devices, ARRAY_SIZE(imote2_devices));
 
 	pxa2xx_set_spi_info(1, &pxa_ssp_master_0_info);
diff --git a/arch/arm/mach-pxa/include/mach/uncompress.h b/arch/arm/mach-pxa/include/mach/uncompress.h
index 5ef91d9d17e..759b851ec98 100644
--- a/arch/arm/mach-pxa/include/mach/uncompress.h
+++ b/arch/arm/mach-pxa/include/mach/uncompress.h
@@ -16,9 +16,9 @@
 #define BTUART_BASE	(0x40200000)
 #define STUART_BASE	(0x40700000)
 
-static unsigned long uart_base = FFUART_BASE;
-static unsigned int uart_shift = 2;
-static unsigned int uart_is_pxa = 1;
+static unsigned long uart_base;
+static unsigned int uart_shift;
+static unsigned int uart_is_pxa;
 
 static inline unsigned char uart_read(int offset)
 {
@@ -56,6 +56,11 @@ static inline void flush(void)
 
 static inline void arch_decomp_setup(void)
 {
+	/* initialize to default */
+	uart_base = FFUART_BASE;
+	uart_shift = 2;
+	uart_is_pxa = 1;
+
 	if (machine_is_littleton() || machine_is_intelmote2()
 	    || machine_is_csb726() || machine_is_stargate2()
 	    || machine_is_cm_x300() || machine_is_balloon3())
diff --git a/arch/arm/mach-pxa/raumfeld.c b/arch/arm/mach-pxa/raumfeld.c
index 3184bdc1452..44bb675e47f 100644
--- a/arch/arm/mach-pxa/raumfeld.c
+++ b/arch/arm/mach-pxa/raumfeld.c
@@ -37,8 +37,6 @@
 #include <linux/lis3lv02d.h>
 #include <linux/pda_power.h>
 #include <linux/power_supply.h>
-#include <linux/pda_power.h>
-#include <linux/power_supply.h>
 #include <linux/regulator/max8660.h>
 #include <linux/regulator/machine.h>
 #include <linux/regulator/fixed.h>
@@ -444,7 +442,7 @@ static struct gpio_keys_button gpio_keys_button[] = {
 		.active_low		= 0,
 		.wakeup			= 0,
 		.debounce_interval	= 5, /* ms */
-		.desc			= "on/off button",
+		.desc			= "on_off button",
 	},
 };
 
diff --git a/arch/arm/mach-pxa/stargate2.c b/arch/arm/mach-pxa/stargate2.c
index a98a434f011..2041eb1d90b 100644
--- a/arch/arm/mach-pxa/stargate2.c
+++ b/arch/arm/mach-pxa/stargate2.c
@@ -764,11 +764,6 @@ static void __init stargate2_init(void)
 	pxa_set_btuart_info(NULL);
 	pxa_set_stuart_info(NULL);
 
-	/* spi chip selects */
-	gpio_direction_output(37, 0);
-	gpio_direction_output(24, 0);
-	gpio_direction_output(39, 0);
-
 	platform_add_devices(ARRAY_AND_SIZE(stargate2_devices));
 
 	pxa2xx_set_spi_info(1, &pxa_ssp_master_0_info);
diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types
index 31c2f4c30a9..1536f1784ca 100644
--- a/arch/arm/tools/mach-types
+++ b/arch/arm/tools/mach-types
@@ -12,7 +12,7 @@
 #
 #   http://www.arm.linux.org.uk/developer/machines/?action=new
 #
-# Last update: Sat Feb 20 14:16:15 2010
+# Last update: Sat Mar 20 15:35:41 2010
 #
 # machine_is_xxx	CONFIG_xxxx		MACH_TYPE_xxx		number
 #
@@ -2663,7 +2663,7 @@ reb01			MACH_REB01		REB01			2675
 aquila			MACH_AQUILA		AQUILA			2676
 spark_sls_hw2		MACH_SPARK_SLS_HW2	SPARK_SLS_HW2		2677
 sheeva_esata		MACH_ESATA_SHEEVAPLUG	ESATA_SHEEVAPLUG	2678
-surf7x30		MACH_SURF7X30		SURF7X30		2679
+msm7x30_surf		MACH_MSM7X30_SURF	MSM7X30_SURF		2679
 micro2440		MACH_MICRO2440		MICRO2440		2680
 am2440			MACH_AM2440		AM2440			2681
 tq2440			MACH_TQ2440		TQ2440			2682
@@ -2678,3 +2678,74 @@ vc088x			MACH_VC088X		VC088X			2690
 mioa702			MACH_MIOA702		MIOA702			2691
 hpmin			MACH_HPMIN		HPMIN			2692
 ak880xak		MACH_AK880XAK		AK880XAK		2693
+arm926tomap850		MACH_ARM926TOMAP850	ARM926TOMAP850		2694
+lkevm			MACH_LKEVM		LKEVM			2695
+mw6410			MACH_MW6410		MW6410			2696
+terastation_wxl		MACH_TERASTATION_WXL	TERASTATION_WXL		2697
+cpu8000e		MACH_CPU8000E		CPU8000E		2698
+catania			MACH_CATANIA		CATANIA			2699
+tokyo			MACH_TOKYO		TOKYO			2700
+msm7201a_surf		MACH_MSM7201A_SURF	MSM7201A_SURF		2701
+msm7201a_ffa		MACH_MSM7201A_FFA	MSM7201A_FFA		2702
+msm7x25_surf		MACH_MSM7X25_SURF	MSM7X25_SURF		2703
+msm7x25_ffa		MACH_MSM7X25_FFA	MSM7X25_FFA		2704
+msm7x27_surf		MACH_MSM7X27_SURF	MSM7X27_SURF		2705
+msm7x27_ffa		MACH_MSM7X27_FFA	MSM7X27_FFA		2706
+msm7x30_ffa		MACH_MSM7X30_FFA	MSM7X30_FFA		2707
+qsd8x50_surf		MACH_QSD8X50_SURF	QSD8X50_SURF		2708
+qsd8x50_comet		MACH_QSD8X50_COMET	QSD8X50_COMET		2709
+qsd8x50_ffa		MACH_QSD8X50_FFA	QSD8X50_FFA		2710
+qsd8x50a_surf		MACH_QSD8X50A_SURF	QSD8X50A_SURF		2711
+qsd8x50a_ffa		MACH_QSD8X50A_FFA	QSD8X50A_FFA		2712
+adx_xgcp10		MACH_ADX_XGCP10		ADX_XGCP10		2713
+mcgwumts2a		MACH_MCGWUMTS2A		MCGWUMTS2A		2714
+mobikt			MACH_MOBIKT		MOBIKT			2715
+mx53_evk		MACH_MX53_EVK		MX53_EVK		2716
+igep0030		MACH_IGEP0030		IGEP0030		2717
+axell_h40_h50_ctrl	MACH_AXELL_H40_H50_CTRL	AXELL_H40_H50_CTRL	2718
+dtcommod		MACH_DTCOMMOD		DTCOMMOD		2719
+gould			MACH_GOULD		GOULD			2720
+siberia			MACH_SIBERIA		SIBERIA			2721
+sbc3530			MACH_SBC3530		SBC3530			2722
+qarm			MACH_QARM		QARM			2723
+mips			MACH_MIPS		MIPS			2724
+mx27grb			MACH_MX27GRB		MX27GRB			2725
+sbc8100			MACH_SBC8100		SBC8100			2726
+saarb			MACH_SAARB		SAARB			2727
+omap3mini		MACH_OMAP3MINI		OMAP3MINI		2728
+cnmbook7se		MACH_CNMBOOK7SE		CNMBOOK7SE		2729
+catan			MACH_CATAN		CATAN			2730
+harmony			MACH_HARMONY		HARMONY			2731
+tonga			MACH_TONGA		TONGA			2732
+cybook_orizon		MACH_CYBOOK_ORIZON	CYBOOK_ORIZON		2733
+htcrhodiumcdma		MACH_HTCRHODIUMCDMA	HTCRHODIUMCDMA		2734
+epc_g45			MACH_EPC_G45		EPC_G45			2735
+epc_lpc3250		MACH_EPC_LPC3250	EPC_LPC3250		2736
+mxc91341evb		MACH_MXC91341EVB	MXC91341EVB		2737
+rtw1000			MACH_RTW1000		RTW1000			2738
+bobcat			MACH_BOBCAT		BOBCAT			2739
+trizeps6		MACH_TRIZEPS6		TRIZEPS6		2740
+msm7x30_fluid		MACH_MSM7X30_FLUID	MSM7X30_FLUID		2741
+nedap9263		MACH_NEDAP9263		NEDAP9263		2742
+netgear_ms2110		MACH_NETGEAR_MS2110	NETGEAR_MS2110		2743
+bmx			MACH_BMX		BMX			2744
+netstream		MACH_NETSTREAM		NETSTREAM		2745
+vpnext_rcu		MACH_VPNEXT_RCU		VPNEXT_RCU		2746
+vpnext_mpu		MACH_VPNEXT_MPU		VPNEXT_MPU		2747
+bcmring_tablet_v1	MACH_BCMRING_TABLET_V1	BCMRING_TABLET_V1	2748
+sgarm10			MACH_SGARM10		SGARM10			2749
+cm_t3517		MACH_CM_T3517		CM_T3517		2750
+omap3_cps		MACH_OMAP3_CPS		OMAP3_CPS		2751
+axar1500_receiver	MACH_AXAR1500_RECEIVER	AXAR1500_RECEIVER	2752
+wbd222			MACH_WBD222		WBD222			2753
+mt65xx			MACH_MT65XX		MT65XX			2754
+msm8x60_surf		MACH_MSM8X60_SURF	MSM8X60_SURF		2755
+msm8x60_sim		MACH_MSM8X60_SIM	MSM8X60_SIM		2756
+vmc300			MACH_VMC300		VMC300			2757
+tcc8000_sdk		MACH_TCC8000_SDK	TCC8000_SDK		2758
+nanos			MACH_NANOS		NANOS			2759
+stamp9g10		MACH_STAMP9G10		STAMP9G10		2760
+stamp9g45		MACH_STAMP9G45		STAMP9G45		2761
+h6053			MACH_H6053		H6053			2762
+smint01			MACH_SMINT01		SMINT01			2763
+prtlvt2			MACH_PRTLVT2		PRTLVT2			2764
diff --git a/arch/cris/arch-v32/drivers/pci/bios.c b/arch/cris/arch-v32/drivers/pci/bios.c
index d4b9c36ddc0..bc0cfdad1cb 100644
--- a/arch/cris/arch-v32/drivers/pci/bios.c
+++ b/arch/cris/arch-v32/drivers/pci/bios.c
@@ -50,7 +50,7 @@ pcibios_align_resource(void *data, const struct resource *res,
 	if ((res->flags & IORESOURCE_IO) && (start & 0x300))
 		start = (start + 0x3ff) & ~0x3ff;
 
-	return start
+	return start;
 }
 
 int pcibios_enable_resources(struct pci_dev *dev, int mask)
diff --git a/arch/frv/mb93090-mb00/pci-frv.c b/arch/frv/mb93090-mb00/pci-frv.c
index 1ed15d7fea2..6b4fb28e9f9 100644
--- a/arch/frv/mb93090-mb00/pci-frv.c
+++ b/arch/frv/mb93090-mb00/pci-frv.c
@@ -41,7 +41,7 @@ pcibios_align_resource(void *data, const struct resource *res,
 	if ((res->flags & IORESOURCE_IO) && (start & 0x300))
 		start = (start + 0x3ff) & ~0x3ff;
 
-	return start
+	return start;
 }
 
 
@@ -94,8 +94,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
 				r = &dev->resource[idx];
 				if (!r->start)
 					continue;
-				if (pci_claim_resource(dev, idx) < 0)
-					printk(KERN_ERR "PCI: Cannot allocate resource region %d of bridge %s\n", idx, pci_name(dev));
+				pci_claim_resource(dev, idx);
 			}
 		}
 		pcibios_allocate_bus_resources(&bus->children);
@@ -125,7 +124,6 @@ static void __init pcibios_allocate_resources(int pass)
 				DBG("PCI: Resource %08lx-%08lx (f=%lx, d=%d, p=%d)\n",
 				    r->start, r->end, r->flags, disabled, pass);
 				if (pci_claim_resource(dev, idx) < 0) {
-					printk(KERN_ERR "PCI: Cannot allocate resource region %d of device %s\n", idx, pci_name(dev));
 					/* We'll assign a new address later */
 					r->end -= r->start;
 					r->start = 0;
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 203ec61c6d4..76818f92653 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -75,9 +75,6 @@ config LOCKDEP_SUPPORT
 config HAVE_LATENCYTOP_SUPPORT
 	def_bool y
 
-config PCI
-	def_bool n
-
 config DTC
 	def_bool y
 
diff --git a/arch/microblaze/Makefile b/arch/microblaze/Makefile
index 836832dd9b2..72f6e858374 100644
--- a/arch/microblaze/Makefile
+++ b/arch/microblaze/Makefile
@@ -84,7 +84,7 @@ define archhelp
   echo '* linux.bin    - Create raw binary'
   echo '  linux.bin.gz - Create compressed raw binary'
   echo '  simpleImage.<dt> - ELF image with $(arch)/boot/dts/<dt>.dts linked in'
-  echo '                   - stripped elf with fdt blob
+  echo '                   - stripped elf with fdt blob'
   echo '  simpleImage.<dt>.unstrip - full ELF image with fdt blob'
   echo '  *_defconfig      - Select default config from arch/microblaze/configs'
   echo ''
@@ -94,3 +94,5 @@ define archhelp
   echo '  name of a dts file from the arch/microblaze/boot/dts/ directory'
   echo '  (minus the .dts extension).'
 endef
+
+MRPROPER_FILES += $(boot)/simpleImage.*
diff --git a/arch/microblaze/boot/Makefile b/arch/microblaze/boot/Makefile
index 902cf9846c3..57f50c2371c 100644
--- a/arch/microblaze/boot/Makefile
+++ b/arch/microblaze/boot/Makefile
@@ -23,8 +23,6 @@ $(obj)/system.dtb: $(obj)/$(DTB).dtb
 endif
 
 $(obj)/linux.bin: vmlinux FORCE
-	[ -n $(CONFIG_INITRAMFS_SOURCE) ] && [ ! -e $(CONFIG_INITRAMFS_SOURCE) ] && \
-	touch $(CONFIG_INITRAMFS_SOURCE) || echo "No CPIO image"
 	$(call if_changed,objcopy)
 	$(call if_changed,uimage)
 	@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
@@ -62,6 +60,4 @@ quiet_cmd_dtc = DTC     $@
 $(obj)/%.dtb: $(dtstree)/%.dts FORCE
 	$(call if_changed,dtc)
 
-clean-kernel += linux.bin linux.bin.gz simpleImage.*
-
-clean-files += *.dtb simpleImage.*.unstrip
+clean-files += *.dtb simpleImage.*.unstrip linux.bin.ub
diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h
index 563c6b9453f..8eeb09211ec 100644
--- a/arch/microblaze/include/asm/processor.h
+++ b/arch/microblaze/include/asm/processor.h
@@ -14,7 +14,6 @@
 #include <asm/ptrace.h>
 #include <asm/setup.h>
 #include <asm/registers.h>
-#include <asm/segment.h>
 #include <asm/entry.h>
 #include <asm/current.h>
 
diff --git a/arch/microblaze/include/asm/segment.h b/arch/microblaze/include/asm/segment.h
deleted file mode 100644
index 0e7102c3fb1..00000000000
--- a/arch/microblaze/include/asm/segment.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
- * Copyright (C) 2008-2009 PetaLogix
- * Copyright (C) 2006 Atmark Techno, Inc.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#ifndef _ASM_MICROBLAZE_SEGMENT_H
-#define _ASM_MICROBLAZE_SEGMENT_H
-
-# ifndef __ASSEMBLY__
-
-typedef struct {
-	unsigned long seg;
-} mm_segment_t;
-
-/*
- * On Microblaze the fs value is actually the top of the corresponding
- * address space.
- *
- * The fs value determines whether argument validity checking should be
- * performed or not. If get_fs() == USER_DS, checking is performed, with
- * get_fs() == KERNEL_DS, checking is bypassed.
- *
- * For historical reasons, these macros are grossly misnamed.
- *
- * For non-MMU arch like Microblaze, KERNEL_DS and USER_DS is equal.
- */
-# define MAKE_MM_SEG(s)       ((mm_segment_t) { (s) })
-
-#  ifndef CONFIG_MMU
-#  define KERNEL_DS	MAKE_MM_SEG(0)
-#  define USER_DS	KERNEL_DS
-#  else
-#  define KERNEL_DS	MAKE_MM_SEG(0xFFFFFFFF)
-#  define USER_DS	MAKE_MM_SEG(TASK_SIZE - 1)
-#  endif
-
-# define get_ds()	(KERNEL_DS)
-# define get_fs()	(current_thread_info()->addr_limit)
-# define set_fs(val)	(current_thread_info()->addr_limit = (val))
-
-# define segment_eq(a, b)	((a).seg == (b).seg)
-
-# endif /* __ASSEMBLY__ */
-#endif /* _ASM_MICROBLAZE_SEGMENT_H */
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index 6e92885d381..b2ca80f6464 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -19,7 +19,6 @@
 #ifndef __ASSEMBLY__
 # include <linux/types.h>
 # include <asm/processor.h>
-# include <asm/segment.h>
 
 /*
  * low level task data that entry.S needs immediate access to
@@ -60,6 +59,10 @@ struct cpu_context {
 	__u32	fsr;
 };
 
+typedef struct {
+	unsigned long seg;
+} mm_segment_t;
+
 struct thread_info {
 	struct task_struct	*task; /* main task structure */
 	struct exec_domain	*exec_domain; /* execution domain */
diff --git a/arch/microblaze/include/asm/tlbflush.h b/arch/microblaze/include/asm/tlbflush.h
index bcb8b41d55a..2e1353c2d18 100644
--- a/arch/microblaze/include/asm/tlbflush.h
+++ b/arch/microblaze/include/asm/tlbflush.h
@@ -24,6 +24,7 @@ extern void _tlbie(unsigned long address);
 extern void _tlbia(void);
 
 #define __tlbia()	{ preempt_disable(); _tlbia(); preempt_enable(); }
+#define __tlbie(x)	{ _tlbie(x); }
 
 static inline void local_flush_tlb_all(void)
 	{ __tlbia(); }
@@ -31,7 +32,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 	{ __tlbia(); }
 static inline void local_flush_tlb_page(struct vm_area_struct *vma,
 				unsigned long vmaddr)
-	{ _tlbie(vmaddr); }
+	{ __tlbie(vmaddr); }
 static inline void local_flush_tlb_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end)
 	{ __tlbia(); }
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 371bd6e56d9..446bec29b14 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -22,101 +22,73 @@
 #include <asm/mmu.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <asm/segment.h>
 #include <linux/string.h>
 
 #define VERIFY_READ	0
 #define VERIFY_WRITE	1
 
-#define __clear_user(addr, n)	(memset((void *)(addr), 0, (n)), 0)
-
-#ifndef CONFIG_MMU
-
-extern int ___range_ok(unsigned long addr, unsigned long size);
-
-#define __range_ok(addr, size) \
-		___range_ok((unsigned long)(addr), (unsigned long)(size))
-
-#define access_ok(type, addr, size) (__range_ok((addr), (size)) == 0)
-#define __access_ok(add, size) (__range_ok((addr), (size)) == 0)
-
-/* Undefined function to trigger linker error */
-extern int bad_user_access_length(void);
-
-/* FIXME this is function for optimalization -> memcpy */
-#define __get_user(var, ptr)				\
-({							\
-	int __gu_err = 0;				\
-	switch (sizeof(*(ptr))) {			\
-	case 1:						\
-	case 2:						\
-	case 4:						\
-		(var) = *(ptr);				\
-		break;					\
-	case 8:						\
-		memcpy((void *) &(var), (ptr), 8);	\
-		break;					\
-	default:					\
-		(var) = 0;				\
-		__gu_err = __get_user_bad();		\
-		break;					\
-	}						\
-	__gu_err;					\
-})
+/*
+ * On Microblaze the fs value is actually the top of the corresponding
+ * address space.
+ *
+ * The fs value determines whether argument validity checking should be
+ * performed or not. If get_fs() == USER_DS, checking is performed, with
+ * get_fs() == KERNEL_DS, checking is bypassed.
+ *
+ * For historical reasons, these macros are grossly misnamed.
+ *
+ * For non-MMU arch like Microblaze, KERNEL_DS and USER_DS is equal.
+ */
+# define MAKE_MM_SEG(s)       ((mm_segment_t) { (s) })
 
-#define __get_user_bad()	(bad_user_access_length(), (-EFAULT))
+#  ifndef CONFIG_MMU
+#  define KERNEL_DS	MAKE_MM_SEG(0)
+#  define USER_DS	KERNEL_DS
+#  else
+#  define KERNEL_DS	MAKE_MM_SEG(0xFFFFFFFF)
+#  define USER_DS	MAKE_MM_SEG(TASK_SIZE - 1)
+#  endif
 
-/* FIXME is not there defined __pu_val */
-#define __put_user(var, ptr)					\
-({								\
-	int __pu_err = 0;					\
-	switch (sizeof(*(ptr))) {				\
-	case 1:							\
-	case 2:							\
-	case 4:							\
-		*(ptr) = (var);					\
-		break;						\
-	case 8: {						\
-		typeof(*(ptr)) __pu_val = (var);		\
-		memcpy(ptr, &__pu_val, sizeof(__pu_val));	\
-		}						\
-		break;						\
-	default:						\
-		__pu_err = __put_user_bad();			\
-		break;						\
-	}							\
-	__pu_err;						\
-})
+# define get_ds()	(KERNEL_DS)
+# define get_fs()	(current_thread_info()->addr_limit)
+# define set_fs(val)	(current_thread_info()->addr_limit = (val))
 
-#define __put_user_bad()	(bad_user_access_length(), (-EFAULT))
+# define segment_eq(a, b)	((a).seg == (b).seg)
 
-#define put_user(x, ptr)	__put_user((x), (ptr))
-#define get_user(x, ptr)	__get_user((x), (ptr))
+/*
+ * The exception table consists of pairs of addresses: the first is the
+ * address of an instruction that is allowed to fault, and the second is
+ * the address at which the program should continue. No registers are
+ * modified, so it is entirely up to the continuation code to figure out
+ * what to do.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path. This means when everything is well,
+ * we don't even have to jump over them. Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+struct exception_table_entry {
+	unsigned long insn, fixup;
+};
 
-#define copy_to_user(to, from, n)	(memcpy((to), (from), (n)), 0)
-#define copy_from_user(to, from, n)	(memcpy((to), (from), (n)), 0)
+/* Returns 0 if exception not found and fixup otherwise.  */
+extern unsigned long search_exception_table(unsigned long);
 
-#define __copy_to_user(to, from, n)	(copy_to_user((to), (from), (n)))
-#define __copy_from_user(to, from, n)	(copy_from_user((to), (from), (n)))
-#define __copy_to_user_inatomic(to, from, n) \
-			(__copy_to_user((to), (from), (n)))
-#define __copy_from_user_inatomic(to, from, n) \
-			(__copy_from_user((to), (from), (n)))
+#ifndef CONFIG_MMU
 
-static inline unsigned long clear_user(void *addr, unsigned long size)
+/* Check against bounds of physical memory */
+static inline int ___range_ok(unsigned long addr, unsigned long size)
 {
-	if (access_ok(VERIFY_WRITE, addr, size))
-		size = __clear_user(addr, size);
-	return size;
+	return ((addr < memory_start) ||
+		((addr + size) > memory_end));
 }
 
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
+#define __range_ok(addr, size) \
+		___range_ok((unsigned long)(addr), (unsigned long)(size))
 
-extern long strncpy_from_user(char *dst, const char *src, long count);
-extern long strnlen_user(const char *src, long count);
+#define access_ok(type, addr, size) (__range_ok((addr), (size)) == 0)
 
-#else /* CONFIG_MMU */
+#else
 
 /*
  * Address is valid if:
@@ -129,24 +101,88 @@ extern long strnlen_user(const char *src, long count);
 /* || printk("access_ok failed for %s at 0x%08lx (size %d), seg 0x%08x\n",
  type?"WRITE":"READ",addr,size,get_fs().seg)) */
 
-/*
- * All the __XXX versions macros/functions below do not perform
- * access checking. It is assumed that the necessary checks have been
- * already performed before the finction (macro) is called.
- */
+#endif
 
-#define get_user(x, ptr)						\
-({									\
-	access_ok(VERIFY_READ, (ptr), sizeof(*(ptr)))			\
-		? __get_user((x), (ptr)) : -EFAULT;			\
-})
+#ifdef CONFIG_MMU
+# define __FIXUP_SECTION	".section .fixup,\"ax\"\n"
+# define __EX_TABLE_SECTION	".section __ex_table,\"a\"\n"
+#else
+# define __FIXUP_SECTION	".section .discard,\"ax\"\n"
+# define __EX_TABLE_SECTION	".section .discard,\"a\"\n"
+#endif
 
-#define put_user(x, ptr)						\
-({									\
-	access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr)))			\
-		? __put_user((x), (ptr)) : -EFAULT;			\
+extern unsigned long __copy_tofrom_user(void __user *to,
+		const void __user *from, unsigned long size);
+
+/* Return: number of not copied bytes, i.e. 0 if OK or non-zero if fail. */
+static inline unsigned long __must_check __clear_user(void __user *to,
+							unsigned long n)
+{
+	/* normal memset with two words to __ex_table */
+	__asm__ __volatile__ (				\
+			"1:	sb	r0, %2, r0;"	\
+			"	addik	%0, %0, -1;"	\
+			"	bneid	%0, 1b;"	\
+			"	addik	%2, %2, 1;"	\
+			"2:			"	\
+			__EX_TABLE_SECTION		\
+			".word	1b,2b;"			\
+			".previous;"			\
+		: "=r"(n)				\
+		: "0"(n), "r"(to)
+	);
+	return n;
+}
+
+static inline unsigned long __must_check clear_user(void __user *to,
+							unsigned long n)
+{
+	might_sleep();
+	if (unlikely(!access_ok(VERIFY_WRITE, to, n)))
+		return n;
+
+	return __clear_user(to, n);
+}
+
+/* put_user and get_user macros */
+extern long __user_bad(void);
+
+#define __get_user_asm(insn, __gu_ptr, __gu_val, __gu_err)	\
+({								\
+	__asm__ __volatile__ (					\
+			"1:"	insn	" %1, %2, r0;"		\
+			"	addk	%0, r0, r0;"		\
+			"2:			"		\
+			__FIXUP_SECTION				\
+			"3:	brid	2b;"			\
+			"	addik	%0, r0, %3;"		\
+			".previous;"				\
+			__EX_TABLE_SECTION			\
+			".word	1b,3b;"				\
+			".previous;"				\
+		: "=&r"(__gu_err), "=r"(__gu_val)		\
+		: "r"(__gu_ptr), "i"(-EFAULT)			\
+	);							\
 })
 
+/**
+ * get_user: - Get a simple variable from user space.
+ * @x:   Variable to store result.
+ * @ptr: Source address, in user space.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * This macro copies a single simple variable from user space to kernel
+ * space.  It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and the result of
+ * dereferencing @ptr must be assignable to @x without a cast.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ * On error, the variable @x is set to zero.
+ */
+
 #define __get_user(x, ptr)						\
 ({									\
 	unsigned long __gu_val;						\
@@ -163,30 +199,74 @@ extern long strnlen_user(const char *src, long count);
 		__get_user_asm("lw", (ptr), __gu_val, __gu_err);	\
 		break;							\
 	default:							\
-		__gu_val = 0; __gu_err = -EINVAL;			\
+		/* __gu_val = 0; __gu_err = -EINVAL;*/ __gu_err = __user_bad();\
 	}								\
 	x = (__typeof__(*(ptr))) __gu_val;				\
 	__gu_err;							\
 })
 
-#define __get_user_asm(insn, __gu_ptr, __gu_val, __gu_err)		\
+
+#define get_user(x, ptr)						\
 ({									\
-	__asm__ __volatile__ (						\
-			"1:"	insn	" %1, %2, r0;			\
-				addk	%0, r0, r0;			\
-			2:						\
-			.section .fixup,\"ax\";				\
-			3:	brid	2b;				\
-				addik	%0, r0, %3;			\
-			.previous;					\
-			.section __ex_table,\"a\";			\
-			.word	1b,3b;					\
-			.previous;"					\
-		: "=r"(__gu_err), "=r"(__gu_val)			\
-		: "r"(__gu_ptr), "i"(-EFAULT)				\
-	);								\
+	access_ok(VERIFY_READ, (ptr), sizeof(*(ptr)))			\
+		? __get_user((x), (ptr)) : -EFAULT;			\
+})
+
+#define __put_user_asm(insn, __gu_ptr, __gu_val, __gu_err)	\
+({								\
+	__asm__ __volatile__ (					\
+			"1:"	insn	" %1, %2, r0;"		\
+			"	addk	%0, r0, r0;"		\
+			"2:			"		\
+			__FIXUP_SECTION				\
+			"3:	brid	2b;"			\
+			"	addik	%0, r0, %3;"		\
+			".previous;"				\
+			__EX_TABLE_SECTION			\
+			".word	1b,3b;"				\
+			".previous;"				\
+		: "=&r"(__gu_err)				\
+		: "r"(__gu_val), "r"(__gu_ptr), "i"(-EFAULT)	\
+	);							\
 })
 
+#define __put_user_asm_8(__gu_ptr, __gu_val, __gu_err)		\
+({								\
+	__asm__ __volatile__ ("	lwi	%0, %1, 0;"		\
+			"1:	swi	%0, %2, 0;"		\
+			"	lwi	%0, %1, 4;"		\
+			"2:	swi	%0, %2, 4;"		\
+			"	addk	%0, r0, r0;"		\
+			"3:			"		\
+			__FIXUP_SECTION				\
+			"4:	brid	3b;"			\
+			"	addik	%0, r0, %3;"		\
+			".previous;"				\
+			__EX_TABLE_SECTION			\
+			".word	1b,4b,2b,4b;"			\
+			".previous;"				\
+		: "=&r"(__gu_err)				\
+		: "r"(&__gu_val), "r"(__gu_ptr), "i"(-EFAULT)	\
+		);						\
+})
+
+/**
+ * put_user: - Write a simple value into user space.
+ * @x:   Value to copy to user space.
+ * @ptr: Destination address, in user space.
+ *
+ * Context: User context only.  This function may sleep.
+ *
+ * This macro copies a single simple value from kernel space to user
+ * space.  It supports simple types like char and int, but not larger
+ * data types like structures or arrays.
+ *
+ * @ptr must have pointer-to-simple-variable type, and @x must be assignable
+ * to the result of dereferencing @ptr.
+ *
+ * Returns zero on success, or -EFAULT on error.
+ */
+
 #define __put_user(x, ptr)						\
 ({									\
 	__typeof__(*(ptr)) volatile __gu_val = (x);			\
@@ -195,7 +275,7 @@ extern long strnlen_user(const char *src, long count);
 	case 1:								\
 		__put_user_asm("sb", (ptr), __gu_val, __gu_err);	\
 		break;							\
-	case 2: 							\
+	case 2:								\
 		__put_user_asm("sh", (ptr), __gu_val, __gu_err);	\
 		break;							\
 	case 4:								\
@@ -205,121 +285,82 @@ extern long strnlen_user(const char *src, long count);
 		__put_user_asm_8((ptr), __gu_val, __gu_err);		\
 		break;							\
 	default:							\
-		__gu_err = -EINVAL;					\
+		/*__gu_err = -EINVAL;*/	__gu_err = __user_bad();	\
 	}								\
 	__gu_err;							\
 })
 
-#define __put_user_asm_8(__gu_ptr, __gu_val, __gu_err)	\
-({							\
-__asm__ __volatile__ ("	lwi	%0, %1, 0;		\
-		1:	swi	%0, %2, 0;		\
-			lwi	%0, %1, 4;		\
-		2:	swi	%0, %2, 4;		\
-			addk	%0,r0,r0;		\
-		3:					\
-		.section .fixup,\"ax\";			\
-		4:	brid	3b;			\
-			addik	%0, r0, %3;		\
-		.previous;				\
-		.section __ex_table,\"a\";		\
-		.word	1b,4b,2b,4b;			\
-		.previous;"				\
-	: "=&r"(__gu_err)				\
-	: "r"(&__gu_val),				\
-	"r"(__gu_ptr), "i"(-EFAULT)			\
-	);						\
-})
+#ifndef CONFIG_MMU
 
-#define __put_user_asm(insn, __gu_ptr, __gu_val, __gu_err)	\
-({								\
-	__asm__ __volatile__ (					\
-			"1:"	insn	" %1, %2, r0;		\
-				addk	%0, r0, r0;		\
-			2:					\
-			.section .fixup,\"ax\";			\
-			3:	brid	2b;			\
-				addik	%0, r0, %3;		\
-			.previous;				\
-			.section __ex_table,\"a\";		\
-			.word	1b,3b;				\
-			.previous;"				\
-		: "=r"(__gu_err)				\
-		: "r"(__gu_val), "r"(__gu_ptr), "i"(-EFAULT)	\
-	);							\
-})
+#define put_user(x, ptr)	__put_user((x), (ptr))
 
-/*
- * Return: number of not copied bytes, i.e. 0 if OK or non-zero if fail.
- */
-static inline int clear_user(char *to, int size)
-{
-	if (size && access_ok(VERIFY_WRITE, to, size)) {
-		__asm__ __volatile__ ("				\
-				1:				\
-					sb	r0, %2, r0;	\
-					addik	%0, %0, -1;	\
-					bneid	%0, 1b;		\
-					addik	%2, %2, 1;	\
-				2:				\
-				.section __ex_table,\"a\";	\
-				.word	1b,2b;			\
-				.section .text;"		\
-			: "=r"(size)				\
-			: "0"(size), "r"(to)
-		);
-	}
-	return size;
-}
+#else /* CONFIG_MMU */
 
-#define __copy_from_user(to, from, n)	copy_from_user((to), (from), (n))
+#define put_user(x, ptr)						\
+({									\
+	access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr)))			\
+		? __put_user((x), (ptr)) : -EFAULT;			\
+})
+#endif /* CONFIG_MMU */
+
+/* copy_to_from_user */
+#define __copy_from_user(to, from, n)	\
+	__copy_tofrom_user((__force void __user *)(to), \
+				(void __user *)(from), (n))
 #define __copy_from_user_inatomic(to, from, n) \
 		copy_from_user((to), (from), (n))
 
-#define copy_to_user(to, from, n)					\
-	(access_ok(VERIFY_WRITE, (to), (n)) ?				\
-		__copy_tofrom_user((void __user *)(to),			\
-			(__force const void __user *)(from), (n))	\
-		: -EFAULT)
+static inline long copy_from_user(void *to,
+		const void __user *from, unsigned long n)
+{
+	might_sleep();
+	if (access_ok(VERIFY_READ, from, n))
+		return __copy_from_user(to, from, n);
+	return n;
+}
 
-#define __copy_to_user(to, from, n)	copy_to_user((to), (from), (n))
+#define __copy_to_user(to, from, n)	\
+		__copy_tofrom_user((void __user *)(to), \
+			(__force const void __user *)(from), (n))
 #define __copy_to_user_inatomic(to, from, n)	copy_to_user((to), (from), (n))
 
-#define copy_from_user(to, from, n)					\
-	(access_ok(VERIFY_READ, (from), (n)) ?				\
-		__copy_tofrom_user((__force void __user *)(to),		\
-			(void __user *)(from), (n))			\
-		: -EFAULT)
+static inline long copy_to_user(void __user *to,
+		const void *from, unsigned long n)
+{
+	might_sleep();
+	if (access_ok(VERIFY_WRITE, to, n))
+		return __copy_to_user(to, from, n);
+	return n;
+}
 
+/*
+ * Copy a null terminated string from userspace.
+ */
 extern int __strncpy_user(char *to, const char __user *from, int len);
-extern int __strnlen_user(const char __user *sstr, int len);
 
-#define strncpy_from_user(to, from, len)	\
-		(access_ok(VERIFY_READ, from, 1) ?	\
-			__strncpy_user(to, from, len) : -EFAULT)
-#define strnlen_user(str, len)	\
-		(access_ok(VERIFY_READ, str, 1) ? __strnlen_user(str, len) : 0)
+#define __strncpy_from_user	__strncpy_user
 
-#endif /* CONFIG_MMU */
-
-extern unsigned long __copy_tofrom_user(void __user *to,
-		const void __user *from, unsigned long size);
+static inline long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+	if (!access_ok(VERIFY_READ, src, 1))
+		return -EFAULT;
+	return __strncpy_from_user(dst, src, count);
+}
 
 /*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue. No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * Return the size of a string (including the ending 0)
  *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path. This means when everything is well,
- * we don't even have to jump over them. Further, they do not intrude
- * on our cache or tlb entries.
+ * Return 0 on exception, a value greater than N if too long
  */
-struct exception_table_entry {
-	unsigned long insn, fixup;
-};
+extern int __strnlen_user(const char __user *sstr, int len);
+
+static inline long strnlen_user(const char __user *src, long n)
+{
+	if (!access_ok(VERIFY_READ, src, 1))
+		return 0;
+	return __strnlen_user(src, n);
+}
 
 #endif  /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index b1084974fcc..4d5b0311601 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -37,7 +37,7 @@ static inline void __dma_sync_page(unsigned long paddr, unsigned long offset,
 
 static unsigned long get_dma_direct_offset(struct device *dev)
 {
-	if (dev)
+	if (likely(dev))
 		return (unsigned long)dev->archdata.dma_data;
 
 	return PCI_DRAM_OFFSET; /* FIXME Not sure if is correct */
diff --git a/arch/microblaze/kernel/head.S b/arch/microblaze/kernel/head.S
index cb7815cfe5a..da6a5f5dc76 100644
--- a/arch/microblaze/kernel/head.S
+++ b/arch/microblaze/kernel/head.S
@@ -51,6 +51,12 @@ swapper_pg_dir:
 
 	.text
 ENTRY(_start)
+#if CONFIG_KERNEL_BASE_ADDR == 0
+	brai	TOPHYS(real_start)
+	.org	0x100
+real_start:
+#endif
+
 	mfs	r1, rmsr
 	andi	r1, r1, ~2
 	mts	rmsr, r1
@@ -99,8 +105,8 @@ no_fdt_arg:
 	tophys(r4,r4)			/* convert to phys address */
 	ori	r3, r0, COMMAND_LINE_SIZE - 1 /* number of loops */
 _copy_command_line:
-	lbu	r2, r5, r6 /* r7=r5+r6 - r5 contain pointer to command line */
-	sb	r2, r4, r6		/* addr[r4+r6]= r7*/
+	lbu	r2, r5, r6 /* r2=r5+r6 - r5 contain pointer to command line */
+	sb	r2, r4, r6		/* addr[r4+r6]= r2*/
 	addik	r6, r6, 1		/* increment counting */
 	bgtid	r3, _copy_command_line	/* loop for all entries       */
 	addik	r3, r3, -1		/* descrement loop */
@@ -128,7 +134,7 @@ _copy_bram:
 	 * virtual to physical.
 	 */
 	nop
-	addik	r3, r0, 63		/* Invalidate all TLB entries */
+	addik	r3, r0, MICROBLAZE_TLB_SIZE -1	/* Invalidate all TLB entries */
 _invalidate:
 	mts	rtlbx, r3
 	mts	rtlbhi, r0			/* flush: ensure V is clear   */
diff --git a/arch/microblaze/kernel/hw_exception_handler.S b/arch/microblaze/kernel/hw_exception_handler.S
index 2b86c03aa84..995a2123635 100644
--- a/arch/microblaze/kernel/hw_exception_handler.S
+++ b/arch/microblaze/kernel/hw_exception_handler.S
@@ -313,13 +313,13 @@ _hw_exception_handler:
 	mfs	r5, rmsr;
 	nop
 	swi	r5, r1, 0;
-	mfs	r3, resr
+	mfs	r4, resr
 	nop
-	mfs	r4, rear;
+	mfs	r3, rear;
 	nop
 
 #ifndef CONFIG_MMU
-	andi	r5, r3, 0x1000;		/* Check ESR[DS] */
+	andi	r5, r4, 0x1000;		/* Check ESR[DS] */
 	beqi	r5, not_in_delay_slot;	/* Branch if ESR[DS] not set */
 	mfs	r17, rbtr;	/* ESR[DS] set - return address in BTR */
 	nop
@@ -327,13 +327,14 @@ not_in_delay_slot:
 	swi	r17, r1, PT_R17
 #endif
 
-	andi	r5, r3, 0x1F;		/* Extract ESR[EXC] */
+	andi	r5, r4, 0x1F;		/* Extract ESR[EXC] */
 
 #ifdef CONFIG_MMU
 	/* Calculate exception vector offset = r5 << 2 */
 	addk	r6, r5, r5; /* << 1 */
 	addk	r6, r6, r6; /* << 2 */
 
+#ifdef DEBUG
 /* counting which exception happen */
 	lwi	r5, r0, 0x200 + TOPHYS(r0_ram)
 	addi	r5, r5, 1
@@ -341,6 +342,7 @@ not_in_delay_slot:
 	lwi	r5, r6, 0x200 + TOPHYS(r0_ram)
 	addi	r5, r5, 1
 	swi	r5, r6, 0x200 + TOPHYS(r0_ram)
+#endif
 /* end */
 	/* Load the HW Exception vector */
 	lwi	r6, r6, TOPHYS(_MB_HW_ExceptionVectorTable)
@@ -376,7 +378,7 @@ handle_other_ex: /* Handle Other exceptions here */
 	swi	r18, r1, PT_R18
 
 	or	r5, r1, r0
-	andi	r6, r3, 0x1F; /* Load ESR[EC] */
+	andi	r6, r4, 0x1F; /* Load ESR[EC] */
 	lwi	r7, r0, PER_CPU(KM) /* MS: saving current kernel mode to regs */
 	swi	r7, r1, PT_MODE
 	mfs	r7, rfsr
@@ -426,11 +428,11 @@ handle_other_ex: /* Handle Other exceptions here */
  */
 handle_unaligned_ex:
 	/* Working registers already saved: R3, R4, R5, R6
-	 *  R3 = ESR
-	 *  R4 = EAR
+	 *  R4 = ESR
+	 *  R3 = EAR
 	 */
 #ifdef CONFIG_MMU
-	andi	r6, r3, 0x1000			/* Check ESR[DS] */
+	andi	r6, r4, 0x1000			/* Check ESR[DS] */
 	beqi	r6, _no_delayslot		/* Branch if ESR[DS] not set */
 	mfs	r17, rbtr;	/* ESR[DS] set - return address in BTR */
 	nop
@@ -439,7 +441,7 @@ _no_delayslot:
 	RESTORE_STATE;
 	bri	unaligned_data_trap
 #endif
-	andi	r6, r3, 0x3E0; /* Mask and extract the register operand */
+	andi	r6, r4, 0x3E0; /* Mask and extract the register operand */
 	srl	r6, r6; /* r6 >> 5 */
 	srl	r6, r6;
 	srl	r6, r6;
@@ -448,33 +450,33 @@ _no_delayslot:
 	/* Store the register operand in a temporary location */
 	sbi	r6, r0, TOPHYS(ex_reg_op);
 
-	andi	r6, r3, 0x400; /* Extract ESR[S] */
+	andi	r6, r4, 0x400; /* Extract ESR[S] */
 	bnei	r6, ex_sw;
 ex_lw:
-	andi	r6, r3, 0x800; /* Extract ESR[W] */
+	andi	r6, r4, 0x800; /* Extract ESR[W] */
 	beqi	r6, ex_lhw;
-	lbui	r5, r4, 0; /* Exception address in r4 */
+	lbui	r5, r3, 0; /* Exception address in r3 */
 	/* Load a word, byte-by-byte from destination address
 		and save it in tmp space */
 	sbi	r5, r0, TOPHYS(ex_tmp_data_loc_0);
-	lbui	r5, r4, 1;
+	lbui	r5, r3, 1;
 	sbi	r5, r0, TOPHYS(ex_tmp_data_loc_1);
-	lbui	r5, r4, 2;
+	lbui	r5, r3, 2;
 	sbi	r5, r0, TOPHYS(ex_tmp_data_loc_2);
-	lbui	r5, r4, 3;
+	lbui	r5, r3, 3;
 	sbi	r5, r0, TOPHYS(ex_tmp_data_loc_3);
-	/* Get the destination register value into r3 */
-	lwi	r3, r0, TOPHYS(ex_tmp_data_loc_0);
+	/* Get the destination register value into r4 */
+	lwi	r4, r0, TOPHYS(ex_tmp_data_loc_0);
 	bri	ex_lw_tail;
 ex_lhw:
-	lbui	r5, r4, 0; /* Exception address in r4 */
+	lbui	r5, r3, 0; /* Exception address in r3 */
 	/* Load a half-word, byte-by-byte from destination
 		address and save it in tmp space */
 	sbi	r5, r0, TOPHYS(ex_tmp_data_loc_0);
-	lbui	r5, r4, 1;
+	lbui	r5, r3, 1;
 	sbi	r5, r0, TOPHYS(ex_tmp_data_loc_1);
-	/* Get the destination register value into r3 */
-	lhui	r3, r0, TOPHYS(ex_tmp_data_loc_0);
+	/* Get the destination register value into r4 */
+	lhui	r4, r0, TOPHYS(ex_tmp_data_loc_0);
 ex_lw_tail:
 	/* Get the destination register number into r5 */
 	lbui	r5, r0, TOPHYS(ex_reg_op);
@@ -502,25 +504,25 @@ ex_sw_tail:
 	andi	r6, r6, 0x800; /* Extract ESR[W] */
 	beqi	r6, ex_shw;
 	/* Get the word - delay slot */
-	swi	r3, r0, TOPHYS(ex_tmp_data_loc_0);
+	swi	r4, r0, TOPHYS(ex_tmp_data_loc_0);
 	/* Store the word, byte-by-byte into destination address */
-	lbui	r3, r0, TOPHYS(ex_tmp_data_loc_0);
-	sbi	r3, r4, 0;
-	lbui	r3, r0, TOPHYS(ex_tmp_data_loc_1);
-	sbi	r3, r4, 1;
-	lbui	r3, r0, TOPHYS(ex_tmp_data_loc_2);
-	sbi	r3, r4, 2;
-	lbui	r3, r0, TOPHYS(ex_tmp_data_loc_3);
-	sbi	r3, r4, 3;
+	lbui	r4, r0, TOPHYS(ex_tmp_data_loc_0);
+	sbi	r4, r3, 0;
+	lbui	r4, r0, TOPHYS(ex_tmp_data_loc_1);
+	sbi	r4, r3, 1;
+	lbui	r4, r0, TOPHYS(ex_tmp_data_loc_2);
+	sbi	r4, r3, 2;
+	lbui	r4, r0, TOPHYS(ex_tmp_data_loc_3);
+	sbi	r4, r3, 3;
 	bri	ex_handler_done;
 
 ex_shw:
 	/* Store the lower half-word, byte-by-byte into destination address */
-	swi	r3, r0, TOPHYS(ex_tmp_data_loc_0);
-	lbui	r3, r0, TOPHYS(ex_tmp_data_loc_2);
-	sbi	r3, r4, 0;
-	lbui	r3, r0, TOPHYS(ex_tmp_data_loc_3);
-	sbi	r3, r4, 1;
+	swi	r4, r0, TOPHYS(ex_tmp_data_loc_0);
+	lbui	r4, r0, TOPHYS(ex_tmp_data_loc_2);
+	sbi	r4, r3, 0;
+	lbui	r4, r0, TOPHYS(ex_tmp_data_loc_3);
+	sbi	r4, r3, 1;
 ex_sw_end: /* Exception handling of store word, ends. */
 
 ex_handler_done:
@@ -560,21 +562,16 @@ ex_handler_done:
 		 */
 		mfs	r11, rpid
 		nop
-		bri	4
-		mfs	r3, rear		/* Get faulting address */
-		nop
 		/* If we are faulting a kernel address, we have to use the
 		 * kernel page tables.
 		 */
-		ori	r4, r0, CONFIG_KERNEL_START
-		cmpu	r4, r3, r4
-		bgti	r4, ex3
+		ori	r5, r0, CONFIG_KERNEL_START
+		cmpu	r5, r3, r5
+		bgti	r5, ex3
 		/* First, check if it was a zone fault (which means a user
 		 * tried to access a kernel or read-protected page - always
 		 * a SEGV). All other faults here must be stores, so no
 		 * need to check ESR_S as well. */
-		mfs	r4, resr
-		nop
 		andi	r4, r4, 0x800		/* ESR_Z - zone protection */
 		bnei	r4, ex2
 
@@ -589,8 +586,6 @@ ex_handler_done:
 		 * tried to access a kernel or read-protected page - always
 		 * a SEGV). All other faults here must be stores, so no
 		 * need to check ESR_S as well. */
-		mfs	r4, resr
-		nop
 		andi	r4, r4, 0x800		/* ESR_Z */
 		bnei	r4, ex2
 		/* get current task address */
@@ -665,8 +660,6 @@ ex_handler_done:
 		 * R3 = ESR
 		 */
 
-		mfs	r3, rear		/* Get faulting address */
-		nop
 		RESTORE_STATE;
 		bri	page_fault_instr_trap
 
@@ -677,18 +670,15 @@ ex_handler_done:
 	 */
 	handle_data_tlb_miss_exception:
 		/* Working registers already saved: R3, R4, R5, R6
-		 * R3 = ESR
+		 * R3 = EAR, R4 = ESR
 		 */
 		mfs	r11, rpid
 		nop
-		bri	4
-		mfs	r3, rear		/* Get faulting address */
-		nop
 
 		/* If we are faulting a kernel address, we have to use the
 		 * kernel page tables. */
-		ori	r4, r0, CONFIG_KERNEL_START
-		cmpu	r4, r3, r4
+		ori	r6, r0, CONFIG_KERNEL_START
+		cmpu	r4, r3, r6
 		bgti	r4, ex5
 		ori	r4, r0, swapper_pg_dir
 		mts	rpid, r0		/* TLB will have 0 TID */
@@ -731,9 +721,8 @@ ex_handler_done:
 		 * Many of these bits are software only. Bits we don't set
 		 * here we (properly should) assume have the appropriate value.
 		 */
+		brid	finish_tlb_load
 		andni	r4, r4, 0x0ce2		/* Make sure 20, 21 are zero */
-
-		bri	finish_tlb_load
 	ex7:
 		/* The bailout. Restore registers to pre-exception conditions
 		 * and call the heavyweights to help us out.
@@ -754,9 +743,6 @@ ex_handler_done:
 		 */
 		mfs	r11, rpid
 		nop
-		bri	4
-		mfs	r3, rear		/* Get faulting address */
-		nop
 
 		/* If we are faulting a kernel address, we have to use the
 		 * kernel page tables.
@@ -792,7 +778,7 @@ ex_handler_done:
 		lwi	r4, r5, 0		/* Get Linux PTE */
 
 		andi	r6, r4, _PAGE_PRESENT
-		beqi	r6, ex7
+		beqi	r6, ex10
 
 		ori	r4, r4, _PAGE_ACCESSED
 		swi	r4, r5, 0
@@ -805,9 +791,8 @@ ex_handler_done:
 		 * Many of these bits are software only. Bits we don't set
 		 * here we (properly should) assume have the appropriate value.
 		 */
+		brid	finish_tlb_load
 		andni	r4, r4, 0x0ce2		/* Make sure 20, 21 are zero */
-
-		bri	finish_tlb_load
 	ex10:
 		/* The bailout. Restore registers to pre-exception conditions
 		 * and call the heavyweights to help us out.
@@ -837,9 +822,9 @@ ex_handler_done:
 		andi	r5, r5, (MICROBLAZE_TLB_SIZE-1)
 		ori	r6, r0, 1
 		cmp	r31, r5, r6
-		blti	r31, sem
+		blti	r31, ex12
 		addik	r5, r6, 1
-	sem:
+	ex12:
 		/* MS: save back current TLB index */
 		swi	r5, r0, TOPHYS(tlb_index)
 
@@ -859,7 +844,6 @@ ex_handler_done:
 		nop
 
 		/* Done...restore registers and get out of here. */
-	ex12:
 		mts	rpid, r11
 		nop
 		bri 4
diff --git a/arch/microblaze/kernel/misc.S b/arch/microblaze/kernel/misc.S
index df16c6287a8..7cf86498326 100644
--- a/arch/microblaze/kernel/misc.S
+++ b/arch/microblaze/kernel/misc.S
@@ -26,9 +26,10 @@
  * We avoid flushing the pinned 0, 1 and possibly 2 entries.
  */
 .globl _tlbia;
+.type  _tlbia, @function
 .align 4;
 _tlbia:
-	addik	r12, r0, 63 /* flush all entries (63 - 3) */
+	addik	r12, r0, MICROBLAZE_TLB_SIZE - 1 /* flush all entries (63 - 3) */
 	/* isync */
 _tlbia_1:
 	mts	rtlbx, r12
@@ -41,11 +42,13 @@ _tlbia_1:
 	/* sync */
 	rtsd	r15, 8
 	nop
+	.size  _tlbia, . - _tlbia
 
 /*
  * Flush MMU TLB for a particular address (in r5)
  */
 .globl _tlbie;
+.type  _tlbie, @function
 .align 4;
 _tlbie:
 	mts	rtlbsx, r5 /* look up the address in TLB */
@@ -59,17 +62,20 @@ _tlbie_1:
 	rtsd	r15, 8
 	nop
 
+	.size  _tlbie, . - _tlbie
+
 /*
  * Allocate TLB entry for early console
  */
 .globl early_console_reg_tlb_alloc;
+.type  early_console_reg_tlb_alloc, @function
 .align 4;
 early_console_reg_tlb_alloc:
 	/*
 	 * Load a TLB entry for the UART, so that microblaze_progress() can use
 	 * the UARTs nice and early.  We use a 4k real==virtual mapping.
 	 */
-	ori	r4, r0, 63
+	ori	r4, r0, MICROBLAZE_TLB_SIZE - 1
 	mts	rtlbx, r4 /* TLB slot 2 */
 
 	or	r4,r5,r0
@@ -86,6 +92,8 @@ early_console_reg_tlb_alloc:
 	rtsd	r15, 8
 	nop
 
+	.size  early_console_reg_tlb_alloc, . - early_console_reg_tlb_alloc
+
 /*
  * Copy a whole page (4096 bytes).
  */
@@ -104,6 +112,7 @@ early_console_reg_tlb_alloc:
 #define DCACHE_LINE_BYTES (4 * 4)
 
 .globl copy_page;
+.type  copy_page, @function
 .align 4;
 copy_page:
 	ori	r11, r0, (PAGE_SIZE/DCACHE_LINE_BYTES) - 1
@@ -118,3 +127,5 @@ _copy_page_loop:
 	addik	r11, r11, -1
 	rtsd	r15, 8
 	nop
+
+	.size  copy_page, . - copy_page
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 812f1bf06c9..09bed44dfcd 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -15,6 +15,7 @@
 #include <linux/bitops.h>
 #include <asm/system.h>
 #include <asm/pgalloc.h>
+#include <asm/uaccess.h> /* for USER_DS macros */
 #include <asm/cacheflush.h>
 
 void show_regs(struct pt_regs *regs)
@@ -74,7 +75,10 @@ __setup("hlt", hlt_setup);
 
 void default_idle(void)
 {
-	if (!hlt_counter) {
+	if (likely(hlt_counter)) {
+		while (!need_resched())
+			cpu_relax();
+	} else {
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 		smp_mb__after_clear_bit();
 		local_irq_disable();
@@ -82,9 +86,7 @@ void default_idle(void)
 			cpu_sleep();
 		local_irq_enable();
 		set_thread_flag(TIF_POLLING_NRFLAG);
-	} else
-		while (!need_resched())
-			cpu_relax();
+	}
 }
 
 void cpu_idle(void)
diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c
index f974ec7aa35..17c98dbcec8 100644
--- a/arch/microblaze/kernel/setup.c
+++ b/arch/microblaze/kernel/setup.c
@@ -92,6 +92,12 @@ inline unsigned get_romfs_len(unsigned *addr)
 }
 #endif	/* CONFIG_MTD_UCLINUX_EBSS */
 
+#if defined(CONFIG_EARLY_PRINTK) && defined(CONFIG_SERIAL_UARTLITE_CONSOLE)
+#define eprintk early_printk
+#else
+#define eprintk printk
+#endif
+
 void __init machine_early_init(const char *cmdline, unsigned int ram,
 		unsigned int fdt, unsigned int msr)
 {
@@ -139,32 +145,32 @@ void __init machine_early_init(const char *cmdline, unsigned int ram,
 	setup_early_printk(NULL);
 #endif
 
-	early_printk("Ramdisk addr 0x%08x, ", ram);
+	eprintk("Ramdisk addr 0x%08x, ", ram);
 	if (fdt)
-		early_printk("FDT at 0x%08x\n", fdt);
+		eprintk("FDT at 0x%08x\n", fdt);
 	else
-		early_printk("Compiled-in FDT at 0x%08x\n",
+		eprintk("Compiled-in FDT at 0x%08x\n",
 					(unsigned int)_fdt_start);
 
 #ifdef CONFIG_MTD_UCLINUX
-	early_printk("Found romfs @ 0x%08x (0x%08x)\n",
+	eprintk("Found romfs @ 0x%08x (0x%08x)\n",
 			romfs_base, romfs_size);
-	early_printk("#### klimit %p ####\n", old_klimit);
+	eprintk("#### klimit %p ####\n", old_klimit);
 	BUG_ON(romfs_size < 0); /* What else can we do? */
 
-	early_printk("Moved 0x%08x bytes from 0x%08x to 0x%08x\n",
+	eprintk("Moved 0x%08x bytes from 0x%08x to 0x%08x\n",
 			romfs_size, romfs_base, (unsigned)&_ebss);
 
-	early_printk("New klimit: 0x%08x\n", (unsigned)klimit);
+	eprintk("New klimit: 0x%08x\n", (unsigned)klimit);
 #endif
 
 #if CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR
 	if (msr)
-		early_printk("!!!Your kernel has setup MSR instruction but "
+		eprintk("!!!Your kernel has setup MSR instruction but "
 				"CPU don't have it %d\n", msr);
 #else
 	if (!msr)
-		early_printk("!!!Your kernel not setup MSR instruction but "
+		eprintk("!!!Your kernel not setup MSR instruction but "
 				"CPU have it %d\n", msr);
 #endif
 
diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c
index eaaaf805f31..5e4570ef515 100644
--- a/arch/microblaze/kernel/traps.c
+++ b/arch/microblaze/kernel/traps.c
@@ -22,13 +22,11 @@ void trap_init(void)
 	__enable_hw_exceptions();
 }
 
-static int kstack_depth_to_print = 24;
+static unsigned long kstack_depth_to_print = 24;
 
 static int __init kstack_setup(char *s)
 {
-	kstack_depth_to_print = strict_strtoul(s, 0, NULL);
-
-	return 1;
+	return !strict_strtoul(s, 0, &kstack_depth_to_print);
 }
 __setup("kstack=", kstack_setup);
 
diff --git a/arch/microblaze/lib/Makefile b/arch/microblaze/lib/Makefile
index b579db068c0..4dfe47d3cd9 100644
--- a/arch/microblaze/lib/Makefile
+++ b/arch/microblaze/lib/Makefile
@@ -10,5 +10,4 @@ else
 lib-y += memcpy.o memmove.o
 endif
 
-lib-$(CONFIG_NO_MMU) += uaccess.o
-lib-$(CONFIG_MMU) += uaccess_old.o
+lib-y += uaccess_old.o
diff --git a/arch/microblaze/lib/fastcopy.S b/arch/microblaze/lib/fastcopy.S
index 02e3ab4eddf..fdc48bb065d 100644
--- a/arch/microblaze/lib/fastcopy.S
+++ b/arch/microblaze/lib/fastcopy.S
@@ -30,8 +30,9 @@
  */
 
 #include <linux/linkage.h>
-
+	.text
 	.globl	memcpy
+	.type  memcpy, @function
 	.ent	memcpy
 
 memcpy:
@@ -345,9 +346,11 @@ a_done:
 	rtsd	r15, 8
 	nop
 
+.size  memcpy, . - memcpy
 .end memcpy
 /*----------------------------------------------------------------------------*/
 	.globl	memmove
+	.type  memmove, @function
 	.ent	memmove
 
 memmove:
@@ -659,4 +662,5 @@ d_done:
 	rtsd	r15, 8
 	nop
 
+.size  memmove, . - memmove
 .end memmove
diff --git a/arch/microblaze/lib/memcpy.c b/arch/microblaze/lib/memcpy.c
index cc2108b6b26..014bac92bdf 100644
--- a/arch/microblaze/lib/memcpy.c
+++ b/arch/microblaze/lib/memcpy.c
@@ -53,7 +53,7 @@ void *memcpy(void *v_dst, const void *v_src, __kernel_size_t c)
 	const uint32_t *i_src;
 	uint32_t *i_dst;
 
-	if (c >= 4) {
+	if (likely(c >= 4)) {
 		unsigned  value, buf_hold;
 
 		/* Align the dstination to a word boundry. */
diff --git a/arch/microblaze/lib/memset.c b/arch/microblaze/lib/memset.c
index 4df851d41a2..ecfb663e1fc 100644
--- a/arch/microblaze/lib/memset.c
+++ b/arch/microblaze/lib/memset.c
@@ -33,22 +33,23 @@
 #ifdef __HAVE_ARCH_MEMSET
 void *memset(void *v_src, int c, __kernel_size_t n)
 {
-
 	char *src = v_src;
 #ifdef CONFIG_OPT_LIB_FUNCTION
 	uint32_t *i_src;
-	uint32_t w32;
+	uint32_t w32 = 0;
 #endif
 	/* Truncate c to 8 bits */
 	c = (c & 0xFF);
 
 #ifdef CONFIG_OPT_LIB_FUNCTION
-	/* Make a repeating word out of it */
-	w32 = c;
-	w32 |= w32 << 8;
-	w32 |= w32 << 16;
+	if (unlikely(c)) {
+		/* Make a repeating word out of it */
+		w32 = c;
+		w32 |= w32 << 8;
+		w32 |= w32 << 16;
+	}
 
-	if (n >= 4) {
+	if (likely(n >= 4)) {
 		/* Align the destination to a word boundary */
 		/* This is done in an endian independant manner */
 		switch ((unsigned) src & 3) {
diff --git a/arch/microblaze/lib/uaccess.c b/arch/microblaze/lib/uaccess.c
deleted file mode 100644
index a853fe089c4..00000000000
--- a/arch/microblaze/lib/uaccess.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2006 Atmark Techno, Inc.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#include <linux/string.h>
-#include <asm/uaccess.h>
-
-#include <asm/bug.h>
-
-long strnlen_user(const char __user *src, long count)
-{
-	return strlen(src) + 1;
-}
-
-#define __do_strncpy_from_user(dst, src, count, res)			\
-	do {								\
-		char *tmp;						\
-		strncpy(dst, src, count);				\
-		for (tmp = dst; *tmp && count > 0; tmp++, count--)	\
-			;						\
-		res = (tmp - dst);					\
-	} while (0)
-
-long __strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res;
-	__do_strncpy_from_user(dst, src, count, res);
-	return res;
-}
-
-long strncpy_from_user(char *dst, const char __user *src, long count)
-{
-	long res = -EFAULT;
-	if (access_ok(VERIFY_READ, src, 1))
-		__do_strncpy_from_user(dst, src, count, res);
-	return res;
-}
-
-unsigned long __copy_tofrom_user(void __user *to,
-		const void __user *from, unsigned long size)
-{
-	memcpy(to, from, size);
-	return 0;
-}
diff --git a/arch/microblaze/lib/uaccess_old.S b/arch/microblaze/lib/uaccess_old.S
index 67f991c14b8..5810cec54a7 100644
--- a/arch/microblaze/lib/uaccess_old.S
+++ b/arch/microblaze/lib/uaccess_old.S
@@ -22,6 +22,7 @@
 
 	.text
 .globl __strncpy_user;
+.type  __strncpy_user, @function
 .align 4;
 __strncpy_user:
 
@@ -50,7 +51,7 @@ __strncpy_user:
 3:
 	rtsd	r15,8
 	nop
-
+	.size   __strncpy_user, . - __strncpy_user
 
 	.section	.fixup, "ax"
 	.align	2
@@ -72,6 +73,7 @@ __strncpy_user:
 
 	.text
 .globl __strnlen_user;
+.type  __strnlen_user, @function
 .align 4;
 __strnlen_user:
 	addik	r3,r6,0
@@ -90,7 +92,7 @@ __strnlen_user:
 3:
 	rtsd	r15,8
 	nop
-
+	.size   __strnlen_user, . - __strnlen_user
 
 	.section	.fixup,"ax"
 4:
@@ -108,6 +110,7 @@ __strnlen_user:
  */
 	.text
 .globl __copy_tofrom_user;
+.type  __copy_tofrom_user, @function
 .align 4;
 __copy_tofrom_user:
 	/*
@@ -116,20 +119,34 @@ __copy_tofrom_user:
 	 * r7, r3 - count
 	 * r4 - tempval
 	 */
-	addik	r3,r7,0
-	beqi	r3,3f
-1:
-	lbu	r4,r6,r0
-	addik	r6,r6,1
-2:
-	sb	r4,r5,r0
-	addik	r3,r3,-1
-	bneid	r3,1b
-	addik	r5,r5,1		/* delay slot */
+	beqid	r7, 3f /* zero size is not likely */
+	andi	r3, r7, 0x3 /* filter add count */
+	bneid	r3, 4f /* if is odd value then byte copying */
+	or	r3, r5, r6 /* find if is any to/from unaligned */
+	andi	r3, r3, 0x3 /* mask unaligned */
+	bneid	r3, 1f /* it is unaligned -> then jump */
+	or	r3, r0, r0
+
+/* at least one 4 byte copy */
+5:	lw	r4, r6, r3
+6:	sw	r4, r5, r3
+	addik	r7, r7, -4
+	bneid	r7, 5b
+	addik	r3, r3, 4
+	addik	r3, r7, 0
+	rtsd	r15, 8
+	nop
+4:	or	r3, r0, r0
+1:	lbu	r4,r6,r3
+2:	sb	r4,r5,r3
+	addik	r7,r7,-1
+	bneid	r7,1b
+	addik	r3,r3,1		/* delay slot */
 3:
+	addik	r3,r7,0
 	rtsd	r15,8
 	nop
-
+	.size   __copy_tofrom_user, . - __copy_tofrom_user
 
 	.section	__ex_table,"a"
-	.word	1b,3b,2b,3b
+	.word	1b,3b,2b,3b,5b,3b,6b,3b
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index d9d249a66ff..7af87f4b2c2 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -106,7 +106,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	regs->esr = error_code;
 
 	/* On a kernel SLB miss we can only check for a valid exception entry */
-	if (kernel_mode(regs) && (address >= TASK_SIZE)) {
+	if (unlikely(kernel_mode(regs) && (address >= TASK_SIZE))) {
 		printk(KERN_WARNING "kernel task_size exceed");
 		_exception(SIGSEGV, regs, code, address);
 	}
@@ -122,7 +122,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	}
 #endif /* CONFIG_KGDB */
 
-	if (in_atomic() || !mm) {
+	if (unlikely(in_atomic() || !mm)) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
@@ -150,7 +150,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * source.  If this is invalid we can skip the address space check,
 	 * thus avoiding the deadlock.
 	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
+	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		if (kernel_mode(regs) && !search_exception_tables(regs->pc))
 			goto bad_area_nosemaphore;
 
@@ -158,16 +158,16 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	}
 
 	vma = find_vma(mm, address);
-	if (!vma)
+	if (unlikely(!vma))
 		goto bad_area;
 
 	if (vma->vm_start <= address)
 		goto good_area;
 
-	if (!(vma->vm_flags & VM_GROWSDOWN))
+	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
 		goto bad_area;
 
-	if (!is_write)
+	if (unlikely(!is_write))
 		goto bad_area;
 
 	/*
@@ -179,7 +179,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * before setting the user r1.  Thus we allow the stack to
 	 * expand to 1MB without further checks.
 	 */
-	if (address + 0x100000 < vma->vm_end) {
+	if (unlikely(address + 0x100000 < vma->vm_end)) {
 
 		/* get user regs even if this fault is in kernel mode */
 		struct pt_regs *uregs = current->thread.regs;
@@ -209,15 +209,15 @@ good_area:
 	code = SEGV_ACCERR;
 
 	/* a write */
-	if (is_write) {
-		if (!(vma->vm_flags & VM_WRITE))
+	if (unlikely(is_write)) {
+		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 			goto bad_area;
 	/* a read */
 	} else {
 		/* protection fault */
-		if (error_code & 0x08000000)
+		if (unlikely(error_code & 0x08000000))
 			goto bad_area;
-		if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+		if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC))))
 			goto bad_area;
 	}
 
@@ -235,7 +235,7 @@ survive:
 			goto do_sigbus;
 		BUG();
 	}
-	if (fault & VM_FAULT_MAJOR)
+	if (unlikely(fault & VM_FAULT_MAJOR))
 		current->maj_flt++;
 	else
 		current->min_flt++;
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 1608e2e1a44..40bc10ede09 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -165,7 +165,6 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	for (addr = begin; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
-		memset((void *)addr, 0xcc, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -208,14 +207,6 @@ void __init mem_init(void)
 }
 
 #ifndef CONFIG_MMU
-/* Check against bounds of physical memory */
-int ___range_ok(unsigned long addr, unsigned long size)
-{
-	return ((addr < memory_start) ||
-		((addr + size) > memory_end));
-}
-EXPORT_SYMBOL(___range_ok);
-
 int page_is_ram(unsigned long pfn)
 {
 	return __range_ok(pfn, 0);
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index 63a6fd07c48..d31312cde6e 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -154,7 +154,7 @@ int map_page(unsigned long va, phys_addr_t pa, int flags)
 		err = 0;
 		set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT,
 				__pgprot(flags)));
-		if (mem_init_done)
+		if (unlikely(mem_init_done))
 			flush_HPTE(0, va, pmd_val(*pd));
 			/* flush_HPTE(0, va, pg); */
 	}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8a54eb8e376..2e19500921f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -313,19 +313,6 @@ config 8XX_MINIMAL_FPEMU
 
 	  It is recommended that you build a soft-float userspace instead.
 
-config IOMMU_VMERGE
-	bool "Enable IOMMU virtual merging"
-	depends on PPC64
-	default y
-	help
-	  Cause IO segments sent to a device for DMA to be merged virtually
-	  by the IOMMU when they happen to have been allocated contiguously.
-	  This doesn't add pressure to the IOMMU allocator. However, some
-	  drivers don't support getting large merged segments coming back
-	  from *_map_sg().
-
-	  Most drivers don't have this problem; it is safe to say Y here.
-
 config IOMMU_HELPER
 	def_bool PPC64
 
diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h
index c1b475a941e..a9b91ed3d4b 100644
--- a/arch/powerpc/include/asm/asm-compat.h
+++ b/arch/powerpc/include/asm/asm-compat.h
@@ -28,6 +28,7 @@
 #define PPC_LLARX(t, a, b, eh)	PPC_LDARX(t, a, b, eh)
 #define PPC_STLCX	stringify_in_c(stdcx.)
 #define PPC_CNTLZL	stringify_in_c(cntlzd)
+#define PPC_LR_STKOFF	16
 
 /* Move to CR, single-entry optimized version. Only available
  * on POWER4 and later.
@@ -51,6 +52,7 @@
 #define PPC_STLCX	stringify_in_c(stwcx.)
 #define PPC_CNTLZL	stringify_in_c(cntlzw)
 #define PPC_MTOCRF	stringify_in_c(mtcrf)
+#define PPC_LR_STKOFF	4
 
 #endif
 
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index aea71479759..d553bbeb726 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -25,7 +25,7 @@
 #define PPC_INST_LDARX			0x7c0000a8
 #define PPC_INST_LSWI			0x7c0004aa
 #define PPC_INST_LSWX			0x7c00042a
-#define PPC_INST_LWARX			0x7c000029
+#define PPC_INST_LWARX			0x7c000028
 #define PPC_INST_LWSYNC			0x7c2004ac
 #define PPC_INST_LXVD2X			0x7c000698
 #define PPC_INST_MCRXR			0x7c000400
@@ -62,8 +62,8 @@
 #define __PPC_T_TLB(t)	(((t) & 0x3) << 21)
 #define __PPC_WC(w)	(((w) & 0x3) << 21)
 /*
- * Only use the larx hint bit on 64bit CPUs. Once we verify it doesn't have
- * any side effects on all 32bit processors, we can do this all the time.
+ * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a
+ * larx with EH set as an illegal instruction.
  */
 #ifdef CONFIG_PPC64
 #define __PPC_EH(eh)	(((eh) & 0x1) << 0)
diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index efa7f0b879f..23913e902fc 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -30,7 +30,7 @@ static inline void syscall_rollback(struct task_struct *task,
 static inline long syscall_get_error(struct task_struct *task,
 				     struct pt_regs *regs)
 {
-	return (regs->ccr & 0x1000) ? -regs->gpr[3] : 0;
+	return (regs->ccr & 0x10000000) ? -regs->gpr[3] : 0;
 }
 
 static inline long syscall_get_return_value(struct task_struct *task,
@@ -44,10 +44,10 @@ static inline void syscall_set_return_value(struct task_struct *task,
 					    int error, long val)
 {
 	if (error) {
-		regs->ccr |= 0x1000L;
+		regs->ccr |= 0x10000000L;
 		regs->gpr[3] = -error;
 	} else {
-		regs->ccr &= ~0x1000L;
+		regs->ccr &= ~0x10000000L;
 		regs->gpr[3] = val;
 	}
 }
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 25793bb0e78..72552654799 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -747,9 +747,6 @@ finish_tlb_load:
 #else
 	rlwimi	r12, r11, 26, 27, 31	/* extract WIMGE from pte */
 #endif
-#ifdef CONFIG_SMP
-	ori	r12, r12, MAS2_M
-#endif
 	mtspr	SPRN_MAS2, r12
 
 #ifdef CONFIG_PTE_64BIT
@@ -887,13 +884,17 @@ KernelSPE:
 	lwz	r3,_MSR(r1)
 	oris	r3,r3,MSR_SPE@h
 	stw	r3,_MSR(r1)	/* enable use of SPE after return */
+#ifdef CONFIG_PRINTK
 	lis	r3,87f@h
 	ori	r3,r3,87f@l
 	mr	r4,r2		/* current */
 	lwz	r5,_NIP(r1)
 	bl	printk
+#endif
 	b	ret_from_except
+#ifdef CONFIG_PRINTK
 87:	.string	"SPE used in kernel  (task=%p, pc=%x)  \n"
+#endif
 	.align	4,0
 
 #endif /* CONFIG_SPE */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5547ae6e6b0..ec94f906ea4 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -42,12 +42,7 @@
 
 #define DBG(...)
 
-#ifdef CONFIG_IOMMU_VMERGE
-static int novmerge = 0;
-#else
-static int novmerge = 1;
-#endif
-
+static int novmerge;
 static int protect4gb = 1;
 
 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S
index 2d29752cbe1..b485a87c94e 100644
--- a/arch/powerpc/kernel/misc.S
+++ b/arch/powerpc/kernel/misc.S
@@ -127,3 +127,31 @@ _GLOBAL(__setup_cpu_power7)
 _GLOBAL(__restore_cpu_power7)
 	/* place holder */
 	blr
+
+#ifdef CONFIG_EVENT_TRACING
+/*
+ * Get a minimal set of registers for our caller's nth caller.
+ * r3 = regs pointer, r5 = n.
+ *
+ * We only get R1 (stack pointer), NIP (next instruction pointer)
+ * and LR (link register).  These are all we can get in the
+ * general case without doing complicated stack unwinding, but
+ * fortunately they are enough to do a stack backtrace, which
+ * is all we need them for.
+ */
+_GLOBAL(perf_arch_fetch_caller_regs)
+	mr	r6,r1
+	cmpwi	r5,0
+	mflr	r4
+	ble	2f
+	mtctr	r5
+1:	PPC_LL	r6,0(r6)
+	bdnz	1b
+	PPC_LL	r4,PPC_LR_STKOFF(r6)
+2:	PPC_LL	r7,0(r6)
+	PPC_LL	r7,PPC_LR_STKOFF(r7)
+	PPC_STL	r6,GPR1-STACK_FRAME_OVERHEAD(r3)
+	PPC_STL	r4,_NIP-STACK_FRAME_OVERHEAD(r3)
+	PPC_STL	r7,_LINK-STACK_FRAME_OVERHEAD(r3)
+	blr
+#endif /* CONFIG_EVENT_TRACING */
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index b152de3e64d..8f58986c2ad 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -39,7 +39,6 @@
 #include <asm/serial.h>
 #include <asm/udbg.h>
 #include <asm/mmu_context.h>
-#include <asm/swiotlb.h>
 
 #include "setup.h"
 
@@ -343,11 +342,6 @@ void __init setup_arch(char **cmdline_p)
 		ppc_md.setup_arch();
 	if ( ppc_md.progress ) ppc_md.progress("arch: exit", 0x3eab);
 
-#ifdef CONFIG_SWIOTLB
-	if (ppc_swiotlb_enable)
-		swiotlb_init(1);
-#endif
-
 	paging_init();
 
 	/* Initialize the MMU context management stuff */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 63547394048..914389158a9 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -61,7 +61,6 @@
 #include <asm/xmon.h>
 #include <asm/udbg.h>
 #include <asm/kexec.h>
-#include <asm/swiotlb.h>
 #include <asm/mmu_context.h>
 
 #include "setup.h"
@@ -541,11 +540,6 @@ void __init setup_arch(char **cmdline_p)
 	if (ppc_md.setup_arch)
 		ppc_md.setup_arch();
 
-#ifdef CONFIG_SWIOTLB
-	if (ppc_swiotlb_enable)
-		swiotlb_init(1);
-#endif
-
 	paging_init();
 
 	/* Initialize the MMU context management stuff */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 311224cdb7a..448f972b22f 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -48,6 +48,7 @@
 #include <asm/sparsemem.h>
 #include <asm/vdso.h>
 #include <asm/fixmap.h>
+#include <asm/swiotlb.h>
 
 #include "mmu_decl.h"
 
@@ -320,6 +321,11 @@ void __init mem_init(void)
 	struct page *page;
 	unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
 
+#ifdef CONFIG_SWIOTLB
+	if (ppc_swiotlb_enable)
+		swiotlb_init(1);
+#endif
+
 	num_physpages = lmb.memory.size >> PAGE_SHIFT;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
 
diff --git a/arch/s390/boot/compressed/misc.c b/arch/s390/boot/compressed/misc.c
index a97d6952582..14e0479d388 100644
--- a/arch/s390/boot/compressed/misc.c
+++ b/arch/s390/boot/compressed/misc.c
@@ -24,8 +24,8 @@
 /* Symbols defined by linker scripts */
 extern char input_data[];
 extern int input_len;
-extern int _text;
-extern int _end;
+extern char _text, _end;
+extern char _bss, _ebss;
 
 static void error(char *m);
 
@@ -129,12 +129,12 @@ unsigned long decompress_kernel(void)
 	unsigned long output_addr;
 	unsigned char *output;
 
+	check_ipl_parmblock((void *) 0, (unsigned long) output + SZ__bss_start);
+	memset(&_bss, 0, &_ebss - &_bss);
 	free_mem_ptr = (unsigned long)&_end;
 	free_mem_end_ptr = free_mem_ptr + HEAP_SIZE;
 	output = (unsigned char *) ((free_mem_end_ptr + 4095UL) & -4096UL);
 
-	check_ipl_parmblock((void *) 0, (unsigned long) output + SZ__bss_start);
-
 #ifdef CONFIG_BLK_DEV_INITRD
 	/*
 	 * Move the initrd right behind the end of the decompressed
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h
index 67ee6c3c6bb..1741c1556a4 100644
--- a/arch/s390/include/asm/system.h
+++ b/arch/s390/include/asm/system.h
@@ -110,6 +110,7 @@ extern void pfault_fini(void);
 #endif /* CONFIG_PFAULT */
 
 extern void cmma_init(void);
+extern int memcpy_real(void *, void *, size_t);
 
 #define finish_arch_switch(prev) do {					     \
 	set_fs(current->thread.mm_segment);				     \
@@ -218,8 +219,8 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
 			"	l	%0,%2\n"
 			"0:	nr	%0,%5\n"
 			"	lr	%1,%0\n"
-			"	or	%0,%2\n"
-			"	or	%1,%3\n"
+			"	or	%0,%3\n"
+			"	or	%1,%4\n"
 			"	cs	%0,%1,%2\n"
 			"	jnl	1f\n"
 			"	xr	%1,%0\n"
@@ -239,8 +240,8 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
 			"	l	%0,%2\n"
 			"0:	nr	%0,%5\n"
 			"	lr	%1,%0\n"
-			"	or	%0,%2\n"
-			"	or	%1,%3\n"
+			"	or	%0,%3\n"
+			"	or	%1,%4\n"
 			"	cs	%0,%1,%2\n"
 			"	jnl	1f\n"
 			"	xr	%1,%0\n"
diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S
index ca4a62bd862..9d1f76702d4 100644
--- a/arch/s390/kernel/head.S
+++ b/arch/s390/kernel/head.S
@@ -517,7 +517,10 @@ startup:
 	lhi	%r1,2			# mode 2 = esame (dump)
 	sigp	%r1,%r0,0x12		# switch to esame mode
 	sam64				# switch to 64 bit mode
+	larl	%r13,4f
+	lmh	%r0,%r15,0(%r13)	# clear high-order half
 	jg	startup_continue
+4:	.fill	16,4,0x0
 #else
 	mvi	__LC_AR_MODE_ID,0	# set ESA flag (mode 0)
 	l	%r13,4f-.LPG0(%r13)
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 39580e76865..1f70970de0a 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -21,7 +21,6 @@ startup_continue:
 	larl	%r1,sched_clock_base_cc
 	mvc	0(8,%r1),__LC_LAST_UPDATE_CLOCK
 	larl	%r13,.LPG1		# get base
-	lmh	%r0,%r15,.Lzero64-.LPG1(%r13)	# clear high-order half
 	lctlg	%c0,%c15,.Lctl-.LPG1(%r13)	# load control registers
 	lg	%r12,.Lparmaddr-.LPG1(%r13)	# pointer to parameter area
 					# move IPL device to lowcore
@@ -67,7 +66,6 @@ startup_continue:
 .L4malign:.quad 0xffffffffffc00000
 .Lscan2g:.quad	0x80000000 + 0x20000 - 8	# 2GB + 128K - 8
 .Lnop:	.long	0x07000700
-.Lzero64:.fill	16,4,0x0
 .Lparmaddr:
 	.quad	PARMAREA
 	.align	64
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 77a63ae419f..ba363d99de4 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -401,7 +401,7 @@ setup_lowcore(void)
 	 * Setup lowcore for boot cpu
 	 */
 	BUILD_BUG_ON(sizeof(struct _lowcore) != LC_PAGES * 4096);
-	lc = __alloc_bootmem(LC_PAGES * PAGE_SIZE, LC_PAGES * PAGE_SIZE, 0);
+	lc = __alloc_bootmem_low(LC_PAGES * PAGE_SIZE, LC_PAGES * PAGE_SIZE, 0);
 	lc->restart_psw.mask = PSW_BASE_BITS | PSW_DEFAULT_KEY;
 	lc->restart_psw.addr =
 		PSW_ADDR_AMODE | (unsigned long) restart_int_handler;
@@ -433,7 +433,7 @@ setup_lowcore(void)
 #ifndef CONFIG_64BIT
 	if (MACHINE_HAS_IEEE) {
 		lc->extended_save_area_addr = (__u32)
-			__alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0);
+			__alloc_bootmem_low(PAGE_SIZE, PAGE_SIZE, 0);
 		/* enable extended save area */
 		__ctl_set_bit(14, 29);
 	}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 29f65bce55e..d7d24fc3d6b 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -292,9 +292,9 @@ static void __init smp_get_save_area(unsigned int cpu, unsigned int phy_cpu)
 	zfcpdump_save_areas[cpu] = kmalloc(sizeof(struct save_area), GFP_KERNEL);
 	while (raw_sigp(phy_cpu, sigp_stop_and_store_status) == sigp_busy)
 		cpu_relax();
-	memcpy(zfcpdump_save_areas[cpu],
-	       (void *)(unsigned long) store_prefix() + SAVE_AREA_BASE,
-	       sizeof(struct save_area));
+	memcpy_real(zfcpdump_save_areas[cpu],
+		    (void *)(unsigned long) store_prefix() + SAVE_AREA_BASE,
+		    sizeof(struct save_area));
 }
 
 struct save_area *zfcpdump_save_areas[NR_CPUS + 1];
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 81756271dc4..a8c2af8c650 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -59,3 +59,29 @@ long probe_kernel_write(void *dst, void *src, size_t size)
 	}
 	return copied < 0 ? -EFAULT : 0;
 }
+
+int memcpy_real(void *dest, void *src, size_t count)
+{
+	register unsigned long _dest asm("2") = (unsigned long) dest;
+	register unsigned long _len1 asm("3") = (unsigned long) count;
+	register unsigned long _src  asm("4") = (unsigned long) src;
+	register unsigned long _len2 asm("5") = (unsigned long) count;
+	unsigned long flags;
+	int rc = -EFAULT;
+
+	if (!count)
+		return 0;
+	flags = __raw_local_irq_stnsm(0xf8UL);
+	asm volatile (
+		"0:	mvcle	%1,%2,0x0\n"
+		"1:	jo	0b\n"
+		"	lhi	%0,0x0\n"
+		"2:\n"
+		EX_TABLE(1b,2b)
+		: "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
+		  "+d" (_len2), "=m" (*((long *) dest))
+		: "m" (*((long *) src))
+		: "cc", "memory");
+	__raw_local_irq_ssm(flags);
+	return rc;
+}
diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index 39ed8722d11..6c13b92742e 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -836,6 +836,8 @@ static void __init sh_eth_init(struct sh_eth_plat_data *pd)
 		pd->mac_addr[i] = mac_read(a, 0x10 + i);
 		msleep(10);
 	}
+
+	i2c_put_adapter(a);
 }
 #else
 static void __init sh_eth_init(struct sh_eth_plat_data *pd)
diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c
index 66cdbc3c7af..ccaa290e9ab 100644
--- a/arch/sh/boards/mach-se/7724/setup.c
+++ b/arch/sh/boards/mach-se/7724/setup.c
@@ -52,6 +52,13 @@
  * and change SW41 to use 720p
  */
 
+/*
+ * about sound
+ *
+ * This setup.c supports FSI slave mode.
+ * Please change J20, J21, J22 pin to 1-2 connection.
+ */
+
 /* Heartbeat */
 static struct resource heartbeat_resource = {
 	.start  = PA_LED,
@@ -276,6 +283,7 @@ static struct clk fsimcka_clk = {
 	.rate		= 0, /* unknown */
 };
 
+/* change J20, J21, J22 pin to 1-2 connection to use slave mode */
 struct sh_fsi_platform_info fsi_info = {
 	.porta_flags = SH_FSI_BRS_INV |
 		       SH_FSI_OUT_SLAVE_MODE |
diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig
index 18e3356406f..6041c66dd10 100644
--- a/arch/sh/configs/ecovec24_defconfig
+++ b/arch/sh/configs/ecovec24_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.33-rc2
-# Mon Jan  4 11:20:36 2010
+# Linux kernel version: 2.6.34-rc2
+# Mon Mar 29 02:21:58 2010
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
@@ -13,8 +13,8 @@ CONFIG_GENERIC_FIND_NEXT_BIT=y
 CONFIG_GENERIC_HWEIGHT=y
 CONFIG_GENERIC_HARDIRQS=y
 CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
-CONFIG_GENERIC_IRQ_PROBE=y
 CONFIG_IRQ_PER_CPU=y
+CONFIG_SPARSE_IRQ=y
 CONFIG_GENERIC_GPIO=y
 CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CLOCKEVENTS=y
@@ -32,6 +32,7 @@ CONFIG_ARCH_NO_VIRT_TO_BUS=y
 CONFIG_ARCH_HAS_DEFAULT_IDLE=y
 CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
 CONFIG_DMA_NONCOHERENT=y
+CONFIG_NEED_DMA_MAP_STATE=y
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 CONFIG_CONSTRUCTORS=y
 
@@ -47,9 +48,11 @@ CONFIG_LOCALVERSION=""
 CONFIG_HAVE_KERNEL_GZIP=y
 CONFIG_HAVE_KERNEL_BZIP2=y
 CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_HAVE_KERNEL_LZO=y
 CONFIG_KERNEL_GZIP=y
 # CONFIG_KERNEL_BZIP2 is not set
 # CONFIG_KERNEL_LZMA is not set
+# CONFIG_KERNEL_LZO is not set
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
@@ -71,14 +74,8 @@ CONFIG_RCU_FANOUT=32
 # CONFIG_TREE_RCU_TRACE is not set
 # CONFIG_IKCONFIG is not set
 CONFIG_LOG_BUF_SHIFT=14
-CONFIG_GROUP_SCHED=y
-CONFIG_FAIR_GROUP_SCHED=y
-# CONFIG_RT_GROUP_SCHED is not set
-CONFIG_USER_SCHED=y
-# CONFIG_CGROUP_SCHED is not set
 # CONFIG_CGROUPS is not set
-CONFIG_SYSFS_DEPRECATED=y
-CONFIG_SYSFS_DEPRECATED_V2=y
+# CONFIG_SYSFS_DEPRECATED_V2 is not set
 # CONFIG_RELAY is not set
 # CONFIG_NAMESPACES is not set
 # CONFIG_BLK_DEV_INITRD is not set
@@ -107,7 +104,7 @@ CONFIG_PERF_USE_VMALLOC=y
 #
 # Kernel Performance Events And Counters
 #
-# CONFIG_PERF_EVENTS is not set
+CONFIG_PERF_EVENTS=y
 # CONFIG_PERF_COUNTERS is not set
 CONFIG_VM_EVENT_COUNTERS=y
 CONFIG_COMPAT_BRK=y
@@ -116,13 +113,13 @@ CONFIG_SLAB=y
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
 CONFIG_HAVE_OPROFILE=y
-CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_DMA_ATTRS=y
 CONFIG_HAVE_CLK=y
 CONFIG_HAVE_DMA_API_DEBUG=y
+CONFIG_HAVE_HW_BREAKPOINT=y
 
 #
 # GCOV-based kernel profiling
@@ -234,12 +231,12 @@ CONFIG_CPU_SUBTYPE_SH7724=y
 CONFIG_QUICKLIST=y
 CONFIG_MMU=y
 CONFIG_PAGE_OFFSET=0x80000000
-CONFIG_FORCE_MAX_ZONEORDER=11
+CONFIG_FORCE_MAX_ZONEORDER=12
 CONFIG_MEMORY_START=0x08000000
 CONFIG_MEMORY_SIZE=0x10000000
 CONFIG_29BIT=y
-# CONFIG_PMB_ENABLE is not set
-# CONFIG_X2TLB is not set
+# CONFIG_PMB is not set
+CONFIG_X2TLB=y
 CONFIG_VSYSCALL=y
 CONFIG_ARCH_FLATMEM_ENABLE=y
 CONFIG_ARCH_SPARSEMEM_ENABLE=y
@@ -247,6 +244,8 @@ CONFIG_ARCH_SPARSEMEM_DEFAULT=y
 CONFIG_MAX_ACTIVE_REGIONS=1
 CONFIG_ARCH_POPULATES_NODE_MAP=y
 CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_IOREMAP_FIXED=y
+CONFIG_UNCACHED_MAPPING=y
 CONFIG_PAGE_SIZE_4KB=y
 # CONFIG_PAGE_SIZE_8KB is not set
 # CONFIG_PAGE_SIZE_16KB is not set
@@ -262,7 +261,7 @@ CONFIG_PAGEFLAGS_EXTENDED=y
 CONFIG_SPLIT_PTLOCK_CPUS=4
 # CONFIG_PHYS_ADDR_T_64BIT is not set
 CONFIG_ZONE_DMA_FLAG=0
-CONFIG_NR_QUICK=2
+CONFIG_NR_QUICK=1
 # CONFIG_KSM is not set
 CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
 
@@ -337,7 +336,6 @@ CONFIG_SECCOMP=y
 # CONFIG_PREEMPT_VOLUNTARY is not set
 CONFIG_PREEMPT=y
 CONFIG_GUSA=y
-# CONFIG_SPARSE_IRQ is not set
 
 #
 # Boot options
@@ -347,7 +345,7 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 CONFIG_ENTRY_OFFSET=0x00001000
 CONFIG_CMDLINE_OVERWRITE=y
 # CONFIG_CMDLINE_EXTEND is not set
-CONFIG_CMDLINE="console=tty0, console=ttySC0,115200 root=/dev/nfs ip=dhcp mem=120M memchunk.vpu=4m"
+CONFIG_CMDLINE="console=tty0, console=ttySC0,115200 root=/dev/nfs ip=dhcp mem=248M memchunk.vpu=8m memchunk.veu0=4m"
 
 #
 # Bus options
@@ -373,6 +371,7 @@ CONFIG_SUSPEND=y
 CONFIG_SUSPEND_FREEZER=y
 # CONFIG_HIBERNATION is not set
 CONFIG_PM_RUNTIME=y
+CONFIG_PM_OPS=y
 # CONFIG_CPU_IDLE is not set
 CONFIG_NET=y
 
@@ -380,7 +379,6 @@ CONFIG_NET=y
 # Networking options
 #
 CONFIG_PACKET=y
-# CONFIG_PACKET_MMAP is not set
 CONFIG_UNIX=y
 # CONFIG_NET_KEY is not set
 CONFIG_INET=y
@@ -445,7 +443,45 @@ CONFIG_DEFAULT_TCP_CONG="cubic"
 # CONFIG_NET_PKTGEN is not set
 # CONFIG_HAMRADIO is not set
 # CONFIG_CAN is not set
-# CONFIG_IRDA is not set
+CONFIG_IRDA=y
+
+#
+# IrDA protocols
+#
+# CONFIG_IRLAN is not set
+# CONFIG_IRCOMM is not set
+# CONFIG_IRDA_ULTRA is not set
+
+#
+# IrDA options
+#
+# CONFIG_IRDA_CACHE_LAST_LSAP is not set
+# CONFIG_IRDA_FAST_RR is not set
+# CONFIG_IRDA_DEBUG is not set
+
+#
+# Infrared-port device drivers
+#
+
+#
+# SIR device drivers
+#
+# CONFIG_IRTTY_SIR is not set
+
+#
+# Dongle support
+#
+CONFIG_SH_SIR=y
+# CONFIG_KINGSUN_DONGLE is not set
+# CONFIG_KSDAZZLE_DONGLE is not set
+# CONFIG_KS959_DONGLE is not set
+
+#
+# FIR device drivers
+#
+# CONFIG_USB_IRDA is not set
+# CONFIG_SIGMATEL_FIR is not set
+# CONFIG_MCS_FIR is not set
 # CONFIG_BT is not set
 # CONFIG_AF_RXRPC is not set
 CONFIG_WIRELESS=y
@@ -556,6 +592,7 @@ CONFIG_MTD_NAND_IDS=y
 # CONFIG_MTD_NAND_NANDSIM is not set
 # CONFIG_MTD_NAND_PLATFORM is not set
 # CONFIG_MTD_ALAUDA is not set
+# CONFIG_MTD_NAND_SH_FLCTL is not set
 # CONFIG_MTD_ONENAND is not set
 
 #
@@ -597,6 +634,7 @@ CONFIG_MISC_DEVICES=y
 # CONFIG_ICS932S401 is not set
 # CONFIG_ENCLOSURE_SERVICES is not set
 # CONFIG_ISL29003 is not set
+# CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_DS1682 is not set
 # CONFIG_TI_DAC7512 is not set
 # CONFIG_C2PORT is not set
@@ -616,6 +654,7 @@ CONFIG_HAVE_IDE=y
 #
 # SCSI device support
 #
+CONFIG_SCSI_MOD=y
 # CONFIG_RAID_ATTRS is not set
 CONFIG_SCSI=y
 CONFIG_SCSI_DMA=y
@@ -768,7 +807,29 @@ CONFIG_KEYBOARD_SH_KEYSC=y
 # CONFIG_INPUT_MOUSE is not set
 # CONFIG_INPUT_JOYSTICK is not set
 # CONFIG_INPUT_TABLET is not set
-# CONFIG_INPUT_TOUCHSCREEN is not set
+CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_ADS7846 is not set
+# CONFIG_TOUCHSCREEN_AD7877 is not set
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879_SPI is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
+# CONFIG_TOUCHSCREEN_DYNAPRO is not set
+# CONFIG_TOUCHSCREEN_EETI is not set
+# CONFIG_TOUCHSCREEN_FUJITSU is not set
+# CONFIG_TOUCHSCREEN_GUNZE is not set
+# CONFIG_TOUCHSCREEN_ELO is not set
+# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
+# CONFIG_TOUCHSCREEN_MCS5000 is not set
+# CONFIG_TOUCHSCREEN_MTOUCH is not set
+# CONFIG_TOUCHSCREEN_INEXIO is not set
+# CONFIG_TOUCHSCREEN_MK712 is not set
+# CONFIG_TOUCHSCREEN_PENMOUNT is not set
+# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
+# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
+# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
+# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
+CONFIG_TOUCHSCREEN_TSC2007=y
+# CONFIG_TOUCHSCREEN_W90X900 is not set
 # CONFIG_INPUT_MISC is not set
 
 #
@@ -802,10 +863,10 @@ CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
 CONFIG_SERIAL_CORE=y
 CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_TIMBERDALE is not set
 CONFIG_UNIX98_PTYS=y
 # CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
-CONFIG_LEGACY_PTYS=y
-CONFIG_LEGACY_PTY_COUNT=256
+# CONFIG_LEGACY_PTYS is not set
 # CONFIG_IPMI_HANDLER is not set
 CONFIG_HW_RANDOM=y
 # CONFIG_HW_RANDOM_TIMERIOMEM is not set
@@ -830,6 +891,7 @@ CONFIG_I2C_HELPER_AUTO=y
 # CONFIG_I2C_OCORES is not set
 CONFIG_I2C_SH_MOBILE=y
 # CONFIG_I2C_SIMTEC is not set
+# CONFIG_I2C_XILINX is not set
 
 #
 # External I2C/SMBus adapter drivers
@@ -843,15 +905,9 @@ CONFIG_I2C_SH_MOBILE=y
 #
 # CONFIG_I2C_PCA_PLATFORM is not set
 # CONFIG_I2C_STUB is not set
-
-#
-# Miscellaneous I2C Chip support
-#
-# CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_I2C_DEBUG_CORE is not set
 # CONFIG_I2C_DEBUG_ALGO is not set
 # CONFIG_I2C_DEBUG_BUS is not set
-# CONFIG_I2C_DEBUG_CHIP is not set
 CONFIG_SPI=y
 CONFIG_SPI_MASTER=y
 
@@ -882,13 +938,16 @@ CONFIG_GPIOLIB=y
 #
 # Memory mapped GPIO expanders:
 #
+# CONFIG_GPIO_IT8761E is not set
 
 #
 # I2C GPIO expanders:
 #
+# CONFIG_GPIO_MAX7300 is not set
 # CONFIG_GPIO_MAX732X is not set
 # CONFIG_GPIO_PCA953X is not set
 # CONFIG_GPIO_PCF857X is not set
+# CONFIG_GPIO_ADP5588 is not set
 
 #
 # PCI GPIO expanders:
@@ -919,23 +978,26 @@ CONFIG_SSB_POSSIBLE=y
 #
 # Multifunction device drivers
 #
-# CONFIG_MFD_CORE is not set
+CONFIG_MFD_CORE=y
+# CONFIG_MFD_88PM860X is not set
 # CONFIG_MFD_SM501 is not set
-# CONFIG_MFD_SH_MOBILE_SDHI is not set
+CONFIG_MFD_SH_MOBILE_SDHI=y
 # CONFIG_HTC_PASIC3 is not set
+# CONFIG_HTC_I2CPLD is not set
 # CONFIG_TPS65010 is not set
 # CONFIG_TWL4030_CORE is not set
 # CONFIG_MFD_TMIO is not set
 # CONFIG_PMIC_DA903X is not set
 # CONFIG_PMIC_ADP5520 is not set
+# CONFIG_MFD_MAX8925 is not set
 # CONFIG_MFD_WM8400 is not set
 # CONFIG_MFD_WM831X is not set
 # CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_WM8994 is not set
 # CONFIG_MFD_PCF50633 is not set
 # CONFIG_MFD_MC13783 is not set
 # CONFIG_AB3100_CORE is not set
 # CONFIG_EZX_PCAP is not set
-# CONFIG_MFD_88PM8607 is not set
 # CONFIG_AB4500_CORE is not set
 # CONFIG_REGULATOR is not set
 CONFIG_MEDIA_SUPPORT=y
@@ -985,10 +1047,10 @@ CONFIG_SOC_CAMERA=y
 # CONFIG_SOC_CAMERA_MT9M001 is not set
 # CONFIG_SOC_CAMERA_MT9M111 is not set
 # CONFIG_SOC_CAMERA_MT9T031 is not set
-# CONFIG_SOC_CAMERA_MT9T112 is not set
+CONFIG_SOC_CAMERA_MT9T112=y
 # CONFIG_SOC_CAMERA_MT9V022 is not set
 # CONFIG_SOC_CAMERA_RJ54N1 is not set
-# CONFIG_SOC_CAMERA_TW9910 is not set
+CONFIG_SOC_CAMERA_TW9910=y
 # CONFIG_SOC_CAMERA_PLATFORM is not set
 # CONFIG_SOC_CAMERA_OV772X is not set
 # CONFIG_SOC_CAMERA_OV9640 is not set
@@ -1001,6 +1063,7 @@ CONFIG_RADIO_ADAPTERS=y
 # CONFIG_RADIO_SI470X is not set
 # CONFIG_USB_MR800 is not set
 # CONFIG_RADIO_TEA5764 is not set
+# CONFIG_RADIO_SAA7706H is not set
 # CONFIG_RADIO_TEF6862 is not set
 # CONFIG_DAB is not set
 
@@ -1034,6 +1097,7 @@ CONFIG_FB_DEFERRED_IO=y
 #
 # CONFIG_FB_S1D13XXX is not set
 CONFIG_FB_SH_MOBILE_LCDC=y
+# CONFIG_FB_TMIO is not set
 # CONFIG_FB_VIRTUAL is not set
 # CONFIG_FB_METRONOME is not set
 # CONFIG_FB_MB862XX is not set
@@ -1062,7 +1126,46 @@ CONFIG_LOGO=y
 # CONFIG_LOGO_SUPERH_MONO is not set
 # CONFIG_LOGO_SUPERH_VGA16 is not set
 CONFIG_LOGO_SUPERH_CLUT224=y
-# CONFIG_SOUND is not set
+CONFIG_SOUND=y
+CONFIG_SOUND_OSS_CORE=y
+CONFIG_SOUND_OSS_CORE_PRECLAIM=y
+CONFIG_SND=y
+CONFIG_SND_TIMER=y
+CONFIG_SND_PCM=y
+CONFIG_SND_JACK=y
+CONFIG_SND_SEQUENCER=y
+CONFIG_SND_SEQ_DUMMY=y
+CONFIG_SND_OSSEMUL=y
+CONFIG_SND_MIXER_OSS=y
+CONFIG_SND_PCM_OSS=y
+CONFIG_SND_PCM_OSS_PLUGINS=y
+# CONFIG_SND_SEQUENCER_OSS is not set
+# CONFIG_SND_DYNAMIC_MINORS is not set
+CONFIG_SND_SUPPORT_OLD_API=y
+CONFIG_SND_VERBOSE_PROCFS=y
+# CONFIG_SND_VERBOSE_PRINTK is not set
+# CONFIG_SND_DEBUG is not set
+# CONFIG_SND_RAWMIDI_SEQ is not set
+# CONFIG_SND_OPL3_LIB_SEQ is not set
+# CONFIG_SND_OPL4_LIB_SEQ is not set
+# CONFIG_SND_SBAWE_SEQ is not set
+# CONFIG_SND_EMU10K1_SEQ is not set
+# CONFIG_SND_DRIVERS is not set
+# CONFIG_SND_SPI is not set
+CONFIG_SND_SUPERH=y
+# CONFIG_SND_USB is not set
+CONFIG_SND_SOC=y
+
+#
+# SoC Audio support for SuperH
+#
+CONFIG_SND_SOC_SH4_FSI=y
+# CONFIG_SND_FSI_AK4642 is not set
+CONFIG_SND_FSI_DA7210=y
+CONFIG_SND_SOC_I2C_AND_SPI=y
+# CONFIG_SND_SOC_ALL_CODECS is not set
+CONFIG_SND_SOC_DA7210=y
+# CONFIG_SOUND_PRIME is not set
 CONFIG_HID_SUPPORT=y
 CONFIG_HID=y
 # CONFIG_HIDRAW is not set
@@ -1077,6 +1180,7 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
+# CONFIG_HID_3M_PCT is not set
 # CONFIG_HID_A4TECH is not set
 # CONFIG_HID_APPLE is not set
 # CONFIG_HID_BELKIN is not set
@@ -1091,12 +1195,16 @@ CONFIG_USB_HID=y
 # CONFIG_HID_KENSINGTON is not set
 # CONFIG_HID_LOGITECH is not set
 # CONFIG_HID_MICROSOFT is not set
+# CONFIG_HID_MOSART is not set
 # CONFIG_HID_MONTEREY is not set
 # CONFIG_HID_NTRIG is not set
+# CONFIG_HID_ORTEK is not set
 # CONFIG_HID_PANTHERLORD is not set
 # CONFIG_HID_PETALYNX is not set
+# CONFIG_HID_QUANTA is not set
 # CONFIG_HID_SAMSUNG is not set
 # CONFIG_HID_SONY is not set
+# CONFIG_HID_STANTUM is not set
 # CONFIG_HID_SUNPLUS is not set
 # CONFIG_HID_GREENASIA is not set
 # CONFIG_HID_SMARTJOYPLUS is not set
@@ -1136,6 +1244,7 @@ CONFIG_USB_MON=y
 # CONFIG_USB_SL811_HCD is not set
 CONFIG_USB_R8A66597_HCD=y
 # CONFIG_USB_HWA_HCD is not set
+# CONFIG_USB_GADGET_MUSB_HDRC is not set
 
 #
 # USB Device Class drivers
@@ -1188,7 +1297,6 @@ CONFIG_USB_STORAGE=y
 # CONFIG_USB_RIO500 is not set
 # CONFIG_USB_LEGOTOWER is not set
 # CONFIG_USB_LCD is not set
-# CONFIG_USB_BERRY_CHARGE is not set
 # CONFIG_USB_LED is not set
 # CONFIG_USB_CYPRESS_CY7C63 is not set
 # CONFIG_USB_CYTHERM is not set
@@ -1200,8 +1308,45 @@ CONFIG_USB_STORAGE=y
 # CONFIG_USB_IOWARRIOR is not set
 # CONFIG_USB_TEST is not set
 # CONFIG_USB_ISIGHTFW is not set
-# CONFIG_USB_VST is not set
-# CONFIG_USB_GADGET is not set
+CONFIG_USB_GADGET=y
+# CONFIG_USB_GADGET_DEBUG_FILES is not set
+# CONFIG_USB_GADGET_DEBUG_FS is not set
+CONFIG_USB_GADGET_VBUS_DRAW=2
+CONFIG_USB_GADGET_SELECTED=y
+# CONFIG_USB_GADGET_AT91 is not set
+# CONFIG_USB_GADGET_ATMEL_USBA is not set
+# CONFIG_USB_GADGET_FSL_USB2 is not set
+# CONFIG_USB_GADGET_LH7A40X is not set
+# CONFIG_USB_GADGET_OMAP is not set
+# CONFIG_USB_GADGET_PXA25X is not set
+CONFIG_USB_GADGET_R8A66597=y
+CONFIG_USB_R8A66597=y
+# CONFIG_USB_GADGET_PXA27X is not set
+# CONFIG_USB_GADGET_S3C_HSOTG is not set
+# CONFIG_USB_GADGET_IMX is not set
+# CONFIG_USB_GADGET_S3C2410 is not set
+# CONFIG_USB_GADGET_M66592 is not set
+# CONFIG_USB_GADGET_AMD5536UDC is not set
+# CONFIG_USB_GADGET_FSL_QE is not set
+# CONFIG_USB_GADGET_CI13XXX is not set
+# CONFIG_USB_GADGET_NET2280 is not set
+# CONFIG_USB_GADGET_GOKU is not set
+# CONFIG_USB_GADGET_LANGWELL is not set
+# CONFIG_USB_GADGET_DUMMY_HCD is not set
+CONFIG_USB_GADGET_DUALSPEED=y
+# CONFIG_USB_ZERO is not set
+# CONFIG_USB_AUDIO is not set
+# CONFIG_USB_ETH is not set
+# CONFIG_USB_GADGETFS is not set
+CONFIG_USB_FILE_STORAGE=m
+# CONFIG_USB_FILE_STORAGE_TEST is not set
+# CONFIG_USB_MASS_STORAGE is not set
+# CONFIG_USB_G_SERIAL is not set
+# CONFIG_USB_MIDI_GADGET is not set
+# CONFIG_USB_G_PRINTER is not set
+# CONFIG_USB_CDC_COMPOSITE is not set
+# CONFIG_USB_G_NOKIA is not set
+# CONFIG_USB_G_MULTI is not set
 
 #
 # OTG and related infrastructure
@@ -1224,10 +1369,8 @@ CONFIG_MMC_BLOCK_BOUNCE=y
 # MMC/SD/SDIO Host Controller Drivers
 #
 # CONFIG_MMC_SDHCI is not set
-# CONFIG_MMC_AT91 is not set
-# CONFIG_MMC_ATMELMCI is not set
 CONFIG_MMC_SPI=y
-# CONFIG_MMC_TMIO is not set
+CONFIG_MMC_TMIO=y
 # CONFIG_MEMSTICK is not set
 # CONFIG_NEW_LEDS is not set
 # CONFIG_ACCESSIBILITY is not set
@@ -1253,10 +1396,10 @@ CONFIG_RTC_INTF_DEV=y
 # CONFIG_RTC_DRV_DS1374 is not set
 # CONFIG_RTC_DRV_DS1672 is not set
 # CONFIG_RTC_DRV_MAX6900 is not set
-# CONFIG_RTC_DRV_RS5C372 is not set
+CONFIG_RTC_DRV_RS5C372=y
 # CONFIG_RTC_DRV_ISL1208 is not set
 # CONFIG_RTC_DRV_X1205 is not set
-CONFIG_RTC_DRV_PCF8563=y
+# CONFIG_RTC_DRV_PCF8563 is not set
 # CONFIG_RTC_DRV_PCF8583 is not set
 # CONFIG_RTC_DRV_M41T80 is not set
 # CONFIG_RTC_DRV_BQ32K is not set
@@ -1303,8 +1446,6 @@ CONFIG_RTC_DRV_PCF8563=y
 CONFIG_UIO=y
 # CONFIG_UIO_PDRV is not set
 CONFIG_UIO_PDRV_GENIRQ=y
-# CONFIG_UIO_SMX is not set
-# CONFIG_UIO_SERCOS3 is not set
 
 #
 # TI VLYNQ
@@ -1390,6 +1531,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_EFS_FS is not set
 # CONFIG_JFFS2_FS is not set
 # CONFIG_UBIFS_FS is not set
+# CONFIG_LOGFS is not set
 # CONFIG_CRAMFS is not set
 # CONFIG_SQUASHFS is not set
 # CONFIG_VXFS_FS is not set
@@ -1418,6 +1560,7 @@ CONFIG_SUNRPC=y
 # CONFIG_RPCSEC_GSS_KRB5 is not set
 # CONFIG_RPCSEC_GSS_SPKM3 is not set
 # CONFIG_SMB_FS is not set
+# CONFIG_CEPH_FS is not set
 # CONFIG_CIFS is not set
 # CONFIG_NCP_FS is not set
 # CONFIG_CODA_FS is not set
@@ -1487,6 +1630,7 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_BUGVERBOSE=y
 # CONFIG_DEBUG_MEMORY_INIT is not set
 # CONFIG_RCU_CPU_STALL_DETECTOR is not set
+# CONFIG_LKDTM is not set
 # CONFIG_LATENCYTOP is not set
 CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
@@ -1618,7 +1762,7 @@ CONFIG_CRYPTO_HW=y
 #
 CONFIG_BITREVERSE=y
 CONFIG_GENERIC_FIND_LAST_BIT=y
-# CONFIG_CRC_CCITT is not set
+CONFIG_CRC_CCITT=y
 # CONFIG_CRC16 is not set
 CONFIG_CRC_T10DIF=y
 CONFIG_CRC_ITU_T=y
diff --git a/arch/sh/include/asm/elf.h b/arch/sh/include/asm/elf.h
index ac04255022b..ce830faeebb 100644
--- a/arch/sh/include/asm/elf.h
+++ b/arch/sh/include/asm/elf.h
@@ -211,7 +211,9 @@ extern void __kernel_vsyscall;
 
 #define VSYSCALL_AUX_ENT					\
 	if (vdso_enabled)					\
-		NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_BASE);
+		NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_BASE);	\
+	else							\
+		NEW_AUX_ENT(AT_IGNORE, 0);
 #else
 #define VSYSCALL_AUX_ENT
 #endif /* CONFIG_VSYSCALL */
@@ -219,7 +221,7 @@ extern void __kernel_vsyscall;
 #ifdef CONFIG_SH_FPU
 #define FPU_AUX_ENT	NEW_AUX_ENT(AT_FPUCW, FPSCR_INIT)
 #else
-#define FPU_AUX_ENT
+#define FPU_AUX_ENT	NEW_AUX_ENT(AT_IGNORE, 0)
 #endif
 
 extern int l1i_cache_shape, l1d_cache_shape, l2_cache_shape;
diff --git a/arch/sh/include/asm/mmu.h b/arch/sh/include/asm/mmu.h
index 19fe84550b4..56e4418c19b 100644
--- a/arch/sh/include/asm/mmu.h
+++ b/arch/sh/include/asm/mmu.h
@@ -66,6 +66,13 @@ int pmb_unmap(void __iomem *addr);
 
 #else
 
+static inline int
+pmb_bolt_mapping(unsigned long virt, phys_addr_t phys,
+		 unsigned long size, pgprot_t prot)
+{
+	return -EINVAL;
+}
+
 static inline void __iomem *
 pmb_remap_caller(phys_addr_t phys, unsigned long size,
 		 pgprot_t prot, void *caller)
diff --git a/arch/sh/include/cpu-sh4/cpu/mmu_context.h b/arch/sh/include/cpu-sh4/cpu/mmu_context.h
index 03ea75c5315..5963124c1d4 100644
--- a/arch/sh/include/cpu-sh4/cpu/mmu_context.h
+++ b/arch/sh/include/cpu-sh4/cpu/mmu_context.h
@@ -19,6 +19,8 @@
 
 #define MMUCR		0xFF000010	/* MMU Control Register */
 
+#define MMU_ITLB_ADDRESS_ARRAY  0xF2000000
+#define MMU_ITLB_ADDRESS_ARRAY2	0xF2800000
 #define MMU_UTLB_ADDRESS_ARRAY	0xF6000000
 #define MMU_UTLB_ADDRESS_ARRAY2	0xF6800000
 #define MMU_PAGE_ASSOC_BIT	0x80
@@ -28,6 +30,8 @@
 #define MMUCR_URB		0x00FC0000
 #define MMUCR_URB_SHIFT		18
 #define MMUCR_URB_NENTRIES	64
+#define MMUCR_URC		0x0000FC00
+#define MMUCR_URC_SHIFT		10
 
 #if defined(CONFIG_32BIT) && defined(CONFIG_CPU_SUBTYPE_ST40)
 #define MMUCR_SE		(1 << 4)
diff --git a/arch/sh/include/cpu-sh4/cpu/watchdog.h b/arch/sh/include/cpu-sh4/cpu/watchdog.h
index 7672301d0c7..7f62b938093 100644
--- a/arch/sh/include/cpu-sh4/cpu/watchdog.h
+++ b/arch/sh/include/cpu-sh4/cpu/watchdog.h
@@ -21,6 +21,12 @@
 #define WTCNT		0xffcc0000 /*WDTST*/
 #define WTST		WTCNT
 #define WTBST		0xffcc0008 /*WDTBST*/
+/* Register definitions */
+#elif	defined(CONFIG_CPU_SUBTYPE_SH7722) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7723) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7724)
+#define WTCNT		0xa4520000
+#define WTCSR		0xa4520004
 #else
 /* Register definitions */
 #define WTCNT		0xffc00008
diff --git a/arch/sh/kernel/cpufreq.c b/arch/sh/kernel/cpufreq.c
index dce4f3ff093..0fffacea6ed 100644
--- a/arch/sh/kernel/cpufreq.c
+++ b/arch/sh/kernel/cpufreq.c
@@ -48,7 +48,7 @@ static int sh_cpufreq_target(struct cpufreq_policy *policy,
 		return -ENODEV;
 
 	cpus_allowed = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	BUG_ON(smp_processor_id() != cpu);
 
@@ -66,7 +66,7 @@ static int sh_cpufreq_target(struct cpufreq_policy *policy,
 	freqs.flags	= 0;
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	set_cpus_allowed(current, cpus_allowed);
+	set_cpus_allowed_ptr(current, &cpus_allowed);
 	clk_set_rate(cpuclk, freq);
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index bd1c497280a..94739ee7aa7 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -727,7 +727,7 @@ static int dwarf_parse_cie(void *entry, void *p, unsigned long len,
 			   unsigned char *end, struct module *mod)
 {
 	struct rb_node **rb_node = &cie_root.rb_node;
-	struct rb_node *parent;
+	struct rb_node *parent = *rb_node;
 	struct dwarf_cie *cie;
 	unsigned long flags;
 	int count;
@@ -856,7 +856,7 @@ static int dwarf_parse_fde(void *entry, u32 entry_type,
 			   unsigned char *end, struct module *mod)
 {
 	struct rb_node **rb_node = &fde_root.rb_node;
-	struct rb_node *parent;
+	struct rb_node *parent = *rb_node;
 	struct dwarf_fde *fde;
 	struct dwarf_cie *cie;
 	unsigned long flags;
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 0fd7b41f0a2..273f890b17a 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -112,7 +112,7 @@ void cpu_idle(void)
 	}
 }
 
-void __cpuinit select_idle_routine(void)
+void __init select_idle_routine(void)
 {
 	/*
 	 * If a platform has set its own idle routine, leave it alone.
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 9f253e9cce0..81b6de41ae5 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -315,7 +315,7 @@ void hw_perf_disable(void)
 	sh_pmu->disable_all();
 }
 
-int register_sh_pmu(struct sh_pmu *pmu)
+int __cpuinit register_sh_pmu(struct sh_pmu *pmu)
 {
 	if (sh_pmu)
 		return -EBUSY;
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index c90957a459a..c0d40f671ec 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -504,13 +504,6 @@ out:
 	return error;
 }
 
-/*
- * These bracket the sleeping functions..
- */
-extern void interruptible_sleep_on(wait_queue_head_t *q);
-
-#define mid_sched	((unsigned long) interruptible_sleep_on)
-
 #ifdef CONFIG_FRAME_POINTER
 static int in_sh64_switch_to(unsigned long pc)
 {
diff --git a/arch/sh/kernel/return_address.c b/arch/sh/kernel/return_address.c
index df3ab581107..cbf1dd5372b 100644
--- a/arch/sh/kernel/return_address.c
+++ b/arch/sh/kernel/return_address.c
@@ -9,6 +9,7 @@
  * for more details.
  */
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <asm/dwarf.h>
 
 #ifdef CONFIG_DWARF_UNWINDER
@@ -52,3 +53,5 @@ void *return_address(unsigned int depth)
 }
 
 #endif
+
+EXPORT_SYMBOL_GPL(return_address);
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index e124cf7008d..002cc612dee 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -69,6 +69,7 @@ asmlinkage void __cpuinit start_secondary(void)
 	unsigned int cpu;
 	struct mm_struct *mm = &init_mm;
 
+	enable_mmu();
 	atomic_inc(&mm->mm_count);
 	atomic_inc(&mm->mm_users);
 	current->active_mm = mm;
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index a4662e2782c..3cc21933063 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -323,6 +323,7 @@ static void __clear_pmb_entry(struct pmb_entry *pmbe)
 	writel_uncached(data_val & ~PMB_V, data);
 }
 
+#ifdef CONFIG_PM
 static void set_pmb_entry(struct pmb_entry *pmbe)
 {
 	unsigned long flags;
@@ -331,6 +332,7 @@ static void set_pmb_entry(struct pmb_entry *pmbe)
 	__set_pmb_entry(pmbe);
 	spin_unlock_irqrestore(&pmbe->lock, flags);
 }
+#endif /* CONFIG_PM */
 
 int pmb_bolt_mapping(unsigned long vaddr, phys_addr_t phys,
 		     unsigned long size, pgprot_t prot)
@@ -802,7 +804,7 @@ void __init pmb_init(void)
 	writel_uncached(0, PMB_IRMCR);
 
 	/* Flush out the TLB */
-	__raw_writel(__raw_readl(MMUCR) | MMUCR_TI, MMUCR);
+	local_flush_tlb_all();
 	ctrl_barrier();
 }
 
diff --git a/arch/sh/mm/tlb-pteaex.c b/arch/sh/mm/tlb-pteaex.c
index 32dc674c550..b71db6af806 100644
--- a/arch/sh/mm/tlb-pteaex.c
+++ b/arch/sh/mm/tlb-pteaex.c
@@ -73,5 +73,35 @@ void local_flush_tlb_one(unsigned long asid, unsigned long page)
 	jump_to_uncached();
 	__raw_writel(page, MMU_UTLB_ADDRESS_ARRAY | MMU_PAGE_ASSOC_BIT);
 	__raw_writel(asid, MMU_UTLB_ADDRESS_ARRAY2 | MMU_PAGE_ASSOC_BIT);
+	__raw_writel(page, MMU_ITLB_ADDRESS_ARRAY | MMU_PAGE_ASSOC_BIT);
+	__raw_writel(asid, MMU_ITLB_ADDRESS_ARRAY2 | MMU_PAGE_ASSOC_BIT);
 	back_to_cached();
 }
+
+void local_flush_tlb_all(void)
+{
+	unsigned long flags, status;
+	int i;
+
+	/*
+	 * Flush all the TLB.
+	 */
+	local_irq_save(flags);
+	jump_to_uncached();
+
+	status = __raw_readl(MMUCR);
+	status = ((status & MMUCR_URB) >> MMUCR_URB_SHIFT);
+
+	if (status == 0)
+		status = MMUCR_URB_NENTRIES;
+
+	for (i = 0; i < status; i++)
+		__raw_writel(0x0, MMU_UTLB_ADDRESS_ARRAY | (i << 8));
+
+	for (i = 0; i < 4; i++)
+		__raw_writel(0x0, MMU_ITLB_ADDRESS_ARRAY | (i << 8));
+
+	back_to_cached();
+	ctrl_barrier();
+	local_irq_restore(flags);
+}
diff --git a/arch/sh/mm/tlb-sh3.c b/arch/sh/mm/tlb-sh3.c
index 4f5f7cbdd50..7a940dbfc2e 100644
--- a/arch/sh/mm/tlb-sh3.c
+++ b/arch/sh/mm/tlb-sh3.c
@@ -77,3 +77,22 @@ void local_flush_tlb_one(unsigned long asid, unsigned long page)
 	for (i = 0; i < ways; i++)
 		__raw_writel(data, addr + (i << 8));
 }
+
+void local_flush_tlb_all(void)
+{
+	unsigned long flags, status;
+
+	/*
+	 * Flush all the TLB.
+	 *
+	 * Write to the MMU control register's bit:
+	 *	TF-bit for SH-3, TI-bit for SH-4.
+	 *      It's same position, bit #2.
+	 */
+	local_irq_save(flags);
+	status = __raw_readl(MMUCR);
+	status |= 0x04;
+	__raw_writel(status, MMUCR);
+	ctrl_barrier();
+	local_irq_restore(flags);
+}
diff --git a/arch/sh/mm/tlb-sh4.c b/arch/sh/mm/tlb-sh4.c
index ccac77f504a..cfdf7930d29 100644
--- a/arch/sh/mm/tlb-sh4.c
+++ b/arch/sh/mm/tlb-sh4.c
@@ -80,3 +80,31 @@ void local_flush_tlb_one(unsigned long asid, unsigned long page)
 	__raw_writel(data, addr);
 	back_to_cached();
 }
+
+void local_flush_tlb_all(void)
+{
+	unsigned long flags, status;
+	int i;
+
+	/*
+	 * Flush all the TLB.
+	 */
+	local_irq_save(flags);
+	jump_to_uncached();
+
+	status = __raw_readl(MMUCR);
+	status = ((status & MMUCR_URB) >> MMUCR_URB_SHIFT);
+
+	if (status == 0)
+		status = MMUCR_URB_NENTRIES;
+
+	for (i = 0; i < status; i++)
+		__raw_writel(0x0, MMU_UTLB_ADDRESS_ARRAY | (i << 8));
+
+	for (i = 0; i < 4; i++)
+		__raw_writel(0x0, MMU_ITLB_ADDRESS_ARRAY | (i << 8));
+
+	back_to_cached();
+	ctrl_barrier();
+	local_irq_restore(flags);
+}
diff --git a/arch/sh/mm/tlb-urb.c b/arch/sh/mm/tlb-urb.c
index bb5b9098956..c92ce20db39 100644
--- a/arch/sh/mm/tlb-urb.c
+++ b/arch/sh/mm/tlb-urb.c
@@ -24,13 +24,9 @@ void tlb_wire_entry(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 
 	local_irq_save(flags);
 
-	/* Load the entry into the TLB */
-	__update_tlb(vma, addr, pte);
-
-	/* ... and wire it up. */
 	status = __raw_readl(MMUCR);
 	urb = (status & MMUCR_URB) >> MMUCR_URB_SHIFT;
-	status &= ~MMUCR_URB;
+	status &= ~MMUCR_URC;
 
 	/*
 	 * Make sure we're not trying to wire the last TLB entry slot.
@@ -39,7 +35,23 @@ void tlb_wire_entry(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 
 	urb = urb % MMUCR_URB_NENTRIES;
 
+	/*
+	 * Insert this entry into the highest non-wired TLB slot (via
+	 * the URC field).
+	 */
+	status |= (urb << MMUCR_URC_SHIFT);
+	__raw_writel(status, MMUCR);
+	ctrl_barrier();
+
+	/* Load the entry into the TLB */
+	__update_tlb(vma, addr, pte);
+
+	/* ... and wire it up. */
+	status = __raw_readl(MMUCR);
+
+	status &= ~MMUCR_URB;
 	status |= (urb << MMUCR_URB_SHIFT);
+
 	__raw_writel(status, MMUCR);
 	ctrl_barrier();
 
diff --git a/arch/sh/mm/tlbflush_32.c b/arch/sh/mm/tlbflush_32.c
index 004bb3f25b5..3fbe03ce8fe 100644
--- a/arch/sh/mm/tlbflush_32.c
+++ b/arch/sh/mm/tlbflush_32.c
@@ -119,22 +119,3 @@ void local_flush_tlb_mm(struct mm_struct *mm)
 		local_irq_restore(flags);
 	}
 }
-
-void local_flush_tlb_all(void)
-{
-	unsigned long flags, status;
-
-	/*
-	 * Flush all the TLB.
-	 *
-	 * Write to the MMU control register's bit:
-	 *	TF-bit for SH-3, TI-bit for SH-4.
-	 *      It's same position, bit #2.
-	 */
-	local_irq_save(flags);
-	status = __raw_readl(MMUCR);
-	status |= 0x04;
-	__raw_writel(status, MMUCR);
-	ctrl_barrier();
-	local_irq_restore(flags);
-}
diff --git a/arch/sh/mm/uncached.c b/arch/sh/mm/uncached.c
index cf20a5c5136..8a4eca551fc 100644
--- a/arch/sh/mm/uncached.c
+++ b/arch/sh/mm/uncached.c
@@ -1,6 +1,8 @@
 #include <linux/init.h>
+#include <linux/module.h>
 #include <asm/sizes.h>
 #include <asm/page.h>
+#include <asm/addrspace.h>
 
 /*
  * This is the offset of the uncached section from its cached alias.
@@ -15,15 +17,22 @@
 unsigned long cached_to_uncached = SZ_512M;
 unsigned long uncached_size = SZ_512M;
 unsigned long uncached_start, uncached_end;
+EXPORT_SYMBOL(uncached_start);
+EXPORT_SYMBOL(uncached_end);
 
 int virt_addr_uncached(unsigned long kaddr)
 {
 	return (kaddr >= uncached_start) && (kaddr < uncached_end);
 }
+EXPORT_SYMBOL(virt_addr_uncached);
 
 void __init uncached_init(void)
 {
+#ifdef CONFIG_29BIT
+	uncached_start = P2SEG;
+#else
 	uncached_start = memory_end;
+#endif
 	uncached_end = uncached_start + uncached_size;
 }
 
diff --git a/arch/sparc/include/asm/stat.h b/arch/sparc/include/asm/stat.h
index 39327d6a57e..a232e9e1f4e 100644
--- a/arch/sparc/include/asm/stat.h
+++ b/arch/sparc/include/asm/stat.h
@@ -53,8 +53,8 @@ struct stat {
 	ino_t		st_ino;
 	mode_t		st_mode;
 	short		st_nlink;
-	uid16_t		st_uid;
-	gid16_t		st_gid;
+	unsigned short	st_uid;
+	unsigned short	st_gid;
 	unsigned short	st_rdev;
 	off_t		st_size;
 	time_t		st_atime;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 68cb9b42088..e2771939341 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1337,7 +1337,7 @@ static void perf_callchain_user_32(struct pt_regs *regs,
 	callchain_store(entry, PERF_CONTEXT_USER);
 	callchain_store(entry, regs->tpc);
 
-	ufp = regs->u_regs[UREG_I6];
+	ufp = regs->u_regs[UREG_I6] & 0xffffffffUL;
 	do {
 		struct sparc_stackf32 *usf, sf;
 		unsigned long pc;
diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index ca39c606fe8..1eb8b00aed7 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -107,12 +107,12 @@ static unsigned long run_on_cpu(unsigned long cpu,
 	unsigned long ret;
 
 	/* should return -EINVAL to userspace */
-	if (set_cpus_allowed(current, cpumask_of_cpu(cpu)))
+	if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
 		return 0;
 
 	ret = func(arg);
 
-	set_cpus_allowed(current, old_affinity);
+	set_cpus_allowed_ptr(current, &old_affinity);
 
 	return ret;
 }
diff --git a/arch/sparc/kernel/us2e_cpufreq.c b/arch/sparc/kernel/us2e_cpufreq.c
index 791c15138f3..8f982b76c71 100644
--- a/arch/sparc/kernel/us2e_cpufreq.c
+++ b/arch/sparc/kernel/us2e_cpufreq.c
@@ -238,12 +238,12 @@ static unsigned int us2e_freq_get(unsigned int cpu)
 		return 0;
 
 	cpus_allowed = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	clock_tick = sparc64_get_clock_tick(cpu) / 1000;
 	estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
 
-	set_cpus_allowed(current, cpus_allowed);
+	set_cpus_allowed_ptr(current, &cpus_allowed);
 
 	return clock_tick / estar_to_divisor(estar);
 }
@@ -259,7 +259,7 @@ static void us2e_set_cpu_divider_index(unsigned int cpu, unsigned int index)
 		return;
 
 	cpus_allowed = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000;
 	new_bits = index_to_estar_mode(index);
@@ -281,7 +281,7 @@ static void us2e_set_cpu_divider_index(unsigned int cpu, unsigned int index)
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
-	set_cpus_allowed(current, cpus_allowed);
+	set_cpus_allowed_ptr(current, &cpus_allowed);
 }
 
 static int us2e_freq_target(struct cpufreq_policy *policy,
diff --git a/arch/sparc/kernel/us3_cpufreq.c b/arch/sparc/kernel/us3_cpufreq.c
index 365b6464e2c..f35d1e79454 100644
--- a/arch/sparc/kernel/us3_cpufreq.c
+++ b/arch/sparc/kernel/us3_cpufreq.c
@@ -86,12 +86,12 @@ static unsigned int us3_freq_get(unsigned int cpu)
 		return 0;
 
 	cpus_allowed = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	reg = read_safari_cfg();
 	ret = get_current_freq(cpu, reg);
 
-	set_cpus_allowed(current, cpus_allowed);
+	set_cpus_allowed_ptr(current, &cpus_allowed);
 
 	return ret;
 }
@@ -106,7 +106,7 @@ static void us3_set_cpu_divider_index(unsigned int cpu, unsigned int index)
 		return;
 
 	cpus_allowed = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
 	new_freq = sparc64_get_clock_tick(cpu) / 1000;
 	switch (index) {
@@ -140,7 +140,7 @@ static void us3_set_cpu_divider_index(unsigned int cpu, unsigned int index)
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
-	set_cpus_allowed(current, cpus_allowed);
+	set_cpus_allowed_ptr(current, &cpus_allowed);
 }
 
 static int us3_freq_target(struct cpufreq_policy *policy,
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 635f03bb499..d07b44f7d1d 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -82,6 +82,9 @@ enum fixed_addresses {
 #endif
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+	FIX_OHCI1394_BASE,
+#endif
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
@@ -132,9 +135,6 @@ enum fixed_addresses {
 	   (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
 	 : __end_of_permanent_fixed_addresses,
 	FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
-#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
-	FIX_OHCI1394_BASE,
-#endif
 #ifdef CONFIG_X86_32
 	FIX_WP_TEST,
 #endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a929c9ede33..46c0fe05f23 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -133,6 +133,7 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
 
 typedef int vector_irq_t[NR_VECTORS];
 DECLARE_PER_CPU(vector_irq_t, vector_irq);
+extern void setup_vector_irq(int cpu);
 
 #ifdef CONFIG_X86_IO_APIC
 extern void lock_vector_lock(void);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1cd58cdbc03..4604e6a54d3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -105,6 +105,8 @@
 #define MSR_AMD64_PATCH_LEVEL		0x0000008b
 #define MSR_AMD64_NB_CFG		0xc001001f
 #define MSR_AMD64_PATCH_LOADER		0xc0010020
+#define MSR_AMD64_OSVW_ID_LENGTH	0xc0010140
+#define MSR_AMD64_OSVW_STATUS		0xc0010141
 #define MSR_AMD64_IBSFETCHCTL		0xc0011030
 #define MSR_AMD64_IBSFETCHLINAD		0xc0011031
 #define MSR_AMD64_IBSFETCHPHYSAD	0xc0011032
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e4e0ddcb154..463de9a858a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1268,6 +1268,14 @@ void __setup_vector_irq(int cpu)
 	/* Mark the inuse vectors */
 	for_each_irq_desc(irq, desc) {
 		cfg = desc->chip_data;
+
+		/*
+		 * If it is a legacy IRQ handled by the legacy PIC, this cpu
+		 * will be part of the irq_cfg's domain.
+		 */
+		if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
+			cpumask_set_cpu(cpu, cfg->domain);
+
 		if (!cpumask_test_cpu(cpu, cfg->domain))
 			continue;
 		vector = cfg->vector;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 573458f1caf..b87e0b6970c 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -348,10 +348,12 @@ static void amd_pmu_cpu_offline(int cpu)
 
 	raw_spin_lock(&amd_nb_lock);
 
-	if (--cpuhw->amd_nb->refcnt == 0)
-		kfree(cpuhw->amd_nb);
+	if (cpuhw->amd_nb) {
+		if (--cpuhw->amd_nb->refcnt == 0)
+			kfree(cpuhw->amd_nb);
 
-	cpuhw->amd_nb = NULL;
+		cpuhw->amd_nb = NULL;
+	}
 
 	raw_spin_unlock(&amd_nb_lock);
 }
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index adedeef1ded..b2e24603739 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -7,6 +7,7 @@
 
 #include <linux/init.h>
 #include <linux/start_kernel.h>
+#include <linux/mm.h>
 
 #include <asm/setup.h>
 #include <asm/sections.h>
@@ -44,9 +45,10 @@ void __init i386_start_kernel(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
 	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+		/* Assume only end is not page aligned */
 		u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 		u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-		u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+		u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b5a9896ca1e..7147143fd61 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data)
 #ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
 	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+		/* Assume only end is not page aligned */
 		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
 		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-		unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
+		unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index ef257fc2921..f01d390f9c5 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -141,6 +141,28 @@ void __init init_IRQ(void)
 	x86_init.irqs.intr_init();
 }
 
+/*
+ * Setup the vector to irq mappings.
+ */
+void setup_vector_irq(int cpu)
+{
+#ifndef CONFIG_X86_IO_APIC
+	int irq;
+
+	/*
+	 * On most of the platforms, legacy PIC delivers the interrupts on the
+	 * boot cpu. But there are certain platforms where PIC interrupts are
+	 * delivered to multiple cpu's. If the legacy IRQ is handled by the
+	 * legacy PIC, for the new cpu that is coming online, setup the static
+	 * legacy vector to irq mapping:
+	 */
+	for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
+		per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+#endif
+
+	__setup_vector_irq(cpu);
+}
+
 static void __init smp_intr_init(void)
 {
 #ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ad9540676fc..28ad9f4d8b9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -526,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 }
 
 /*
- * Check for AMD CPUs, which have potentially C1E support
+ * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
+ * For more information see
+ * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
+ * - Erratum #365 for family 0x11 (not affected because C1e not in use)
  */
 static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
 {
+	u64 val;
 	if (c->x86_vendor != X86_VENDOR_AMD)
-		return 0;
-
-	if (c->x86 < 0x0F)
-		return 0;
+		goto no_c1e_idle;
 
 	/* Family 0x0f models < rev F do not have C1E */
-	if (c->x86 == 0x0f && c->x86_model < 0x40)
-		return 0;
+	if (c->x86 == 0x0F && c->x86_model >= 0x40)
+		return 1;
 
-	return 1;
+	if (c->x86 == 0x10) {
+		/*
+		 * check OSVW bit for CPUs that are not affected
+		 * by erratum #400
+		 */
+		rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
+		if (val >= 2) {
+			rdmsrl(MSR_AMD64_OSVW_STATUS, val);
+			if (!(val & BIT(1)))
+				goto no_c1e_idle;
+		}
+		return 1;
+	}
+
+no_c1e_idle:
+	return 0;
 }
 
 static cpumask_var_t c1e_mask;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5d7ba1a449b..d76e18570c6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -314,16 +314,17 @@ static void __init reserve_brk(void)
 #define MAX_MAP_CHUNK	(NR_FIX_BTMAPS << PAGE_SHIFT)
 static void __init relocate_initrd(void)
 {
-
+	/* Assume only end is not page aligned */
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 area_size     = PAGE_ALIGN(ramdisk_size);
 	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
 	u64 ramdisk_here;
 	unsigned long slop, clen, mapaddr;
 	char *p, *q;
 
 	/* We need to move the initrd down into lowmem */
-	ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size,
+	ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
 					 PAGE_SIZE);
 
 	if (ramdisk_here == -1ULL)
@@ -332,7 +333,7 @@ static void __init relocate_initrd(void)
 
 	/* Note: this includes all the lowmem currently occupied by
 	   the initrd, we rely on that fact to keep the data intact. */
-	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+	reserve_early(ramdisk_here, ramdisk_here + area_size,
 			 "NEW RAMDISK");
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
@@ -376,9 +377,10 @@ static void __init relocate_initrd(void)
 
 static void __init reserve_initrd(void)
 {
+	/* Assume only end is not page aligned */
 	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
 	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+	u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
 	u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
 
 	if (!boot_params.hdr.type_of_loader ||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a02e80c3c54..06d98ae5a80 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -247,7 +247,7 @@ static void __cpuinit smp_callin(void)
 	/*
 	 * Need to setup vector mappings before we enable interrupts.
 	 */
-	__setup_vector_irq(smp_processor_id());
+	setup_vector_irq(smp_processor_id());
 	/*
 	 * Get our bogomips.
 	 *
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 44879df5569..2cc249718c4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -291,8 +291,8 @@ SECTIONS
 	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
 		__smp_locks = .;
 		*(.smp_locks)
-		__smp_locks_end = .;
 		. = ALIGN(PAGE_SIZE);
+		__smp_locks_end = .;
 	}
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index e71c5cbc8f3..452ee5b8f30 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -331,11 +331,23 @@ int devmem_is_allowed(unsigned long pagenr)
 
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
-	unsigned long addr = begin;
+	unsigned long addr;
+	unsigned long begin_aligned, end_aligned;
 
-	if (addr >= end)
+	/* Make sure boundaries are page aligned */
+	begin_aligned = PAGE_ALIGN(begin);
+	end_aligned   = end & PAGE_MASK;
+
+	if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
+		begin = begin_aligned;
+		end   = end_aligned;
+	}
+
+	if (begin >= end)
 		return;
 
+	addr = begin;
+
 	/*
 	 * If debugging page accesses then do not free this memory but
 	 * mark them not present - any buggy init-section access will
@@ -343,7 +355,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	 */
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
-		begin, PAGE_ALIGN(end));
+		begin, end);
 	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
 #else
 	/*
@@ -358,8 +370,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
 	for (; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
 		init_page_count(virt_to_page(addr));
-		memset((void *)(addr & ~(PAGE_SIZE-1)),
-			POISON_FREE_INITMEM, PAGE_SIZE);
+		memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -376,6 +387,15 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_init_pages("initrd memory", start, end);
+	/*
+	 * end could be not aligned, and We can not align that,
+	 * decompresser could be confused by aligned initrd_end
+	 * We already reserve the end partial page before in
+	 *   - i386_start_kernel()
+	 *   - x86_64_start_kernel()
+	 *   - relocate_initrd()
+	 * So here We can do PAGE_ALIGN() safely to get partial page to be freed
+	 */
+	free_init_pages("initrd memory", start, PAGE_ALIGN(end));
 }
 #endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 6e22454bfaa..e31160216ef 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -122,8 +122,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 	struct acpi_resource_address64 addr;
 	acpi_status status;
 	unsigned long flags;
-	struct resource *root;
-	u64 start, end;
+	struct resource *root, *conflict;
+	u64 start, end, max_len;
 
 	status = resource_to_addr(acpi_res, &addr);
 	if (!ACPI_SUCCESS(status))
@@ -140,6 +140,17 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 	} else
 		return AE_OK;
 
+	max_len = addr.maximum - addr.minimum + 1;
+	if (addr.address_length > max_len) {
+		dev_printk(KERN_DEBUG, &info->bridge->dev,
+			   "host bridge window length %#llx doesn't fit in "
+			   "%#llx-%#llx, trimming\n",
+			   (unsigned long long) addr.address_length,
+			   (unsigned long long) addr.minimum,
+			   (unsigned long long) addr.maximum);
+		addr.address_length = max_len;
+	}
+
 	start = addr.minimum + addr.translation_offset;
 	end = start + addr.address_length - 1;
 
@@ -157,9 +168,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 		return AE_OK;
 	}
 
-	if (insert_resource(root, res)) {
+	conflict = insert_resource_conflict(root, res);
+	if (conflict) {
 		dev_err(&info->bridge->dev,
-			"can't allocate host bridge window %pR\n", res);
+			"address space collision: host bridge window %pR "
+			"conflicts with %s %pR\n",
+			res, conflict->name, conflict);
 	} else {
 		pci_bus_add_resource(info->bus, res, 0);
 		info->res_num++;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index dece3eb9c90..46fd43f7910 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -127,9 +127,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
 					continue;
 				if (!r->start ||
 				    pci_claim_resource(dev, idx) < 0) {
-					dev_info(&dev->dev,
-						 "can't reserve window %pR\n",
-						 r);
 					/*
 					 * Something is wrong with the region.
 					 * Invalidate the resource to prevent
@@ -181,8 +178,6 @@ static void __init pcibios_allocate_resources(int pass)
 					"BAR %d: reserving %pr (d=%d, p=%d)\n",
 					idx, r, disabled, pass);
 				if (pci_claim_resource(dev, idx) < 0) {
-					dev_info(&dev->dev,
-						 "can't reserve %pR\n", r);
 					/* We'll assign a new address later */
 					r->end -= r->start;
 					r->start = 0;
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index fb7fc24fe72..189cbc2585f 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -8,6 +8,7 @@
 #include <linux/acpi.h>
 #include <linux/signal.h>
 #include <linux/kthread.h>
+#include <linux/dmi.h>
 
 #include <acpi/acpi_drivers.h>
 
@@ -1032,6 +1033,41 @@ static void acpi_add_id(struct acpi_device *device, const char *dev_id)
 	list_add_tail(&id->list, &device->pnp.ids);
 }
 
+/*
+ * Old IBM workstations have a DSDT bug wherein the SMBus object
+ * lacks the SMBUS01 HID and the methods do not have the necessary "_"
+ * prefix.  Work around this.
+ */
+static int acpi_ibm_smbus_match(struct acpi_device *device)
+{
+	acpi_handle h_dummy;
+	struct acpi_buffer path = {ACPI_ALLOCATE_BUFFER, NULL};
+	int result;
+
+	if (!dmi_name_in_vendors("IBM"))
+		return -ENODEV;
+
+	/* Look for SMBS object */
+	result = acpi_get_name(device->handle, ACPI_SINGLE_NAME, &path);
+	if (result)
+		return result;
+
+	if (strcmp("SMBS", path.pointer)) {
+		result = -ENODEV;
+		goto out;
+	}
+
+	/* Does it have the necessary (but misnamed) methods? */
+	result = -ENODEV;
+	if (ACPI_SUCCESS(acpi_get_handle(device->handle, "SBI", &h_dummy)) &&
+	    ACPI_SUCCESS(acpi_get_handle(device->handle, "SBR", &h_dummy)) &&
+	    ACPI_SUCCESS(acpi_get_handle(device->handle, "SBW", &h_dummy)))
+		result = 0;
+out:
+	kfree(path.pointer);
+	return result;
+}
+
 static void acpi_device_set_id(struct acpi_device *device)
 {
 	acpi_status status;
@@ -1082,6 +1118,8 @@ static void acpi_device_set_id(struct acpi_device *device)
 			acpi_add_id(device, ACPI_BAY_HID);
 		else if (ACPI_SUCCESS(acpi_dock_match(device)))
 			acpi_add_id(device, ACPI_DOCK_HID);
+		else if (!acpi_ibm_smbus_match(device))
+			acpi_add_id(device, ACPI_SMBUS_IBM_HID);
 
 		break;
 	case ACPI_BUS_TYPE_POWER:
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 561dec2481c..277477251a8 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -1667,6 +1667,7 @@ unsigned int ata_sff_host_intr(struct ata_port *ap,
 {
 	struct ata_eh_info *ehi = &ap->link.eh_info;
 	u8 status, host_stat = 0;
+	bool bmdma_stopped = false;
 
 	VPRINTK("ata%u: protocol %d task_state %d\n",
 		ap->print_id, qc->tf.protocol, ap->hsm_task_state);
@@ -1699,6 +1700,7 @@ unsigned int ata_sff_host_intr(struct ata_port *ap,
 
 			/* before we do anything else, clear DMA-Start bit */
 			ap->ops->bmdma_stop(qc);
+			bmdma_stopped = true;
 
 			if (unlikely(host_stat & ATA_DMA_ERR)) {
 				/* error when transfering data to/from memory */
@@ -1716,8 +1718,14 @@ unsigned int ata_sff_host_intr(struct ata_port *ap,
 
 	/* check main status, clearing INTRQ if needed */
 	status = ata_sff_irq_status(ap);
-	if (status & ATA_BUSY)
-		goto idle_irq;
+	if (status & ATA_BUSY) {
+		if (bmdma_stopped) {
+			/* BMDMA engine is already stopped, we're screwed */
+			qc->err_mask |= AC_ERR_HSM;
+			ap->hsm_task_state = HSM_ST_ERR;
+		} else
+			goto idle_irq;
+	}
 
 	/* ack bmdma irq events */
 	ap->ops->sff_irq_clear(ap);
@@ -1762,13 +1770,16 @@ EXPORT_SYMBOL_GPL(ata_sff_host_intr);
 irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
 {
 	struct ata_host *host = dev_instance;
+	bool retried = false;
 	unsigned int i;
-	unsigned int handled = 0, polling = 0;
+	unsigned int handled, idle, polling;
 	unsigned long flags;
 
 	/* TODO: make _irqsave conditional on x86 PCI IDE legacy mode */
 	spin_lock_irqsave(&host->lock, flags);
 
+retry:
+	handled = idle = polling = 0;
 	for (i = 0; i < host->n_ports; i++) {
 		struct ata_port *ap = host->ports[i];
 		struct ata_queued_cmd *qc;
@@ -1782,7 +1793,8 @@ irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
 				handled |= ata_sff_host_intr(ap, qc);
 			else
 				polling |= 1 << i;
-		}
+		} else
+			idle |= 1 << i;
 	}
 
 	/*
@@ -1790,7 +1802,9 @@ irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
 	 * asserting IRQ line, nobody cared will ensue.  Check IRQ
 	 * pending status if available and clear spurious IRQ.
 	 */
-	if (!handled) {
+	if (!handled && !retried) {
+		bool retry = false;
+
 		for (i = 0; i < host->n_ports; i++) {
 			struct ata_port *ap = host->ports[i];
 
@@ -1805,8 +1819,23 @@ irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
 				ata_port_printk(ap, KERN_INFO,
 						"clearing spurious IRQ\n");
 
-			ap->ops->sff_check_status(ap);
-			ap->ops->sff_irq_clear(ap);
+			if (idle & (1 << i)) {
+				ap->ops->sff_check_status(ap);
+				ap->ops->sff_irq_clear(ap);
+			} else {
+				/* clear INTRQ and check if BUSY cleared */
+				if (!(ap->ops->sff_check_status(ap) & ATA_BUSY))
+					retry |= true;
+				/*
+				 * With command in flight, we can't do
+				 * sff_irq_clear() w/o racing with completion.
+				 */
+			}
+		}
+
+		if (retry) {
+			retried = true;
+			goto retry;
 		}
 	}
 
diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
index 3059ec017de..c59b40710fb 100644
--- a/drivers/ata/pata_via.c
+++ b/drivers/ata/pata_via.c
@@ -576,6 +576,10 @@ static int via_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 			u8 rev = isa->revision;
 			pci_dev_put(isa);
 
+			if ((id->device == 0x0415 || id->device == 0x3164) &&
+			    (config->id != id->device))
+				continue;
+
 			if (rev >= config->rev_min && rev <= config->rev_max)
 				break;
 		}
@@ -677,6 +681,7 @@ static const struct pci_device_id via[] = {
 	{ PCI_VDEVICE(VIA, 0x3164), },
 	{ PCI_VDEVICE(VIA, 0x5324), },
 	{ PCI_VDEVICE(VIA, 0xC409), VIA_IDFLAG_SINGLE },
+	{ PCI_VDEVICE(VIA, 0x9001), VIA_IDFLAG_SINGLE },
 
 	{ },
 };
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 0147f476b8a..9c6a0d6408e 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -219,6 +219,8 @@ static void class_create_release(struct class *cls)
  * This is used to create a struct class pointer that can then be used
  * in calls to device_create().
  *
+ * Returns &struct class pointer on success, or ERR_PTR() on error.
+ *
  * Note, the pointer created here is to be destroyed when finished by
  * making a call to class_destroy().
  */
diff --git a/drivers/base/core.c b/drivers/base/core.c
index ef55df34ddd..b56a0ba31d4 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1345,6 +1345,8 @@ static void root_device_release(struct device *dev)
  * 'module' symlink which points to the @owner directory
  * in sysfs.
  *
+ * Returns &struct device pointer on success, or ERR_PTR() on error.
+ *
  * Note: You probably want to use root_device_register().
  */
 struct device *__root_device_register(const char *name, struct module *owner)
@@ -1432,6 +1434,8 @@ static void device_create_release(struct device *dev)
  * Any further sysfs files that might be required can be created using this
  * pointer.
  *
+ * Returns &struct device pointer on success, or ERR_PTR() on error.
+ *
  * Note: the struct class passed to this function must have previously
  * been created with a call to class_create().
  */
@@ -1492,6 +1496,8 @@ EXPORT_SYMBOL_GPL(device_create_vargs);
  * Any further sysfs files that might be required can be created using this
  * pointer.
  *
+ * Returns &struct device pointer on success, or ERR_PTR() on error.
+ *
  * Note: the struct class passed to this function must have previously
  * been created with a call to class_create().
  */
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 7036e8e96ab..b5242e1e8bc 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -79,24 +79,24 @@ void unregister_cpu(struct cpu *cpu)
 }
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
-static ssize_t cpu_probe_store(struct sys_device *dev,
-				struct sysdev_attribute *attr,
-				const char *buf,
+static ssize_t cpu_probe_store(struct sysdev_class *class,
+			       struct sysdev_class_attribute *attr,
+			       const char *buf,
 			       size_t count)
 {
 	return arch_cpu_probe(buf, count);
 }
 
-static ssize_t cpu_release_store(struct sys_device *dev,
-				struct sysdev_attribute *attr,
-				const char *buf,
+static ssize_t cpu_release_store(struct sysdev_class *class,
+				 struct sysdev_class_attribute *attr,
+				 const char *buf,
 				 size_t count)
 {
 	return arch_cpu_release(buf, count);
 }
 
-static SYSDEV_ATTR(probe, S_IWUSR, NULL, cpu_probe_store);
-static SYSDEV_ATTR(release, S_IWUSR, NULL, cpu_release_store);
+static SYSDEV_CLASS_ATTR(probe, S_IWUSR, NULL, cpu_probe_store);
+static SYSDEV_CLASS_ATTR(release, S_IWUSR, NULL, cpu_release_store);
 #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 
 #else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index d0dc26ad538..18518ba13c8 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -78,6 +78,7 @@ firmware_timeout_show(struct class *class,
 /**
  * firmware_timeout_store - set number of seconds to wait for firmware
  * @class: device class pointer
+ * @attr: device attribute pointer
  * @buf: buffer to scan for timeout value
  * @count: number of bytes in @buf
  *
@@ -442,6 +443,7 @@ static int fw_setup_device(struct firmware *fw, struct device **dev_p,
 	fw_priv = dev_get_drvdata(f_dev);
 
 	fw_priv->fw = fw;
+	sysfs_bin_attr_init(&fw_priv->attr_data);
 	retval = sysfs_create_bin_file(&f_dev->kobj, &fw_priv->attr_data);
 	if (retval) {
 		dev_err(device, "%s: sysfs_create_bin_file failed\n", __func__);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index ad43185ec15..93b3ac65c2d 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -165,8 +165,11 @@ static ssize_t node_read_distance(struct sys_device * dev,
 	int len = 0;
 	int i;
 
-	/* buf currently PAGE_SIZE, need ~4 chars per node */
-	BUILD_BUG_ON(MAX_NUMNODES*4 > PAGE_SIZE/2);
+	/*
+	 * buf is currently PAGE_SIZE in length and each node needs 4 chars
+	 * at the most (distance + space or newline).
+	 */
+	BUILD_BUG_ON(MAX_NUMNODES * 4 > PAGE_SIZE);
 
 	for_each_online_node(i)
 		len += sprintf(buf + len, "%s%d", i ? " " : "", node_distance(nid, i));
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 1ba9d617d24..4b4b565c835 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -362,6 +362,8 @@ EXPORT_SYMBOL_GPL(platform_device_unregister);
  * enumeration tasks, they don't fully conform to the Linux driver model.
  * In particular, when such drivers are built as modules, they can't be
  * "hotplugged".
+ *
+ * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
  */
 struct platform_device *platform_device_register_simple(const char *name,
 							int id,
@@ -408,6 +410,8 @@ EXPORT_SYMBOL_GPL(platform_device_register_simple);
  * allocated for the device allows drivers using such devices to be
  * unloaded without waiting for the last reference to the device to be
  * dropped.
+ *
+ * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
  */
 struct platform_device *platform_device_register_data(
 		struct device *parent,
@@ -559,6 +563,8 @@ EXPORT_SYMBOL_GPL(platform_driver_probe);
  *
  * Use this in legacy-style modules that probe hardware directly and
  * register a single platform device and corresponding platform driver.
+ *
+ * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
  */
 struct platform_device * __init_or_module platform_create_bundle(
 			struct platform_driver *driver,
@@ -1052,9 +1058,11 @@ static __initdata LIST_HEAD(early_platform_driver_list);
 static __initdata LIST_HEAD(early_platform_device_list);
 
 /**
- * early_platform_driver_register
+ * early_platform_driver_register - register early platform driver
  * @epdrv: early_platform driver structure
  * @buf: string passed from early_param()
+ *
+ * Helper function for early_platform_init() / early_platform_init_buffer()
  */
 int __init early_platform_driver_register(struct early_platform_driver *epdrv,
 					  char *buf)
@@ -1106,9 +1114,12 @@ int __init early_platform_driver_register(struct early_platform_driver *epdrv,
 }
 
 /**
- * early_platform_add_devices - add a numbers of early platform devices
+ * early_platform_add_devices - adds a number of early platform devices
  * @devs: array of early platform devices to add
  * @num: number of early platform devices in array
+ *
+ * Used by early architecture code to register early platform devices and
+ * their platform data.
  */
 void __init early_platform_add_devices(struct platform_device **devs, int num)
 {
@@ -1128,8 +1139,12 @@ void __init early_platform_add_devices(struct platform_device **devs, int num)
 }
 
 /**
- * early_platform_driver_register_all
+ * early_platform_driver_register_all - register early platform drivers
  * @class_str: string to identify early platform driver class
+ *
+ * Used by architecture code to register all early platform drivers
+ * for a certain class. If omitted then only early platform drivers
+ * with matching kernel command line class parameters will be registered.
  */
 void __init early_platform_driver_register_all(char *class_str)
 {
@@ -1151,7 +1166,7 @@ void __init early_platform_driver_register_all(char *class_str)
 }
 
 /**
- * early_platform_match
+ * early_platform_match - find early platform device matching driver
  * @epdrv: early platform driver structure
  * @id: id to match against
  */
@@ -1169,7 +1184,7 @@ early_platform_match(struct early_platform_driver *epdrv, int id)
 }
 
 /**
- * early_platform_left
+ * early_platform_left - check if early platform driver has matching devices
  * @epdrv: early platform driver structure
  * @id: return true if id or above exists
  */
@@ -1187,7 +1202,7 @@ static  __init int early_platform_left(struct early_platform_driver *epdrv,
 }
 
 /**
- * early_platform_driver_probe_id
+ * early_platform_driver_probe_id - probe drivers matching class_str and id
  * @class_str: string to identify early platform driver class
  * @id: id to match against
  * @nr_probe: number of platform devices to successfully probe before exiting
@@ -1257,10 +1272,14 @@ static int __init early_platform_driver_probe_id(char *class_str,
 }
 
 /**
- * early_platform_driver_probe
+ * early_platform_driver_probe - probe a class of registered drivers
  * @class_str: string to identify early platform driver class
  * @nr_probe: number of platform devices to successfully probe before exiting
  * @user_only: only probe user specified early platform devices
+ *
+ * Used by architecture code to probe registered early platform drivers
+ * within a certain class. For probe to happen a registered early platform
+ * device matching a registered early platform driver is needed.
  */
 int __init early_platform_driver_probe(char *class_str,
 				       int nr_probe,
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index d477f4dc5e5..941fcb87e52 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -439,8 +439,23 @@ static int device_resume_noirq(struct device *dev, pm_message_t state)
 	if (dev->bus && dev->bus->pm) {
 		pm_dev_dbg(dev, state, "EARLY ");
 		error = pm_noirq_op(dev, dev->bus->pm, state);
+		if (error)
+			goto End;
 	}
 
+	if (dev->type && dev->type->pm) {
+		pm_dev_dbg(dev, state, "EARLY type ");
+		error = pm_noirq_op(dev, dev->type->pm, state);
+		if (error)
+			goto End;
+	}
+
+	if (dev->class && dev->class->pm) {
+		pm_dev_dbg(dev, state, "EARLY class ");
+		error = pm_noirq_op(dev, dev->class->pm, state);
+	}
+
+End:
 	TRACE_RESUME(error);
 	return error;
 }
@@ -735,10 +750,26 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
+	if (dev->class && dev->class->pm) {
+		pm_dev_dbg(dev, state, "LATE class ");
+		error = pm_noirq_op(dev, dev->class->pm, state);
+		if (error)
+			goto End;
+	}
+
+	if (dev->type && dev->type->pm) {
+		pm_dev_dbg(dev, state, "LATE type ");
+		error = pm_noirq_op(dev, dev->type->pm, state);
+		if (error)
+			goto End;
+	}
+
 	if (dev->bus && dev->bus->pm) {
 		pm_dev_dbg(dev, state, "LATE ");
 		error = pm_noirq_op(dev, dev->bus->pm, state);
 	}
+
+End:
 	return error;
 }
 
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index a3e10dc7cc2..b78d5c381ef 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -97,6 +97,9 @@ EXPORT_SYMBOL(intel_agp_enabled);
 #define IS_PINEVIEW (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_PINEVIEW_M_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_PINEVIEW_HB)
 
+#define IS_SNB (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_SANDYBRIDGE_HB || \
+		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_SANDYBRIDGE_M_HB)
+
 #define IS_G4X (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_EAGLELAKE_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_Q45_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_G45_HB || \
@@ -107,8 +110,7 @@ EXPORT_SYMBOL(intel_agp_enabled);
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB || \
 		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IRONLAKE_MC2_HB || \
-		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_SANDYBRIDGE_HB || \
-		agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_SANDYBRIDGE_M_HB)
+		IS_SNB)
 
 extern int agp_memory_reserved;
 
@@ -175,6 +177,10 @@ extern int agp_memory_reserved;
 #define SNB_GMCH_GMS_STOLEN_448M	(0xe << 3)
 #define SNB_GMCH_GMS_STOLEN_480M	(0xf << 3)
 #define SNB_GMCH_GMS_STOLEN_512M	(0x10 << 3)
+#define SNB_GTT_SIZE_0M			(0 << 8)
+#define SNB_GTT_SIZE_1M			(1 << 8)
+#define SNB_GTT_SIZE_2M			(2 << 8)
+#define SNB_GTT_SIZE_MASK		(3 << 8)
 
 static const struct aper_size_info_fixed intel_i810_sizes[] =
 {
@@ -1200,6 +1206,9 @@ static void intel_i9xx_setup_flush(void)
 	if (intel_private.ifp_resource.start)
 		return;
 
+	if (IS_SNB)
+		return;
+
 	/* setup a resource for this object */
 	intel_private.ifp_resource.name = "Intel Flush Page";
 	intel_private.ifp_resource.flags = IORESOURCE_MEM;
@@ -1438,6 +1447,8 @@ static unsigned long intel_i965_mask_memory(struct agp_bridge_data *bridge,
 
 static void intel_i965_get_gtt_range(int *gtt_offset, int *gtt_size)
 {
+	u16 snb_gmch_ctl;
+
 	switch (agp_bridge->dev->device) {
 	case PCI_DEVICE_ID_INTEL_GM45_HB:
 	case PCI_DEVICE_ID_INTEL_EAGLELAKE_HB:
@@ -1449,9 +1460,26 @@ static void intel_i965_get_gtt_range(int *gtt_offset, int *gtt_size)
 	case PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB:
 	case PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB:
 	case PCI_DEVICE_ID_INTEL_IRONLAKE_MC2_HB:
+		*gtt_offset = *gtt_size = MB(2);
+		break;
 	case PCI_DEVICE_ID_INTEL_SANDYBRIDGE_HB:
 	case PCI_DEVICE_ID_INTEL_SANDYBRIDGE_M_HB:
-		*gtt_offset = *gtt_size = MB(2);
+		*gtt_offset = MB(2);
+
+		pci_read_config_word(intel_private.pcidev, SNB_GMCH_CTRL, &snb_gmch_ctl);
+		switch (snb_gmch_ctl & SNB_GTT_SIZE_MASK) {
+		default:
+		case SNB_GTT_SIZE_0M:
+			printk(KERN_ERR "Bad GTT size mask: 0x%04x.\n", snb_gmch_ctl);
+			*gtt_size = MB(0);
+			break;
+		case SNB_GTT_SIZE_1M:
+			*gtt_size = MB(1);
+			break;
+		case SNB_GTT_SIZE_2M:
+			*gtt_size = MB(2);
+			break;
+		}
 		break;
 	default:
 		*gtt_offset = *gtt_size = KB(512);
diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index 465185fc0f5..ba55bba151b 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -312,6 +312,7 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
 	spin_lock_irqsave(&hp->lock, flags);
 	/* Check and then increment for fast path open. */
 	if (hp->count++ > 0) {
+		tty_kref_get(tty);
 		spin_unlock_irqrestore(&hp->lock, flags);
 		hvc_kick();
 		return 0;
@@ -319,7 +320,7 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
 
 	tty->driver_data = hp;
 
-	hp->tty = tty;
+	hp->tty = tty_kref_get(tty);
 
 	spin_unlock_irqrestore(&hp->lock, flags);
 
@@ -336,6 +337,7 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
 		spin_lock_irqsave(&hp->lock, flags);
 		hp->tty = NULL;
 		spin_unlock_irqrestore(&hp->lock, flags);
+		tty_kref_put(tty);
 		tty->driver_data = NULL;
 		kref_put(&hp->kref, destroy_hvc_struct);
 		printk(KERN_ERR "hvc_open: request_irq failed with rc %d.\n", rc);
@@ -363,13 +365,18 @@ static void hvc_close(struct tty_struct *tty, struct file * filp)
 		return;
 
 	hp = tty->driver_data;
+
 	spin_lock_irqsave(&hp->lock, flags);
+	tty_kref_get(tty);
 
 	if (--hp->count == 0) {
 		/* We are done with the tty pointer now. */
 		hp->tty = NULL;
 		spin_unlock_irqrestore(&hp->lock, flags);
 
+		/* Put the ref obtained in hvc_open() */
+		tty_kref_put(tty);
+
 		if (hp->ops->notifier_del)
 			hp->ops->notifier_del(hp, hp->data);
 
@@ -389,6 +396,7 @@ static void hvc_close(struct tty_struct *tty, struct file * filp)
 		spin_unlock_irqrestore(&hp->lock, flags);
 	}
 
+	tty_kref_put(tty);
 	kref_put(&hp->kref, destroy_hvc_struct);
 }
 
@@ -424,10 +432,11 @@ static void hvc_hangup(struct tty_struct *tty)
 	spin_unlock_irqrestore(&hp->lock, flags);
 
 	if (hp->ops->notifier_hangup)
-			hp->ops->notifier_hangup(hp, hp->data);
+		hp->ops->notifier_hangup(hp, hp->data);
 
 	while(temp_open_count) {
 		--temp_open_count;
+		tty_kref_put(tty);
 		kref_put(&hp->kref, destroy_hvc_struct);
 	}
 }
@@ -592,7 +601,7 @@ int hvc_poll(struct hvc_struct *hp)
 	}
 
 	/* No tty attached, just skip */
-	tty = hp->tty;
+	tty = tty_kref_get(hp->tty);
 	if (tty == NULL)
 		goto bail;
 
@@ -672,6 +681,8 @@ int hvc_poll(struct hvc_struct *hp)
 
 		tty_flip_buffer_push(tty);
 	}
+	if (tty)
+		tty_kref_put(tty);
 
 	return poll_mask;
 }
@@ -807,7 +818,7 @@ int hvc_remove(struct hvc_struct *hp)
 	struct tty_struct *tty;
 
 	spin_lock_irqsave(&hp->lock, flags);
-	tty = hp->tty;
+	tty = tty_kref_get(hp->tty);
 
 	if (hp->index < MAX_NR_HVC_CONSOLES)
 		vtermnos[hp->index] = -1;
@@ -819,18 +830,18 @@ int hvc_remove(struct hvc_struct *hp)
 	/*
 	 * We 'put' the instance that was grabbed when the kref instance
 	 * was initialized using kref_init().  Let the last holder of this
-	 * kref cause it to be removed, which will probably be the tty_hangup
+	 * kref cause it to be removed, which will probably be the tty_vhangup
 	 * below.
 	 */
 	kref_put(&hp->kref, destroy_hvc_struct);
 
 	/*
-	 * This function call will auto chain call hvc_hangup.  The tty should
-	 * always be valid at this time unless a simultaneous tty close already
-	 * cleaned up the hvc_struct.
+	 * This function call will auto chain call hvc_hangup.
 	 */
-	if (tty)
-		tty_hangup(tty);
+	if (tty) {
+		tty_vhangup(tty);
+		tty_kref_put(tty);
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(hvc_remove);
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index ec5e3f8df64..c6ad4234378 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -2272,42 +2272,52 @@ static int create_files(struct bmc_device *bmc)
 	bmc->device_id_attr.attr.name = "device_id";
 	bmc->device_id_attr.attr.mode = S_IRUGO;
 	bmc->device_id_attr.show = device_id_show;
+	sysfs_attr_init(&bmc->device_id_attr.attr);
 
 	bmc->provides_dev_sdrs_attr.attr.name = "provides_device_sdrs";
 	bmc->provides_dev_sdrs_attr.attr.mode = S_IRUGO;
 	bmc->provides_dev_sdrs_attr.show = provides_dev_sdrs_show;
+	sysfs_attr_init(&bmc->provides_dev_sdrs_attr.attr);
 
 	bmc->revision_attr.attr.name = "revision";
 	bmc->revision_attr.attr.mode = S_IRUGO;
 	bmc->revision_attr.show = revision_show;
+	sysfs_attr_init(&bmc->revision_attr.attr);
 
 	bmc->firmware_rev_attr.attr.name = "firmware_revision";
 	bmc->firmware_rev_attr.attr.mode = S_IRUGO;
 	bmc->firmware_rev_attr.show = firmware_rev_show;
+	sysfs_attr_init(&bmc->firmware_rev_attr.attr);
 
 	bmc->version_attr.attr.name = "ipmi_version";
 	bmc->version_attr.attr.mode = S_IRUGO;
 	bmc->version_attr.show = ipmi_version_show;
+	sysfs_attr_init(&bmc->version_attr.attr);
 
 	bmc->add_dev_support_attr.attr.name = "additional_device_support";
 	bmc->add_dev_support_attr.attr.mode = S_IRUGO;
 	bmc->add_dev_support_attr.show = add_dev_support_show;
+	sysfs_attr_init(&bmc->add_dev_support_attr.attr);
 
 	bmc->manufacturer_id_attr.attr.name = "manufacturer_id";
 	bmc->manufacturer_id_attr.attr.mode = S_IRUGO;
 	bmc->manufacturer_id_attr.show = manufacturer_id_show;
+	sysfs_attr_init(&bmc->manufacturer_id_attr.attr);
 
 	bmc->product_id_attr.attr.name = "product_id";
 	bmc->product_id_attr.attr.mode = S_IRUGO;
 	bmc->product_id_attr.show = product_id_show;
+	sysfs_attr_init(&bmc->product_id_attr.attr);
 
 	bmc->guid_attr.attr.name = "guid";
 	bmc->guid_attr.attr.mode = S_IRUGO;
 	bmc->guid_attr.show = guid_show;
+	sysfs_attr_init(&bmc->guid_attr.attr);
 
 	bmc->aux_firmware_rev_attr.attr.name = "aux_firmware_revision";
 	bmc->aux_firmware_rev_attr.attr.mode = S_IRUGO;
 	bmc->aux_firmware_rev_attr.show = aux_firmware_rev_show;
+	sysfs_attr_init(&bmc->aux_firmware_rev_attr.attr);
 
 	err = device_create_file(&bmc->dev->dev,
 			   &bmc->device_id_attr);
diff --git a/drivers/char/tty_buffer.c b/drivers/char/tty_buffer.c
index af8d9771572..7ee52164d47 100644
--- a/drivers/char/tty_buffer.c
+++ b/drivers/char/tty_buffer.c
@@ -248,7 +248,7 @@ int tty_insert_flip_string_fixed_flag(struct tty_struct *tty,
 {
 	int copied = 0;
 	do {
-		int goal = min(size - copied, TTY_BUFFER_PAGE);
+		int goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE);
 		int space = tty_buffer_request_room(tty, goal);
 		struct tty_buffer *tb = tty->buf.tail;
 		/* If there is no space then tb may be NULL */
@@ -285,7 +285,7 @@ int tty_insert_flip_string_flags(struct tty_struct *tty,
 {
 	int copied = 0;
 	do {
-		int goal = min(size - copied, TTY_BUFFER_PAGE);
+		int goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE);
 		int space = tty_buffer_request_room(tty, goal);
 		struct tty_buffer *tb = tty->buf.tail;
 		/* If there is no space then tb may be NULL */
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index be492dd6643..a3bd1d0b66c 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(tty_port_tty_set);
 static void tty_port_shutdown(struct tty_port *port)
 {
 	mutex_lock(&port->mutex);
-	if (port->ops->shutdown &&
+	if (port->ops->shutdown && !port->console &&
 		test_and_clear_bit(ASYNCB_INITIALIZED, &port->flags))
 			port->ops->shutdown(port);
 	mutex_unlock(&port->mutex);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index f404ccfc9c2..44288ce0cb4 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -681,6 +681,10 @@ static void resize_console(struct port *port)
 	struct virtio_device *vdev;
 	struct winsize ws;
 
+	/* The port could have been hot-unplugged */
+	if (!port)
+		return;
+
 	vdev = port->portdev->vdev;
 	if (virtio_has_feature(vdev, VIRTIO_CONSOLE_F_SIZE)) {
 		vdev->config->get(vdev,
@@ -947,11 +951,18 @@ static void handle_control_message(struct ports_device *portdev,
 		 */
 		err = sysfs_create_group(&port->dev->kobj,
 					 &port_attribute_group);
-		if (err)
+		if (err) {
 			dev_err(port->dev,
 				"Error %d creating sysfs device attributes\n",
 				err);
-
+		} else {
+			/*
+			 * Generate a udev event so that appropriate
+			 * symlinks can be created based on udev
+			 * rules.
+			 */
+			kobject_uevent(&port->dev->kobj, KOBJ_CHANGE);
+		}
 		break;
 	case VIRTIO_CONSOLE_PORT_REMOVE:
 		/*
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index 87778dcf872..6aa10284104 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -888,7 +888,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 			ret = -EFAULT;
 			goto out;
 		}
-		if (tmp.mode != VT_AUTO && tmp.mode != VT_PROCESS && tmp.mode != VT_PROCESS_AUTO) {
+		if (tmp.mode != VT_AUTO && tmp.mode != VT_PROCESS) {
 			ret = -EINVAL;
 			goto out;
 		}
@@ -1622,7 +1622,7 @@ static void complete_change_console(struct vc_data *vc)
 	 * telling it that it has acquired. Also check if it has died and
 	 * clean up (similar to logic employed in change_console())
 	 */
-	if (vc->vt_mode.mode == VT_PROCESS || vc->vt_mode.mode == VT_PROCESS_AUTO) {
+	if (vc->vt_mode.mode == VT_PROCESS) {
 		/*
 		 * Send the signal as privileged - kill_pid() will
 		 * tell us if the process has gone or something else
@@ -1682,7 +1682,7 @@ void change_console(struct vc_data *new_vc)
 	 * vt to auto control.
 	 */
 	vc = vc_cons[fg_console].d;
-	if (vc->vt_mode.mode == VT_PROCESS || vc->vt_mode.mode == VT_PROCESS_AUTO) {
+	if (vc->vt_mode.mode == VT_PROCESS) {
 		/*
 		 * Send the signal as privileged - kill_pid() will
 		 * tell us if the process has gone or something else
@@ -1693,28 +1693,27 @@ void change_console(struct vc_data *new_vc)
 		 */
 		vc->vt_newvt = new_vc->vc_num;
 		if (kill_pid(vc->vt_pid, vc->vt_mode.relsig, 1) == 0) {
-			if(vc->vt_mode.mode == VT_PROCESS)
-				/*
-				 * It worked. Mark the vt to switch to and
-				 * return. The process needs to send us a
-				 * VT_RELDISP ioctl to complete the switch.
-				 */
-				return;
-		} else {
 			/*
-			 * The controlling process has died, so we revert back to
-			 * normal operation. In this case, we'll also change back
-			 * to KD_TEXT mode. I'm not sure if this is strictly correct
-			 * but it saves the agony when the X server dies and the screen
-			 * remains blanked due to KD_GRAPHICS! It would be nice to do
-			 * this outside of VT_PROCESS but there is no single process
-			 * to account for and tracking tty count may be undesirable.
+			 * It worked. Mark the vt to switch to and
+			 * return. The process needs to send us a
+			 * VT_RELDISP ioctl to complete the switch.
 			 */
-			reset_vc(vc);
+			return;
 		}
 
 		/*
-		 * Fall through to normal (VT_AUTO and VT_PROCESS_AUTO) handling of the switch...
+		 * The controlling process has died, so we revert back to
+		 * normal operation. In this case, we'll also change back
+		 * to KD_TEXT mode. I'm not sure if this is strictly correct
+		 * but it saves the agony when the X server dies and the screen
+		 * remains blanked due to KD_GRAPHICS! It would be nice to do
+		 * this outside of VT_PROCESS but there is no single process
+		 * to account for and tracking tty count may be undesirable.
+		 */
+		reset_vc(vc);
+
+		/*
+		 * Fall through to normal (VT_AUTO) handling of the switch...
 		 */
 	}
 
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index 8fc91a01962..f5b6d9fe4de 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -316,7 +316,12 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
 		if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
 			pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
 	} else {
-		pr_cont(", core: %d\n", fls((regs->nbsh & 0xf) - 1));
+		u8 assoc_cpus = regs->nbsh & 0xf;
+
+		if (assoc_cpus > 0)
+			pr_cont(", core: %d", fls(assoc_cpus) - 1);
+
+		pr_cont("\n");
 	}
 
 	pr_emerg("%s.\n", EXT_ERR_MSG(xec));
diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c
index 5db0518c66d..882472d1e14 100644
--- a/drivers/firewire/core-device.c
+++ b/drivers/firewire/core-device.c
@@ -126,97 +126,74 @@ int fw_csr_string(const u32 *directory, int key, char *buf, size_t size)
 }
 EXPORT_SYMBOL(fw_csr_string);
 
-static bool is_fw_unit(struct device *dev);
-
-static int match_unit_directory(const u32 *directory, u32 match_flags,
-				const struct ieee1394_device_id *id)
+static void get_ids(const u32 *directory, int *id)
 {
 	struct fw_csr_iterator ci;
-	int key, value, match;
+	int key, value;
 
-	match = 0;
 	fw_csr_iterator_init(&ci, directory);
 	while (fw_csr_iterator_next(&ci, &key, &value)) {
-		if (key == CSR_VENDOR && value == id->vendor_id)
-			match |= IEEE1394_MATCH_VENDOR_ID;
-		if (key == CSR_MODEL && value == id->model_id)
-			match |= IEEE1394_MATCH_MODEL_ID;
-		if (key == CSR_SPECIFIER_ID && value == id->specifier_id)
-			match |= IEEE1394_MATCH_SPECIFIER_ID;
-		if (key == CSR_VERSION && value == id->version)
-			match |= IEEE1394_MATCH_VERSION;
+		switch (key) {
+		case CSR_VENDOR:	id[0] = value; break;
+		case CSR_MODEL:		id[1] = value; break;
+		case CSR_SPECIFIER_ID:	id[2] = value; break;
+		case CSR_VERSION:	id[3] = value; break;
+		}
 	}
+}
+
+static void get_modalias_ids(struct fw_unit *unit, int *id)
+{
+	get_ids(&fw_parent_device(unit)->config_rom[5], id);
+	get_ids(unit->directory, id);
+}
+
+static bool match_ids(const struct ieee1394_device_id *id_table, int *id)
+{
+	int match = 0;
+
+	if (id[0] == id_table->vendor_id)
+		match |= IEEE1394_MATCH_VENDOR_ID;
+	if (id[1] == id_table->model_id)
+		match |= IEEE1394_MATCH_MODEL_ID;
+	if (id[2] == id_table->specifier_id)
+		match |= IEEE1394_MATCH_SPECIFIER_ID;
+	if (id[3] == id_table->version)
+		match |= IEEE1394_MATCH_VERSION;
 
-	return (match & match_flags) == match_flags;
+	return (match & id_table->match_flags) == id_table->match_flags;
 }
 
+static bool is_fw_unit(struct device *dev);
+
 static int fw_unit_match(struct device *dev, struct device_driver *drv)
 {
-	struct fw_unit *unit = fw_unit(dev);
-	struct fw_device *device;
-	const struct ieee1394_device_id *id;
+	const struct ieee1394_device_id *id_table =
+			container_of(drv, struct fw_driver, driver)->id_table;
+	int id[] = {0, 0, 0, 0};
 
 	/* We only allow binding to fw_units. */
 	if (!is_fw_unit(dev))
 		return 0;
 
-	device = fw_parent_device(unit);
-	id = container_of(drv, struct fw_driver, driver)->id_table;
+	get_modalias_ids(fw_unit(dev), id);
 
-	for (; id->match_flags != 0; id++) {
-		if (match_unit_directory(unit->directory, id->match_flags, id))
+	for (; id_table->match_flags != 0; id_table++)
+		if (match_ids(id_table, id))
 			return 1;
 
-		/* Also check vendor ID in the root directory. */
-		if ((id->match_flags & IEEE1394_MATCH_VENDOR_ID) &&
-		    match_unit_directory(&device->config_rom[5],
-				IEEE1394_MATCH_VENDOR_ID, id) &&
-		    match_unit_directory(unit->directory, id->match_flags
-				& ~IEEE1394_MATCH_VENDOR_ID, id))
-			return 1;
-	}
-
 	return 0;
 }
 
 static int get_modalias(struct fw_unit *unit, char *buffer, size_t buffer_size)
 {
-	struct fw_device *device = fw_parent_device(unit);
-	struct fw_csr_iterator ci;
+	int id[] = {0, 0, 0, 0};
 
-	int key, value;
-	int vendor = 0;
-	int model = 0;
-	int specifier_id = 0;
-	int version = 0;
-
-	fw_csr_iterator_init(&ci, &device->config_rom[5]);
-	while (fw_csr_iterator_next(&ci, &key, &value)) {
-		switch (key) {
-		case CSR_VENDOR:
-			vendor = value;
-			break;
-		case CSR_MODEL:
-			model = value;
-			break;
-		}
-	}
-
-	fw_csr_iterator_init(&ci, unit->directory);
-	while (fw_csr_iterator_next(&ci, &key, &value)) {
-		switch (key) {
-		case CSR_SPECIFIER_ID:
-			specifier_id = value;
-			break;
-		case CSR_VERSION:
-			version = value;
-			break;
-		}
-	}
+	get_modalias_ids(unit, id);
 
 	return snprintf(buffer, buffer_size,
 			"ieee1394:ven%08Xmo%08Xsp%08Xver%08X",
-			vendor, model, specifier_id, version);
+			id[0], id[1], id[2], id[3]);
 }
 
 static int fw_unit_uevent(struct device *dev, struct kobj_uevent_env *env)
diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c
index 1c0b504a42f..99c20f1b613 100644
--- a/drivers/firewire/core-iso.c
+++ b/drivers/firewire/core-iso.c
@@ -331,8 +331,9 @@ void fw_iso_resource_manage(struct fw_card *card, int generation,
 	if (ret < 0)
 		*bandwidth = 0;
 
-	if (allocate && ret < 0 && c >= 0) {
-		deallocate_channel(card, irm_id, generation, c, buffer);
+	if (allocate && ret < 0) {
+		if (c >= 0)
+			deallocate_channel(card, irm_id, generation, c, buffer);
 		*channel = ret;
 	}
 }
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 75dc6988cff..e33917bf97d 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -231,6 +231,8 @@ static inline struct fw_ohci *fw_ohci(struct fw_card *card)
 
 static char ohci_driver_name[] = KBUILD_MODNAME;
 
+#define PCI_DEVICE_ID_TI_TSB12LV22	0x8009
+
 #define QUIRK_CYCLE_TIMER		1
 #define QUIRK_RESET_PACKET		2
 #define QUIRK_BE_HEADERS		4
@@ -239,6 +241,8 @@ static char ohci_driver_name[] = KBUILD_MODNAME;
 static const struct {
 	unsigned short vendor, device, flags;
 } ohci_quirks[] = {
+	{PCI_VENDOR_ID_TI,	PCI_DEVICE_ID_TI_TSB12LV22, QUIRK_CYCLE_TIMER |
+							    QUIRK_RESET_PACKET},
 	{PCI_VENDOR_ID_TI,	PCI_ANY_ID,	QUIRK_RESET_PACKET},
 	{PCI_VENDOR_ID_AL,	PCI_ANY_ID,	QUIRK_CYCLE_TIMER},
 	{PCI_VENDOR_ID_NEC,	PCI_ANY_ID,	QUIRK_CYCLE_TIMER},
diff --git a/drivers/gpio/max730x.c b/drivers/gpio/max730x.c
index c9bced55f82..4a7d662ff9b 100644
--- a/drivers/gpio/max730x.c
+++ b/drivers/gpio/max730x.c
@@ -242,3 +242,7 @@ int __devexit __max730x_remove(struct device *dev)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__max730x_remove);
+
+MODULE_AUTHOR("Juergen Beisert, Wolfram Sang");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("MAX730x GPIO-Expanders, generic parts");
diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c
index f2aaf39be39..51103aa469f 100644
--- a/drivers/gpu/drm/drm_crtc_helper.c
+++ b/drivers/gpu/drm/drm_crtc_helper.c
@@ -104,6 +104,7 @@ int drm_helper_probe_single_connector_modes(struct drm_connector *connector,
 	if (connector->status == connector_status_disconnected) {
 		DRM_DEBUG_KMS("%s is disconnected\n",
 			  drm_get_connector_name(connector));
+		drm_mode_connector_update_edid_property(connector, NULL);
 		goto prune;
 	}
 
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index f97e7c42ac8..7e608f4a0df 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -707,15 +707,6 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_device *dev,
 	mode->vsync_end = mode->vsync_start + vsync_pulse_width;
 	mode->vtotal = mode->vdisplay + vblank;
 
-	/* perform the basic check for the detailed timing */
-	if (mode->hsync_end > mode->htotal ||
-		mode->vsync_end > mode->vtotal) {
-		drm_mode_destroy(dev, mode);
-		DRM_DEBUG_KMS("Incorrect detailed timing. "
-				"Sync is beyond the blank.\n");
-		return NULL;
-	}
-
 	/* Some EDIDs have bogus h/vtotal values */
 	if (mode->hsync_end > mode->htotal)
 		mode->htotal = mode->hsync_end + 1;
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 50549703584..99487237111 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -283,6 +283,8 @@ static struct sysrq_key_op sysrq_drm_fb_helper_restore_op = {
 	.help_msg = "force-fb(V)",
 	.action_msg = "Restore framebuffer console",
 };
+#else
+static struct sysrq_key_op sysrq_drm_fb_helper_restore_op = { };
 #endif
 
 static void drm_fb_helper_on(struct fb_info *info)
diff --git a/drivers/gpu/drm/drm_fops.c b/drivers/gpu/drm/drm_fops.c
index 08d14df3bb4..4804872f8b1 100644
--- a/drivers/gpu/drm/drm_fops.c
+++ b/drivers/gpu/drm/drm_fops.c
@@ -140,14 +140,16 @@ int drm_open(struct inode *inode, struct file *filp)
 		spin_unlock(&dev->count_lock);
 	}
 out:
-	mutex_lock(&dev->struct_mutex);
-	if (minor->type == DRM_MINOR_LEGACY) {
-		BUG_ON((dev->dev_mapping != NULL) &&
-			(dev->dev_mapping != inode->i_mapping));
-		if (dev->dev_mapping == NULL)
-			dev->dev_mapping = inode->i_mapping;
+	if (!retcode) {
+		mutex_lock(&dev->struct_mutex);
+		if (minor->type == DRM_MINOR_LEGACY) {
+			if (dev->dev_mapping == NULL)
+				dev->dev_mapping = inode->i_mapping;
+			else if (dev->dev_mapping != inode->i_mapping)
+				retcode = -ENODEV;
+		}
+		mutex_unlock(&dev->struct_mutex);
 	}
-	mutex_unlock(&dev->struct_mutex);
 
 	return retcode;
 }
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 8bfc0bbf13e..a9f8589490c 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1881,29 +1881,29 @@ struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF(DRM_I915_GET_VBLANK_PIPE,  i915_vblank_pipe_get, DRM_AUTH ),
 	DRM_IOCTL_DEF(DRM_I915_VBLANK_SWAP, i915_vblank_swap, DRM_AUTH),
 	DRM_IOCTL_DEF(DRM_I915_HWS_ADDR, i915_set_status_page, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
-	DRM_IOCTL_DEF(DRM_I915_GEM_INIT, i915_gem_init_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
-	DRM_IOCTL_DEF(DRM_I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH),
-	DRM_IOCTL_DEF(DRM_I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH),
-	DRM_IOCTL_DEF(DRM_I915_GEM_PIN, i915_gem_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
-	DRM_IOCTL_DEF(DRM_I915_GEM_UNPIN, i915_gem_unpin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
-	DRM_IOCTL_DEF(DRM_I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH),
-	DRM_IOCTL_DEF(DRM_I915_GEM_THROTTLE, i915_gem_throttle_ioctl, DRM_AUTH),
-	DRM_IOCTL_DEF(DRM_I915_GEM_ENTERVT, i915_gem_entervt_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
-	DRM_IOCTL_DEF(DRM_I915_GEM_LEAVEVT, i915_gem_leavevt_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
-	DRM_IOCTL_DEF(DRM_I915_GEM_CREATE, i915_gem_create_ioctl, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_PREAD, i915_gem_pread_ioctl, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_PWRITE, i915_gem_pwrite_ioctl, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_MMAP, i915_gem_mmap_ioctl, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_MMAP_GTT, i915_gem_mmap_gtt_ioctl, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_SET_DOMAIN, i915_gem_set_domain_ioctl, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_SW_FINISH, i915_gem_sw_finish_ioctl, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_SET_TILING, i915_gem_set_tiling, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_GET_TILING, i915_gem_get_tiling, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_GET_APERTURE, i915_gem_get_aperture_ioctl, 0),
-	DRM_IOCTL_DEF(DRM_I915_GET_PIPE_FROM_CRTC_ID, intel_get_pipe_from_crtc_id, 0),
-	DRM_IOCTL_DEF(DRM_I915_GEM_MADVISE, i915_gem_madvise_ioctl, 0),
-	DRM_IOCTL_DEF(DRM_I915_OVERLAY_PUT_IMAGE, intel_overlay_put_image, DRM_MASTER|DRM_CONTROL_ALLOW),
-	DRM_IOCTL_DEF(DRM_I915_OVERLAY_ATTRS, intel_overlay_attrs, DRM_MASTER|DRM_CONTROL_ALLOW),
+	DRM_IOCTL_DEF(DRM_I915_GEM_INIT, i915_gem_init_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_PIN, i915_gem_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_UNPIN, i915_gem_unpin_ioctl, DRM_AUTH|DRM_ROOT_ONLY|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_THROTTLE, i915_gem_throttle_ioctl, DRM_AUTH|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_ENTERVT, i915_gem_entervt_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_LEAVEVT, i915_gem_leavevt_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_CREATE, i915_gem_create_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_PREAD, i915_gem_pread_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_PWRITE, i915_gem_pwrite_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_MMAP, i915_gem_mmap_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_MMAP_GTT, i915_gem_mmap_gtt_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_SET_DOMAIN, i915_gem_set_domain_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_SW_FINISH, i915_gem_sw_finish_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_SET_TILING, i915_gem_set_tiling, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_GET_TILING, i915_gem_get_tiling, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_GET_APERTURE, i915_gem_get_aperture_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GET_PIPE_FROM_CRTC_ID, intel_get_pipe_from_crtc_id, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_GEM_MADVISE, i915_gem_madvise_ioctl, DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_OVERLAY_PUT_IMAGE, intel_overlay_put_image, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
+	DRM_IOCTL_DEF(DRM_I915_OVERLAY_ATTRS, intel_overlay_attrs, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
 };
 
 int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls);
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 1b2e95455c0..4b26919abdb 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -139,12 +139,12 @@ const static struct intel_device_info intel_ironlake_m_info = {
 
 const static struct intel_device_info intel_sandybridge_d_info = {
 	.is_i965g = 1, .is_i9xx = 1, .need_gfx_hws = 1,
-	.has_hotplug = 1,
+	.has_hotplug = 1, .is_gen6 = 1,
 };
 
 const static struct intel_device_info intel_sandybridge_m_info = {
 	.is_i965g = 1, .is_mobile = 1, .is_i9xx = 1, .need_gfx_hws = 1,
-	.has_hotplug = 1,
+	.has_hotplug = 1, .is_gen6 = 1,
 };
 
 const static struct pci_device_id pciidlist[] = {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 979439cfb82..aba8260fbc5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -205,6 +205,7 @@ struct intel_device_info {
 	u8 is_g4x : 1;
 	u8 is_pineview : 1;
 	u8 is_ironlake : 1;
+	u8 is_gen6 : 1;
 	u8 has_fbc : 1;
 	u8 has_rc6 : 1;
 	u8 has_pipe_cxsr : 1;
@@ -1084,6 +1085,7 @@ extern int i915_wait_ring(struct drm_device * dev, int n, const char *caller);
 #define IS_IRONLAKE_M(dev)	((dev)->pci_device == 0x0046)
 #define IS_IRONLAKE(dev)	(INTEL_INFO(dev)->is_ironlake)
 #define IS_I9XX(dev)		(INTEL_INFO(dev)->is_i9xx)
+#define IS_GEN6(dev)		(INTEL_INFO(dev)->is_gen6)
 #define IS_MOBILE(dev)		(INTEL_INFO(dev)->is_mobile)
 
 #define IS_GEN3(dev)	(IS_I915G(dev) ||			\
@@ -1107,8 +1109,6 @@ extern int i915_wait_ring(struct drm_device * dev, int n, const char *caller);
 
 #define I915_NEED_GFX_HWS(dev)	(INTEL_INFO(dev)->need_gfx_hws)
 
-#define IS_GEN6(dev)	((dev)->pci_device == 0x0102)
-
 /* With the 945 and later, Y tiling got adjusted so that it was 32 128-byte
  * rows, which changed the alignment requirements and fence programming.
  */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fba37e9f775..933e865a892 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1466,9 +1466,6 @@ i915_gem_object_put_pages(struct drm_gem_object *obj)
 		obj_priv->dirty = 0;
 
 	for (i = 0; i < page_count; i++) {
-		if (obj_priv->pages[i] == NULL)
-			break;
-
 		if (obj_priv->dirty)
 			set_page_dirty(obj_priv->pages[i]);
 
@@ -2227,11 +2224,6 @@ i915_gem_evict_something(struct drm_device *dev, int min_size)
 				seqno = i915_add_request(dev, NULL, obj->write_domain);
 				if (seqno == 0)
 					return -ENOMEM;
-
-				ret = i915_wait_request(dev, seqno);
-				if (ret)
-					return ret;
-
 				continue;
 			}
 		}
@@ -2256,7 +2248,6 @@ i915_gem_object_get_pages(struct drm_gem_object *obj,
 	struct address_space *mapping;
 	struct inode *inode;
 	struct page *page;
-	int ret;
 
 	if (obj_priv->pages_refcount++ != 0)
 		return 0;
@@ -2279,11 +2270,9 @@ i915_gem_object_get_pages(struct drm_gem_object *obj,
 					   mapping_gfp_mask (mapping) |
 					   __GFP_COLD |
 					   gfpmask);
-		if (IS_ERR(page)) {
-			ret = PTR_ERR(page);
-			i915_gem_object_put_pages(obj);
-			return ret;
-		}
+		if (IS_ERR(page))
+			goto err_pages;
+
 		obj_priv->pages[i] = page;
 	}
 
@@ -2291,6 +2280,15 @@ i915_gem_object_get_pages(struct drm_gem_object *obj,
 		i915_gem_object_do_bit_17_swizzle(obj);
 
 	return 0;
+
+err_pages:
+	while (i--)
+		page_cache_release(obj_priv->pages[i]);
+
+	drm_free_large(obj_priv->pages);
+	obj_priv->pages = NULL;
+	obj_priv->pages_refcount--;
+	return PTR_ERR(page);
 }
 
 static void sandybridge_write_fence_reg(struct drm_i915_fence_reg *reg)
@@ -4730,6 +4728,11 @@ i915_gem_init_ringbuffer(struct drm_device *dev)
 			ring->space += ring->Size;
 	}
 
+	if (IS_I9XX(dev) && !IS_GEN3(dev)) {
+		I915_WRITE(MI_MODE,
+			   (VS_TIMER_DISPATCH) << 16 | VS_TIMER_DISPATCH);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index b5c55d88ff7..c01c878e51b 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -325,9 +325,12 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
 		 * need to ensure that any fence register is cleared.
 		 */
 		if (!i915_gem_object_fence_offset_ok(obj, args->tiling_mode))
-		    ret = i915_gem_object_unbind(obj);
+			ret = i915_gem_object_unbind(obj);
+		else if (obj_priv->fence_reg != I915_FENCE_REG_NONE)
+			ret = i915_gem_object_put_fence_reg(obj);
 		else
-		    ret = i915_gem_object_put_fence_reg(obj);
+			i915_gem_release_mmap(obj);
+
 		if (ret != 0) {
 			WARN(ret != -ERESTARTSYS,
 			     "failed to reset object for tiling switch");
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 3d59862c7cc..cbbf59f56df 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -298,6 +298,10 @@
 #define INSTDONE	0x02090
 #define NOPID		0x02094
 #define HWSTAM		0x02098
+
+#define MI_MODE		0x0209c
+# define VS_TIMER_DISPATCH				(1 << 6)
+
 #define SCPD0		0x0209c /* 915+ only */
 #define IER		0x020a0
 #define IIR		0x020a4
@@ -366,7 +370,7 @@
 #define   FBC_CTL_PERIODIC	(1<<30)
 #define   FBC_CTL_INTERVAL_SHIFT (16)
 #define   FBC_CTL_UNCOMPRESSIBLE (1<<14)
-#define   FBC_C3_IDLE		(1<<13)
+#define   FBC_CTL_C3_IDLE	(1<<13)
 #define   FBC_CTL_STRIDE_SHIFT	(5)
 #define   FBC_CTL_FENCENO	(1<<0)
 #define FBC_COMMAND		0x0320c
@@ -2172,6 +2176,14 @@
 #define DISPLAY_PORT_PLL_BIOS_1         0x46010
 #define DISPLAY_PORT_PLL_BIOS_2         0x46014
 
+#define PCH_DSPCLK_GATE_D	0x42020
+# define DPFDUNIT_CLOCK_GATE_DISABLE		(1 << 7)
+# define DPARBUNIT_CLOCK_GATE_DISABLE		(1 << 5)
+
+#define PCH_3DCGDIS0		0x46020
+# define MARIUNIT_CLOCK_GATE_DISABLE		(1 << 18)
+# define SVSMUNIT_CLOCK_GATE_DISABLE		(1 << 1)
+
 #define FDI_PLL_FREQ_CTL        0x46030
 #define  FDI_PLL_FREQ_CHANGE_REQUEST    (1<<24)
 #define  FDI_PLL_FREQ_LOCK_LIMIT_MASK   0xfff00
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index 70c9d4ba704..f9ba452f0cb 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -417,8 +417,9 @@ parse_edp(struct drm_i915_private *dev_priv, struct bdb_header *bdb)
 	edp = find_section(bdb, BDB_EDP);
 	if (!edp) {
 		if (SUPPORTS_EDP(dev_priv->dev) && dev_priv->edp_support) {
-			DRM_DEBUG_KMS("No eDP BDB found but eDP panel supported,\
-				       assume 18bpp panel color depth.\n");
+			DRM_DEBUG_KMS("No eDP BDB found but eDP panel "
+				      "supported, assume 18bpp panel color "
+				      "depth.\n");
 			dev_priv->edp_bpp = 18;
 		}
 		return;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 9cd6de5f990..58fc7fa0eb1 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -1032,7 +1032,7 @@ static void i8xx_enable_fbc(struct drm_crtc *crtc, unsigned long interval)
 	/* enable it... */
 	fbc_ctl = FBC_CTL_EN | FBC_CTL_PERIODIC;
 	if (IS_I945GM(dev))
-		fbc_ctl |= FBC_C3_IDLE; /* 945 needs special SR handling */
+		fbc_ctl |= FBC_CTL_C3_IDLE; /* 945 needs special SR handling */
 	fbc_ctl |= (dev_priv->cfb_pitch & 0xff) << FBC_CTL_STRIDE_SHIFT;
 	fbc_ctl |= (interval & 0x2fff) << FBC_CTL_INTERVAL_SHIFT;
 	if (obj_priv->tiling_mode != I915_TILING_NONE)
@@ -4717,6 +4717,20 @@ void intel_init_clock_gating(struct drm_device *dev)
 	 * specs, but enable as much else as we can.
 	 */
 	if (HAS_PCH_SPLIT(dev)) {
+		uint32_t dspclk_gate = VRHUNIT_CLOCK_GATE_DISABLE;
+
+		if (IS_IRONLAKE(dev)) {
+			/* Required for FBC */
+			dspclk_gate |= DPFDUNIT_CLOCK_GATE_DISABLE;
+			/* Required for CxSR */
+			dspclk_gate |= DPARBUNIT_CLOCK_GATE_DISABLE;
+
+			I915_WRITE(PCH_3DCGDIS0,
+				   MARIUNIT_CLOCK_GATE_DISABLE |
+				   SVSMUNIT_CLOCK_GATE_DISABLE);
+		}
+
+		I915_WRITE(PCH_DSPCLK_GATE_D, dspclk_gate);
 		return;
 	} else if (IS_G4X(dev)) {
 		uint32_t dspclk_gate;
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index 14e516fdc2d..2b3fa7a3c02 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -607,53 +607,6 @@ static void intel_lvds_mode_set(struct drm_encoder *encoder,
 	I915_WRITE(PFIT_CONTROL, lvds_priv->pfit_control);
 }
 
-/* Some lid devices report incorrect lid status, assume they're connected */
-static const struct dmi_system_id bad_lid_status[] = {
-	{
-		.ident = "Compaq nx9020",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-			DMI_MATCH(DMI_BOARD_NAME, "3084"),
-		},
-	},
-	{
-		.ident = "Samsung SX20S",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Samsung Electronics"),
-			DMI_MATCH(DMI_BOARD_NAME, "SX20S"),
-		},
-	},
-	{
-		.ident = "Aspire One",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "Aspire one"),
-		},
-	},
-	{
-		.ident = "Aspire 1810T",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 1810T"),
-		},
-	},
-	{
-		.ident = "PC-81005",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "MALATA"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "PC-81005"),
-		},
-	},
-	{
-		.ident = "Clevo M5x0N",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "CLEVO Co."),
-			DMI_MATCH(DMI_BOARD_NAME, "M5x0N"),
-		},
-	},
-	{ }
-};
-
 /**
  * Detect the LVDS connection.
  *
@@ -669,12 +622,9 @@ static enum drm_connector_status intel_lvds_detect(struct drm_connector *connect
 	/* ACPI lid methods were generally unreliable in this generation, so
 	 * don't even bother.
 	 */
-	if (IS_GEN2(dev))
+	if (IS_GEN2(dev) || IS_GEN3(dev))
 		return connector_status_connected;
 
-	if (!dmi_check_system(bad_lid_status) && !acpi_lid_open())
-		status = connector_status_disconnected;
-
 	return status;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index d355d1d527e..60595fc26fd 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -1068,14 +1068,18 @@ int intel_overlay_put_image(struct drm_device *dev, void *data,
 
 	drmmode_obj = drm_mode_object_find(dev, put_image_rec->crtc_id,
                         DRM_MODE_OBJECT_CRTC);
-	if (!drmmode_obj)
-		return -ENOENT;
+	if (!drmmode_obj) {
+		ret = -ENOENT;
+		goto out_free;
+	}
 	crtc = to_intel_crtc(obj_to_crtc(drmmode_obj));
 
 	new_bo = drm_gem_object_lookup(dev, file_priv,
 			put_image_rec->bo_handle);
-	if (!new_bo)
-		return -ENOENT;
+	if (!new_bo) {
+		ret = -ENOENT;
+		goto out_free;
+	}
 
 	mutex_lock(&dev->mode_config.mutex);
 	mutex_lock(&dev->struct_mutex);
@@ -1165,6 +1169,7 @@ out_unlock:
 	mutex_unlock(&dev->struct_mutex);
 	mutex_unlock(&dev->mode_config.mutex);
 	drm_gem_object_unreference_unlocked(new_bo);
+out_free:
 	kfree(params);
 
 	return ret;
diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
index 32db806f3b5..7f0d807a0d0 100644
--- a/drivers/gpu/drm/nouveau/Makefile
+++ b/drivers/gpu/drm/nouveau/Makefile
@@ -12,7 +12,7 @@ nouveau-y := nouveau_drv.o nouveau_state.o nouveau_channel.o nouveau_mem.o \
              nouveau_dp.o nouveau_grctx.o \
              nv04_timer.o \
              nv04_mc.o nv40_mc.o nv50_mc.o \
-             nv04_fb.o nv10_fb.o nv40_fb.o \
+             nv04_fb.o nv10_fb.o nv40_fb.o nv50_fb.o \
              nv04_fifo.o nv10_fifo.o nv40_fifo.o nv50_fifo.o \
              nv04_graph.o nv10_graph.o nv20_graph.o \
              nv40_graph.o nv50_graph.o \
diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c
index 75bceee7604..b5a9336a2e8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bios.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bios.c
@@ -5211,6 +5211,21 @@ divine_connector_type(struct nvbios *bios, int index)
 }
 
 static void
+apply_dcb_connector_quirks(struct nvbios *bios, int idx)
+{
+	struct dcb_connector_table_entry *cte = &bios->dcb.connector.entry[idx];
+	struct drm_device *dev = bios->dev;
+
+	/* Gigabyte NX85T */
+	if ((dev->pdev->device == 0x0421) &&
+	    (dev->pdev->subsystem_vendor == 0x1458) &&
+	    (dev->pdev->subsystem_device == 0x344c)) {
+		if (cte->type == DCB_CONNECTOR_HDMI_1)
+			cte->type = DCB_CONNECTOR_DVI_I;
+	}
+}
+
+static void
 parse_dcb_connector_table(struct nvbios *bios)
 {
 	struct drm_device *dev = bios->dev;
@@ -5238,13 +5253,14 @@ parse_dcb_connector_table(struct nvbios *bios)
 	entry = conntab + conntab[1];
 	cte = &ct->entry[0];
 	for (i = 0; i < conntab[2]; i++, entry += conntab[3], cte++) {
+		cte->index = i;
 		if (conntab[3] == 2)
 			cte->entry = ROM16(entry[0]);
 		else
 			cte->entry = ROM32(entry[0]);
 
 		cte->type  = (cte->entry & 0x000000ff) >> 0;
-		cte->index = (cte->entry & 0x00000f00) >> 8;
+		cte->index2 = (cte->entry & 0x00000f00) >> 8;
 		switch (cte->entry & 0x00033000) {
 		case 0x00001000:
 			cte->gpio_tag = 0x07;
@@ -5266,6 +5282,8 @@ parse_dcb_connector_table(struct nvbios *bios)
 		if (cte->type == 0xff)
 			continue;
 
+		apply_dcb_connector_quirks(bios, i);
+
 		NV_INFO(dev, "  %d: 0x%08x: type 0x%02x idx %d tag 0x%02x\n",
 			i, cte->entry, cte->type, cte->index, cte->gpio_tag);
 
@@ -5287,10 +5305,16 @@ parse_dcb_connector_table(struct nvbios *bios)
 			break;
 		default:
 			cte->type = divine_connector_type(bios, cte->index);
-			NV_WARN(dev, "unknown type, using 0x%02x", cte->type);
+			NV_WARN(dev, "unknown type, using 0x%02x\n", cte->type);
 			break;
 		}
 
+		if (nouveau_override_conntype) {
+			int type = divine_connector_type(bios, cte->index);
+			if (type != cte->type)
+				NV_WARN(dev, " -> type 0x%02x\n", cte->type);
+		}
+
 	}
 }
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.h b/drivers/gpu/drm/nouveau/nouveau_bios.h
index 9f688aa9a65..4f88e6924d2 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bios.h
+++ b/drivers/gpu/drm/nouveau/nouveau_bios.h
@@ -72,9 +72,10 @@ enum dcb_connector_type {
 };
 
 struct dcb_connector_table_entry {
+	uint8_t index;
 	uint32_t entry;
 	enum dcb_connector_type type;
-	uint8_t index;
+	uint8_t index2;
 	uint8_t gpio_tag;
 };
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 028719fddf7..026612471c9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -439,8 +439,7 @@ nouveau_bo_evict_flags(struct ttm_buffer_object *bo, struct ttm_placement *pl)
 
 	switch (bo->mem.mem_type) {
 	case TTM_PL_VRAM:
-		nouveau_bo_placement_set(nvbo, TTM_PL_FLAG_TT |
-					 TTM_PL_FLAG_SYSTEM);
+		nouveau_bo_placement_set(nvbo, TTM_PL_FLAG_TT);
 		break;
 	default:
 		nouveau_bo_placement_set(nvbo, TTM_PL_FLAG_SYSTEM);
diff --git a/drivers/gpu/drm/nouveau/nouveau_connector.c b/drivers/gpu/drm/nouveau/nouveau_connector.c
index 24327f468c4..14afe1e47e5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_connector.c
+++ b/drivers/gpu/drm/nouveau/nouveau_connector.c
@@ -302,7 +302,7 @@ nouveau_connector_detect(struct drm_connector *connector)
 
 detect_analog:
 	nv_encoder = find_encoder_by_type(connector, OUTPUT_ANALOG);
-	if (!nv_encoder)
+	if (!nv_encoder && !nouveau_tv_disable)
 		nv_encoder = find_encoder_by_type(connector, OUTPUT_TV);
 	if (nv_encoder) {
 		struct drm_encoder *encoder = to_drm_encoder(nv_encoder);
diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.c b/drivers/gpu/drm/nouveau/nouveau_dma.c
index c8482a108a7..65c441a1999 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dma.c
@@ -190,6 +190,11 @@ nv50_dma_push(struct nouveau_channel *chan, struct nouveau_bo *bo,
 	nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8);
 
 	chan->dma.ib_put = (chan->dma.ib_put + 1) & chan->dma.ib_max;
+
+	DRM_MEMORYBARRIER();
+	/* Flush writes. */
+	nouveau_bo_rd32(pb, 0);
+
 	nvchan_wr32(chan, 0x8c, chan->dma.ib_put);
 	chan->dma.ib_free--;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 30cc09e8a70..1de974acbc6 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -83,6 +83,14 @@ MODULE_PARM_DESC(nofbaccel, "Disable fbcon acceleration");
 int nouveau_nofbaccel = 0;
 module_param_named(nofbaccel, nouveau_nofbaccel, int, 0400);
 
+MODULE_PARM_DESC(override_conntype, "Ignore DCB connector type");
+int nouveau_override_conntype = 0;
+module_param_named(override_conntype, nouveau_override_conntype, int, 0400);
+
+MODULE_PARM_DESC(tv_disable, "Disable TV-out detection\n");
+int nouveau_tv_disable = 0;
+module_param_named(tv_disable, nouveau_tv_disable, int, 0400);
+
 MODULE_PARM_DESC(tv_norm, "Default TV norm.\n"
 		 "\t\tSupported: PAL, PAL-M, PAL-N, PAL-Nc, NTSC-M, NTSC-J,\n"
 		 "\t\t\thd480i, hd480p, hd576i, hd576p, hd720p, hd1080i.\n"
@@ -154,9 +162,11 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state)
 	if (pm_state.event == PM_EVENT_PRETHAW)
 		return 0;
 
+	NV_INFO(dev, "Disabling fbcon acceleration...\n");
 	fbdev_flags = dev_priv->fbdev_info->flags;
 	dev_priv->fbdev_info->flags |= FBINFO_HWACCEL_DISABLED;
 
+	NV_INFO(dev, "Unpinning framebuffer(s)...\n");
 	list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
 		struct nouveau_framebuffer *nouveau_fb;
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 4b9aaf2a8d0..d8b55901177 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -681,6 +681,7 @@ extern int nouveau_uscript_tmds;
 extern int nouveau_vram_pushbuf;
 extern int nouveau_vram_notify;
 extern int nouveau_fbpercrtc;
+extern int nouveau_tv_disable;
 extern char *nouveau_tv_norm;
 extern int nouveau_reg_debug;
 extern char *nouveau_vbios;
@@ -688,6 +689,7 @@ extern int nouveau_ctxfw;
 extern int nouveau_ignorelid;
 extern int nouveau_nofbaccel;
 extern int nouveau_noaccel;
+extern int nouveau_override_conntype;
 
 extern int nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state);
 extern int nouveau_pci_resume(struct pci_dev *pdev);
@@ -926,6 +928,10 @@ extern void nv40_fb_takedown(struct drm_device *);
 extern void nv40_fb_set_region_tiling(struct drm_device *, int, uint32_t,
 				      uint32_t, uint32_t);
 
+/* nv50_fb.c */
+extern int  nv50_fb_init(struct drm_device *);
+extern void nv50_fb_takedown(struct drm_device *);
+
 /* nv04_fifo.c */
 extern int  nv04_fifo_init(struct drm_device *);
 extern void nv04_fifo_disable(struct drm_device *);
diff --git a/drivers/gpu/drm/nouveau/nouveau_irq.c b/drivers/gpu/drm/nouveau/nouveau_irq.c
index 95220ddebb4..2bd59a92fee 100644
--- a/drivers/gpu/drm/nouveau/nouveau_irq.c
+++ b/drivers/gpu/drm/nouveau/nouveau_irq.c
@@ -311,6 +311,31 @@ nouveau_print_bitfield_names_(uint32_t value,
 #define nouveau_print_bitfield_names(val, namelist) \
 	nouveau_print_bitfield_names_((val), (namelist), ARRAY_SIZE(namelist))
 
+struct nouveau_enum_names {
+	uint32_t value;
+	const char *name;
+};
+
+static void
+nouveau_print_enum_names_(uint32_t value,
+				const struct nouveau_enum_names *namelist,
+				const int namelist_len)
+{
+	/*
+	 * Caller must have already printed the KERN_* log level for us.
+	 * Also the caller is responsible for adding the newline.
+	 */
+	int i;
+	for (i = 0; i < namelist_len; ++i) {
+		if (value == namelist[i].value) {
+			printk("%s", namelist[i].name);
+			return;
+		}
+	}
+	printk("unknown value 0x%08x", value);
+}
+#define nouveau_print_enum_names(val, namelist) \
+	nouveau_print_enum_names_((val), (namelist), ARRAY_SIZE(namelist))
 
 static int
 nouveau_graph_chid_from_grctx(struct drm_device *dev)
@@ -427,14 +452,16 @@ nouveau_graph_dump_trap_info(struct drm_device *dev, const char *id,
 	struct drm_nouveau_private *dev_priv = dev->dev_private;
 	uint32_t nsource = trap->nsource, nstatus = trap->nstatus;
 
-	NV_INFO(dev, "%s - nSource:", id);
-	nouveau_print_bitfield_names(nsource, nsource_names);
-	printk(", nStatus:");
-	if (dev_priv->card_type < NV_10)
-		nouveau_print_bitfield_names(nstatus, nstatus_names);
-	else
-		nouveau_print_bitfield_names(nstatus, nstatus_names_nv10);
-	printk("\n");
+	if (dev_priv->card_type < NV_50) {
+		NV_INFO(dev, "%s - nSource:", id);
+		nouveau_print_bitfield_names(nsource, nsource_names);
+		printk(", nStatus:");
+		if (dev_priv->card_type < NV_10)
+			nouveau_print_bitfield_names(nstatus, nstatus_names);
+		else
+			nouveau_print_bitfield_names(nstatus, nstatus_names_nv10);
+		printk("\n");
+	}
 
 	NV_INFO(dev, "%s - Ch %d/%d Class 0x%04x Mthd 0x%04x "
 					"Data 0x%08x:0x%08x\n",
@@ -578,27 +605,502 @@ nouveau_pgraph_irq_handler(struct drm_device *dev)
 }
 
 static void
+nv50_pfb_vm_trap(struct drm_device *dev, int display, const char *name)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	uint32_t trap[6];
+	int i, ch;
+	uint32_t idx = nv_rd32(dev, 0x100c90);
+	if (idx & 0x80000000) {
+		idx &= 0xffffff;
+		if (display) {
+			for (i = 0; i < 6; i++) {
+				nv_wr32(dev, 0x100c90, idx | i << 24);
+				trap[i] = nv_rd32(dev, 0x100c94);
+			}
+			for (ch = 0; ch < dev_priv->engine.fifo.channels; ch++) {
+				struct nouveau_channel *chan = dev_priv->fifos[ch];
+
+				if (!chan || !chan->ramin)
+					continue;
+
+				if (trap[1] == chan->ramin->instance >> 12)
+					break;
+			}
+			NV_INFO(dev, "%s - VM: Trapped %s at %02x%04x%04x status %08x %08x channel %d\n",
+					name, (trap[5]&0x100?"read":"write"),
+					trap[5]&0xff, trap[4]&0xffff,
+					trap[3]&0xffff, trap[0], trap[2], ch);
+		}
+		nv_wr32(dev, 0x100c90, idx | 0x80000000);
+	} else if (display) {
+		NV_INFO(dev, "%s - no VM fault?\n", name);
+	}
+}
+
+static struct nouveau_enum_names nv50_mp_exec_error_names[] =
+{
+	{ 3, "STACK_UNDERFLOW" },
+	{ 4, "QUADON_ACTIVE" },
+	{ 8, "TIMEOUT" },
+	{ 0x10, "INVALID_OPCODE" },
+	{ 0x40, "BREAKPOINT" },
+};
+
+static void
+nv50_pgraph_mp_trap(struct drm_device *dev, int tpid, int display)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	uint32_t units = nv_rd32(dev, 0x1540);
+	uint32_t addr, mp10, status, pc, oplow, ophigh;
+	int i;
+	int mps = 0;
+	for (i = 0; i < 4; i++) {
+		if (!(units & 1 << (i+24)))
+			continue;
+		if (dev_priv->chipset < 0xa0)
+			addr = 0x408200 + (tpid << 12) + (i << 7);
+		else
+			addr = 0x408100 + (tpid << 11) + (i << 7);
+		mp10 = nv_rd32(dev, addr + 0x10);
+		status = nv_rd32(dev, addr + 0x14);
+		if (!status)
+			continue;
+		if (display) {
+			nv_rd32(dev, addr + 0x20);
+			pc = nv_rd32(dev, addr + 0x24);
+			oplow = nv_rd32(dev, addr + 0x70);
+			ophigh= nv_rd32(dev, addr + 0x74);
+			NV_INFO(dev, "PGRAPH_TRAP_MP_EXEC - "
+					"TP %d MP %d: ", tpid, i);
+			nouveau_print_enum_names(status,
+					nv50_mp_exec_error_names);
+			printk(" at %06x warp %d, opcode %08x %08x\n",
+					pc&0xffffff, pc >> 24,
+					oplow, ophigh);
+		}
+		nv_wr32(dev, addr + 0x10, mp10);
+		nv_wr32(dev, addr + 0x14, 0);
+		mps++;
+	}
+	if (!mps && display)
+		NV_INFO(dev, "PGRAPH_TRAP_MP_EXEC - TP %d: "
+				"No MPs claiming errors?\n", tpid);
+}
+
+static void
+nv50_pgraph_tp_trap(struct drm_device *dev, int type, uint32_t ustatus_old,
+		uint32_t ustatus_new, int display, const char *name)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	int tps = 0;
+	uint32_t units = nv_rd32(dev, 0x1540);
+	int i, r;
+	uint32_t ustatus_addr, ustatus;
+	for (i = 0; i < 16; i++) {
+		if (!(units & (1 << i)))
+			continue;
+		if (dev_priv->chipset < 0xa0)
+			ustatus_addr = ustatus_old + (i << 12);
+		else
+			ustatus_addr = ustatus_new + (i << 11);
+		ustatus = nv_rd32(dev, ustatus_addr) & 0x7fffffff;
+		if (!ustatus)
+			continue;
+		tps++;
+		switch (type) {
+		case 6: /* texture error... unknown for now */
+			nv50_pfb_vm_trap(dev, display, name);
+			if (display) {
+				NV_ERROR(dev, "magic set %d:\n", i);
+				for (r = ustatus_addr + 4; r <= ustatus_addr + 0x10; r += 4)
+					NV_ERROR(dev, "\t0x%08x: 0x%08x\n", r,
+						nv_rd32(dev, r));
+			}
+			break;
+		case 7: /* MP error */
+			if (ustatus & 0x00010000) {
+				nv50_pgraph_mp_trap(dev, i, display);
+				ustatus &= ~0x00010000;
+			}
+			break;
+		case 8: /* TPDMA error */
+			{
+			uint32_t e0c = nv_rd32(dev, ustatus_addr + 4);
+			uint32_t e10 = nv_rd32(dev, ustatus_addr + 8);
+			uint32_t e14 = nv_rd32(dev, ustatus_addr + 0xc);
+			uint32_t e18 = nv_rd32(dev, ustatus_addr + 0x10);
+			uint32_t e1c = nv_rd32(dev, ustatus_addr + 0x14);
+			uint32_t e20 = nv_rd32(dev, ustatus_addr + 0x18);
+			uint32_t e24 = nv_rd32(dev, ustatus_addr + 0x1c);
+			nv50_pfb_vm_trap(dev, display, name);
+			/* 2d engine destination */
+			if (ustatus & 0x00000010) {
+				if (display) {
+					NV_INFO(dev, "PGRAPH_TRAP_TPDMA_2D - TP %d - Unknown fault at address %02x%08x\n",
+							i, e14, e10);
+					NV_INFO(dev, "PGRAPH_TRAP_TPDMA_2D - TP %d - e0c: %08x, e18: %08x, e1c: %08x, e20: %08x, e24: %08x\n",
+							i, e0c, e18, e1c, e20, e24);
+				}
+				ustatus &= ~0x00000010;
+			}
+			/* Render target */
+			if (ustatus & 0x00000040) {
+				if (display) {
+					NV_INFO(dev, "PGRAPH_TRAP_TPDMA_RT - TP %d - Unknown fault at address %02x%08x\n",
+							i, e14, e10);
+					NV_INFO(dev, "PGRAPH_TRAP_TPDMA_RT - TP %d - e0c: %08x, e18: %08x, e1c: %08x, e20: %08x, e24: %08x\n",
+							i, e0c, e18, e1c, e20, e24);
+				}
+				ustatus &= ~0x00000040;
+			}
+			/* CUDA memory: l[], g[] or stack. */
+			if (ustatus & 0x00000080) {
+				if (display) {
+					if (e18 & 0x80000000) {
+						/* g[] read fault? */
+						NV_INFO(dev, "PGRAPH_TRAP_TPDMA - TP %d - Global read fault at address %02x%08x\n",
+								i, e14, e10 | ((e18 >> 24) & 0x1f));
+						e18 &= ~0x1f000000;
+					} else if (e18 & 0xc) {
+						/* g[] write fault? */
+						NV_INFO(dev, "PGRAPH_TRAP_TPDMA - TP %d - Global write fault at address %02x%08x\n",
+								i, e14, e10 | ((e18 >> 7) & 0x1f));
+						e18 &= ~0x00000f80;
+					} else {
+						NV_INFO(dev, "PGRAPH_TRAP_TPDMA - TP %d - Unknown CUDA fault at address %02x%08x\n",
+								i, e14, e10);
+					}
+					NV_INFO(dev, "PGRAPH_TRAP_TPDMA - TP %d - e0c: %08x, e18: %08x, e1c: %08x, e20: %08x, e24: %08x\n",
+							i, e0c, e18, e1c, e20, e24);
+				}
+				ustatus &= ~0x00000080;
+			}
+			}
+			break;
+		}
+		if (ustatus) {
+			if (display)
+				NV_INFO(dev, "%s - TP%d: Unhandled ustatus 0x%08x\n", name, i, ustatus);
+		}
+		nv_wr32(dev, ustatus_addr, 0xc0000000);
+	}
+
+	if (!tps && display)
+		NV_INFO(dev, "%s - No TPs claiming errors?\n", name);
+}
+
+static void
+nv50_pgraph_trap_handler(struct drm_device *dev)
+{
+	struct nouveau_pgraph_trap trap;
+	uint32_t status = nv_rd32(dev, 0x400108);
+	uint32_t ustatus;
+	int display = nouveau_ratelimit();
+
+
+	if (!status && display) {
+		nouveau_graph_trap_info(dev, &trap);
+		nouveau_graph_dump_trap_info(dev, "PGRAPH_TRAP", &trap);
+		NV_INFO(dev, "PGRAPH_TRAP - no units reporting traps?\n");
+	}
+
+	/* DISPATCH: Relays commands to other units and handles NOTIFY,
+	 * COND, QUERY. If you get a trap from it, the command is still stuck
+	 * in DISPATCH and you need to do something about it. */
+	if (status & 0x001) {
+		ustatus = nv_rd32(dev, 0x400804) & 0x7fffffff;
+		if (!ustatus && display) {
+			NV_INFO(dev, "PGRAPH_TRAP_DISPATCH - no ustatus?\n");
+		}
+
+		/* Known to be triggered by screwed up NOTIFY and COND... */
+		if (ustatus & 0x00000001) {
+			nv50_pfb_vm_trap(dev, display, "PGRAPH_TRAP_DISPATCH_FAULT");
+			nv_wr32(dev, 0x400500, 0);
+			if (nv_rd32(dev, 0x400808) & 0x80000000) {
+				if (display) {
+					if (nouveau_graph_trapped_channel(dev, &trap.channel))
+						trap.channel = -1;
+					trap.class = nv_rd32(dev, 0x400814);
+					trap.mthd = nv_rd32(dev, 0x400808) & 0x1ffc;
+					trap.subc = (nv_rd32(dev, 0x400808) >> 16) & 0x7;
+					trap.data = nv_rd32(dev, 0x40080c);
+					trap.data2 = nv_rd32(dev, 0x400810);
+					nouveau_graph_dump_trap_info(dev,
+							"PGRAPH_TRAP_DISPATCH_FAULT", &trap);
+					NV_INFO(dev, "PGRAPH_TRAP_DISPATCH_FAULT - 400808: %08x\n", nv_rd32(dev, 0x400808));
+					NV_INFO(dev, "PGRAPH_TRAP_DISPATCH_FAULT - 400848: %08x\n", nv_rd32(dev, 0x400848));
+				}
+				nv_wr32(dev, 0x400808, 0);
+			} else if (display) {
+				NV_INFO(dev, "PGRAPH_TRAP_DISPATCH_FAULT - No stuck command?\n");
+			}
+			nv_wr32(dev, 0x4008e8, nv_rd32(dev, 0x4008e8) & 3);
+			nv_wr32(dev, 0x400848, 0);
+			ustatus &= ~0x00000001;
+		}
+		if (ustatus & 0x00000002) {
+			nv50_pfb_vm_trap(dev, display, "PGRAPH_TRAP_DISPATCH_QUERY");
+			nv_wr32(dev, 0x400500, 0);
+			if (nv_rd32(dev, 0x40084c) & 0x80000000) {
+				if (display) {
+					if (nouveau_graph_trapped_channel(dev, &trap.channel))
+						trap.channel = -1;
+					trap.class = nv_rd32(dev, 0x400814);
+					trap.mthd = nv_rd32(dev, 0x40084c) & 0x1ffc;
+					trap.subc = (nv_rd32(dev, 0x40084c) >> 16) & 0x7;
+					trap.data = nv_rd32(dev, 0x40085c);
+					trap.data2 = 0;
+					nouveau_graph_dump_trap_info(dev,
+							"PGRAPH_TRAP_DISPATCH_QUERY", &trap);
+					NV_INFO(dev, "PGRAPH_TRAP_DISPATCH_QUERY - 40084c: %08x\n", nv_rd32(dev, 0x40084c));
+				}
+				nv_wr32(dev, 0x40084c, 0);
+			} else if (display) {
+				NV_INFO(dev, "PGRAPH_TRAP_DISPATCH_QUERY - No stuck command?\n");
+			}
+			ustatus &= ~0x00000002;
+		}
+		if (ustatus && display)
+			NV_INFO(dev, "PGRAPH_TRAP_DISPATCH - Unhandled ustatus 0x%08x\n", ustatus);
+		nv_wr32(dev, 0x400804, 0xc0000000);
+		nv_wr32(dev, 0x400108, 0x001);
+		status &= ~0x001;
+	}
+
+	/* TRAPs other than dispatch use the "normal" trap regs. */
+	if (status && display) {
+		nouveau_graph_trap_info(dev, &trap);
+		nouveau_graph_dump_trap_info(dev,
+				"PGRAPH_TRAP", &trap);
+	}
+
+	/* M2MF: Memory to memory copy engine. */
+	if (status & 0x002) {
+		ustatus = nv_rd32(dev, 0x406800) & 0x7fffffff;
+		if (!ustatus && display) {
+			NV_INFO(dev, "PGRAPH_TRAP_M2MF - no ustatus?\n");
+		}
+		if (ustatus & 0x00000001) {
+			nv50_pfb_vm_trap(dev, display, "PGRAPH_TRAP_M2MF_NOTIFY");
+			ustatus &= ~0x00000001;
+		}
+		if (ustatus & 0x00000002) {
+			nv50_pfb_vm_trap(dev, display, "PGRAPH_TRAP_M2MF_IN");
+			ustatus &= ~0x00000002;
+		}
+		if (ustatus & 0x00000004) {
+			nv50_pfb_vm_trap(dev, display, "PGRAPH_TRAP_M2MF_OUT");
+			ustatus &= ~0x00000004;
+		}
+		NV_INFO (dev, "PGRAPH_TRAP_M2MF - %08x %08x %08x %08x\n",
+				nv_rd32(dev, 0x406804),
+				nv_rd32(dev, 0x406808),
+				nv_rd32(dev, 0x40680c),
+				nv_rd32(dev, 0x406810));
+		if (ustatus && display)
+			NV_INFO(dev, "PGRAPH_TRAP_M2MF - Unhandled ustatus 0x%08x\n", ustatus);
+		/* No sane way found yet -- just reset the bugger. */
+		nv_wr32(dev, 0x400040, 2);
+		nv_wr32(dev, 0x400040, 0);
+		nv_wr32(dev, 0x406800, 0xc0000000);
+		nv_wr32(dev, 0x400108, 0x002);
+		status &= ~0x002;
+	}
+
+	/* VFETCH: Fetches data from vertex buffers. */
+	if (status & 0x004) {
+		ustatus = nv_rd32(dev, 0x400c04) & 0x7fffffff;
+		if (!ustatus && display) {
+			NV_INFO(dev, "PGRAPH_TRAP_VFETCH - no ustatus?\n");
+		}
+		if (ustatus & 0x00000001) {
+			nv50_pfb_vm_trap(dev, display, "PGRAPH_TRAP_VFETCH_FAULT");
+			NV_INFO (dev, "PGRAPH_TRAP_VFETCH_FAULT - %08x %08x %08x %08x\n",
+					nv_rd32(dev, 0x400c00),
+					nv_rd32(dev, 0x400c08),
+					nv_rd32(dev, 0x400c0c),
+					nv_rd32(dev, 0x400c10));
+			ustatus &= ~0x00000001;
+		}
+		if (ustatus && display)
+			NV_INFO(dev, "PGRAPH_TRAP_VFETCH - Unhandled ustatus 0x%08x\n", ustatus);
+		nv_wr32(dev, 0x400c04, 0xc0000000);
+		nv_wr32(dev, 0x400108, 0x004);
+		status &= ~0x004;
+	}
+
+	/* STRMOUT: DirectX streamout / OpenGL transform feedback. */
+	if (status & 0x008) {
+		ustatus = nv_rd32(dev, 0x401800) & 0x7fffffff;
+		if (!ustatus && display) {
+			NV_INFO(dev, "PGRAPH_TRAP_STRMOUT - no ustatus?\n");
+		}
+		if (ustatus & 0x00000001) {
+			nv50_pfb_vm_trap(dev, display, "PGRAPH_TRAP_STRMOUT_FAULT");
+			NV_INFO (dev, "PGRAPH_TRAP_STRMOUT_FAULT - %08x %08x %08x %08x\n",
+					nv_rd32(dev, 0x401804),
+					nv_rd32(dev, 0x401808),
+					nv_rd32(dev, 0x40180c),
+					nv_rd32(dev, 0x401810));
+			ustatus &= ~0x00000001;
+		}
+		if (ustatus && display)
+			NV_INFO(dev, "PGRAPH_TRAP_STRMOUT - Unhandled ustatus 0x%08x\n", ustatus);
+		/* No sane way found yet -- just reset the bugger. */
+		nv_wr32(dev, 0x400040, 0x80);
+		nv_wr32(dev, 0x400040, 0);
+		nv_wr32(dev, 0x401800, 0xc0000000);
+		nv_wr32(dev, 0x400108, 0x008);
+		status &= ~0x008;
+	}
+
+	/* CCACHE: Handles code and c[] caches and fills them. */
+	if (status & 0x010) {
+		ustatus = nv_rd32(dev, 0x405018) & 0x7fffffff;
+		if (!ustatus && display) {
+			NV_INFO(dev, "PGRAPH_TRAP_CCACHE - no ustatus?\n");
+		}
+		if (ustatus & 0x00000001) {
+			nv50_pfb_vm_trap(dev, display, "PGRAPH_TRAP_CCACHE_FAULT");
+			NV_INFO (dev, "PGRAPH_TRAP_CCACHE_FAULT - %08x %08x %08x %08x %08x %08x %08x\n",
+					nv_rd32(dev, 0x405800),
+					nv_rd32(dev, 0x405804),
+					nv_rd32(dev, 0x405808),
+					nv_rd32(dev, 0x40580c),
+					nv_rd32(dev, 0x405810),
+					nv_rd32(dev, 0x405814),
+					nv_rd32(dev, 0x40581c));
+			ustatus &= ~0x00000001;
+		}
+		if (ustatus && display)
+			NV_INFO(dev, "PGRAPH_TRAP_CCACHE - Unhandled ustatus 0x%08x\n", ustatus);
+		nv_wr32(dev, 0x405018, 0xc0000000);
+		nv_wr32(dev, 0x400108, 0x010);
+		status &= ~0x010;
+	}
+
+	/* Unknown, not seen yet... 0x402000 is the only trap status reg
+	 * remaining, so try to handle it anyway. Perhaps related to that
+	 * unknown DMA slot on tesla? */
+	if (status & 0x20) {
+		nv50_pfb_vm_trap(dev, display, "PGRAPH_TRAP_UNKC04");
+		ustatus = nv_rd32(dev, 0x402000) & 0x7fffffff;
+		if (display)
+			NV_INFO(dev, "PGRAPH_TRAP_UNKC04 - Unhandled ustatus 0x%08x\n", ustatus);
+		nv_wr32(dev, 0x402000, 0xc0000000);
+		/* no status modifiction on purpose */
+	}
+
+	/* TEXTURE: CUDA texturing units */
+	if (status & 0x040) {
+		nv50_pgraph_tp_trap (dev, 6, 0x408900, 0x408600, display,
+				"PGRAPH_TRAP_TEXTURE");
+		nv_wr32(dev, 0x400108, 0x040);
+		status &= ~0x040;
+	}
+
+	/* MP: CUDA execution engines. */
+	if (status & 0x080) {
+		nv50_pgraph_tp_trap (dev, 7, 0x408314, 0x40831c, display,
+				"PGRAPH_TRAP_MP");
+		nv_wr32(dev, 0x400108, 0x080);
+		status &= ~0x080;
+	}
+
+	/* TPDMA:  Handles TP-initiated uncached memory accesses:
+	 * l[], g[], stack, 2d surfaces, render targets. */
+	if (status & 0x100) {
+		nv50_pgraph_tp_trap (dev, 8, 0x408e08, 0x408708, display,
+				"PGRAPH_TRAP_TPDMA");
+		nv_wr32(dev, 0x400108, 0x100);
+		status &= ~0x100;
+	}
+
+	if (status) {
+		if (display)
+			NV_INFO(dev, "PGRAPH_TRAP - Unknown trap 0x%08x\n",
+				status);
+		nv_wr32(dev, 0x400108, status);
+	}
+}
+
+/* There must be a *lot* of these. Will take some time to gather them up. */
+static struct nouveau_enum_names nv50_data_error_names[] =
+{
+	{ 4,	"INVALID_VALUE" },
+	{ 5,	"INVALID_ENUM" },
+	{ 8,	"INVALID_OBJECT" },
+	{ 0xc,	"INVALID_BITFIELD" },
+	{ 0x28,	"MP_NO_REG_SPACE" },
+	{ 0x2b,	"MP_BLOCK_SIZE_MISMATCH" },
+};
+
+static void
 nv50_pgraph_irq_handler(struct drm_device *dev)
 {
+	struct nouveau_pgraph_trap trap;
+	int unhandled = 0;
 	uint32_t status;
 
 	while ((status = nv_rd32(dev, NV03_PGRAPH_INTR))) {
-		uint32_t nsource = nv_rd32(dev, NV03_PGRAPH_NSOURCE);
-
+		/* NOTIFY: You've set a NOTIFY an a command and it's done. */
 		if (status & 0x00000001) {
-			nouveau_pgraph_intr_notify(dev, nsource);
+			nouveau_graph_trap_info(dev, &trap);
+			if (nouveau_ratelimit())
+				nouveau_graph_dump_trap_info(dev,
+						"PGRAPH_NOTIFY", &trap);
 			status &= ~0x00000001;
 			nv_wr32(dev, NV03_PGRAPH_INTR, 0x00000001);
 		}
 
-		if (status & 0x00000010) {
-			nouveau_pgraph_intr_error(dev, nsource |
-					NV03_PGRAPH_NSOURCE_ILLEGAL_MTHD);
+		/* COMPUTE_QUERY: Purpose and exact cause unknown, happens
+		 * when you write 0x200 to 0x50c0 method 0x31c. */
+		if (status & 0x00000002) {
+			nouveau_graph_trap_info(dev, &trap);
+			if (nouveau_ratelimit())
+				nouveau_graph_dump_trap_info(dev,
+						"PGRAPH_COMPUTE_QUERY", &trap);
+			status &= ~0x00000002;
+			nv_wr32(dev, NV03_PGRAPH_INTR, 0x00000002);
+		}
 
+		/* Unknown, never seen: 0x4 */
+
+		/* ILLEGAL_MTHD: You used a wrong method for this class. */
+		if (status & 0x00000010) {
+			nouveau_graph_trap_info(dev, &trap);
+			if (nouveau_pgraph_intr_swmthd(dev, &trap))
+				unhandled = 1;
+			if (unhandled && nouveau_ratelimit())
+				nouveau_graph_dump_trap_info(dev,
+						"PGRAPH_ILLEGAL_MTHD", &trap);
 			status &= ~0x00000010;
 			nv_wr32(dev, NV03_PGRAPH_INTR, 0x00000010);
 		}
 
+		/* ILLEGAL_CLASS: You used a wrong class. */
+		if (status & 0x00000020) {
+			nouveau_graph_trap_info(dev, &trap);
+			if (nouveau_ratelimit())
+				nouveau_graph_dump_trap_info(dev,
+						"PGRAPH_ILLEGAL_CLASS", &trap);
+			status &= ~0x00000020;
+			nv_wr32(dev, NV03_PGRAPH_INTR, 0x00000020);
+		}
+
+		/* DOUBLE_NOTIFY: You tried to set a NOTIFY on another NOTIFY. */
+		if (status & 0x00000040) {
+			nouveau_graph_trap_info(dev, &trap);
+			if (nouveau_ratelimit())
+				nouveau_graph_dump_trap_info(dev,
+						"PGRAPH_DOUBLE_NOTIFY", &trap);
+			status &= ~0x00000040;
+			nv_wr32(dev, NV03_PGRAPH_INTR, 0x00000040);
+		}
+
+		/* CONTEXT_SWITCH: PGRAPH needs us to load a new context */
 		if (status & 0x00001000) {
 			nv_wr32(dev, 0x400500, 0x00000000);
 			nv_wr32(dev, NV03_PGRAPH_INTR,
@@ -613,49 +1115,59 @@ nv50_pgraph_irq_handler(struct drm_device *dev)
 			status &= ~NV_PGRAPH_INTR_CONTEXT_SWITCH;
 		}
 
-		if (status & 0x00100000) {
-			nouveau_pgraph_intr_error(dev, nsource |
-					NV03_PGRAPH_NSOURCE_DATA_ERROR);
+		/* BUFFER_NOTIFY: Your m2mf transfer finished */
+		if (status & 0x00010000) {
+			nouveau_graph_trap_info(dev, &trap);
+			if (nouveau_ratelimit())
+				nouveau_graph_dump_trap_info(dev,
+						"PGRAPH_BUFFER_NOTIFY", &trap);
+			status &= ~0x00010000;
+			nv_wr32(dev, NV03_PGRAPH_INTR, 0x00010000);
+		}
 
+		/* DATA_ERROR: Invalid value for this method, or invalid
+		 * state in current PGRAPH context for this operation */
+		if (status & 0x00100000) {
+			nouveau_graph_trap_info(dev, &trap);
+			if (nouveau_ratelimit()) {
+				nouveau_graph_dump_trap_info(dev,
+						"PGRAPH_DATA_ERROR", &trap);
+				NV_INFO (dev, "PGRAPH_DATA_ERROR - ");
+				nouveau_print_enum_names(nv_rd32(dev, 0x400110),
+						nv50_data_error_names);
+				printk("\n");
+			}
 			status &= ~0x00100000;
 			nv_wr32(dev, NV03_PGRAPH_INTR, 0x00100000);
 		}
 
+		/* TRAP: Something bad happened in the middle of command
+		 * execution.  Has a billion types, subtypes, and even
+		 * subsubtypes. */
 		if (status & 0x00200000) {
-			int r;
-
-			nouveau_pgraph_intr_error(dev, nsource |
-					NV03_PGRAPH_NSOURCE_PROTECTION_ERROR);
-
-			NV_ERROR(dev, "magic set 1:\n");
-			for (r = 0x408900; r <= 0x408910; r += 4)
-				NV_ERROR(dev, "\t0x%08x: 0x%08x\n", r,
-					nv_rd32(dev, r));
-			nv_wr32(dev, 0x408900,
-				nv_rd32(dev, 0x408904) | 0xc0000000);
-			for (r = 0x408e08; r <= 0x408e24; r += 4)
-				NV_ERROR(dev, "\t0x%08x: 0x%08x\n", r,
-							nv_rd32(dev, r));
-			nv_wr32(dev, 0x408e08,
-				nv_rd32(dev, 0x408e08) | 0xc0000000);
-
-			NV_ERROR(dev, "magic set 2:\n");
-			for (r = 0x409900; r <= 0x409910; r += 4)
-				NV_ERROR(dev, "\t0x%08x: 0x%08x\n", r,
-					nv_rd32(dev, r));
-			nv_wr32(dev, 0x409900,
-				nv_rd32(dev, 0x409904) | 0xc0000000);
-			for (r = 0x409e08; r <= 0x409e24; r += 4)
-				NV_ERROR(dev, "\t0x%08x: 0x%08x\n", r,
-					nv_rd32(dev, r));
-			nv_wr32(dev, 0x409e08,
-				nv_rd32(dev, 0x409e08) | 0xc0000000);
-
+			nv50_pgraph_trap_handler(dev);
 			status &= ~0x00200000;
-			nv_wr32(dev, NV03_PGRAPH_NSOURCE, nsource);
 			nv_wr32(dev, NV03_PGRAPH_INTR, 0x00200000);
 		}
 
+		/* Unknown, never seen: 0x00400000 */
+
+		/* SINGLE_STEP: Happens on every method if you turned on
+		 * single stepping in 40008c */
+		if (status & 0x01000000) {
+			nouveau_graph_trap_info(dev, &trap);
+			if (nouveau_ratelimit())
+				nouveau_graph_dump_trap_info(dev,
+						"PGRAPH_SINGLE_STEP", &trap);
+			status &= ~0x01000000;
+			nv_wr32(dev, NV03_PGRAPH_INTR, 0x01000000);
+		}
+
+		/* 0x02000000 happens when you pause a ctxprog...
+		 * but the only way this can happen that I know is by
+		 * poking the relevant MMIO register, and we don't
+		 * do that. */
+
 		if (status) {
 			NV_INFO(dev, "Unhandled PGRAPH_INTR - 0x%08x\n",
 				status);
@@ -672,7 +1184,8 @@ nv50_pgraph_irq_handler(struct drm_device *dev)
 	}
 
 	nv_wr32(dev, NV03_PMC_INTR_0, NV_PMC_INTR_0_PGRAPH_PENDING);
-	nv_wr32(dev, 0x400824, nv_rd32(dev, 0x400824) & ~(1 << 31));
+	if (nv_rd32(dev, 0x400824) & (1 << 31))
+		nv_wr32(dev, 0x400824, nv_rd32(dev, 0x400824) & ~(1 << 31));
 }
 
 static void
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
index eb8f084d5f5..58b46807de2 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -35,7 +35,6 @@
 #include "nouveau_drm.h"
 #include "nv50_display.h"
 
-static int nouveau_stub_init(struct drm_device *dev) { return 0; }
 static void nouveau_stub_takedown(struct drm_device *dev) {}
 
 static int nouveau_init_engine_ptrs(struct drm_device *dev)
@@ -277,8 +276,8 @@ static int nouveau_init_engine_ptrs(struct drm_device *dev)
 		engine->timer.init		= nv04_timer_init;
 		engine->timer.read		= nv04_timer_read;
 		engine->timer.takedown		= nv04_timer_takedown;
-		engine->fb.init			= nouveau_stub_init;
-		engine->fb.takedown		= nouveau_stub_takedown;
+		engine->fb.init			= nv50_fb_init;
+		engine->fb.takedown		= nv50_fb_takedown;
 		engine->graph.grclass		= nv50_graph_grclass;
 		engine->graph.init		= nv50_graph_init;
 		engine->graph.takedown		= nv50_graph_takedown;
diff --git a/drivers/gpu/drm/nouveau/nv04_crtc.c b/drivers/gpu/drm/nouveau/nv04_crtc.c
index a1d1ebb073d..eba687f1099 100644
--- a/drivers/gpu/drm/nouveau/nv04_crtc.c
+++ b/drivers/gpu/drm/nouveau/nv04_crtc.c
@@ -230,9 +230,9 @@ nv_crtc_mode_set_vga(struct drm_crtc *crtc, struct drm_display_mode *mode)
 	struct drm_framebuffer *fb = crtc->fb;
 
 	/* Calculate our timings */
-	int horizDisplay	= (mode->crtc_hdisplay >> 3) 	- 1;
-	int horizStart		= (mode->crtc_hsync_start >> 3) 	- 1;
-	int horizEnd		= (mode->crtc_hsync_end >> 3) 	- 1;
+	int horizDisplay	= (mode->crtc_hdisplay >> 3)		- 1;
+	int horizStart		= (mode->crtc_hsync_start >> 3) 	+ 1;
+	int horizEnd		= (mode->crtc_hsync_end >> 3)		+ 1;
 	int horizTotal		= (mode->crtc_htotal >> 3)		- 5;
 	int horizBlankStart	= (mode->crtc_hdisplay >> 3)		- 1;
 	int horizBlankEnd	= (mode->crtc_htotal >> 3)		- 1;
diff --git a/drivers/gpu/drm/nouveau/nv04_fbcon.c b/drivers/gpu/drm/nouveau/nv04_fbcon.c
index 3da90c2c4e6..813b25cec72 100644
--- a/drivers/gpu/drm/nouveau/nv04_fbcon.c
+++ b/drivers/gpu/drm/nouveau/nv04_fbcon.c
@@ -118,8 +118,8 @@ nv04_fbcon_imageblit(struct fb_info *info, const struct fb_image *image)
 		return;
 	}
 
-	width = ALIGN(image->width, 32);
-	dsize = (width * image->height) >> 5;
+	width = ALIGN(image->width, 8);
+	dsize = ALIGN(width * image->height, 32) >> 5;
 
 	if (info->fix.visual == FB_VISUAL_TRUECOLOR ||
 	    info->fix.visual == FB_VISUAL_DIRECTCOLOR) {
@@ -136,8 +136,8 @@ nv04_fbcon_imageblit(struct fb_info *info, const struct fb_image *image)
 			 ((image->dx + image->width) & 0xffff));
 	OUT_RING(chan, bg);
 	OUT_RING(chan, fg);
-	OUT_RING(chan, (image->height << 16) | image->width);
 	OUT_RING(chan, (image->height << 16) | width);
+	OUT_RING(chan, (image->height << 16) | image->width);
 	OUT_RING(chan, (image->dy << 16) | (image->dx & 0xffff));
 
 	while (dsize) {
diff --git a/drivers/gpu/drm/nouveau/nv50_display.c b/drivers/gpu/drm/nouveau/nv50_display.c
index 61a89f2dc55..fac6c88a2b1 100644
--- a/drivers/gpu/drm/nouveau/nv50_display.c
+++ b/drivers/gpu/drm/nouveau/nv50_display.c
@@ -522,8 +522,8 @@ int nv50_display_create(struct drm_device *dev)
 	}
 
 	for (i = 0 ; i < dcb->connector.entries; i++) {
-		if (i != 0 && dcb->connector.entry[i].index ==
-			      dcb->connector.entry[i - 1].index)
+		if (i != 0 && dcb->connector.entry[i].index2 ==
+			      dcb->connector.entry[i - 1].index2)
 			continue;
 		nouveau_connector_create(dev, &dcb->connector.entry[i]);
 	}
diff --git a/drivers/gpu/drm/nouveau/nv50_fb.c b/drivers/gpu/drm/nouveau/nv50_fb.c
new file mode 100644
index 00000000000..a95e6941ba8
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nv50_fb.c
@@ -0,0 +1,32 @@
+#include "drmP.h"
+#include "drm.h"
+#include "nouveau_drv.h"
+#include "nouveau_drm.h"
+
+int
+nv50_fb_init(struct drm_device *dev)
+{
+	/* This is needed to get meaningful information from 100c90
+	 * on traps. No idea what these values mean exactly. */
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	switch (dev_priv->chipset) {
+	case 0x50:
+		nv_wr32(dev, 0x100c90, 0x0707ff);
+		break;
+	case 0xa5:
+	case 0xa8:
+		nv_wr32(dev, 0x100c90, 0x0d0fff);
+		break;
+	default:
+		nv_wr32(dev, 0x100c90, 0x1d07ff);
+		break;
+	}
+
+	return 0;
+}
+
+void
+nv50_fb_takedown(struct drm_device *dev)
+{
+}
diff --git a/drivers/gpu/drm/nouveau/nv50_fbcon.c b/drivers/gpu/drm/nouveau/nv50_fbcon.c
index 993c7126fbd..25a3cd8794f 100644
--- a/drivers/gpu/drm/nouveau/nv50_fbcon.c
+++ b/drivers/gpu/drm/nouveau/nv50_fbcon.c
@@ -233,7 +233,7 @@ nv50_fbcon_accel_init(struct fb_info *info)
 	BEGIN_RING(chan, NvSub2D, 0x0808, 3);
 	OUT_RING(chan, 0);
 	OUT_RING(chan, 0);
-	OUT_RING(chan, 0);
+	OUT_RING(chan, 1);
 	BEGIN_RING(chan, NvSub2D, 0x081c, 1);
 	OUT_RING(chan, 1);
 	BEGIN_RING(chan, NvSub2D, 0x0840, 4);
diff --git a/drivers/gpu/drm/nouveau/nv50_graph.c b/drivers/gpu/drm/nouveau/nv50_graph.c
index 857a09671a3..c62b33a02f8 100644
--- a/drivers/gpu/drm/nouveau/nv50_graph.c
+++ b/drivers/gpu/drm/nouveau/nv50_graph.c
@@ -56,6 +56,10 @@ nv50_graph_init_intr(struct drm_device *dev)
 static void
 nv50_graph_init_regs__nv(struct drm_device *dev)
 {
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	uint32_t units = nv_rd32(dev, 0x1540);
+	int i;
+
 	NV_DEBUG(dev, "\n");
 
 	nv_wr32(dev, 0x400804, 0xc0000000);
@@ -65,6 +69,20 @@ nv50_graph_init_regs__nv(struct drm_device *dev)
 	nv_wr32(dev, 0x405018, 0xc0000000);
 	nv_wr32(dev, 0x402000, 0xc0000000);
 
+	for (i = 0; i < 16; i++) {
+		if (units & 1 << i) {
+			if (dev_priv->chipset < 0xa0) {
+				nv_wr32(dev, 0x408900 + (i << 12), 0xc0000000);
+				nv_wr32(dev, 0x408e08 + (i << 12), 0xc0000000);
+				nv_wr32(dev, 0x408314 + (i << 12), 0xc0000000);
+			} else {
+				nv_wr32(dev, 0x408600 + (i << 11), 0xc0000000);
+				nv_wr32(dev, 0x408708 + (i << 11), 0xc0000000);
+				nv_wr32(dev, 0x40831c + (i << 11), 0xc0000000);
+			}
+		}
+	}
+
 	nv_wr32(dev, 0x400108, 0xffffffff);
 
 	nv_wr32(dev, 0x400824, 0x00004000);
@@ -229,10 +247,6 @@ nv50_graph_create_context(struct nouveau_channel *chan)
 		nouveau_grctx_vals_load(dev, ctx);
 	}
 	nv_wo32(dev, ctx, 0x00000/4, chan->ramin->instance >> 12);
-	if ((dev_priv->chipset & 0xf0) == 0xa0)
-		nv_wo32(dev, ctx, 0x00004/4, 0x00000000);
-	else
-		nv_wo32(dev, ctx, 0x0011c/4, 0x00000000);
 	dev_priv->engine.instmem.finish_access(dev);
 
 	return 0;
diff --git a/drivers/gpu/drm/nouveau/nv50_grctx.c b/drivers/gpu/drm/nouveau/nv50_grctx.c
index d105fcd42ca..546b31949a3 100644
--- a/drivers/gpu/drm/nouveau/nv50_grctx.c
+++ b/drivers/gpu/drm/nouveau/nv50_grctx.c
@@ -64,6 +64,9 @@
 #define CP_FLAG_ALWAYS                ((2 * 32) + 13)
 #define CP_FLAG_ALWAYS_FALSE          0
 #define CP_FLAG_ALWAYS_TRUE           1
+#define CP_FLAG_INTR                  ((2 * 32) + 15)
+#define CP_FLAG_INTR_NOT_PENDING      0
+#define CP_FLAG_INTR_PENDING          1
 
 #define CP_CTX                   0x00100000
 #define CP_CTX_COUNT             0x000f0000
@@ -214,6 +217,8 @@ nv50_grctx_init(struct nouveau_grctx *ctx)
 	cp_name(ctx, cp_setup_save);
 	cp_set (ctx, UNK1D, SET);
 	cp_wait(ctx, STATUS, BUSY);
+	cp_wait(ctx, INTR, PENDING);
+	cp_bra (ctx, STATUS, BUSY, cp_setup_save);
 	cp_set (ctx, UNK01, SET);
 	cp_set (ctx, SWAP_DIRECTION, SAVE);
 
@@ -269,7 +274,7 @@ nv50_graph_construct_mmio(struct nouveau_grctx *ctx)
 	int offset, base;
 	uint32_t units = nv_rd32 (ctx->dev, 0x1540);
 
-	/* 0800 */
+	/* 0800: DISPATCH */
 	cp_ctx(ctx, 0x400808, 7);
 	gr_def(ctx, 0x400814, 0x00000030);
 	cp_ctx(ctx, 0x400834, 0x32);
@@ -300,7 +305,7 @@ nv50_graph_construct_mmio(struct nouveau_grctx *ctx)
 		gr_def(ctx, 0x400b20, 0x0001629d);
 	}
 
-	/* 0C00 */
+	/* 0C00: VFETCH */
 	cp_ctx(ctx, 0x400c08, 0x2);
 	gr_def(ctx, 0x400c08, 0x0000fe0c);
 
@@ -326,7 +331,7 @@ nv50_graph_construct_mmio(struct nouveau_grctx *ctx)
 	cp_ctx(ctx, 0x401540, 0x5);
 	gr_def(ctx, 0x401550, 0x00001018);
 
-	/* 1800 */
+	/* 1800: STREAMOUT */
 	cp_ctx(ctx, 0x401814, 0x1);
 	gr_def(ctx, 0x401814, 0x000000ff);
 	if (dev_priv->chipset == 0x50) {
@@ -641,7 +646,7 @@ nv50_graph_construct_mmio(struct nouveau_grctx *ctx)
 	if (dev_priv->chipset == 0x50)
 		cp_ctx(ctx, 0x4063e0, 0x1);
 
-	/* 6800 */
+	/* 6800: M2MF */
 	if (dev_priv->chipset < 0x90) {
 		cp_ctx(ctx, 0x406814, 0x2b);
 		gr_def(ctx, 0x406818, 0x00000f80);
diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile
index ed38262d998..3c91312dea9 100644
--- a/drivers/gpu/drm/radeon/Makefile
+++ b/drivers/gpu/drm/radeon/Makefile
@@ -50,7 +50,7 @@ $(obj)/r600_cs.o: $(obj)/r600_reg_safe.h
 radeon-y := radeon_drv.o radeon_cp.o radeon_state.o radeon_mem.o \
 	radeon_irq.o r300_cmdbuf.o r600_cp.o
 # add KMS driver
-radeon-y += radeon_device.o radeon_kms.o \
+radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \
 	radeon_atombios.o radeon_agp.o atombios_crtc.o radeon_combios.o \
 	atom.o radeon_fence.o radeon_ttm.o radeon_object.o radeon_gart.o \
 	radeon_legacy_crtc.o radeon_legacy_encoders.o radeon_connectors.o \
diff --git a/drivers/gpu/drm/radeon/atom.c b/drivers/gpu/drm/radeon/atom.c
index d75788feac6..247f8ee7e94 100644
--- a/drivers/gpu/drm/radeon/atom.c
+++ b/drivers/gpu/drm/radeon/atom.c
@@ -52,15 +52,17 @@
 
 typedef struct {
 	struct atom_context *ctx;
-
 	uint32_t *ps, *ws;
 	int ps_shift;
 	uint16_t start;
+	unsigned last_jump;
+	unsigned long last_jump_jiffies;
+	bool abort;
 } atom_exec_context;
 
 int atom_debug = 0;
-static void atom_execute_table_locked(struct atom_context *ctx, int index, uint32_t * params);
-void atom_execute_table(struct atom_context *ctx, int index, uint32_t * params);
+static int atom_execute_table_locked(struct atom_context *ctx, int index, uint32_t * params);
+int atom_execute_table(struct atom_context *ctx, int index, uint32_t * params);
 
 static uint32_t atom_arg_mask[8] =
     { 0xFFFFFFFF, 0xFFFF, 0xFFFF00, 0xFFFF0000, 0xFF, 0xFF00, 0xFF0000,
@@ -604,12 +606,17 @@ static void atom_op_beep(atom_exec_context *ctx, int *ptr, int arg)
 static void atom_op_calltable(atom_exec_context *ctx, int *ptr, int arg)
 {
 	int idx = U8((*ptr)++);
+	int r = 0;
+
 	if (idx < ATOM_TABLE_NAMES_CNT)
 		SDEBUG("   table: %d (%s)\n", idx, atom_table_names[idx]);
 	else
 		SDEBUG("   table: %d\n", idx);
 	if (U16(ctx->ctx->cmd_table + 4 + 2 * idx))
-		atom_execute_table_locked(ctx->ctx, idx, ctx->ps + ctx->ps_shift);
+		r = atom_execute_table_locked(ctx->ctx, idx, ctx->ps + ctx->ps_shift);
+	if (r) {
+		ctx->abort = true;
+	}
 }
 
 static void atom_op_clear(atom_exec_context *ctx, int *ptr, int arg)
@@ -673,6 +680,8 @@ static void atom_op_eot(atom_exec_context *ctx, int *ptr, int arg)
 static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
 {
 	int execute = 0, target = U16(*ptr);
+	unsigned long cjiffies;
+
 	(*ptr) += 2;
 	switch (arg) {
 	case ATOM_COND_ABOVE:
@@ -700,8 +709,25 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
 	if (arg != ATOM_COND_ALWAYS)
 		SDEBUG("   taken: %s\n", execute ? "yes" : "no");
 	SDEBUG("   target: 0x%04X\n", target);
-	if (execute)
+	if (execute) {
+		if (ctx->last_jump == (ctx->start + target)) {
+			cjiffies = jiffies;
+			if (time_after(cjiffies, ctx->last_jump_jiffies)) {
+				cjiffies -= ctx->last_jump_jiffies;
+				if ((jiffies_to_msecs(cjiffies) > 1000)) {
+					DRM_ERROR("atombios stuck in loop for more than 1sec aborting\n");
+					ctx->abort = true;
+				}
+			} else {
+				/* jiffies wrap around we will just wait a little longer */
+				ctx->last_jump_jiffies = jiffies;
+			}
+		} else {
+			ctx->last_jump = ctx->start + target;
+			ctx->last_jump_jiffies = jiffies;
+		}
 		*ptr = ctx->start + target;
+	}
 }
 
 static void atom_op_mask(atom_exec_context *ctx, int *ptr, int arg)
@@ -1104,7 +1130,7 @@ static struct {
 	atom_op_shr, ATOM_ARG_MC}, {
 atom_op_debug, 0},};
 
-static void atom_execute_table_locked(struct atom_context *ctx, int index, uint32_t * params)
+static int atom_execute_table_locked(struct atom_context *ctx, int index, uint32_t * params)
 {
 	int base = CU16(ctx->cmd_table + 4 + 2 * index);
 	int len, ws, ps, ptr;
@@ -1112,7 +1138,7 @@ static void atom_execute_table_locked(struct atom_context *ctx, int index, uint3
 	atom_exec_context ectx;
 
 	if (!base)
-		return;
+		return -EINVAL;
 
 	len = CU16(base + ATOM_CT_SIZE_PTR);
 	ws = CU8(base + ATOM_CT_WS_PTR);
@@ -1125,6 +1151,8 @@ static void atom_execute_table_locked(struct atom_context *ctx, int index, uint3
 	ectx.ps_shift = ps / 4;
 	ectx.start = base;
 	ectx.ps = params;
+	ectx.abort = false;
+	ectx.last_jump = 0;
 	if (ws)
 		ectx.ws = kzalloc(4 * ws, GFP_KERNEL);
 	else
@@ -1137,6 +1165,11 @@ static void atom_execute_table_locked(struct atom_context *ctx, int index, uint3
 			SDEBUG("%s @ 0x%04X\n", atom_op_names[op], ptr - 1);
 		else
 			SDEBUG("[%d] @ 0x%04X\n", op, ptr - 1);
+		if (ectx.abort) {
+			DRM_ERROR("atombios stuck executing %04X (len %d, WS %d, PS %d) @ 0x%04X\n",
+				base, len, ws, ps, ptr - 1);
+			return -EINVAL;
+		}
 
 		if (op < ATOM_OP_CNT && op > 0)
 			opcode_table[op].func(&ectx, &ptr,
@@ -1152,10 +1185,13 @@ static void atom_execute_table_locked(struct atom_context *ctx, int index, uint3
 
 	if (ws)
 		kfree(ectx.ws);
+	return 0;
 }
 
-void atom_execute_table(struct atom_context *ctx, int index, uint32_t * params)
+int atom_execute_table(struct atom_context *ctx, int index, uint32_t * params)
 {
+	int r;
+
 	mutex_lock(&ctx->mutex);
 	/* reset reg block */
 	ctx->reg_block = 0;
@@ -1163,8 +1199,9 @@ void atom_execute_table(struct atom_context *ctx, int index, uint32_t * params)
 	ctx->fb_base = 0;
 	/* reset io mode */
 	ctx->io_mode = ATOM_IO_MM;
-	atom_execute_table_locked(ctx, index, params);
+	r = atom_execute_table_locked(ctx, index, params);
 	mutex_unlock(&ctx->mutex);
+	return r;
 }
 
 static int atom_iio_len[] = { 1, 2, 3, 3, 3, 3, 4, 4, 4, 3 };
@@ -1248,9 +1285,7 @@ int atom_asic_init(struct atom_context *ctx)
 
 	if (!CU16(ctx->cmd_table + 4 + 2 * ATOM_CMD_INIT))
 		return 1;
-	atom_execute_table(ctx, ATOM_CMD_INIT, ps);
-
-	return 0;
+	return atom_execute_table(ctx, ATOM_CMD_INIT, ps);
 }
 
 void atom_destroy(struct atom_context *ctx)
@@ -1260,12 +1295,16 @@ void atom_destroy(struct atom_context *ctx)
 	kfree(ctx);
 }
 
-void atom_parse_data_header(struct atom_context *ctx, int index,
+bool atom_parse_data_header(struct atom_context *ctx, int index,
 			    uint16_t * size, uint8_t * frev, uint8_t * crev,
 			    uint16_t * data_start)
 {
 	int offset = index * 2 + 4;
 	int idx = CU16(ctx->data_table + offset);
+	u16 *mdt = (u16 *)(ctx->bios + ctx->data_table + 4);
+
+	if (!mdt[index])
+		return false;
 
 	if (size)
 		*size = CU16(idx);
@@ -1274,38 +1313,42 @@ void atom_parse_data_header(struct atom_context *ctx, int index,
 	if (crev)
 		*crev = CU8(idx + 3);
 	*data_start = idx;
-	return;
+	return true;
 }
 
-void atom_parse_cmd_header(struct atom_context *ctx, int index, uint8_t * frev,
+bool atom_parse_cmd_header(struct atom_context *ctx, int index, uint8_t * frev,
 			   uint8_t * crev)
 {
 	int offset = index * 2 + 4;
 	int idx = CU16(ctx->cmd_table + offset);
+	u16 *mct = (u16 *)(ctx->bios + ctx->cmd_table + 4);
+
+	if (!mct[index])
+		return false;
 
 	if (frev)
 		*frev = CU8(idx + 2);
 	if (crev)
 		*crev = CU8(idx + 3);
-	return;
+	return true;
 }
 
 int atom_allocate_fb_scratch(struct atom_context *ctx)
 {
 	int index = GetIndexIntoMasterTable(DATA, VRAM_UsageByFirmware);
 	uint16_t data_offset;
-	int usage_bytes;
+	int usage_bytes = 0;
 	struct _ATOM_VRAM_USAGE_BY_FIRMWARE *firmware_usage;
 
-	atom_parse_data_header(ctx, index, NULL, NULL, NULL, &data_offset);
+	if (atom_parse_data_header(ctx, index, NULL, NULL, NULL, &data_offset)) {
+		firmware_usage = (struct _ATOM_VRAM_USAGE_BY_FIRMWARE *)(ctx->bios + data_offset);
 
-	firmware_usage = (struct _ATOM_VRAM_USAGE_BY_FIRMWARE *)(ctx->bios + data_offset);
+		DRM_DEBUG("atom firmware requested %08x %dkb\n",
+			  firmware_usage->asFirmwareVramReserveInfo[0].ulStartAddrUsedByFirmware,
+			  firmware_usage->asFirmwareVramReserveInfo[0].usFirmwareUseInKb);
 
-	DRM_DEBUG("atom firmware requested %08x %dkb\n",
-		  firmware_usage->asFirmwareVramReserveInfo[0].ulStartAddrUsedByFirmware,
-		  firmware_usage->asFirmwareVramReserveInfo[0].usFirmwareUseInKb);
-
-	usage_bytes = firmware_usage->asFirmwareVramReserveInfo[0].usFirmwareUseInKb * 1024;
+		usage_bytes = firmware_usage->asFirmwareVramReserveInfo[0].usFirmwareUseInKb * 1024;
+	}
 	if (usage_bytes == 0)
 		usage_bytes = 20 * 1024;
 	/* allocate some scratch memory */
diff --git a/drivers/gpu/drm/radeon/atom.h b/drivers/gpu/drm/radeon/atom.h
index bc73781423a..cd1b64ab5ca 100644
--- a/drivers/gpu/drm/radeon/atom.h
+++ b/drivers/gpu/drm/radeon/atom.h
@@ -140,11 +140,13 @@ struct atom_context {
 extern int atom_debug;
 
 struct atom_context *atom_parse(struct card_info *, void *);
-void atom_execute_table(struct atom_context *, int, uint32_t *);
+int atom_execute_table(struct atom_context *, int, uint32_t *);
 int atom_asic_init(struct atom_context *);
 void atom_destroy(struct atom_context *);
-void atom_parse_data_header(struct atom_context *ctx, int index, uint16_t *size, uint8_t *frev, uint8_t *crev, uint16_t *data_start);
-void atom_parse_cmd_header(struct atom_context *ctx, int index, uint8_t *frev, uint8_t *crev);
+bool atom_parse_data_header(struct atom_context *ctx, int index, uint16_t *size,
+			    uint8_t *frev, uint8_t *crev, uint16_t *data_start);
+bool atom_parse_cmd_header(struct atom_context *ctx, int index,
+			   uint8_t *frev, uint8_t *crev);
 int atom_allocate_fb_scratch(struct atom_context *ctx);
 #include "atom-types.h"
 #include "atombios.h"
diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c
index dd9fdf56061..fd4ef6d1884 100644
--- a/drivers/gpu/drm/radeon/atombios_crtc.c
+++ b/drivers/gpu/drm/radeon/atombios_crtc.c
@@ -353,12 +353,55 @@ static void atombios_crtc_set_timing(struct drm_crtc *crtc,
 	atom_execute_table(rdev->mode_info.atom_context, index, (uint32_t *)&args);
 }
 
+static void atombios_disable_ss(struct drm_crtc *crtc)
+{
+	struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc);
+	struct drm_device *dev = crtc->dev;
+	struct radeon_device *rdev = dev->dev_private;
+	u32 ss_cntl;
+
+	if (ASIC_IS_DCE4(rdev)) {
+		switch (radeon_crtc->pll_id) {
+		case ATOM_PPLL1:
+			ss_cntl = RREG32(EVERGREEN_P1PLL_SS_CNTL);
+			ss_cntl &= ~EVERGREEN_PxPLL_SS_EN;
+			WREG32(EVERGREEN_P1PLL_SS_CNTL, ss_cntl);
+			break;
+		case ATOM_PPLL2:
+			ss_cntl = RREG32(EVERGREEN_P2PLL_SS_CNTL);
+			ss_cntl &= ~EVERGREEN_PxPLL_SS_EN;
+			WREG32(EVERGREEN_P2PLL_SS_CNTL, ss_cntl);
+			break;
+		case ATOM_DCPLL:
+		case ATOM_PPLL_INVALID:
+			return;
+		}
+	} else if (ASIC_IS_AVIVO(rdev)) {
+		switch (radeon_crtc->pll_id) {
+		case ATOM_PPLL1:
+			ss_cntl = RREG32(AVIVO_P1PLL_INT_SS_CNTL);
+			ss_cntl &= ~1;
+			WREG32(AVIVO_P1PLL_INT_SS_CNTL, ss_cntl);
+			break;
+		case ATOM_PPLL2:
+			ss_cntl = RREG32(AVIVO_P2PLL_INT_SS_CNTL);
+			ss_cntl &= ~1;
+			WREG32(AVIVO_P2PLL_INT_SS_CNTL, ss_cntl);
+			break;
+		case ATOM_DCPLL:
+		case ATOM_PPLL_INVALID:
+			return;
+		}
+	}
+}
+
+
 union atom_enable_ss {
 	ENABLE_LVDS_SS_PARAMETERS legacy;
 	ENABLE_SPREAD_SPECTRUM_ON_PPLL_PS_ALLOCATION v1;
 };
 
-static void atombios_set_ss(struct drm_crtc *crtc, int enable)
+static void atombios_enable_ss(struct drm_crtc *crtc)
 {
 	struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc);
 	struct drm_device *dev = crtc->dev;
@@ -387,9 +430,9 @@ static void atombios_set_ss(struct drm_crtc *crtc, int enable)
 					step = dig->ss->step;
 					delay = dig->ss->delay;
 					range = dig->ss->range;
-				} else if (enable)
+				} else
 					return;
-			} else if (enable)
+			} else
 				return;
 			break;
 		}
@@ -406,13 +449,13 @@ static void atombios_set_ss(struct drm_crtc *crtc, int enable)
 		args.v1.ucSpreadSpectrumDelay = delay;
 		args.v1.ucSpreadSpectrumRange = range;
 		args.v1.ucPpll = radeon_crtc->crtc_id ? ATOM_PPLL2 : ATOM_PPLL1;
-		args.v1.ucEnable = enable;
+		args.v1.ucEnable = ATOM_ENABLE;
 	} else {
 		args.legacy.usSpreadSpectrumPercentage = cpu_to_le16(percentage);
 		args.legacy.ucSpreadSpectrumType = type;
 		args.legacy.ucSpreadSpectrumStepSize_Delay = (step & 3) << 2;
 		args.legacy.ucSpreadSpectrumStepSize_Delay |= (delay & 7) << 4;
-		args.legacy.ucEnable = enable;
+		args.legacy.ucEnable = ATOM_ENABLE;
 	}
 	atom_execute_table(rdev->mode_info.atom_context, index, (uint32_t *)&args);
 }
@@ -478,11 +521,6 @@ static u32 atombios_adjust_pll(struct drm_crtc *crtc,
 				/* DVO wants 2x pixel clock if the DVO chip is in 12 bit mode */
 				if (radeon_encoder->encoder_id == ENCODER_OBJECT_ID_INTERNAL_KLDSCP_DVO1)
 					adjusted_clock = mode->clock * 2;
-				/* LVDS PLL quirks */
-				if (encoder->encoder_type == DRM_MODE_ENCODER_LVDS) {
-					struct radeon_encoder_atom_dig *dig = radeon_encoder->enc_priv;
-					pll->algo = dig->pll_algo;
-				}
 			} else {
 				if (encoder->encoder_type != DRM_MODE_ENCODER_DAC)
 					pll->flags |= RADEON_PLL_NO_ODD_POST_DIV;
@@ -503,8 +541,9 @@ static u32 atombios_adjust_pll(struct drm_crtc *crtc,
 		int index;
 
 		index = GetIndexIntoMasterTable(COMMAND, AdjustDisplayPll);
-		atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev,
-				      &crev);
+		if (!atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev,
+					   &crev))
+			return adjusted_clock;
 
 		memset(&args, 0, sizeof(args));
 
@@ -542,11 +581,16 @@ static u32 atombios_adjust_pll(struct drm_crtc *crtc,
 					}
 				} else if (radeon_encoder->devices & (ATOM_DEVICE_LCD_SUPPORT)) {
 					/* may want to enable SS on DP/eDP eventually */
-					args.v3.sInput.ucDispPllConfig |=
-						DISPPLL_CONFIG_SS_ENABLE;
-					if (mode->clock > 165000)
+					/*args.v3.sInput.ucDispPllConfig |=
+						DISPPLL_CONFIG_SS_ENABLE;*/
+					if (encoder_mode == ATOM_ENCODER_MODE_DP)
 						args.v3.sInput.ucDispPllConfig |=
-							DISPPLL_CONFIG_DUAL_LINK;
+							DISPPLL_CONFIG_COHERENT_MODE;
+					else {
+						if (mode->clock > 165000)
+							args.v3.sInput.ucDispPllConfig |=
+								DISPPLL_CONFIG_DUAL_LINK;
+					}
 				}
 				atom_execute_table(rdev->mode_info.atom_context,
 						   index, (uint32_t *)&args);
@@ -592,8 +636,9 @@ static void atombios_crtc_set_dcpll(struct drm_crtc *crtc)
 	memset(&args, 0, sizeof(args));
 
 	index = GetIndexIntoMasterTable(COMMAND, SetPixelClock);
-	atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev,
-			      &crev);
+	if (!atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev,
+				   &crev))
+		return;
 
 	switch (frev) {
 	case 1:
@@ -667,8 +712,9 @@ static void atombios_crtc_set_pll(struct drm_crtc *crtc, struct drm_display_mode
 			   &ref_div, &post_div);
 
 	index = GetIndexIntoMasterTable(COMMAND, SetPixelClock);
-	atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev,
-			      &crev);
+	if (!atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev,
+				   &crev))
+		return;
 
 	switch (frev) {
 	case 1:
@@ -1083,15 +1129,12 @@ int atombios_crtc_mode_set(struct drm_crtc *crtc,
 
 	/* TODO color tiling */
 
-	/* pick pll */
-	radeon_crtc->pll_id = radeon_atom_pick_pll(crtc);
-
-	atombios_set_ss(crtc, 0);
+	atombios_disable_ss(crtc);
 	/* always set DCPLL */
 	if (ASIC_IS_DCE4(rdev))
 		atombios_crtc_set_dcpll(crtc);
 	atombios_crtc_set_pll(crtc, adjusted_mode);
-	atombios_set_ss(crtc, 1);
+	atombios_enable_ss(crtc);
 
 	if (ASIC_IS_DCE4(rdev))
 		atombios_set_crtc_dtd_timing(crtc, adjusted_mode);
@@ -1120,6 +1163,11 @@ static bool atombios_crtc_mode_fixup(struct drm_crtc *crtc,
 
 static void atombios_crtc_prepare(struct drm_crtc *crtc)
 {
+	struct radeon_crtc *radeon_crtc = to_radeon_crtc(crtc);
+
+	/* pick pll */
+	radeon_crtc->pll_id = radeon_atom_pick_pll(crtc);
+
 	atombios_lock_crtc(crtc, ATOM_ENABLE);
 	atombios_crtc_dpms(crtc, DRM_MODE_DPMS_OFF);
 }
diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c
index 8a133bda00a..28b31c64f48 100644
--- a/drivers/gpu/drm/radeon/atombios_dp.c
+++ b/drivers/gpu/drm/radeon/atombios_dp.c
@@ -745,14 +745,14 @@ void dp_link_train(struct drm_encoder *encoder,
 			  >> DP_TRAIN_PRE_EMPHASIS_SHIFT);
 
 	/* disable the training pattern on the sink */
+	dp_set_training(radeon_connector, DP_TRAINING_PATTERN_DISABLE);
+
+	/* disable the training pattern on the source */
 	if (ASIC_IS_DCE4(rdev))
 		atombios_dig_encoder_setup(encoder, ATOM_ENCODER_CMD_DP_LINK_TRAINING_COMPLETE);
 	else
 		radeon_dp_encoder_service(rdev, ATOM_DP_ACTION_TRAINING_COMPLETE,
 					  dig_connector->dp_clock, enc_id, 0);
-
-	radeon_dp_encoder_service(rdev, ATOM_DP_ACTION_TRAINING_COMPLETE,
-				  dig_connector->dp_clock, enc_id, 0);
 }
 
 int radeon_dp_i2c_aux_ch(struct i2c_adapter *adapter, int mode,
diff --git a/drivers/gpu/drm/radeon/evergreen.c b/drivers/gpu/drm/radeon/evergreen.c
index bd2e7aa85c1..647a0efdc35 100644
--- a/drivers/gpu/drm/radeon/evergreen.c
+++ b/drivers/gpu/drm/radeon/evergreen.c
@@ -25,6 +25,7 @@
 #include <linux/platform_device.h>
 #include "drmP.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "radeon_drm.h"
 #include "rv770d.h"
 #include "atom.h"
@@ -436,7 +437,6 @@ static void evergreen_gpu_init(struct radeon_device *rdev)
 
 int evergreen_mc_init(struct radeon_device *rdev)
 {
-	fixed20_12 a;
 	u32 tmp;
 	int chansize, numchan;
 
@@ -481,12 +481,8 @@ int evergreen_mc_init(struct radeon_device *rdev)
 		rdev->mc.real_vram_size = rdev->mc.aper_size;
 	}
 	r600_vram_gtt_location(rdev, &rdev->mc);
-	/* FIXME: we should enforce default clock in case GPU is not in
-	 * default setup
-	 */
-	a.full = rfixed_const(100);
-	rdev->pm.sclk.full = rfixed_const(rdev->clock.default_sclk);
-	rdev->pm.sclk.full = rfixed_div(rdev->pm.sclk, a);
+	radeon_update_bandwidth_info(rdev);
+
 	return 0;
 }
 
@@ -746,6 +742,7 @@ int evergreen_init(struct radeon_device *rdev)
 
 void evergreen_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	evergreen_suspend(rdev);
 #if 0
 	r600_blit_fini(rdev);
diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 91eb762eb3f..3ae51ada1ab 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -31,6 +31,7 @@
 #include "radeon_drm.h"
 #include "radeon_reg.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "r100d.h"
 #include "rs100d.h"
 #include "rv200d.h"
@@ -235,9 +236,9 @@ int r100_pci_gart_set_page(struct radeon_device *rdev, int i, uint64_t addr)
 
 void r100_pci_gart_fini(struct radeon_device *rdev)
 {
+	radeon_gart_fini(rdev);
 	r100_pci_gart_disable(rdev);
 	radeon_gart_table_ram_free(rdev);
-	radeon_gart_fini(rdev);
 }
 
 int r100_irq_set(struct radeon_device *rdev)
@@ -312,10 +313,12 @@ int r100_irq_process(struct radeon_device *rdev)
 		/* Vertical blank interrupts */
 		if (status & RADEON_CRTC_VBLANK_STAT) {
 			drm_handle_vblank(rdev->ddev, 0);
+			rdev->pm.vblank_sync = true;
 			wake_up(&rdev->irq.vblank_queue);
 		}
 		if (status & RADEON_CRTC2_VBLANK_STAT) {
 			drm_handle_vblank(rdev->ddev, 1);
+			rdev->pm.vblank_sync = true;
 			wake_up(&rdev->irq.vblank_queue);
 		}
 		if (status & RADEON_FP_DETECT_STAT) {
@@ -741,6 +744,8 @@ int r100_cp_init(struct radeon_device *rdev, unsigned ring_size)
 	udelay(10);
 	rdev->cp.rptr = RREG32(RADEON_CP_RB_RPTR);
 	rdev->cp.wptr = RREG32(RADEON_CP_RB_WPTR);
+	/* protect against crazy HW on resume */
+	rdev->cp.wptr &= rdev->cp.ptr_mask;
 	/* Set cp mode to bus mastering & enable cp*/
 	WREG32(RADEON_CP_CSQ_MODE,
 	       REG_SET(RADEON_INDIRECT2_START, indirect2_start) |
@@ -1804,6 +1809,7 @@ void r100_set_common_regs(struct radeon_device *rdev)
 {
 	struct drm_device *dev = rdev->ddev;
 	bool force_dac2 = false;
+	u32 tmp;
 
 	/* set these so they don't interfere with anything */
 	WREG32(RADEON_OV0_SCALE_CNTL, 0);
@@ -1875,6 +1881,12 @@ void r100_set_common_regs(struct radeon_device *rdev)
 		WREG32(RADEON_DISP_HW_DEBUG, disp_hw_debug);
 		WREG32(RADEON_DAC_CNTL2, dac2_cntl);
 	}
+
+	/* switch PM block to ACPI mode */
+	tmp = RREG32_PLL(RADEON_PLL_PWRMGT_CNTL);
+	tmp &= ~RADEON_PM_MODE_SEL;
+	WREG32_PLL(RADEON_PLL_PWRMGT_CNTL, tmp);
+
 }
 
 /*
@@ -2022,6 +2034,7 @@ void r100_mc_init(struct radeon_device *rdev)
 	radeon_vram_location(rdev, &rdev->mc, base);
 	if (!(rdev->flags & RADEON_IS_AGP))
 		radeon_gtt_location(rdev, &rdev->mc);
+	radeon_update_bandwidth_info(rdev);
 }
 
 
@@ -2385,6 +2398,8 @@ void r100_bandwidth_update(struct radeon_device *rdev)
 	uint32_t pixel_bytes1 = 0;
 	uint32_t pixel_bytes2 = 0;
 
+	radeon_update_display_priority(rdev);
+
 	if (rdev->mode_info.crtcs[0]->base.enabled) {
 		mode1 = &rdev->mode_info.crtcs[0]->base.mode;
 		pixel_bytes1 = rdev->mode_info.crtcs[0]->base.fb->bits_per_pixel / 8;
@@ -2413,11 +2428,8 @@ void r100_bandwidth_update(struct radeon_device *rdev)
 	/*
 	 * determine is there is enough bw for current mode
 	 */
-	mclk_ff.full = rfixed_const(rdev->clock.default_mclk);
-	temp_ff.full = rfixed_const(100);
-	mclk_ff.full = rfixed_div(mclk_ff, temp_ff);
-	sclk_ff.full = rfixed_const(rdev->clock.default_sclk);
-	sclk_ff.full = rfixed_div(sclk_ff, temp_ff);
+	sclk_ff = rdev->pm.sclk;
+	mclk_ff = rdev->pm.mclk;
 
 	temp = (rdev->mc.vram_width / 8) * (rdev->mc.vram_is_ddr ? 2 : 1);
 	temp_ff.full = rfixed_const(temp);
@@ -3440,6 +3452,7 @@ int r100_suspend(struct radeon_device *rdev)
 
 void r100_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	r100_cp_fini(rdev);
 	r100_wb_fini(rdev);
 	r100_ib_fini(rdev);
diff --git a/drivers/gpu/drm/radeon/r200.c b/drivers/gpu/drm/radeon/r200.c
index 1146c9909c2..85617c31121 100644
--- a/drivers/gpu/drm/radeon/r200.c
+++ b/drivers/gpu/drm/radeon/r200.c
@@ -30,6 +30,7 @@
 #include "radeon_drm.h"
 #include "radeon_reg.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 
 #include "r100d.h"
 #include "r200_reg_safe.h"
diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
index 4cef90cd74e..1023eeb6587 100644
--- a/drivers/gpu/drm/radeon/r300.c
+++ b/drivers/gpu/drm/radeon/r300.c
@@ -30,6 +30,7 @@
 #include "drm.h"
 #include "radeon_reg.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "radeon_drm.h"
 #include "r100_track.h"
 #include "r300d.h"
@@ -164,9 +165,9 @@ void rv370_pcie_gart_disable(struct radeon_device *rdev)
 
 void rv370_pcie_gart_fini(struct radeon_device *rdev)
 {
+	radeon_gart_fini(rdev);
 	rv370_pcie_gart_disable(rdev);
 	radeon_gart_table_vram_free(rdev);
-	radeon_gart_fini(rdev);
 }
 
 void r300_fence_ring_emit(struct radeon_device *rdev,
@@ -481,6 +482,7 @@ void r300_mc_init(struct radeon_device *rdev)
 	radeon_vram_location(rdev, &rdev->mc, base);
 	if (!(rdev->flags & RADEON_IS_AGP))
 		radeon_gtt_location(rdev, &rdev->mc);
+	radeon_update_bandwidth_info(rdev);
 }
 
 void rv370_set_pcie_lanes(struct radeon_device *rdev, int lanes)
@@ -1334,6 +1336,7 @@ int r300_suspend(struct radeon_device *rdev)
 
 void r300_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	r100_cp_fini(rdev);
 	r100_wb_fini(rdev);
 	r100_ib_fini(rdev);
diff --git a/drivers/gpu/drm/radeon/r420.c b/drivers/gpu/drm/radeon/r420.c
index c7593b8f58e..0b8603ca697 100644
--- a/drivers/gpu/drm/radeon/r420.c
+++ b/drivers/gpu/drm/radeon/r420.c
@@ -29,6 +29,7 @@
 #include "drmP.h"
 #include "radeon_reg.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "atom.h"
 #include "r100d.h"
 #include "r420d.h"
@@ -266,6 +267,7 @@ int r420_suspend(struct radeon_device *rdev)
 
 void r420_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	r100_cp_fini(rdev);
 	r100_wb_fini(rdev);
 	r100_ib_fini(rdev);
diff --git a/drivers/gpu/drm/radeon/r520.c b/drivers/gpu/drm/radeon/r520.c
index 2b8a5dd1351..3c44b8d3931 100644
--- a/drivers/gpu/drm/radeon/r520.c
+++ b/drivers/gpu/drm/radeon/r520.c
@@ -27,6 +27,7 @@
  */
 #include "drmP.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "atom.h"
 #include "r520d.h"
 
@@ -121,19 +122,13 @@ static void r520_vram_get_type(struct radeon_device *rdev)
 
 void r520_mc_init(struct radeon_device *rdev)
 {
-	fixed20_12 a;
 
 	r520_vram_get_type(rdev);
 	r100_vram_init_sizes(rdev);
 	radeon_vram_location(rdev, &rdev->mc, 0);
 	if (!(rdev->flags & RADEON_IS_AGP))
 		radeon_gtt_location(rdev, &rdev->mc);
-	/* FIXME: we should enforce default clock in case GPU is not in
-	 * default setup
-	 */
-	a.full = rfixed_const(100);
-	rdev->pm.sclk.full = rfixed_const(rdev->clock.default_sclk);
-	rdev->pm.sclk.full = rfixed_div(rdev->pm.sclk, a);
+	radeon_update_bandwidth_info(rdev);
 }
 
 void r520_mc_program(struct radeon_device *rdev)
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index c5229019729..5509354c7c8 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -31,6 +31,7 @@
 #include "drmP.h"
 #include "radeon_drm.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "radeon_mode.h"
 #include "r600d.h"
 #include "atom.h"
@@ -491,9 +492,9 @@ void r600_pcie_gart_disable(struct radeon_device *rdev)
 
 void r600_pcie_gart_fini(struct radeon_device *rdev)
 {
+	radeon_gart_fini(rdev);
 	r600_pcie_gart_disable(rdev);
 	radeon_gart_table_vram_free(rdev);
-	radeon_gart_fini(rdev);
 }
 
 void r600_agp_enable(struct radeon_device *rdev)
@@ -675,7 +676,6 @@ void r600_vram_gtt_location(struct radeon_device *rdev, struct radeon_mc *mc)
 
 int r600_mc_init(struct radeon_device *rdev)
 {
-	fixed20_12 a;
 	u32 tmp;
 	int chansize, numchan;
 
@@ -719,14 +719,10 @@ int r600_mc_init(struct radeon_device *rdev)
 		rdev->mc.real_vram_size = rdev->mc.aper_size;
 	}
 	r600_vram_gtt_location(rdev, &rdev->mc);
-	/* FIXME: we should enforce default clock in case GPU is not in
-	 * default setup
-	 */
-	a.full = rfixed_const(100);
-	rdev->pm.sclk.full = rfixed_const(rdev->clock.default_sclk);
-	rdev->pm.sclk.full = rfixed_div(rdev->pm.sclk, a);
+
 	if (rdev->flags & RADEON_IS_IGP)
 		rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev);
+	radeon_update_bandwidth_info(rdev);
 	return 0;
 }
 
@@ -1132,6 +1128,7 @@ void r600_gpu_init(struct radeon_device *rdev)
 	/* Setup pipes */
 	WREG32(CC_RB_BACKEND_DISABLE, cc_rb_backend_disable);
 	WREG32(CC_GC_SHADER_PIPE_CONFIG, cc_gc_shader_pipe_config);
+	WREG32(GC_USER_SHADER_PIPE_CONFIG, cc_gc_shader_pipe_config);
 
 	tmp = R6XX_MAX_PIPES - r600_count_pipe_bits((cc_gc_shader_pipe_config & INACTIVE_QD_PIPES_MASK) >> 8);
 	WREG32(VGT_OUT_DEALLOC_CNTL, (tmp * 4) & DEALLOC_DIST_MASK);
@@ -2119,6 +2116,7 @@ int r600_init(struct radeon_device *rdev)
 
 void r600_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	r600_audio_fini(rdev);
 	r600_blit_fini(rdev);
 	r600_cp_fini(rdev);
@@ -2398,19 +2396,19 @@ static void r600_disable_interrupt_state(struct radeon_device *rdev)
 		WREG32(DC_HPD4_INT_CONTROL, tmp);
 		if (ASIC_IS_DCE32(rdev)) {
 			tmp = RREG32(DC_HPD5_INT_CONTROL) & DC_HPDx_INT_POLARITY;
-			WREG32(DC_HPD5_INT_CONTROL, 0);
+			WREG32(DC_HPD5_INT_CONTROL, tmp);
 			tmp = RREG32(DC_HPD6_INT_CONTROL) & DC_HPDx_INT_POLARITY;
-			WREG32(DC_HPD6_INT_CONTROL, 0);
+			WREG32(DC_HPD6_INT_CONTROL, tmp);
 		}
 	} else {
 		WREG32(DACA_AUTODETECT_INT_CONTROL, 0);
 		WREG32(DACB_AUTODETECT_INT_CONTROL, 0);
 		tmp = RREG32(DC_HOT_PLUG_DETECT1_INT_CONTROL) & DC_HOT_PLUG_DETECTx_INT_POLARITY;
-		WREG32(DC_HOT_PLUG_DETECT1_INT_CONTROL, 0);
+		WREG32(DC_HOT_PLUG_DETECT1_INT_CONTROL, tmp);
 		tmp = RREG32(DC_HOT_PLUG_DETECT2_INT_CONTROL) & DC_HOT_PLUG_DETECTx_INT_POLARITY;
-		WREG32(DC_HOT_PLUG_DETECT2_INT_CONTROL, 0);
+		WREG32(DC_HOT_PLUG_DETECT2_INT_CONTROL, tmp);
 		tmp = RREG32(DC_HOT_PLUG_DETECT3_INT_CONTROL) & DC_HOT_PLUG_DETECTx_INT_POLARITY;
-		WREG32(DC_HOT_PLUG_DETECT3_INT_CONTROL, 0);
+		WREG32(DC_HOT_PLUG_DETECT3_INT_CONTROL, tmp);
 	}
 }
 
@@ -2765,6 +2763,7 @@ restart_ih:
 			case 0: /* D1 vblank */
 				if (disp_int & LB_D1_VBLANK_INTERRUPT) {
 					drm_handle_vblank(rdev->ddev, 0);
+					rdev->pm.vblank_sync = true;
 					wake_up(&rdev->irq.vblank_queue);
 					disp_int &= ~LB_D1_VBLANK_INTERRUPT;
 					DRM_DEBUG("IH: D1 vblank\n");
@@ -2786,6 +2785,7 @@ restart_ih:
 			case 0: /* D2 vblank */
 				if (disp_int & LB_D2_VBLANK_INTERRUPT) {
 					drm_handle_vblank(rdev->ddev, 1);
+					rdev->pm.vblank_sync = true;
 					wake_up(&rdev->irq.vblank_queue);
 					disp_int &= ~LB_D2_VBLANK_INTERRUPT;
 					DRM_DEBUG("IH: D2 vblank\n");
@@ -2834,14 +2834,14 @@ restart_ih:
 				break;
 			case 10:
 				if (disp_int_cont2 & DC_HPD5_INTERRUPT) {
-					disp_int_cont &= ~DC_HPD5_INTERRUPT;
+					disp_int_cont2 &= ~DC_HPD5_INTERRUPT;
 					queue_hotplug = true;
 					DRM_DEBUG("IH: HPD5\n");
 				}
 				break;
 			case 12:
 				if (disp_int_cont2 & DC_HPD6_INTERRUPT) {
-					disp_int_cont &= ~DC_HPD6_INTERRUPT;
+					disp_int_cont2 &= ~DC_HPD6_INTERRUPT;
 					queue_hotplug = true;
 					DRM_DEBUG("IH: HPD6\n");
 				}
diff --git a/drivers/gpu/drm/radeon/r600_audio.c b/drivers/gpu/drm/radeon/r600_audio.c
index db928016d03..dac7042b797 100644
--- a/drivers/gpu/drm/radeon/r600_audio.c
+++ b/drivers/gpu/drm/radeon/r600_audio.c
@@ -182,41 +182,6 @@ int r600_audio_init(struct radeon_device *rdev)
 }
 
 /*
- * determin how the encoders and audio interface is wired together
- */
-int r600_audio_tmds_index(struct drm_encoder *encoder)
-{
-	struct drm_device *dev = encoder->dev;
-	struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder);
-	struct drm_encoder *other;
-
-	switch (radeon_encoder->encoder_id) {
-	case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_TMDS1:
-	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY:
-	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1:
-		return 0;
-
-	case ENCODER_OBJECT_ID_INTERNAL_LVTM1:
-		/* special case check if an TMDS1 is present */
-		list_for_each_entry(other, &dev->mode_config.encoder_list, head) {
-			if (to_radeon_encoder(other)->encoder_id ==
-				ENCODER_OBJECT_ID_INTERNAL_TMDS1)
-				return 1;
-		}
-		return 0;
-
-	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2:
-	case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_LVTMA:
-		return 1;
-
-	default:
-		DRM_ERROR("Unsupported encoder type 0x%02X\n",
-			  radeon_encoder->encoder_id);
-		return -1;
-	}
-}
-
-/*
  * atach the audio codec to the clock source of the encoder
  */
 void r600_audio_set_clock(struct drm_encoder *encoder, int clock)
@@ -224,6 +189,7 @@ void r600_audio_set_clock(struct drm_encoder *encoder, int clock)
 	struct drm_device *dev = encoder->dev;
 	struct radeon_device *rdev = dev->dev_private;
 	struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder);
+	struct radeon_encoder_atom_dig *dig = radeon_encoder->enc_priv;
 	int base_rate = 48000;
 
 	switch (radeon_encoder->encoder_id) {
@@ -231,32 +197,34 @@ void r600_audio_set_clock(struct drm_encoder *encoder, int clock)
 	case ENCODER_OBJECT_ID_INTERNAL_LVTM1:
 		WREG32_P(R600_AUDIO_TIMING, 0, ~0x301);
 		break;
-
 	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY:
 	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1:
 	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2:
 	case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_LVTMA:
 		WREG32_P(R600_AUDIO_TIMING, 0x100, ~0x301);
 		break;
-
 	default:
 		DRM_ERROR("Unsupported encoder type 0x%02X\n",
 			  radeon_encoder->encoder_id);
 		return;
 	}
 
-	switch (r600_audio_tmds_index(encoder)) {
+	switch (dig->dig_encoder) {
 	case 0:
-		WREG32(R600_AUDIO_PLL1_MUL, base_rate*50);
-		WREG32(R600_AUDIO_PLL1_DIV, clock*100);
+		WREG32(R600_AUDIO_PLL1_MUL, base_rate * 50);
+		WREG32(R600_AUDIO_PLL1_DIV, clock * 100);
 		WREG32(R600_AUDIO_CLK_SRCSEL, 0);
 		break;
 
 	case 1:
-		WREG32(R600_AUDIO_PLL2_MUL, base_rate*50);
-		WREG32(R600_AUDIO_PLL2_DIV, clock*100);
+		WREG32(R600_AUDIO_PLL2_MUL, base_rate * 50);
+		WREG32(R600_AUDIO_PLL2_DIV, clock * 100);
 		WREG32(R600_AUDIO_CLK_SRCSEL, 1);
 		break;
+	default:
+		dev_err(rdev->dev, "Unsupported DIG on encoder 0x%02X\n",
+			  radeon_encoder->encoder_id);
+		return;
 	}
 }
 
diff --git a/drivers/gpu/drm/radeon/r600_blit_shaders.c b/drivers/gpu/drm/radeon/r600_blit_shaders.c
index a112c59f9d8..0271b53fa2d 100644
--- a/drivers/gpu/drm/radeon/r600_blit_shaders.c
+++ b/drivers/gpu/drm/radeon/r600_blit_shaders.c
@@ -1,7 +1,42 @@
+/*
+ * Copyright 2009 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *     Alex Deucher <alexander.deucher@amd.com>
+ */
 
 #include <linux/types.h>
 #include <linux/kernel.h>
 
+/*
+ * R6xx+ cards need to use the 3D engine to blit data which requires
+ * quite a bit of hw state setup.  Rather than pull the whole 3D driver
+ * (which normally generates the 3D state) into the DRM, we opt to use
+ * statically generated state tables.  The regsiter state and shaders
+ * were hand generated to support blitting functionality.  See the 3D
+ * driver or documentation for descriptions of the registers and
+ * shader instructions.
+ */
+
 const u32 r6xx_default_state[] =
 {
 	0xc0002400,
diff --git a/drivers/gpu/drm/radeon/r600_cp.c b/drivers/gpu/drm/radeon/r600_cp.c
index 40416c068d9..68e6f434930 100644
--- a/drivers/gpu/drm/radeon/r600_cp.c
+++ b/drivers/gpu/drm/radeon/r600_cp.c
@@ -1548,10 +1548,13 @@ static void r700_gfx_init(struct drm_device *dev,
 
 	RADEON_WRITE(R600_CC_RB_BACKEND_DISABLE,      cc_rb_backend_disable);
 	RADEON_WRITE(R600_CC_GC_SHADER_PIPE_CONFIG,   cc_gc_shader_pipe_config);
+	RADEON_WRITE(R600_GC_USER_SHADER_PIPE_CONFIG, cc_gc_shader_pipe_config);
 
 	RADEON_WRITE(R700_CC_SYS_RB_BACKEND_DISABLE, cc_rb_backend_disable);
 	RADEON_WRITE(R700_CGTS_SYS_TCC_DISABLE, 0);
 	RADEON_WRITE(R700_CGTS_TCC_DISABLE, 0);
+	RADEON_WRITE(R700_CGTS_USER_SYS_TCC_DISABLE, 0);
+	RADEON_WRITE(R700_CGTS_USER_TCC_DISABLE, 0);
 
 	num_qd_pipes =
 		R7XX_MAX_PIPES - r600_count_pipe_bits((cc_gc_shader_pipe_config & R600_INACTIVE_QD_PIPES_MASK) >> 8);
diff --git a/drivers/gpu/drm/radeon/r600_cs.c b/drivers/gpu/drm/radeon/r600_cs.c
index cd2c63bce50..c39c1bc1301 100644
--- a/drivers/gpu/drm/radeon/r600_cs.c
+++ b/drivers/gpu/drm/radeon/r600_cs.c
@@ -45,6 +45,7 @@ struct r600_cs_track {
 	u32			nbanks;
 	u32			npipes;
 	/* value we track */
+	u32			sq_config;
 	u32			nsamples;
 	u32			cb_color_base_last[8];
 	struct radeon_bo	*cb_color_bo[8];
@@ -141,6 +142,8 @@ static void r600_cs_track_init(struct r600_cs_track *track)
 {
 	int i;
 
+	/* assume DX9 mode */
+	track->sq_config = DX9_CONSTS;
 	for (i = 0; i < 8; i++) {
 		track->cb_color_base_last[i] = 0;
 		track->cb_color_size[i] = 0;
@@ -715,6 +718,9 @@ static inline int r600_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx
 		tmp =radeon_get_ib_value(p, idx);
 		ib[idx] = 0;
 		break;
+	case SQ_CONFIG:
+		track->sq_config = radeon_get_ib_value(p, idx);
+		break;
 	case R_028800_DB_DEPTH_CONTROL:
 		track->db_depth_control = radeon_get_ib_value(p, idx);
 		break;
@@ -869,6 +875,54 @@ static inline int r600_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx
 	case SQ_PGM_START_VS:
 	case SQ_PGM_START_GS:
 	case SQ_PGM_START_PS:
+	case SQ_ALU_CONST_CACHE_GS_0:
+	case SQ_ALU_CONST_CACHE_GS_1:
+	case SQ_ALU_CONST_CACHE_GS_2:
+	case SQ_ALU_CONST_CACHE_GS_3:
+	case SQ_ALU_CONST_CACHE_GS_4:
+	case SQ_ALU_CONST_CACHE_GS_5:
+	case SQ_ALU_CONST_CACHE_GS_6:
+	case SQ_ALU_CONST_CACHE_GS_7:
+	case SQ_ALU_CONST_CACHE_GS_8:
+	case SQ_ALU_CONST_CACHE_GS_9:
+	case SQ_ALU_CONST_CACHE_GS_10:
+	case SQ_ALU_CONST_CACHE_GS_11:
+	case SQ_ALU_CONST_CACHE_GS_12:
+	case SQ_ALU_CONST_CACHE_GS_13:
+	case SQ_ALU_CONST_CACHE_GS_14:
+	case SQ_ALU_CONST_CACHE_GS_15:
+	case SQ_ALU_CONST_CACHE_PS_0:
+	case SQ_ALU_CONST_CACHE_PS_1:
+	case SQ_ALU_CONST_CACHE_PS_2:
+	case SQ_ALU_CONST_CACHE_PS_3:
+	case SQ_ALU_CONST_CACHE_PS_4:
+	case SQ_ALU_CONST_CACHE_PS_5:
+	case SQ_ALU_CONST_CACHE_PS_6:
+	case SQ_ALU_CONST_CACHE_PS_7:
+	case SQ_ALU_CONST_CACHE_PS_8:
+	case SQ_ALU_CONST_CACHE_PS_9:
+	case SQ_ALU_CONST_CACHE_PS_10:
+	case SQ_ALU_CONST_CACHE_PS_11:
+	case SQ_ALU_CONST_CACHE_PS_12:
+	case SQ_ALU_CONST_CACHE_PS_13:
+	case SQ_ALU_CONST_CACHE_PS_14:
+	case SQ_ALU_CONST_CACHE_PS_15:
+	case SQ_ALU_CONST_CACHE_VS_0:
+	case SQ_ALU_CONST_CACHE_VS_1:
+	case SQ_ALU_CONST_CACHE_VS_2:
+	case SQ_ALU_CONST_CACHE_VS_3:
+	case SQ_ALU_CONST_CACHE_VS_4:
+	case SQ_ALU_CONST_CACHE_VS_5:
+	case SQ_ALU_CONST_CACHE_VS_6:
+	case SQ_ALU_CONST_CACHE_VS_7:
+	case SQ_ALU_CONST_CACHE_VS_8:
+	case SQ_ALU_CONST_CACHE_VS_9:
+	case SQ_ALU_CONST_CACHE_VS_10:
+	case SQ_ALU_CONST_CACHE_VS_11:
+	case SQ_ALU_CONST_CACHE_VS_12:
+	case SQ_ALU_CONST_CACHE_VS_13:
+	case SQ_ALU_CONST_CACHE_VS_14:
+	case SQ_ALU_CONST_CACHE_VS_15:
 		r = r600_cs_packet_next_reloc(p, &reloc);
 		if (r) {
 			dev_warn(p->dev, "bad SET_CONTEXT_REG "
@@ -1226,13 +1280,15 @@ static int r600_packet3_check(struct radeon_cs_parser *p,
 		}
 		break;
 	case PACKET3_SET_ALU_CONST:
-		start_reg = (idx_value << 2) + PACKET3_SET_ALU_CONST_OFFSET;
-		end_reg = 4 * pkt->count + start_reg - 4;
-		if ((start_reg < PACKET3_SET_ALU_CONST_OFFSET) ||
-		    (start_reg >= PACKET3_SET_ALU_CONST_END) ||
-		    (end_reg >= PACKET3_SET_ALU_CONST_END)) {
-			DRM_ERROR("bad SET_ALU_CONST\n");
-			return -EINVAL;
+		if (track->sq_config & DX9_CONSTS) {
+			start_reg = (idx_value << 2) + PACKET3_SET_ALU_CONST_OFFSET;
+			end_reg = 4 * pkt->count + start_reg - 4;
+			if ((start_reg < PACKET3_SET_ALU_CONST_OFFSET) ||
+			    (start_reg >= PACKET3_SET_ALU_CONST_END) ||
+			    (end_reg >= PACKET3_SET_ALU_CONST_END)) {
+				DRM_ERROR("bad SET_ALU_CONST\n");
+				return -EINVAL;
+			}
 		}
 		break;
 	case PACKET3_SET_BOOL_CONST:
diff --git a/drivers/gpu/drm/radeon/r600_hdmi.c b/drivers/gpu/drm/radeon/r600_hdmi.c
index fcc949df0e5..029fa1406d1 100644
--- a/drivers/gpu/drm/radeon/r600_hdmi.c
+++ b/drivers/gpu/drm/radeon/r600_hdmi.c
@@ -42,13 +42,13 @@ enum r600_hdmi_color_format {
  */
 enum r600_hdmi_iec_status_bits {
 	AUDIO_STATUS_DIG_ENABLE   = 0x01,
-	AUDIO_STATUS_V	    = 0x02,
-	AUDIO_STATUS_VCFG	 = 0x04,
+	AUDIO_STATUS_V            = 0x02,
+	AUDIO_STATUS_VCFG         = 0x04,
 	AUDIO_STATUS_EMPHASIS     = 0x08,
 	AUDIO_STATUS_COPYRIGHT    = 0x10,
 	AUDIO_STATUS_NONAUDIO     = 0x20,
 	AUDIO_STATUS_PROFESSIONAL = 0x40,
-	AUDIO_STATUS_LEVEL	= 0x80
+	AUDIO_STATUS_LEVEL        = 0x80
 };
 
 struct {
@@ -85,7 +85,7 @@ struct {
 static void r600_hdmi_calc_CTS(uint32_t clock, int *CTS, int N, int freq)
 {
 	if (*CTS == 0)
-		*CTS = clock*N/(128*freq)*1000;
+		*CTS = clock * N / (128 * freq) * 1000;
 	DRM_DEBUG("Using ACR timing N=%d CTS=%d for frequency %d\n",
 		  N, *CTS, freq);
 }
@@ -131,11 +131,11 @@ static void r600_hdmi_infoframe_checksum(uint8_t packetType,
 					 uint8_t length,
 					 uint8_t *frame)
 {
-    int i;
-    frame[0] = packetType + versionNumber + length;
-    for (i = 1; i <= length; i++)
-	frame[0] += frame[i];
-    frame[0] = 0x100 - frame[0];
+	int i;
+	frame[0] = packetType + versionNumber + length;
+	for (i = 1; i <= length; i++)
+		frame[0] += frame[i];
+	frame[0] = 0x100 - frame[0];
 }
 
 /*
@@ -417,90 +417,141 @@ void r600_hdmi_update_audio_settings(struct drm_encoder *encoder,
 	WREG32_P(offset+R600_HDMI_CNTL, 0x04000000, ~0x04000000);
 }
 
-/*
- * enable/disable the HDMI engine
- */
-void r600_hdmi_enable(struct drm_encoder *encoder, int enable)
+static int r600_hdmi_find_free_block(struct drm_device *dev)
+{
+	struct radeon_device *rdev = dev->dev_private;
+	struct drm_encoder *encoder;
+	struct radeon_encoder *radeon_encoder;
+	bool free_blocks[3] = { true, true, true };
+
+	list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) {
+		radeon_encoder = to_radeon_encoder(encoder);
+		switch (radeon_encoder->hdmi_offset) {
+		case R600_HDMI_BLOCK1:
+			free_blocks[0] = false;
+			break;
+		case R600_HDMI_BLOCK2:
+			free_blocks[1] = false;
+			break;
+		case R600_HDMI_BLOCK3:
+			free_blocks[2] = false;
+			break;
+		}
+	}
+
+	if (rdev->family == CHIP_RS600 || rdev->family == CHIP_RS690) {
+		return free_blocks[0] ? R600_HDMI_BLOCK1 : 0;
+	} else if (rdev->family >= CHIP_R600) {
+		if (free_blocks[0])
+			return R600_HDMI_BLOCK1;
+		else if (free_blocks[1])
+			return R600_HDMI_BLOCK2;
+	}
+	return 0;
+}
+
+static void r600_hdmi_assign_block(struct drm_encoder *encoder)
 {
 	struct drm_device *dev = encoder->dev;
 	struct radeon_device *rdev = dev->dev_private;
 	struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder);
-	uint32_t offset = to_radeon_encoder(encoder)->hdmi_offset;
+	struct radeon_encoder_atom_dig *dig = radeon_encoder->enc_priv;
 
-	if (!offset)
+	if (!dig) {
+		dev_err(rdev->dev, "Enabling HDMI on non-dig encoder\n");
 		return;
+	}
 
-	DRM_DEBUG("%s HDMI interface @ 0x%04X\n", enable ? "Enabling" : "Disabling", offset);
-
-	/* some version of atombios ignore the enable HDMI flag
-	 * so enabling/disabling HDMI was moved here for TMDS1+2 */
-	switch (radeon_encoder->encoder_id) {
-	case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_TMDS1:
-		WREG32_P(AVIVO_TMDSA_CNTL, enable ? 0x4 : 0x0, ~0x4);
-		WREG32(offset+R600_HDMI_ENABLE, enable ? 0x101 : 0x0);
-		break;
-
-	case ENCODER_OBJECT_ID_INTERNAL_LVTM1:
-		WREG32_P(AVIVO_LVTMA_CNTL, enable ? 0x4 : 0x0, ~0x4);
-		WREG32(offset+R600_HDMI_ENABLE, enable ? 0x105 : 0x0);
-		break;
-
-	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY:
-	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1:
-	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2:
-	case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_LVTMA:
-		/* This part is doubtfull in my opinion */
-		WREG32(offset+R600_HDMI_ENABLE, enable ? 0x110 : 0x0);
-		break;
-
-	default:
-		DRM_ERROR("unknown HDMI output type\n");
-		break;
+	if (ASIC_IS_DCE4(rdev)) {
+		/* TODO */
+	} else if (ASIC_IS_DCE3(rdev)) {
+		radeon_encoder->hdmi_offset = dig->dig_encoder ?
+			R600_HDMI_BLOCK3 : R600_HDMI_BLOCK1;
+		if (ASIC_IS_DCE32(rdev))
+			radeon_encoder->hdmi_config_offset = dig->dig_encoder ?
+				R600_HDMI_CONFIG2 : R600_HDMI_CONFIG1;
+	} else if (rdev->family >= CHIP_R600) {
+		radeon_encoder->hdmi_offset = r600_hdmi_find_free_block(dev);
 	}
 }
 
 /*
- * determin at which register offset the HDMI encoder is
+ * enable the HDMI engine
  */
-void r600_hdmi_init(struct drm_encoder *encoder)
+void r600_hdmi_enable(struct drm_encoder *encoder)
 {
+	struct drm_device *dev = encoder->dev;
+	struct radeon_device *rdev = dev->dev_private;
 	struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder);
 
-	switch (radeon_encoder->encoder_id) {
-	case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_TMDS1:
-	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY:
-	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1:
-		radeon_encoder->hdmi_offset = R600_HDMI_TMDS1;
-		break;
-
-	case ENCODER_OBJECT_ID_INTERNAL_LVTM1:
-		switch (r600_audio_tmds_index(encoder)) {
-		case 0:
-			radeon_encoder->hdmi_offset = R600_HDMI_TMDS1;
+	if (!radeon_encoder->hdmi_offset) {
+		r600_hdmi_assign_block(encoder);
+		if (!radeon_encoder->hdmi_offset) {
+			dev_warn(rdev->dev, "Could not find HDMI block for "
+				"0x%x encoder\n", radeon_encoder->encoder_id);
+			return;
+		}
+	}
+
+	if (ASIC_IS_DCE32(rdev) && !ASIC_IS_DCE4(rdev)) {
+		WREG32_P(radeon_encoder->hdmi_config_offset + 0x4, 0x1, ~0x1);
+	} else if (rdev->family >= CHIP_R600 && !ASIC_IS_DCE3(rdev)) {
+		int offset = radeon_encoder->hdmi_offset;
+		switch (radeon_encoder->encoder_id) {
+		case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_TMDS1:
+			WREG32_P(AVIVO_TMDSA_CNTL, 0x4, ~0x4);
+			WREG32(offset + R600_HDMI_ENABLE, 0x101);
 			break;
-		case 1:
-			radeon_encoder->hdmi_offset = R600_HDMI_TMDS2;
+		case ENCODER_OBJECT_ID_INTERNAL_LVTM1:
+			WREG32_P(AVIVO_LVTMA_CNTL, 0x4, ~0x4);
+			WREG32(offset + R600_HDMI_ENABLE, 0x105);
 			break;
 		default:
-			radeon_encoder->hdmi_offset = 0;
+			dev_err(rdev->dev, "Unknown HDMI output type\n");
 			break;
 		}
-	case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2:
-		radeon_encoder->hdmi_offset = R600_HDMI_TMDS2;
-		break;
+	}
 
-	case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_LVTMA:
-		radeon_encoder->hdmi_offset = R600_HDMI_DIG;
-		break;
+	DRM_DEBUG("Enabling HDMI interface @ 0x%04X for encoder 0x%x\n",
+		radeon_encoder->hdmi_offset, radeon_encoder->encoder_id);
+}
 
-	default:
-		radeon_encoder->hdmi_offset = 0;
-		break;
+/*
+ * disable the HDMI engine
+ */
+void r600_hdmi_disable(struct drm_encoder *encoder)
+{
+	struct drm_device *dev = encoder->dev;
+	struct radeon_device *rdev = dev->dev_private;
+	struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder);
+
+	if (!radeon_encoder->hdmi_offset) {
+		dev_err(rdev->dev, "Disabling not enabled HDMI\n");
+		return;
 	}
 
-	DRM_DEBUG("using HDMI engine at offset 0x%04X for encoder 0x%x\n",
-		  radeon_encoder->hdmi_offset, radeon_encoder->encoder_id);
+	DRM_DEBUG("Disabling HDMI interface @ 0x%04X for encoder 0x%x\n",
+		radeon_encoder->hdmi_offset, radeon_encoder->encoder_id);
+
+	if (ASIC_IS_DCE32(rdev) && !ASIC_IS_DCE4(rdev)) {
+		WREG32_P(radeon_encoder->hdmi_config_offset + 0x4, 0, ~0x1);
+	} else if (rdev->family >= CHIP_R600 && !ASIC_IS_DCE3(rdev)) {
+		int offset = radeon_encoder->hdmi_offset;
+		switch (radeon_encoder->encoder_id) {
+		case ENCODER_OBJECT_ID_INTERNAL_KLDSCP_TMDS1:
+			WREG32_P(AVIVO_TMDSA_CNTL, 0, ~0x4);
+			WREG32(offset + R600_HDMI_ENABLE, 0);
+			break;
+		case ENCODER_OBJECT_ID_INTERNAL_LVTM1:
+			WREG32_P(AVIVO_LVTMA_CNTL, 0, ~0x4);
+			WREG32(offset + R600_HDMI_ENABLE, 0);
+			break;
+		default:
+			dev_err(rdev->dev, "Unknown HDMI output type\n");
+			break;
+		}
+	}
 
-	/* TODO: make this configureable */
-	radeon_encoder->hdmi_audio_workaround = 0;
+	radeon_encoder->hdmi_offset = 0;
+	radeon_encoder->hdmi_config_offset = 0;
 }
diff --git a/drivers/gpu/drm/radeon/r600_reg.h b/drivers/gpu/drm/radeon/r600_reg.h
index d0e28ffdeda..7b1d22370f6 100644
--- a/drivers/gpu/drm/radeon/r600_reg.h
+++ b/drivers/gpu/drm/radeon/r600_reg.h
@@ -152,9 +152,9 @@
 #define R600_AUDIO_STATUS_BITS            0x73d8
 
 /* HDMI base register addresses */
-#define R600_HDMI_TMDS1                   0x7400
-#define R600_HDMI_TMDS2                   0x7700
-#define R600_HDMI_DIG                     0x7800
+#define R600_HDMI_BLOCK1                  0x7400
+#define R600_HDMI_BLOCK2                  0x7700
+#define R600_HDMI_BLOCK3                  0x7800
 
 /* HDMI registers */
 #define R600_HDMI_ENABLE           0x00
@@ -185,4 +185,8 @@
 #define R600_HDMI_AUDIO_DEBUG_2    0xe8
 #define R600_HDMI_AUDIO_DEBUG_3    0xec
 
+/* HDMI additional config base register addresses */
+#define R600_HDMI_CONFIG1                 0x7600
+#define R600_HDMI_CONFIG2                 0x7a00
+
 #endif
diff --git a/drivers/gpu/drm/radeon/r600d.h b/drivers/gpu/drm/radeon/r600d.h
index 5b2e4d44282..59c1f8793e6 100644
--- a/drivers/gpu/drm/radeon/r600d.h
+++ b/drivers/gpu/drm/radeon/r600d.h
@@ -77,6 +77,55 @@
 #define CB_COLOR0_FRAG                                  0x280e0
 #define CB_COLOR0_MASK                                  0x28100
 
+#define SQ_ALU_CONST_CACHE_PS_0				0x28940
+#define SQ_ALU_CONST_CACHE_PS_1				0x28944
+#define SQ_ALU_CONST_CACHE_PS_2				0x28948
+#define SQ_ALU_CONST_CACHE_PS_3				0x2894c
+#define SQ_ALU_CONST_CACHE_PS_4				0x28950
+#define SQ_ALU_CONST_CACHE_PS_5				0x28954
+#define SQ_ALU_CONST_CACHE_PS_6				0x28958
+#define SQ_ALU_CONST_CACHE_PS_7				0x2895c
+#define SQ_ALU_CONST_CACHE_PS_8				0x28960
+#define SQ_ALU_CONST_CACHE_PS_9				0x28964
+#define SQ_ALU_CONST_CACHE_PS_10			0x28968
+#define SQ_ALU_CONST_CACHE_PS_11			0x2896c
+#define SQ_ALU_CONST_CACHE_PS_12			0x28970
+#define SQ_ALU_CONST_CACHE_PS_13			0x28974
+#define SQ_ALU_CONST_CACHE_PS_14			0x28978
+#define SQ_ALU_CONST_CACHE_PS_15			0x2897c
+#define SQ_ALU_CONST_CACHE_VS_0				0x28980
+#define SQ_ALU_CONST_CACHE_VS_1				0x28984
+#define SQ_ALU_CONST_CACHE_VS_2				0x28988
+#define SQ_ALU_CONST_CACHE_VS_3				0x2898c
+#define SQ_ALU_CONST_CACHE_VS_4				0x28990
+#define SQ_ALU_CONST_CACHE_VS_5				0x28994
+#define SQ_ALU_CONST_CACHE_VS_6				0x28998
+#define SQ_ALU_CONST_CACHE_VS_7				0x2899c
+#define SQ_ALU_CONST_CACHE_VS_8				0x289a0
+#define SQ_ALU_CONST_CACHE_VS_9				0x289a4
+#define SQ_ALU_CONST_CACHE_VS_10			0x289a8
+#define SQ_ALU_CONST_CACHE_VS_11			0x289ac
+#define SQ_ALU_CONST_CACHE_VS_12			0x289b0
+#define SQ_ALU_CONST_CACHE_VS_13			0x289b4
+#define SQ_ALU_CONST_CACHE_VS_14			0x289b8
+#define SQ_ALU_CONST_CACHE_VS_15			0x289bc
+#define SQ_ALU_CONST_CACHE_GS_0				0x289c0
+#define SQ_ALU_CONST_CACHE_GS_1				0x289c4
+#define SQ_ALU_CONST_CACHE_GS_2				0x289c8
+#define SQ_ALU_CONST_CACHE_GS_3				0x289cc
+#define SQ_ALU_CONST_CACHE_GS_4				0x289d0
+#define SQ_ALU_CONST_CACHE_GS_5				0x289d4
+#define SQ_ALU_CONST_CACHE_GS_6				0x289d8
+#define SQ_ALU_CONST_CACHE_GS_7				0x289dc
+#define SQ_ALU_CONST_CACHE_GS_8				0x289e0
+#define SQ_ALU_CONST_CACHE_GS_9				0x289e4
+#define SQ_ALU_CONST_CACHE_GS_10			0x289e8
+#define SQ_ALU_CONST_CACHE_GS_11			0x289ec
+#define SQ_ALU_CONST_CACHE_GS_12			0x289f0
+#define SQ_ALU_CONST_CACHE_GS_13			0x289f4
+#define SQ_ALU_CONST_CACHE_GS_14			0x289f8
+#define SQ_ALU_CONST_CACHE_GS_15			0x289fc
+
 #define	CONFIG_MEMSIZE					0x5428
 #define CONFIG_CNTL					0x5424
 #define	CP_STAT						0x8680
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 829e26e8a4b..034218c3dbb 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -91,6 +91,8 @@ extern int radeon_tv;
 extern int radeon_new_pll;
 extern int radeon_dynpm;
 extern int radeon_audio;
+extern int radeon_disp_priority;
+extern int radeon_hw_i2c;
 
 /*
  * Copy from radeon_drv.h so we don't have to include both and have conflicting
@@ -168,6 +170,7 @@ struct radeon_clock {
  * Power management
  */
 int radeon_pm_init(struct radeon_device *rdev);
+void radeon_pm_fini(struct radeon_device *rdev);
 void radeon_pm_compute_clocks(struct radeon_device *rdev);
 void radeon_combios_get_power_modes(struct radeon_device *rdev);
 void radeon_atombios_get_power_modes(struct radeon_device *rdev);
@@ -687,6 +690,7 @@ struct radeon_pm {
 	bool 			downclocked;
 	int			active_crtcs;
 	int			req_vblank;
+	bool			vblank_sync;
 	fixed20_12		max_bandwidth;
 	fixed20_12		igp_sideport_mclk;
 	fixed20_12		igp_system_mclk;
@@ -697,6 +701,7 @@ struct radeon_pm {
 	fixed20_12		ht_bandwidth;
 	fixed20_12		core_bandwidth;
 	fixed20_12		sclk;
+	fixed20_12		mclk;
 	fixed20_12		needed_bandwidth;
 	/* XXX: use a define for num power modes */
 	struct radeon_power_state power_state[8];
@@ -707,6 +712,7 @@ struct radeon_pm {
 	struct radeon_power_state *requested_power_state;
 	struct radeon_pm_clock_info *requested_clock_mode;
 	struct radeon_power_state *default_power_state;
+	struct radeon_i2c_chan *i2c_bus;
 };
 
 
@@ -729,8 +735,6 @@ int radeon_debugfs_add_files(struct radeon_device *rdev,
 			     struct drm_info_list *files,
 			     unsigned nfiles);
 int radeon_debugfs_fence_init(struct radeon_device *rdev);
-int r100_debugfs_rbbm_init(struct radeon_device *rdev);
-int r100_debugfs_cp_init(struct radeon_device *rdev);
 
 
 /*
@@ -782,7 +786,7 @@ struct radeon_asic {
 	int (*set_surface_reg)(struct radeon_device *rdev, int reg,
 			       uint32_t tiling_flags, uint32_t pitch,
 			       uint32_t offset, uint32_t obj_size);
-	int (*clear_surface_reg)(struct radeon_device *rdev, int reg);
+	void (*clear_surface_reg)(struct radeon_device *rdev, int reg);
 	void (*bandwidth_update)(struct radeon_device *rdev);
 	void (*hpd_init)(struct radeon_device *rdev);
 	void (*hpd_fini)(struct radeon_device *rdev);
@@ -862,6 +866,12 @@ union radeon_asic_config {
 	struct rv770_asic	rv770;
 };
 
+/*
+ * asic initizalization from radeon_asic.c
+ */
+void radeon_agp_disable(struct radeon_device *rdev);
+int radeon_asic_init(struct radeon_device *rdev);
+
 
 /*
  * IOCTL.
@@ -1172,6 +1182,8 @@ extern void radeon_gart_restore(struct radeon_device *rdev);
 extern int radeon_modeset_init(struct radeon_device *rdev);
 extern void radeon_modeset_fini(struct radeon_device *rdev);
 extern bool radeon_card_posted(struct radeon_device *rdev);
+extern void radeon_update_bandwidth_info(struct radeon_device *rdev);
+extern void radeon_update_display_priority(struct radeon_device *rdev);
 extern bool radeon_boot_test_post_card(struct radeon_device *rdev);
 extern int radeon_clocks_init(struct radeon_device *rdev);
 extern void radeon_clocks_fini(struct radeon_device *rdev);
@@ -1188,51 +1200,6 @@ extern int radeon_resume_kms(struct drm_device *dev);
 extern int radeon_suspend_kms(struct drm_device *dev, pm_message_t state);
 
 /* r100,rv100,rs100,rv200,rs200,r200,rv250,rs300,rv280 */
-struct r100_mc_save {
-	u32	GENMO_WT;
-	u32	CRTC_EXT_CNTL;
-	u32	CRTC_GEN_CNTL;
-	u32	CRTC2_GEN_CNTL;
-	u32	CUR_OFFSET;
-	u32	CUR2_OFFSET;
-};
-extern void r100_cp_disable(struct radeon_device *rdev);
-extern int r100_cp_init(struct radeon_device *rdev, unsigned ring_size);
-extern void r100_cp_fini(struct radeon_device *rdev);
-extern void r100_pci_gart_tlb_flush(struct radeon_device *rdev);
-extern int r100_pci_gart_init(struct radeon_device *rdev);
-extern void r100_pci_gart_fini(struct radeon_device *rdev);
-extern int r100_pci_gart_enable(struct radeon_device *rdev);
-extern void r100_pci_gart_disable(struct radeon_device *rdev);
-extern int r100_pci_gart_set_page(struct radeon_device *rdev, int i, uint64_t addr);
-extern int r100_debugfs_mc_info_init(struct radeon_device *rdev);
-extern int r100_gui_wait_for_idle(struct radeon_device *rdev);
-extern void r100_ib_fini(struct radeon_device *rdev);
-extern int r100_ib_init(struct radeon_device *rdev);
-extern void r100_irq_disable(struct radeon_device *rdev);
-extern int r100_irq_set(struct radeon_device *rdev);
-extern void r100_mc_stop(struct radeon_device *rdev, struct r100_mc_save *save);
-extern void r100_mc_resume(struct radeon_device *rdev, struct r100_mc_save *save);
-extern void r100_vram_init_sizes(struct radeon_device *rdev);
-extern void r100_wb_disable(struct radeon_device *rdev);
-extern void r100_wb_fini(struct radeon_device *rdev);
-extern int r100_wb_init(struct radeon_device *rdev);
-extern void r100_hdp_reset(struct radeon_device *rdev);
-extern int r100_rb2d_reset(struct radeon_device *rdev);
-extern int r100_cp_reset(struct radeon_device *rdev);
-extern void r100_vga_render_disable(struct radeon_device *rdev);
-extern int r100_cs_track_check_pkt3_indx_buffer(struct radeon_cs_parser *p,
-						struct radeon_cs_packet *pkt,
-						struct radeon_bo *robj);
-extern int r100_cs_parse_packet0(struct radeon_cs_parser *p,
-				struct radeon_cs_packet *pkt,
-				const unsigned *auth, unsigned n,
-				radeon_packet0_check_t check);
-extern int r100_cs_packet_parse(struct radeon_cs_parser *p,
-				struct radeon_cs_packet *pkt,
-				unsigned idx);
-extern void r100_enable_bm(struct radeon_device *rdev);
-extern void r100_set_common_regs(struct radeon_device *rdev);
 
 /* rv200,rv250,rv280 */
 extern void r200_set_safe_registers(struct radeon_device *rdev);
@@ -1322,7 +1289,8 @@ extern int r600_audio_tmds_index(struct drm_encoder *encoder);
 extern void r600_audio_set_clock(struct drm_encoder *encoder, int clock);
 extern void r600_audio_fini(struct radeon_device *rdev);
 extern void r600_hdmi_init(struct drm_encoder *encoder);
-extern void r600_hdmi_enable(struct drm_encoder *encoder, int enable);
+extern void r600_hdmi_enable(struct drm_encoder *encoder);
+extern void r600_hdmi_disable(struct drm_encoder *encoder);
 extern void r600_hdmi_setmode(struct drm_encoder *encoder, struct drm_display_mode *mode);
 extern int r600_hdmi_buffer_status_changed(struct drm_encoder *encoder);
 extern void r600_hdmi_update_audio_settings(struct drm_encoder *encoder,
diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c
new file mode 100644
index 00000000000..a4b4bc9fa32
--- /dev/null
+++ b/drivers/gpu/drm/radeon/radeon_asic.c
@@ -0,0 +1,772 @@
+/*
+ * Copyright 2008 Advanced Micro Devices, Inc.
+ * Copyright 2008 Red Hat Inc.
+ * Copyright 2009 Jerome Glisse.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Dave Airlie
+ *          Alex Deucher
+ *          Jerome Glisse
+ */
+
+#include <linux/console.h>
+#include <drm/drmP.h>
+#include <drm/drm_crtc_helper.h>
+#include <drm/radeon_drm.h>
+#include <linux/vgaarb.h>
+#include <linux/vga_switcheroo.h>
+#include "radeon_reg.h"
+#include "radeon.h"
+#include "radeon_asic.h"
+#include "atom.h"
+
+/*
+ * Registers accessors functions.
+ */
+static uint32_t radeon_invalid_rreg(struct radeon_device *rdev, uint32_t reg)
+{
+	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
+	BUG_ON(1);
+	return 0;
+}
+
+static void radeon_invalid_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
+{
+	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
+		  reg, v);
+	BUG_ON(1);
+}
+
+static void radeon_register_accessor_init(struct radeon_device *rdev)
+{
+	rdev->mc_rreg = &radeon_invalid_rreg;
+	rdev->mc_wreg = &radeon_invalid_wreg;
+	rdev->pll_rreg = &radeon_invalid_rreg;
+	rdev->pll_wreg = &radeon_invalid_wreg;
+	rdev->pciep_rreg = &radeon_invalid_rreg;
+	rdev->pciep_wreg = &radeon_invalid_wreg;
+
+	/* Don't change order as we are overridding accessor. */
+	if (rdev->family < CHIP_RV515) {
+		rdev->pcie_reg_mask = 0xff;
+	} else {
+		rdev->pcie_reg_mask = 0x7ff;
+	}
+	/* FIXME: not sure here */
+	if (rdev->family <= CHIP_R580) {
+		rdev->pll_rreg = &r100_pll_rreg;
+		rdev->pll_wreg = &r100_pll_wreg;
+	}
+	if (rdev->family >= CHIP_R420) {
+		rdev->mc_rreg = &r420_mc_rreg;
+		rdev->mc_wreg = &r420_mc_wreg;
+	}
+	if (rdev->family >= CHIP_RV515) {
+		rdev->mc_rreg = &rv515_mc_rreg;
+		rdev->mc_wreg = &rv515_mc_wreg;
+	}
+	if (rdev->family == CHIP_RS400 || rdev->family == CHIP_RS480) {
+		rdev->mc_rreg = &rs400_mc_rreg;
+		rdev->mc_wreg = &rs400_mc_wreg;
+	}
+	if (rdev->family == CHIP_RS690 || rdev->family == CHIP_RS740) {
+		rdev->mc_rreg = &rs690_mc_rreg;
+		rdev->mc_wreg = &rs690_mc_wreg;
+	}
+	if (rdev->family == CHIP_RS600) {
+		rdev->mc_rreg = &rs600_mc_rreg;
+		rdev->mc_wreg = &rs600_mc_wreg;
+	}
+	if ((rdev->family >= CHIP_R600) && (rdev->family <= CHIP_RV740)) {
+		rdev->pciep_rreg = &r600_pciep_rreg;
+		rdev->pciep_wreg = &r600_pciep_wreg;
+	}
+}
+
+
+/* helper to disable agp */
+void radeon_agp_disable(struct radeon_device *rdev)
+{
+	rdev->flags &= ~RADEON_IS_AGP;
+	if (rdev->family >= CHIP_R600) {
+		DRM_INFO("Forcing AGP to PCIE mode\n");
+		rdev->flags |= RADEON_IS_PCIE;
+	} else if (rdev->family >= CHIP_RV515 ||
+			rdev->family == CHIP_RV380 ||
+			rdev->family == CHIP_RV410 ||
+			rdev->family == CHIP_R423) {
+		DRM_INFO("Forcing AGP to PCIE mode\n");
+		rdev->flags |= RADEON_IS_PCIE;
+		rdev->asic->gart_tlb_flush = &rv370_pcie_gart_tlb_flush;
+		rdev->asic->gart_set_page = &rv370_pcie_gart_set_page;
+	} else {
+		DRM_INFO("Forcing AGP to PCI mode\n");
+		rdev->flags |= RADEON_IS_PCI;
+		rdev->asic->gart_tlb_flush = &r100_pci_gart_tlb_flush;
+		rdev->asic->gart_set_page = &r100_pci_gart_set_page;
+	}
+	rdev->mc.gtt_size = radeon_gart_size * 1024 * 1024;
+}
+
+/*
+ * ASIC
+ */
+static struct radeon_asic r100_asic = {
+	.init = &r100_init,
+	.fini = &r100_fini,
+	.suspend = &r100_suspend,
+	.resume = &r100_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &r100_gpu_reset,
+	.gart_tlb_flush = &r100_pci_gart_tlb_flush,
+	.gart_set_page = &r100_pci_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &r100_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &r100_irq_set,
+	.irq_process = &r100_irq_process,
+	.get_vblank_counter = &r100_get_vblank_counter,
+	.fence_ring_emit = &r100_fence_ring_emit,
+	.cs_parse = &r100_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = NULL,
+	.copy = &r100_copy_blit,
+	.get_engine_clock = &radeon_legacy_get_engine_clock,
+	.set_engine_clock = &radeon_legacy_set_engine_clock,
+	.get_memory_clock = &radeon_legacy_get_memory_clock,
+	.set_memory_clock = NULL,
+	.get_pcie_lanes = NULL,
+	.set_pcie_lanes = NULL,
+	.set_clock_gating = &radeon_legacy_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &r100_bandwidth_update,
+	.hpd_init = &r100_hpd_init,
+	.hpd_fini = &r100_hpd_fini,
+	.hpd_sense = &r100_hpd_sense,
+	.hpd_set_polarity = &r100_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic r200_asic = {
+	.init = &r100_init,
+	.fini = &r100_fini,
+	.suspend = &r100_suspend,
+	.resume = &r100_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &r100_gpu_reset,
+	.gart_tlb_flush = &r100_pci_gart_tlb_flush,
+	.gart_set_page = &r100_pci_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &r100_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &r100_irq_set,
+	.irq_process = &r100_irq_process,
+	.get_vblank_counter = &r100_get_vblank_counter,
+	.fence_ring_emit = &r100_fence_ring_emit,
+	.cs_parse = &r100_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = &r200_copy_dma,
+	.copy = &r100_copy_blit,
+	.get_engine_clock = &radeon_legacy_get_engine_clock,
+	.set_engine_clock = &radeon_legacy_set_engine_clock,
+	.get_memory_clock = &radeon_legacy_get_memory_clock,
+	.set_memory_clock = NULL,
+	.set_pcie_lanes = NULL,
+	.set_clock_gating = &radeon_legacy_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &r100_bandwidth_update,
+	.hpd_init = &r100_hpd_init,
+	.hpd_fini = &r100_hpd_fini,
+	.hpd_sense = &r100_hpd_sense,
+	.hpd_set_polarity = &r100_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic r300_asic = {
+	.init = &r300_init,
+	.fini = &r300_fini,
+	.suspend = &r300_suspend,
+	.resume = &r300_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &r300_gpu_reset,
+	.gart_tlb_flush = &r100_pci_gart_tlb_flush,
+	.gart_set_page = &r100_pci_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &r300_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &r100_irq_set,
+	.irq_process = &r100_irq_process,
+	.get_vblank_counter = &r100_get_vblank_counter,
+	.fence_ring_emit = &r300_fence_ring_emit,
+	.cs_parse = &r300_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = &r200_copy_dma,
+	.copy = &r100_copy_blit,
+	.get_engine_clock = &radeon_legacy_get_engine_clock,
+	.set_engine_clock = &radeon_legacy_set_engine_clock,
+	.get_memory_clock = &radeon_legacy_get_memory_clock,
+	.set_memory_clock = NULL,
+	.get_pcie_lanes = &rv370_get_pcie_lanes,
+	.set_pcie_lanes = &rv370_set_pcie_lanes,
+	.set_clock_gating = &radeon_legacy_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &r100_bandwidth_update,
+	.hpd_init = &r100_hpd_init,
+	.hpd_fini = &r100_hpd_fini,
+	.hpd_sense = &r100_hpd_sense,
+	.hpd_set_polarity = &r100_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic r300_asic_pcie = {
+	.init = &r300_init,
+	.fini = &r300_fini,
+	.suspend = &r300_suspend,
+	.resume = &r300_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &r300_gpu_reset,
+	.gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
+	.gart_set_page = &rv370_pcie_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &r300_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &r100_irq_set,
+	.irq_process = &r100_irq_process,
+	.get_vblank_counter = &r100_get_vblank_counter,
+	.fence_ring_emit = &r300_fence_ring_emit,
+	.cs_parse = &r300_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = &r200_copy_dma,
+	.copy = &r100_copy_blit,
+	.get_engine_clock = &radeon_legacy_get_engine_clock,
+	.set_engine_clock = &radeon_legacy_set_engine_clock,
+	.get_memory_clock = &radeon_legacy_get_memory_clock,
+	.set_memory_clock = NULL,
+	.set_pcie_lanes = &rv370_set_pcie_lanes,
+	.set_clock_gating = &radeon_legacy_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &r100_bandwidth_update,
+	.hpd_init = &r100_hpd_init,
+	.hpd_fini = &r100_hpd_fini,
+	.hpd_sense = &r100_hpd_sense,
+	.hpd_set_polarity = &r100_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic r420_asic = {
+	.init = &r420_init,
+	.fini = &r420_fini,
+	.suspend = &r420_suspend,
+	.resume = &r420_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &r300_gpu_reset,
+	.gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
+	.gart_set_page = &rv370_pcie_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &r300_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &r100_irq_set,
+	.irq_process = &r100_irq_process,
+	.get_vblank_counter = &r100_get_vblank_counter,
+	.fence_ring_emit = &r300_fence_ring_emit,
+	.cs_parse = &r300_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = &r200_copy_dma,
+	.copy = &r100_copy_blit,
+	.get_engine_clock = &radeon_atom_get_engine_clock,
+	.set_engine_clock = &radeon_atom_set_engine_clock,
+	.get_memory_clock = &radeon_atom_get_memory_clock,
+	.set_memory_clock = &radeon_atom_set_memory_clock,
+	.get_pcie_lanes = &rv370_get_pcie_lanes,
+	.set_pcie_lanes = &rv370_set_pcie_lanes,
+	.set_clock_gating = &radeon_atom_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &r100_bandwidth_update,
+	.hpd_init = &r100_hpd_init,
+	.hpd_fini = &r100_hpd_fini,
+	.hpd_sense = &r100_hpd_sense,
+	.hpd_set_polarity = &r100_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic rs400_asic = {
+	.init = &rs400_init,
+	.fini = &rs400_fini,
+	.suspend = &rs400_suspend,
+	.resume = &rs400_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &r300_gpu_reset,
+	.gart_tlb_flush = &rs400_gart_tlb_flush,
+	.gart_set_page = &rs400_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &r300_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &r100_irq_set,
+	.irq_process = &r100_irq_process,
+	.get_vblank_counter = &r100_get_vblank_counter,
+	.fence_ring_emit = &r300_fence_ring_emit,
+	.cs_parse = &r300_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = &r200_copy_dma,
+	.copy = &r100_copy_blit,
+	.get_engine_clock = &radeon_legacy_get_engine_clock,
+	.set_engine_clock = &radeon_legacy_set_engine_clock,
+	.get_memory_clock = &radeon_legacy_get_memory_clock,
+	.set_memory_clock = NULL,
+	.get_pcie_lanes = NULL,
+	.set_pcie_lanes = NULL,
+	.set_clock_gating = &radeon_legacy_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &r100_bandwidth_update,
+	.hpd_init = &r100_hpd_init,
+	.hpd_fini = &r100_hpd_fini,
+	.hpd_sense = &r100_hpd_sense,
+	.hpd_set_polarity = &r100_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic rs600_asic = {
+	.init = &rs600_init,
+	.fini = &rs600_fini,
+	.suspend = &rs600_suspend,
+	.resume = &rs600_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &r300_gpu_reset,
+	.gart_tlb_flush = &rs600_gart_tlb_flush,
+	.gart_set_page = &rs600_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &r300_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &rs600_irq_set,
+	.irq_process = &rs600_irq_process,
+	.get_vblank_counter = &rs600_get_vblank_counter,
+	.fence_ring_emit = &r300_fence_ring_emit,
+	.cs_parse = &r300_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = &r200_copy_dma,
+	.copy = &r100_copy_blit,
+	.get_engine_clock = &radeon_atom_get_engine_clock,
+	.set_engine_clock = &radeon_atom_set_engine_clock,
+	.get_memory_clock = &radeon_atom_get_memory_clock,
+	.set_memory_clock = &radeon_atom_set_memory_clock,
+	.get_pcie_lanes = NULL,
+	.set_pcie_lanes = NULL,
+	.set_clock_gating = &radeon_atom_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &rs600_bandwidth_update,
+	.hpd_init = &rs600_hpd_init,
+	.hpd_fini = &rs600_hpd_fini,
+	.hpd_sense = &rs600_hpd_sense,
+	.hpd_set_polarity = &rs600_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic rs690_asic = {
+	.init = &rs690_init,
+	.fini = &rs690_fini,
+	.suspend = &rs690_suspend,
+	.resume = &rs690_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &r300_gpu_reset,
+	.gart_tlb_flush = &rs400_gart_tlb_flush,
+	.gart_set_page = &rs400_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &r300_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &rs600_irq_set,
+	.irq_process = &rs600_irq_process,
+	.get_vblank_counter = &rs600_get_vblank_counter,
+	.fence_ring_emit = &r300_fence_ring_emit,
+	.cs_parse = &r300_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = &r200_copy_dma,
+	.copy = &r200_copy_dma,
+	.get_engine_clock = &radeon_atom_get_engine_clock,
+	.set_engine_clock = &radeon_atom_set_engine_clock,
+	.get_memory_clock = &radeon_atom_get_memory_clock,
+	.set_memory_clock = &radeon_atom_set_memory_clock,
+	.get_pcie_lanes = NULL,
+	.set_pcie_lanes = NULL,
+	.set_clock_gating = &radeon_atom_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &rs690_bandwidth_update,
+	.hpd_init = &rs600_hpd_init,
+	.hpd_fini = &rs600_hpd_fini,
+	.hpd_sense = &rs600_hpd_sense,
+	.hpd_set_polarity = &rs600_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic rv515_asic = {
+	.init = &rv515_init,
+	.fini = &rv515_fini,
+	.suspend = &rv515_suspend,
+	.resume = &rv515_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &rv515_gpu_reset,
+	.gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
+	.gart_set_page = &rv370_pcie_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &rv515_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &rs600_irq_set,
+	.irq_process = &rs600_irq_process,
+	.get_vblank_counter = &rs600_get_vblank_counter,
+	.fence_ring_emit = &r300_fence_ring_emit,
+	.cs_parse = &r300_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = &r200_copy_dma,
+	.copy = &r100_copy_blit,
+	.get_engine_clock = &radeon_atom_get_engine_clock,
+	.set_engine_clock = &radeon_atom_set_engine_clock,
+	.get_memory_clock = &radeon_atom_get_memory_clock,
+	.set_memory_clock = &radeon_atom_set_memory_clock,
+	.get_pcie_lanes = &rv370_get_pcie_lanes,
+	.set_pcie_lanes = &rv370_set_pcie_lanes,
+	.set_clock_gating = &radeon_atom_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &rv515_bandwidth_update,
+	.hpd_init = &rs600_hpd_init,
+	.hpd_fini = &rs600_hpd_fini,
+	.hpd_sense = &rs600_hpd_sense,
+	.hpd_set_polarity = &rs600_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic r520_asic = {
+	.init = &r520_init,
+	.fini = &rv515_fini,
+	.suspend = &rv515_suspend,
+	.resume = &r520_resume,
+	.vga_set_state = &r100_vga_set_state,
+	.gpu_reset = &rv515_gpu_reset,
+	.gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
+	.gart_set_page = &rv370_pcie_gart_set_page,
+	.cp_commit = &r100_cp_commit,
+	.ring_start = &rv515_ring_start,
+	.ring_test = &r100_ring_test,
+	.ring_ib_execute = &r100_ring_ib_execute,
+	.irq_set = &rs600_irq_set,
+	.irq_process = &rs600_irq_process,
+	.get_vblank_counter = &rs600_get_vblank_counter,
+	.fence_ring_emit = &r300_fence_ring_emit,
+	.cs_parse = &r300_cs_parse,
+	.copy_blit = &r100_copy_blit,
+	.copy_dma = &r200_copy_dma,
+	.copy = &r100_copy_blit,
+	.get_engine_clock = &radeon_atom_get_engine_clock,
+	.set_engine_clock = &radeon_atom_set_engine_clock,
+	.get_memory_clock = &radeon_atom_get_memory_clock,
+	.set_memory_clock = &radeon_atom_set_memory_clock,
+	.get_pcie_lanes = &rv370_get_pcie_lanes,
+	.set_pcie_lanes = &rv370_set_pcie_lanes,
+	.set_clock_gating = &radeon_atom_set_clock_gating,
+	.set_surface_reg = r100_set_surface_reg,
+	.clear_surface_reg = r100_clear_surface_reg,
+	.bandwidth_update = &rv515_bandwidth_update,
+	.hpd_init = &rs600_hpd_init,
+	.hpd_fini = &rs600_hpd_fini,
+	.hpd_sense = &rs600_hpd_sense,
+	.hpd_set_polarity = &rs600_hpd_set_polarity,
+	.ioctl_wait_idle = NULL,
+};
+
+static struct radeon_asic r600_asic = {
+	.init = &r600_init,
+	.fini = &r600_fini,
+	.suspend = &r600_suspend,
+	.resume = &r600_resume,
+	.cp_commit = &r600_cp_commit,
+	.vga_set_state = &r600_vga_set_state,
+	.gpu_reset = &r600_gpu_reset,
+	.gart_tlb_flush = &r600_pcie_gart_tlb_flush,
+	.gart_set_page = &rs600_gart_set_page,
+	.ring_test = &r600_ring_test,
+	.ring_ib_execute = &r600_ring_ib_execute,
+	.irq_set = &r600_irq_set,
+	.irq_process = &r600_irq_process,
+	.get_vblank_counter = &rs600_get_vblank_counter,
+	.fence_ring_emit = &r600_fence_ring_emit,
+	.cs_parse = &r600_cs_parse,
+	.copy_blit = &r600_copy_blit,
+	.copy_dma = &r600_copy_blit,
+	.copy = &r600_copy_blit,
+	.get_engine_clock = &radeon_atom_get_engine_clock,
+	.set_engine_clock = &radeon_atom_set_engine_clock,
+	.get_memory_clock = &radeon_atom_get_memory_clock,
+	.set_memory_clock = &radeon_atom_set_memory_clock,
+	.get_pcie_lanes = &rv370_get_pcie_lanes,
+	.set_pcie_lanes = NULL,
+	.set_clock_gating = NULL,
+	.set_surface_reg = r600_set_surface_reg,
+	.clear_surface_reg = r600_clear_surface_reg,
+	.bandwidth_update = &rv515_bandwidth_update,
+	.hpd_init = &r600_hpd_init,
+	.hpd_fini = &r600_hpd_fini,
+	.hpd_sense = &r600_hpd_sense,
+	.hpd_set_polarity = &r600_hpd_set_polarity,
+	.ioctl_wait_idle = r600_ioctl_wait_idle,
+};
+
+static struct radeon_asic rs780_asic = {
+	.init = &r600_init,
+	.fini = &r600_fini,
+	.suspend = &r600_suspend,
+	.resume = &r600_resume,
+	.cp_commit = &r600_cp_commit,
+	.vga_set_state = &r600_vga_set_state,
+	.gpu_reset = &r600_gpu_reset,
+	.gart_tlb_flush = &r600_pcie_gart_tlb_flush,
+	.gart_set_page = &rs600_gart_set_page,
+	.ring_test = &r600_ring_test,
+	.ring_ib_execute = &r600_ring_ib_execute,
+	.irq_set = &r600_irq_set,
+	.irq_process = &r600_irq_process,
+	.get_vblank_counter = &rs600_get_vblank_counter,
+	.fence_ring_emit = &r600_fence_ring_emit,
+	.cs_parse = &r600_cs_parse,
+	.copy_blit = &r600_copy_blit,
+	.copy_dma = &r600_copy_blit,
+	.copy = &r600_copy_blit,
+	.get_engine_clock = &radeon_atom_get_engine_clock,
+	.set_engine_clock = &radeon_atom_set_engine_clock,
+	.get_memory_clock = NULL,
+	.set_memory_clock = NULL,
+	.get_pcie_lanes = NULL,
+	.set_pcie_lanes = NULL,
+	.set_clock_gating = NULL,
+	.set_surface_reg = r600_set_surface_reg,
+	.clear_surface_reg = r600_clear_surface_reg,
+	.bandwidth_update = &rs690_bandwidth_update,
+	.hpd_init = &r600_hpd_init,
+	.hpd_fini = &r600_hpd_fini,
+	.hpd_sense = &r600_hpd_sense,
+	.hpd_set_polarity = &r600_hpd_set_polarity,
+	.ioctl_wait_idle = r600_ioctl_wait_idle,
+};
+
+static struct radeon_asic rv770_asic = {
+	.init = &rv770_init,
+	.fini = &rv770_fini,
+	.suspend = &rv770_suspend,
+	.resume = &rv770_resume,
+	.cp_commit = &r600_cp_commit,
+	.gpu_reset = &rv770_gpu_reset,
+	.vga_set_state = &r600_vga_set_state,
+	.gart_tlb_flush = &r600_pcie_gart_tlb_flush,
+	.gart_set_page = &rs600_gart_set_page,
+	.ring_test = &r600_ring_test,
+	.ring_ib_execute = &r600_ring_ib_execute,
+	.irq_set = &r600_irq_set,
+	.irq_process = &r600_irq_process,
+	.get_vblank_counter = &rs600_get_vblank_counter,
+	.fence_ring_emit = &r600_fence_ring_emit,
+	.cs_parse = &r600_cs_parse,
+	.copy_blit = &r600_copy_blit,
+	.copy_dma = &r600_copy_blit,
+	.copy = &r600_copy_blit,
+	.get_engine_clock = &radeon_atom_get_engine_clock,
+	.set_engine_clock = &radeon_atom_set_engine_clock,
+	.get_memory_clock = &radeon_atom_get_memory_clock,
+	.set_memory_clock = &radeon_atom_set_memory_clock,
+	.get_pcie_lanes = &rv370_get_pcie_lanes,
+	.set_pcie_lanes = NULL,
+	.set_clock_gating = &radeon_atom_set_clock_gating,
+	.set_surface_reg = r600_set_surface_reg,
+	.clear_surface_reg = r600_clear_surface_reg,
+	.bandwidth_update = &rv515_bandwidth_update,
+	.hpd_init = &r600_hpd_init,
+	.hpd_fini = &r600_hpd_fini,
+	.hpd_sense = &r600_hpd_sense,
+	.hpd_set_polarity = &r600_hpd_set_polarity,
+	.ioctl_wait_idle = r600_ioctl_wait_idle,
+};
+
+static struct radeon_asic evergreen_asic = {
+	.init = &evergreen_init,
+	.fini = &evergreen_fini,
+	.suspend = &evergreen_suspend,
+	.resume = &evergreen_resume,
+	.cp_commit = NULL,
+	.gpu_reset = &evergreen_gpu_reset,
+	.vga_set_state = &r600_vga_set_state,
+	.gart_tlb_flush = &r600_pcie_gart_tlb_flush,
+	.gart_set_page = &rs600_gart_set_page,
+	.ring_test = NULL,
+	.ring_ib_execute = NULL,
+	.irq_set = NULL,
+	.irq_process = NULL,
+	.get_vblank_counter = NULL,
+	.fence_ring_emit = NULL,
+	.cs_parse = NULL,
+	.copy_blit = NULL,
+	.copy_dma = NULL,
+	.copy = NULL,
+	.get_engine_clock = &radeon_atom_get_engine_clock,
+	.set_engine_clock = &radeon_atom_set_engine_clock,
+	.get_memory_clock = &radeon_atom_get_memory_clock,
+	.set_memory_clock = &radeon_atom_set_memory_clock,
+	.set_pcie_lanes = NULL,
+	.set_clock_gating = NULL,
+	.set_surface_reg = r600_set_surface_reg,
+	.clear_surface_reg = r600_clear_surface_reg,
+	.bandwidth_update = &evergreen_bandwidth_update,
+	.hpd_init = &evergreen_hpd_init,
+	.hpd_fini = &evergreen_hpd_fini,
+	.hpd_sense = &evergreen_hpd_sense,
+	.hpd_set_polarity = &evergreen_hpd_set_polarity,
+};
+
+int radeon_asic_init(struct radeon_device *rdev)
+{
+	radeon_register_accessor_init(rdev);
+	switch (rdev->family) {
+	case CHIP_R100:
+	case CHIP_RV100:
+	case CHIP_RS100:
+	case CHIP_RV200:
+	case CHIP_RS200:
+		rdev->asic = &r100_asic;
+		break;
+	case CHIP_R200:
+	case CHIP_RV250:
+	case CHIP_RS300:
+	case CHIP_RV280:
+		rdev->asic = &r200_asic;
+		break;
+	case CHIP_R300:
+	case CHIP_R350:
+	case CHIP_RV350:
+	case CHIP_RV380:
+		if (rdev->flags & RADEON_IS_PCIE)
+			rdev->asic = &r300_asic_pcie;
+		else
+			rdev->asic = &r300_asic;
+		break;
+	case CHIP_R420:
+	case CHIP_R423:
+	case CHIP_RV410:
+		rdev->asic = &r420_asic;
+		break;
+	case CHIP_RS400:
+	case CHIP_RS480:
+		rdev->asic = &rs400_asic;
+		break;
+	case CHIP_RS600:
+		rdev->asic = &rs600_asic;
+		break;
+	case CHIP_RS690:
+	case CHIP_RS740:
+		rdev->asic = &rs690_asic;
+		break;
+	case CHIP_RV515:
+		rdev->asic = &rv515_asic;
+		break;
+	case CHIP_R520:
+	case CHIP_RV530:
+	case CHIP_RV560:
+	case CHIP_RV570:
+	case CHIP_R580:
+		rdev->asic = &r520_asic;
+		break;
+	case CHIP_R600:
+	case CHIP_RV610:
+	case CHIP_RV630:
+	case CHIP_RV620:
+	case CHIP_RV635:
+	case CHIP_RV670:
+		rdev->asic = &r600_asic;
+		break;
+	case CHIP_RS780:
+	case CHIP_RS880:
+		rdev->asic = &rs780_asic;
+		break;
+	case CHIP_RV770:
+	case CHIP_RV730:
+	case CHIP_RV710:
+	case CHIP_RV740:
+		rdev->asic = &rv770_asic;
+		break;
+	case CHIP_CEDAR:
+	case CHIP_REDWOOD:
+	case CHIP_JUNIPER:
+	case CHIP_CYPRESS:
+	case CHIP_HEMLOCK:
+		rdev->asic = &evergreen_asic;
+		break;
+	default:
+		/* FIXME: not supported yet */
+		return -EINVAL;
+	}
+
+	if (rdev->flags & RADEON_IS_IGP) {
+		rdev->asic->get_memory_clock = NULL;
+		rdev->asic->set_memory_clock = NULL;
+	}
+
+	/* set the number of crtcs */
+	if (rdev->flags & RADEON_SINGLE_CRTC)
+		rdev->num_crtc = 1;
+	else {
+		if (ASIC_IS_DCE4(rdev))
+			rdev->num_crtc = 6;
+		else
+			rdev->num_crtc = 2;
+	}
+
+	return 0;
+}
+
+/*
+ * Wrapper around modesetting bits. Move to radeon_clocks.c?
+ */
+int radeon_clocks_init(struct radeon_device *rdev)
+{
+	int r;
+
+	r = radeon_static_clocks_init(rdev->ddev);
+	if (r) {
+		return r;
+	}
+	DRM_INFO("Clocks initialized !\n");
+	return 0;
+}
+
+void radeon_clocks_fini(struct radeon_device *rdev)
+{
+}
diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h
index d3a157b2bcb..a0b8280663d 100644
--- a/drivers/gpu/drm/radeon/radeon_asic.h
+++ b/drivers/gpu/drm/radeon/radeon_asic.h
@@ -45,10 +45,18 @@ void radeon_atom_set_clock_gating(struct radeon_device *rdev, int enable);
 /*
  * r100,rv100,rs100,rv200,rs200
  */
-extern int r100_init(struct radeon_device *rdev);
-extern void r100_fini(struct radeon_device *rdev);
-extern int r100_suspend(struct radeon_device *rdev);
-extern int r100_resume(struct radeon_device *rdev);
+struct r100_mc_save {
+	u32	GENMO_WT;
+	u32	CRTC_EXT_CNTL;
+	u32	CRTC_GEN_CNTL;
+	u32	CRTC2_GEN_CNTL;
+	u32	CUR_OFFSET;
+	u32	CUR2_OFFSET;
+};
+int r100_init(struct radeon_device *rdev);
+void r100_fini(struct radeon_device *rdev);
+int r100_suspend(struct radeon_device *rdev);
+int r100_resume(struct radeon_device *rdev);
 uint32_t r100_mm_rreg(struct radeon_device *rdev, uint32_t reg);
 void r100_mm_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
 void r100_vga_set_state(struct radeon_device *rdev, bool state);
@@ -73,7 +81,7 @@ int r100_copy_blit(struct radeon_device *rdev,
 int r100_set_surface_reg(struct radeon_device *rdev, int reg,
 			 uint32_t tiling_flags, uint32_t pitch,
 			 uint32_t offset, uint32_t obj_size);
-int r100_clear_surface_reg(struct radeon_device *rdev, int reg);
+void r100_clear_surface_reg(struct radeon_device *rdev, int reg);
 void r100_bandwidth_update(struct radeon_device *rdev);
 void r100_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib);
 int r100_ring_test(struct radeon_device *rdev);
@@ -82,44 +90,42 @@ void r100_hpd_fini(struct radeon_device *rdev);
 bool r100_hpd_sense(struct radeon_device *rdev, enum radeon_hpd_id hpd);
 void r100_hpd_set_polarity(struct radeon_device *rdev,
 			   enum radeon_hpd_id hpd);
-
-static struct radeon_asic r100_asic = {
-	.init = &r100_init,
-	.fini = &r100_fini,
-	.suspend = &r100_suspend,
-	.resume = &r100_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &r100_gpu_reset,
-	.gart_tlb_flush = &r100_pci_gart_tlb_flush,
-	.gart_set_page = &r100_pci_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &r100_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &r100_irq_set,
-	.irq_process = &r100_irq_process,
-	.get_vblank_counter = &r100_get_vblank_counter,
-	.fence_ring_emit = &r100_fence_ring_emit,
-	.cs_parse = &r100_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = NULL,
-	.copy = &r100_copy_blit,
-	.get_engine_clock = &radeon_legacy_get_engine_clock,
-	.set_engine_clock = &radeon_legacy_set_engine_clock,
-	.get_memory_clock = &radeon_legacy_get_memory_clock,
-	.set_memory_clock = NULL,
-	.get_pcie_lanes = NULL,
-	.set_pcie_lanes = NULL,
-	.set_clock_gating = &radeon_legacy_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &r100_bandwidth_update,
-	.hpd_init = &r100_hpd_init,
-	.hpd_fini = &r100_hpd_fini,
-	.hpd_sense = &r100_hpd_sense,
-	.hpd_set_polarity = &r100_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
+int r100_debugfs_rbbm_init(struct radeon_device *rdev);
+int r100_debugfs_cp_init(struct radeon_device *rdev);
+void r100_cp_disable(struct radeon_device *rdev);
+int r100_cp_init(struct radeon_device *rdev, unsigned ring_size);
+void r100_cp_fini(struct radeon_device *rdev);
+int r100_pci_gart_init(struct radeon_device *rdev);
+void r100_pci_gart_fini(struct radeon_device *rdev);
+int r100_pci_gart_enable(struct radeon_device *rdev);
+void r100_pci_gart_disable(struct radeon_device *rdev);
+int r100_debugfs_mc_info_init(struct radeon_device *rdev);
+int r100_gui_wait_for_idle(struct radeon_device *rdev);
+void r100_ib_fini(struct radeon_device *rdev);
+int r100_ib_init(struct radeon_device *rdev);
+void r100_irq_disable(struct radeon_device *rdev);
+void r100_mc_stop(struct radeon_device *rdev, struct r100_mc_save *save);
+void r100_mc_resume(struct radeon_device *rdev, struct r100_mc_save *save);
+void r100_vram_init_sizes(struct radeon_device *rdev);
+void r100_wb_disable(struct radeon_device *rdev);
+void r100_wb_fini(struct radeon_device *rdev);
+int r100_wb_init(struct radeon_device *rdev);
+void r100_hdp_reset(struct radeon_device *rdev);
+int r100_rb2d_reset(struct radeon_device *rdev);
+int r100_cp_reset(struct radeon_device *rdev);
+void r100_vga_render_disable(struct radeon_device *rdev);
+int r100_cs_track_check_pkt3_indx_buffer(struct radeon_cs_parser *p,
+					 struct radeon_cs_packet *pkt,
+					 struct radeon_bo *robj);
+int r100_cs_parse_packet0(struct radeon_cs_parser *p,
+			  struct radeon_cs_packet *pkt,
+			  const unsigned *auth, unsigned n,
+			  radeon_packet0_check_t check);
+int r100_cs_packet_parse(struct radeon_cs_parser *p,
+			 struct radeon_cs_packet *pkt,
+			 unsigned idx);
+void r100_enable_bm(struct radeon_device *rdev);
+void r100_set_common_regs(struct radeon_device *rdev);
 
 /*
  * r200,rv250,rs300,rv280
@@ -129,43 +135,6 @@ extern int r200_copy_dma(struct radeon_device *rdev,
 			uint64_t dst_offset,
 			unsigned num_pages,
 			struct radeon_fence *fence);
-static struct radeon_asic r200_asic = {
-	.init = &r100_init,
-	.fini = &r100_fini,
-	.suspend = &r100_suspend,
-	.resume = &r100_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &r100_gpu_reset,
-	.gart_tlb_flush = &r100_pci_gart_tlb_flush,
-	.gart_set_page = &r100_pci_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &r100_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &r100_irq_set,
-	.irq_process = &r100_irq_process,
-	.get_vblank_counter = &r100_get_vblank_counter,
-	.fence_ring_emit = &r100_fence_ring_emit,
-	.cs_parse = &r100_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = &r200_copy_dma,
-	.copy = &r100_copy_blit,
-	.get_engine_clock = &radeon_legacy_get_engine_clock,
-	.set_engine_clock = &radeon_legacy_set_engine_clock,
-	.get_memory_clock = &radeon_legacy_get_memory_clock,
-	.set_memory_clock = NULL,
-	.set_pcie_lanes = NULL,
-	.set_clock_gating = &radeon_legacy_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &r100_bandwidth_update,
-	.hpd_init = &r100_hpd_init,
-	.hpd_fini = &r100_hpd_fini,
-	.hpd_sense = &r100_hpd_sense,
-	.hpd_set_polarity = &r100_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
-
 
 /*
  * r300,r350,rv350,rv380
@@ -186,82 +155,6 @@ extern void rv370_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v
 extern void rv370_set_pcie_lanes(struct radeon_device *rdev, int lanes);
 extern int rv370_get_pcie_lanes(struct radeon_device *rdev);
 
-static struct radeon_asic r300_asic = {
-	.init = &r300_init,
-	.fini = &r300_fini,
-	.suspend = &r300_suspend,
-	.resume = &r300_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &r300_gpu_reset,
-	.gart_tlb_flush = &r100_pci_gart_tlb_flush,
-	.gart_set_page = &r100_pci_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &r300_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &r100_irq_set,
-	.irq_process = &r100_irq_process,
-	.get_vblank_counter = &r100_get_vblank_counter,
-	.fence_ring_emit = &r300_fence_ring_emit,
-	.cs_parse = &r300_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = &r200_copy_dma,
-	.copy = &r100_copy_blit,
-	.get_engine_clock = &radeon_legacy_get_engine_clock,
-	.set_engine_clock = &radeon_legacy_set_engine_clock,
-	.get_memory_clock = &radeon_legacy_get_memory_clock,
-	.set_memory_clock = NULL,
-	.get_pcie_lanes = &rv370_get_pcie_lanes,
-	.set_pcie_lanes = &rv370_set_pcie_lanes,
-	.set_clock_gating = &radeon_legacy_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &r100_bandwidth_update,
-	.hpd_init = &r100_hpd_init,
-	.hpd_fini = &r100_hpd_fini,
-	.hpd_sense = &r100_hpd_sense,
-	.hpd_set_polarity = &r100_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
-
-
-static struct radeon_asic r300_asic_pcie = {
-	.init = &r300_init,
-	.fini = &r300_fini,
-	.suspend = &r300_suspend,
-	.resume = &r300_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &r300_gpu_reset,
-	.gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
-	.gart_set_page = &rv370_pcie_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &r300_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &r100_irq_set,
-	.irq_process = &r100_irq_process,
-	.get_vblank_counter = &r100_get_vblank_counter,
-	.fence_ring_emit = &r300_fence_ring_emit,
-	.cs_parse = &r300_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = &r200_copy_dma,
-	.copy = &r100_copy_blit,
-	.get_engine_clock = &radeon_legacy_get_engine_clock,
-	.set_engine_clock = &radeon_legacy_set_engine_clock,
-	.get_memory_clock = &radeon_legacy_get_memory_clock,
-	.set_memory_clock = NULL,
-	.set_pcie_lanes = &rv370_set_pcie_lanes,
-	.set_clock_gating = &radeon_legacy_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &r100_bandwidth_update,
-	.hpd_init = &r100_hpd_init,
-	.hpd_fini = &r100_hpd_fini,
-	.hpd_sense = &r100_hpd_sense,
-	.hpd_set_polarity = &r100_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
-
 /*
  * r420,r423,rv410
  */
@@ -269,44 +162,6 @@ extern int r420_init(struct radeon_device *rdev);
 extern void r420_fini(struct radeon_device *rdev);
 extern int r420_suspend(struct radeon_device *rdev);
 extern int r420_resume(struct radeon_device *rdev);
-static struct radeon_asic r420_asic = {
-	.init = &r420_init,
-	.fini = &r420_fini,
-	.suspend = &r420_suspend,
-	.resume = &r420_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &r300_gpu_reset,
-	.gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
-	.gart_set_page = &rv370_pcie_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &r300_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &r100_irq_set,
-	.irq_process = &r100_irq_process,
-	.get_vblank_counter = &r100_get_vblank_counter,
-	.fence_ring_emit = &r300_fence_ring_emit,
-	.cs_parse = &r300_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = &r200_copy_dma,
-	.copy = &r100_copy_blit,
-	.get_engine_clock = &radeon_atom_get_engine_clock,
-	.set_engine_clock = &radeon_atom_set_engine_clock,
-	.get_memory_clock = &radeon_atom_get_memory_clock,
-	.set_memory_clock = &radeon_atom_set_memory_clock,
-	.get_pcie_lanes = &rv370_get_pcie_lanes,
-	.set_pcie_lanes = &rv370_set_pcie_lanes,
-	.set_clock_gating = &radeon_atom_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &r100_bandwidth_update,
-	.hpd_init = &r100_hpd_init,
-	.hpd_fini = &r100_hpd_fini,
-	.hpd_sense = &r100_hpd_sense,
-	.hpd_set_polarity = &r100_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
-
 
 /*
  * rs400,rs480
@@ -319,44 +174,6 @@ void rs400_gart_tlb_flush(struct radeon_device *rdev);
 int rs400_gart_set_page(struct radeon_device *rdev, int i, uint64_t addr);
 uint32_t rs400_mc_rreg(struct radeon_device *rdev, uint32_t reg);
 void rs400_mc_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
-static struct radeon_asic rs400_asic = {
-	.init = &rs400_init,
-	.fini = &rs400_fini,
-	.suspend = &rs400_suspend,
-	.resume = &rs400_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &r300_gpu_reset,
-	.gart_tlb_flush = &rs400_gart_tlb_flush,
-	.gart_set_page = &rs400_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &r300_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &r100_irq_set,
-	.irq_process = &r100_irq_process,
-	.get_vblank_counter = &r100_get_vblank_counter,
-	.fence_ring_emit = &r300_fence_ring_emit,
-	.cs_parse = &r300_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = &r200_copy_dma,
-	.copy = &r100_copy_blit,
-	.get_engine_clock = &radeon_legacy_get_engine_clock,
-	.set_engine_clock = &radeon_legacy_set_engine_clock,
-	.get_memory_clock = &radeon_legacy_get_memory_clock,
-	.set_memory_clock = NULL,
-	.get_pcie_lanes = NULL,
-	.set_pcie_lanes = NULL,
-	.set_clock_gating = &radeon_legacy_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &r100_bandwidth_update,
-	.hpd_init = &r100_hpd_init,
-	.hpd_fini = &r100_hpd_fini,
-	.hpd_sense = &r100_hpd_sense,
-	.hpd_set_polarity = &r100_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
-
 
 /*
  * rs600.
@@ -379,45 +196,6 @@ bool rs600_hpd_sense(struct radeon_device *rdev, enum radeon_hpd_id hpd);
 void rs600_hpd_set_polarity(struct radeon_device *rdev,
 			    enum radeon_hpd_id hpd);
 
-static struct radeon_asic rs600_asic = {
-	.init = &rs600_init,
-	.fini = &rs600_fini,
-	.suspend = &rs600_suspend,
-	.resume = &rs600_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &r300_gpu_reset,
-	.gart_tlb_flush = &rs600_gart_tlb_flush,
-	.gart_set_page = &rs600_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &r300_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &rs600_irq_set,
-	.irq_process = &rs600_irq_process,
-	.get_vblank_counter = &rs600_get_vblank_counter,
-	.fence_ring_emit = &r300_fence_ring_emit,
-	.cs_parse = &r300_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = &r200_copy_dma,
-	.copy = &r100_copy_blit,
-	.get_engine_clock = &radeon_atom_get_engine_clock,
-	.set_engine_clock = &radeon_atom_set_engine_clock,
-	.get_memory_clock = &radeon_atom_get_memory_clock,
-	.set_memory_clock = &radeon_atom_set_memory_clock,
-	.get_pcie_lanes = NULL,
-	.set_pcie_lanes = NULL,
-	.set_clock_gating = &radeon_atom_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &rs600_bandwidth_update,
-	.hpd_init = &rs600_hpd_init,
-	.hpd_fini = &rs600_hpd_fini,
-	.hpd_sense = &rs600_hpd_sense,
-	.hpd_set_polarity = &rs600_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
-
-
 /*
  * rs690,rs740
  */
@@ -428,44 +206,6 @@ int rs690_suspend(struct radeon_device *rdev);
 uint32_t rs690_mc_rreg(struct radeon_device *rdev, uint32_t reg);
 void rs690_mc_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
 void rs690_bandwidth_update(struct radeon_device *rdev);
-static struct radeon_asic rs690_asic = {
-	.init = &rs690_init,
-	.fini = &rs690_fini,
-	.suspend = &rs690_suspend,
-	.resume = &rs690_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &r300_gpu_reset,
-	.gart_tlb_flush = &rs400_gart_tlb_flush,
-	.gart_set_page = &rs400_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &r300_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &rs600_irq_set,
-	.irq_process = &rs600_irq_process,
-	.get_vblank_counter = &rs600_get_vblank_counter,
-	.fence_ring_emit = &r300_fence_ring_emit,
-	.cs_parse = &r300_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = &r200_copy_dma,
-	.copy = &r200_copy_dma,
-	.get_engine_clock = &radeon_atom_get_engine_clock,
-	.set_engine_clock = &radeon_atom_set_engine_clock,
-	.get_memory_clock = &radeon_atom_get_memory_clock,
-	.set_memory_clock = &radeon_atom_set_memory_clock,
-	.get_pcie_lanes = NULL,
-	.set_pcie_lanes = NULL,
-	.set_clock_gating = &radeon_atom_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &rs690_bandwidth_update,
-	.hpd_init = &rs600_hpd_init,
-	.hpd_fini = &rs600_hpd_fini,
-	.hpd_sense = &rs600_hpd_sense,
-	.hpd_set_polarity = &rs600_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
-
 
 /*
  * rv515
@@ -481,87 +221,12 @@ void rv515_pcie_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v);
 void rv515_bandwidth_update(struct radeon_device *rdev);
 int rv515_resume(struct radeon_device *rdev);
 int rv515_suspend(struct radeon_device *rdev);
-static struct radeon_asic rv515_asic = {
-	.init = &rv515_init,
-	.fini = &rv515_fini,
-	.suspend = &rv515_suspend,
-	.resume = &rv515_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &rv515_gpu_reset,
-	.gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
-	.gart_set_page = &rv370_pcie_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &rv515_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &rs600_irq_set,
-	.irq_process = &rs600_irq_process,
-	.get_vblank_counter = &rs600_get_vblank_counter,
-	.fence_ring_emit = &r300_fence_ring_emit,
-	.cs_parse = &r300_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = &r200_copy_dma,
-	.copy = &r100_copy_blit,
-	.get_engine_clock = &radeon_atom_get_engine_clock,
-	.set_engine_clock = &radeon_atom_set_engine_clock,
-	.get_memory_clock = &radeon_atom_get_memory_clock,
-	.set_memory_clock = &radeon_atom_set_memory_clock,
-	.get_pcie_lanes = &rv370_get_pcie_lanes,
-	.set_pcie_lanes = &rv370_set_pcie_lanes,
-	.set_clock_gating = &radeon_atom_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &rv515_bandwidth_update,
-	.hpd_init = &rs600_hpd_init,
-	.hpd_fini = &rs600_hpd_fini,
-	.hpd_sense = &rs600_hpd_sense,
-	.hpd_set_polarity = &rs600_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
-
 
 /*
  * r520,rv530,rv560,rv570,r580
  */
 int r520_init(struct radeon_device *rdev);
 int r520_resume(struct radeon_device *rdev);
-static struct radeon_asic r520_asic = {
-	.init = &r520_init,
-	.fini = &rv515_fini,
-	.suspend = &rv515_suspend,
-	.resume = &r520_resume,
-	.vga_set_state = &r100_vga_set_state,
-	.gpu_reset = &rv515_gpu_reset,
-	.gart_tlb_flush = &rv370_pcie_gart_tlb_flush,
-	.gart_set_page = &rv370_pcie_gart_set_page,
-	.cp_commit = &r100_cp_commit,
-	.ring_start = &rv515_ring_start,
-	.ring_test = &r100_ring_test,
-	.ring_ib_execute = &r100_ring_ib_execute,
-	.irq_set = &rs600_irq_set,
-	.irq_process = &rs600_irq_process,
-	.get_vblank_counter = &rs600_get_vblank_counter,
-	.fence_ring_emit = &r300_fence_ring_emit,
-	.cs_parse = &r300_cs_parse,
-	.copy_blit = &r100_copy_blit,
-	.copy_dma = &r200_copy_dma,
-	.copy = &r100_copy_blit,
-	.get_engine_clock = &radeon_atom_get_engine_clock,
-	.set_engine_clock = &radeon_atom_set_engine_clock,
-	.get_memory_clock = &radeon_atom_get_memory_clock,
-	.set_memory_clock = &radeon_atom_set_memory_clock,
-	.get_pcie_lanes = &rv370_get_pcie_lanes,
-	.set_pcie_lanes = &rv370_set_pcie_lanes,
-	.set_clock_gating = &radeon_atom_set_clock_gating,
-	.set_surface_reg = r100_set_surface_reg,
-	.clear_surface_reg = r100_clear_surface_reg,
-	.bandwidth_update = &rv515_bandwidth_update,
-	.hpd_init = &rs600_hpd_init,
-	.hpd_fini = &rs600_hpd_fini,
-	.hpd_sense = &rs600_hpd_sense,
-	.hpd_set_polarity = &rs600_hpd_set_polarity,
-	.ioctl_wait_idle = NULL,
-};
 
 /*
  * r600,rv610,rv630,rv620,rv635,rv670,rs780,rs880
@@ -591,7 +256,7 @@ int r600_gpu_reset(struct radeon_device *rdev);
 int r600_set_surface_reg(struct radeon_device *rdev, int reg,
 			 uint32_t tiling_flags, uint32_t pitch,
 			 uint32_t offset, uint32_t obj_size);
-int r600_clear_surface_reg(struct radeon_device *rdev, int reg);
+void r600_clear_surface_reg(struct radeon_device *rdev, int reg);
 void r600_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib);
 int r600_ring_test(struct radeon_device *rdev);
 int r600_copy_blit(struct radeon_device *rdev,
@@ -604,43 +269,6 @@ void r600_hpd_set_polarity(struct radeon_device *rdev,
 			   enum radeon_hpd_id hpd);
 extern void r600_ioctl_wait_idle(struct radeon_device *rdev, struct radeon_bo *bo);
 
-static struct radeon_asic r600_asic = {
-	.init = &r600_init,
-	.fini = &r600_fini,
-	.suspend = &r600_suspend,
-	.resume = &r600_resume,
-	.cp_commit = &r600_cp_commit,
-	.vga_set_state = &r600_vga_set_state,
-	.gpu_reset = &r600_gpu_reset,
-	.gart_tlb_flush = &r600_pcie_gart_tlb_flush,
-	.gart_set_page = &rs600_gart_set_page,
-	.ring_test = &r600_ring_test,
-	.ring_ib_execute = &r600_ring_ib_execute,
-	.irq_set = &r600_irq_set,
-	.irq_process = &r600_irq_process,
-	.get_vblank_counter = &rs600_get_vblank_counter,
-	.fence_ring_emit = &r600_fence_ring_emit,
-	.cs_parse = &r600_cs_parse,
-	.copy_blit = &r600_copy_blit,
-	.copy_dma = &r600_copy_blit,
-	.copy = &r600_copy_blit,
-	.get_engine_clock = &radeon_atom_get_engine_clock,
-	.set_engine_clock = &radeon_atom_set_engine_clock,
-	.get_memory_clock = &radeon_atom_get_memory_clock,
-	.set_memory_clock = &radeon_atom_set_memory_clock,
-	.get_pcie_lanes = &rv370_get_pcie_lanes,
-	.set_pcie_lanes = NULL,
-	.set_clock_gating = NULL,
-	.set_surface_reg = r600_set_surface_reg,
-	.clear_surface_reg = r600_clear_surface_reg,
-	.bandwidth_update = &rv515_bandwidth_update,
-	.hpd_init = &r600_hpd_init,
-	.hpd_fini = &r600_hpd_fini,
-	.hpd_sense = &r600_hpd_sense,
-	.hpd_set_polarity = &r600_hpd_set_polarity,
-	.ioctl_wait_idle = r600_ioctl_wait_idle,
-};
-
 /*
  * rv770,rv730,rv710,rv740
  */
@@ -650,43 +278,6 @@ int rv770_suspend(struct radeon_device *rdev);
 int rv770_resume(struct radeon_device *rdev);
 int rv770_gpu_reset(struct radeon_device *rdev);
 
-static struct radeon_asic rv770_asic = {
-	.init = &rv770_init,
-	.fini = &rv770_fini,
-	.suspend = &rv770_suspend,
-	.resume = &rv770_resume,
-	.cp_commit = &r600_cp_commit,
-	.gpu_reset = &rv770_gpu_reset,
-	.vga_set_state = &r600_vga_set_state,
-	.gart_tlb_flush = &r600_pcie_gart_tlb_flush,
-	.gart_set_page = &rs600_gart_set_page,
-	.ring_test = &r600_ring_test,
-	.ring_ib_execute = &r600_ring_ib_execute,
-	.irq_set = &r600_irq_set,
-	.irq_process = &r600_irq_process,
-	.get_vblank_counter = &rs600_get_vblank_counter,
-	.fence_ring_emit = &r600_fence_ring_emit,
-	.cs_parse = &r600_cs_parse,
-	.copy_blit = &r600_copy_blit,
-	.copy_dma = &r600_copy_blit,
-	.copy = &r600_copy_blit,
-	.get_engine_clock = &radeon_atom_get_engine_clock,
-	.set_engine_clock = &radeon_atom_set_engine_clock,
-	.get_memory_clock = &radeon_atom_get_memory_clock,
-	.set_memory_clock = &radeon_atom_set_memory_clock,
-	.get_pcie_lanes = &rv370_get_pcie_lanes,
-	.set_pcie_lanes = NULL,
-	.set_clock_gating = &radeon_atom_set_clock_gating,
-	.set_surface_reg = r600_set_surface_reg,
-	.clear_surface_reg = r600_clear_surface_reg,
-	.bandwidth_update = &rv515_bandwidth_update,
-	.hpd_init = &r600_hpd_init,
-	.hpd_fini = &r600_hpd_fini,
-	.hpd_sense = &r600_hpd_sense,
-	.hpd_set_polarity = &r600_hpd_set_polarity,
-	.ioctl_wait_idle = r600_ioctl_wait_idle,
-};
-
 /*
  * evergreen
  */
@@ -701,40 +292,4 @@ void evergreen_hpd_fini(struct radeon_device *rdev);
 bool evergreen_hpd_sense(struct radeon_device *rdev, enum radeon_hpd_id hpd);
 void evergreen_hpd_set_polarity(struct radeon_device *rdev,
 				enum radeon_hpd_id hpd);
-
-static struct radeon_asic evergreen_asic = {
-	.init = &evergreen_init,
-	.fini = &evergreen_fini,
-	.suspend = &evergreen_suspend,
-	.resume = &evergreen_resume,
-	.cp_commit = NULL,
-	.gpu_reset = &evergreen_gpu_reset,
-	.vga_set_state = &r600_vga_set_state,
-	.gart_tlb_flush = &r600_pcie_gart_tlb_flush,
-	.gart_set_page = &rs600_gart_set_page,
-	.ring_test = NULL,
-	.ring_ib_execute = NULL,
-	.irq_set = NULL,
-	.irq_process = NULL,
-	.get_vblank_counter = NULL,
-	.fence_ring_emit = NULL,
-	.cs_parse = NULL,
-	.copy_blit = NULL,
-	.copy_dma = NULL,
-	.copy = NULL,
-	.get_engine_clock = &radeon_atom_get_engine_clock,
-	.set_engine_clock = &radeon_atom_set_engine_clock,
-	.get_memory_clock = &radeon_atom_get_memory_clock,
-	.set_memory_clock = &radeon_atom_set_memory_clock,
-	.set_pcie_lanes = NULL,
-	.set_clock_gating = NULL,
-	.set_surface_reg = r600_set_surface_reg,
-	.clear_surface_reg = r600_clear_surface_reg,
-	.bandwidth_update = &evergreen_bandwidth_update,
-	.hpd_init = &evergreen_hpd_init,
-	.hpd_fini = &evergreen_hpd_fini,
-	.hpd_sense = &evergreen_hpd_sense,
-	.hpd_set_polarity = &evergreen_hpd_set_polarity,
-};
-
 #endif
diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c
index 93783b15c81..1fff95505cf 100644
--- a/drivers/gpu/drm/radeon/radeon_atombios.c
+++ b/drivers/gpu/drm/radeon/radeon_atombios.c
@@ -75,46 +75,45 @@ static inline struct radeon_i2c_bus_rec radeon_lookup_i2c_gpio(struct radeon_dev
 	memset(&i2c, 0, sizeof(struct radeon_i2c_bus_rec));
 	i2c.valid = false;
 
-	atom_parse_data_header(ctx, index, NULL, NULL, NULL, &data_offset);
-
-	i2c_info = (struct _ATOM_GPIO_I2C_INFO *)(ctx->bios + data_offset);
-
-
-	for (i = 0; i < ATOM_MAX_SUPPORTED_DEVICE; i++) {
-		gpio = &i2c_info->asGPIO_Info[i];
-
-		if (gpio->sucI2cId.ucAccess == id) {
-			i2c.mask_clk_reg = le16_to_cpu(gpio->usClkMaskRegisterIndex) * 4;
-			i2c.mask_data_reg = le16_to_cpu(gpio->usDataMaskRegisterIndex) * 4;
-			i2c.en_clk_reg = le16_to_cpu(gpio->usClkEnRegisterIndex) * 4;
-			i2c.en_data_reg = le16_to_cpu(gpio->usDataEnRegisterIndex) * 4;
-			i2c.y_clk_reg = le16_to_cpu(gpio->usClkY_RegisterIndex) * 4;
-			i2c.y_data_reg = le16_to_cpu(gpio->usDataY_RegisterIndex) * 4;
-			i2c.a_clk_reg = le16_to_cpu(gpio->usClkA_RegisterIndex) * 4;
-			i2c.a_data_reg = le16_to_cpu(gpio->usDataA_RegisterIndex) * 4;
-			i2c.mask_clk_mask = (1 << gpio->ucClkMaskShift);
-			i2c.mask_data_mask = (1 << gpio->ucDataMaskShift);
-			i2c.en_clk_mask = (1 << gpio->ucClkEnShift);
-			i2c.en_data_mask = (1 << gpio->ucDataEnShift);
-			i2c.y_clk_mask = (1 << gpio->ucClkY_Shift);
-			i2c.y_data_mask = (1 << gpio->ucDataY_Shift);
-			i2c.a_clk_mask = (1 << gpio->ucClkA_Shift);
-			i2c.a_data_mask = (1 << gpio->ucDataA_Shift);
-
-			if (gpio->sucI2cId.sbfAccess.bfHW_Capable)
-				i2c.hw_capable = true;
-			else
-				i2c.hw_capable = false;
-
-			if (gpio->sucI2cId.ucAccess == 0xa0)
-				i2c.mm_i2c = true;
-			else
-				i2c.mm_i2c = false;
-
-			i2c.i2c_id = gpio->sucI2cId.ucAccess;
-
-			i2c.valid = true;
-			break;
+	if (atom_parse_data_header(ctx, index, NULL, NULL, NULL, &data_offset)) {
+		i2c_info = (struct _ATOM_GPIO_I2C_INFO *)(ctx->bios + data_offset);
+
+		for (i = 0; i < ATOM_MAX_SUPPORTED_DEVICE; i++) {
+			gpio = &i2c_info->asGPIO_Info[i];
+
+			if (gpio->sucI2cId.ucAccess == id) {
+				i2c.mask_clk_reg = le16_to_cpu(gpio->usClkMaskRegisterIndex) * 4;
+				i2c.mask_data_reg = le16_to_cpu(gpio->usDataMaskRegisterIndex) * 4;
+				i2c.en_clk_reg = le16_to_cpu(gpio->usClkEnRegisterIndex) * 4;
+				i2c.en_data_reg = le16_to_cpu(gpio->usDataEnRegisterIndex) * 4;
+				i2c.y_clk_reg = le16_to_cpu(gpio->usClkY_RegisterIndex) * 4;
+				i2c.y_data_reg = le16_to_cpu(gpio->usDataY_RegisterIndex) * 4;
+				i2c.a_clk_reg = le16_to_cpu(gpio->usClkA_RegisterIndex) * 4;
+				i2c.a_data_reg = le16_to_cpu(gpio->usDataA_RegisterIndex) * 4;
+				i2c.mask_clk_mask = (1 << gpio->ucClkMaskShift);
+				i2c.mask_data_mask = (1 << gpio->ucDataMaskShift);
+				i2c.en_clk_mask = (1 << gpio->ucClkEnShift);
+				i2c.en_data_mask = (1 << gpio->ucDataEnShift);
+				i2c.y_clk_mask = (1 << gpio->ucClkY_Shift);
+				i2c.y_data_mask = (1 << gpio->ucDataY_Shift);
+				i2c.a_clk_mask = (1 << gpio->ucClkA_Shift);
+				i2c.a_data_mask = (1 << gpio->ucDataA_Shift);
+
+				if (gpio->sucI2cId.sbfAccess.bfHW_Capable)
+					i2c.hw_capable = true;
+				else
+					i2c.hw_capable = false;
+
+				if (gpio->sucI2cId.ucAccess == 0xa0)
+					i2c.mm_i2c = true;
+				else
+					i2c.mm_i2c = false;
+
+				i2c.i2c_id = gpio->sucI2cId.ucAccess;
+
+				i2c.valid = true;
+				break;
+			}
 		}
 	}
 
@@ -135,20 +134,21 @@ static inline struct radeon_gpio_rec radeon_lookup_gpio(struct radeon_device *rd
 	memset(&gpio, 0, sizeof(struct radeon_gpio_rec));
 	gpio.valid = false;
 
-	atom_parse_data_header(ctx, index, &size, NULL, NULL, &data_offset);
+	if (atom_parse_data_header(ctx, index, &size, NULL, NULL, &data_offset)) {
+		gpio_info = (struct _ATOM_GPIO_PIN_LUT *)(ctx->bios + data_offset);
 
-	gpio_info = (struct _ATOM_GPIO_PIN_LUT *)(ctx->bios + data_offset);
+		num_indices = (size - sizeof(ATOM_COMMON_TABLE_HEADER)) /
+			sizeof(ATOM_GPIO_PIN_ASSIGNMENT);
 
-	num_indices = (size - sizeof(ATOM_COMMON_TABLE_HEADER)) / sizeof(ATOM_GPIO_PIN_ASSIGNMENT);
-
-	for (i = 0; i < num_indices; i++) {
-		pin = &gpio_info->asGPIO_Pin[i];
-		if (id == pin->ucGPIO_ID) {
-			gpio.id = pin->ucGPIO_ID;
-			gpio.reg = pin->usGpioPin_AIndex * 4;
-			gpio.mask = (1 << pin->ucGpioPinBitShift);
-			gpio.valid = true;
-			break;
+		for (i = 0; i < num_indices; i++) {
+			pin = &gpio_info->asGPIO_Pin[i];
+			if (id == pin->ucGPIO_ID) {
+				gpio.id = pin->ucGPIO_ID;
+				gpio.reg = pin->usGpioPin_AIndex * 4;
+				gpio.mask = (1 << pin->ucGpioPinBitShift);
+				gpio.valid = true;
+				break;
+			}
 		}
 	}
 
@@ -264,6 +264,8 @@ static bool radeon_atom_apply_quirks(struct drm_device *dev,
 		if ((supported_device == ATOM_DEVICE_CRT1_SUPPORT) ||
 		    (supported_device == ATOM_DEVICE_DFP2_SUPPORT))
 			return false;
+		if (supported_device == ATOM_DEVICE_CRT2_SUPPORT)
+			*line_mux = 0x90;
 	}
 
 	/* ASUS HD 3600 XT board lists the DVI port as HDMI */
@@ -395,9 +397,7 @@ bool radeon_get_atom_connector_info_from_object_table(struct drm_device *dev)
 	struct radeon_gpio_rec gpio;
 	struct radeon_hpd hpd;
 
-	atom_parse_data_header(ctx, index, &size, &frev, &crev, &data_offset);
-
-	if (data_offset == 0)
+	if (!atom_parse_data_header(ctx, index, &size, &frev, &crev, &data_offset))
 		return false;
 
 	if (crev < 2)
@@ -449,37 +449,43 @@ bool radeon_get_atom_connector_info_from_object_table(struct drm_device *dev)
 				    GetIndexIntoMasterTable(DATA,
 							    IntegratedSystemInfo);
 
-				atom_parse_data_header(ctx, index, &size, &frev,
-						       &crev, &igp_offset);
-
-				if (crev >= 2) {
-					igp_obj =
-					    (ATOM_INTEGRATED_SYSTEM_INFO_V2
-					     *) (ctx->bios + igp_offset);
-
-					if (igp_obj) {
-						uint32_t slot_config, ct;
-
-						if (con_obj_num == 1)
-							slot_config =
-							    igp_obj->
-							    ulDDISlot1Config;
-						else
-							slot_config =
-							    igp_obj->
-							    ulDDISlot2Config;
-
-						ct = (slot_config >> 16) & 0xff;
-						connector_type =
-						    object_connector_convert
-						    [ct];
-						connector_object_id = ct;
-						igp_lane_info =
-						    slot_config & 0xffff;
+				if (atom_parse_data_header(ctx, index, &size, &frev,
+							   &crev, &igp_offset)) {
+
+					if (crev >= 2) {
+						igp_obj =
+							(ATOM_INTEGRATED_SYSTEM_INFO_V2
+							 *) (ctx->bios + igp_offset);
+
+						if (igp_obj) {
+							uint32_t slot_config, ct;
+
+							if (con_obj_num == 1)
+								slot_config =
+									igp_obj->
+									ulDDISlot1Config;
+							else
+								slot_config =
+									igp_obj->
+									ulDDISlot2Config;
+
+							ct = (slot_config >> 16) & 0xff;
+							connector_type =
+								object_connector_convert
+								[ct];
+							connector_object_id = ct;
+							igp_lane_info =
+								slot_config & 0xffff;
+						} else
+							continue;
 					} else
 						continue;
-				} else
-					continue;
+				} else {
+					igp_lane_info = 0;
+					connector_type =
+						object_connector_convert[con_obj_id];
+					connector_object_id = con_obj_id;
+				}
 			} else {
 				igp_lane_info = 0;
 				connector_type =
@@ -627,20 +633,23 @@ static uint16_t atombios_get_connector_object_id(struct drm_device *dev,
 		uint8_t frev, crev;
 		ATOM_XTMDS_INFO *xtmds;
 
-		atom_parse_data_header(ctx, index, &size, &frev, &crev, &data_offset);
-		xtmds = (ATOM_XTMDS_INFO *)(ctx->bios + data_offset);
+		if (atom_parse_data_header(ctx, index, &size, &frev, &crev, &data_offset)) {
+			xtmds = (ATOM_XTMDS_INFO *)(ctx->bios + data_offset);
 
-		if (xtmds->ucSupportedLink & ATOM_XTMDS_SUPPORTED_DUALLINK) {
-			if (connector_type == DRM_MODE_CONNECTOR_DVII)
-				return CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_I;
-			else
-				return CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_D;
-		} else {
-			if (connector_type == DRM_MODE_CONNECTOR_DVII)
-				return CONNECTOR_OBJECT_ID_SINGLE_LINK_DVI_I;
-			else
-				return CONNECTOR_OBJECT_ID_SINGLE_LINK_DVI_D;
-		}
+			if (xtmds->ucSupportedLink & ATOM_XTMDS_SUPPORTED_DUALLINK) {
+				if (connector_type == DRM_MODE_CONNECTOR_DVII)
+					return CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_I;
+				else
+					return CONNECTOR_OBJECT_ID_DUAL_LINK_DVI_D;
+			} else {
+				if (connector_type == DRM_MODE_CONNECTOR_DVII)
+					return CONNECTOR_OBJECT_ID_SINGLE_LINK_DVI_I;
+				else
+					return CONNECTOR_OBJECT_ID_SINGLE_LINK_DVI_D;
+			}
+		} else
+			return supported_devices_connector_object_id_convert
+				[connector_type];
 	} else {
 		return supported_devices_connector_object_id_convert
 			[connector_type];
@@ -672,7 +681,8 @@ bool radeon_get_atom_connector_info_from_supported_devices_table(struct
 	int i, j, max_device;
 	struct bios_connector bios_connectors[ATOM_MAX_SUPPORTED_DEVICE];
 
-	atom_parse_data_header(ctx, index, &size, &frev, &crev, &data_offset);
+	if (!atom_parse_data_header(ctx, index, &size, &frev, &crev, &data_offset))
+		return false;
 
 	supported_devices =
 	    (union atom_supported_devices *)(ctx->bios + data_offset);
@@ -865,14 +875,11 @@ bool radeon_atom_get_clock_info(struct drm_device *dev)
 	struct radeon_pll *mpll = &rdev->clock.mpll;
 	uint16_t data_offset;
 
-	atom_parse_data_header(mode_info->atom_context, index, NULL, &frev,
-			       &crev, &data_offset);
-
-	firmware_info =
-	    (union firmware_info *)(mode_info->atom_context->bios +
-				    data_offset);
-
-	if (firmware_info) {
+	if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
+		firmware_info =
+			(union firmware_info *)(mode_info->atom_context->bios +
+						data_offset);
 		/* pixel clocks */
 		p1pll->reference_freq =
 		    le16_to_cpu(firmware_info->info.usReferenceClock);
@@ -887,6 +894,20 @@ bool radeon_atom_get_clock_info(struct drm_device *dev)
 		p1pll->pll_out_max =
 		    le32_to_cpu(firmware_info->info.ulMaxPixelClockPLL_Output);
 
+		if (crev >= 4) {
+			p1pll->lcd_pll_out_min =
+				le16_to_cpu(firmware_info->info_14.usLcdMinPixelClockPLL_Output) * 100;
+			if (p1pll->lcd_pll_out_min == 0)
+				p1pll->lcd_pll_out_min = p1pll->pll_out_min;
+			p1pll->lcd_pll_out_max =
+				le16_to_cpu(firmware_info->info_14.usLcdMaxPixelClockPLL_Output) * 100;
+			if (p1pll->lcd_pll_out_max == 0)
+				p1pll->lcd_pll_out_max = p1pll->pll_out_max;
+		} else {
+			p1pll->lcd_pll_out_min = p1pll->pll_out_min;
+			p1pll->lcd_pll_out_max = p1pll->pll_out_max;
+		}
+
 		if (p1pll->pll_out_min == 0) {
 			if (ASIC_IS_AVIVO(rdev))
 				p1pll->pll_out_min = 64800;
@@ -992,13 +1013,10 @@ bool radeon_atombios_sideport_present(struct radeon_device *rdev)
 	u8 frev, crev;
 	u16 data_offset;
 
-	atom_parse_data_header(mode_info->atom_context, index, NULL, &frev,
-			       &crev, &data_offset);
-
-	igp_info = (union igp_info *)(mode_info->atom_context->bios +
+	if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
+		igp_info = (union igp_info *)(mode_info->atom_context->bios +
 				      data_offset);
-
-	if (igp_info) {
 		switch (crev) {
 		case 1:
 			if (igp_info->info.ucMemoryType & 0xf0)
@@ -1029,14 +1047,12 @@ bool radeon_atombios_get_tmds_info(struct radeon_encoder *encoder,
 	uint16_t maxfreq;
 	int i;
 
-	atom_parse_data_header(mode_info->atom_context, index, NULL, &frev,
-			       &crev, &data_offset);
-
-	tmds_info =
-	    (struct _ATOM_TMDS_INFO *)(mode_info->atom_context->bios +
-				       data_offset);
+	if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
+		tmds_info =
+			(struct _ATOM_TMDS_INFO *)(mode_info->atom_context->bios +
+						   data_offset);
 
-	if (tmds_info) {
 		maxfreq = le16_to_cpu(tmds_info->usMaxFrequency);
 		for (i = 0; i < 4; i++) {
 			tmds->tmds_pll[i].freq =
@@ -1085,13 +1101,11 @@ static struct radeon_atom_ss *radeon_atombios_get_ss_info(struct
 	if (id > ATOM_MAX_SS_ENTRY)
 		return NULL;
 
-	atom_parse_data_header(mode_info->atom_context, index, NULL, &frev,
-			       &crev, &data_offset);
+	if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
+		ss_info =
+			(struct _ATOM_SPREAD_SPECTRUM_INFO *)(mode_info->atom_context->bios + data_offset);
 
-	ss_info =
-	    (struct _ATOM_SPREAD_SPECTRUM_INFO *)(mode_info->atom_context->bios + data_offset);
-
-	if (ss_info) {
 		ss =
 		    kzalloc(sizeof(struct radeon_atom_ss), GFP_KERNEL);
 
@@ -1114,30 +1128,6 @@ static struct radeon_atom_ss *radeon_atombios_get_ss_info(struct
 	return ss;
 }
 
-static void radeon_atom_apply_lvds_quirks(struct drm_device *dev,
-					  struct radeon_encoder_atom_dig *lvds)
-{
-
-	/* Toshiba A300-1BU laptop panel doesn't like new pll divider algo */
-	if ((dev->pdev->device == 0x95c4) &&
-	    (dev->pdev->subsystem_vendor == 0x1179) &&
-	    (dev->pdev->subsystem_device == 0xff50)) {
-		if ((lvds->native_mode.hdisplay == 1280) &&
-		    (lvds->native_mode.vdisplay == 800))
-			lvds->pll_algo = PLL_ALGO_LEGACY;
-	}
-
-	/* Dell Studio 15 laptop panel doesn't like new pll divider algo */
-	if ((dev->pdev->device == 0x95c4) &&
-	    (dev->pdev->subsystem_vendor == 0x1028) &&
-	    (dev->pdev->subsystem_device == 0x029f)) {
-		if ((lvds->native_mode.hdisplay == 1280) &&
-		    (lvds->native_mode.vdisplay == 800))
-			lvds->pll_algo = PLL_ALGO_LEGACY;
-	}
-
-}
-
 union lvds_info {
 	struct _ATOM_LVDS_INFO info;
 	struct _ATOM_LVDS_INFO_V12 info_12;
@@ -1156,13 +1146,10 @@ struct radeon_encoder_atom_dig *radeon_atombios_get_lvds_info(struct
 	uint8_t frev, crev;
 	struct radeon_encoder_atom_dig *lvds = NULL;
 
-	atom_parse_data_header(mode_info->atom_context, index, NULL, &frev,
-			       &crev, &data_offset);
-
-	lvds_info =
-	    (union lvds_info *)(mode_info->atom_context->bios + data_offset);
-
-	if (lvds_info) {
+	if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
+		lvds_info =
+			(union lvds_info *)(mode_info->atom_context->bios + data_offset);
 		lvds =
 		    kzalloc(sizeof(struct radeon_encoder_atom_dig), GFP_KERNEL);
 
@@ -1220,9 +1207,6 @@ struct radeon_encoder_atom_dig *radeon_atombios_get_lvds_info(struct
 				lvds->pll_algo = PLL_ALGO_LEGACY;
 		}
 
-		/* LVDS quirks */
-		radeon_atom_apply_lvds_quirks(dev, lvds);
-
 		encoder->native_mode = lvds->native_mode;
 	}
 	return lvds;
@@ -1241,11 +1225,11 @@ radeon_atombios_get_primary_dac_info(struct radeon_encoder *encoder)
 	uint8_t bg, dac;
 	struct radeon_encoder_primary_dac *p_dac = NULL;
 
-	atom_parse_data_header(mode_info->atom_context, index, NULL, &frev, &crev, &data_offset);
-
-	dac_info = (struct _COMPASSIONATE_DATA *)(mode_info->atom_context->bios + data_offset);
+	if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
+		dac_info = (struct _COMPASSIONATE_DATA *)
+			(mode_info->atom_context->bios + data_offset);
 
-	if (dac_info) {
 		p_dac = kzalloc(sizeof(struct radeon_encoder_primary_dac), GFP_KERNEL);
 
 		if (!p_dac)
@@ -1270,7 +1254,9 @@ bool radeon_atom_get_tv_timings(struct radeon_device *rdev, int index,
 	u8 frev, crev;
 	u16 data_offset, misc;
 
-	atom_parse_data_header(mode_info->atom_context, data_index, NULL, &frev, &crev, &data_offset);
+	if (!atom_parse_data_header(mode_info->atom_context, data_index, NULL,
+				    &frev, &crev, &data_offset))
+		return false;
 
 	switch (crev) {
 	case 1:
@@ -1362,47 +1348,50 @@ radeon_atombios_get_tv_info(struct radeon_device *rdev)
 	struct _ATOM_ANALOG_TV_INFO *tv_info;
 	enum radeon_tv_std tv_std = TV_STD_NTSC;
 
-	atom_parse_data_header(mode_info->atom_context, index, NULL, &frev, &crev, &data_offset);
+	if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
 
-	tv_info = (struct _ATOM_ANALOG_TV_INFO *)(mode_info->atom_context->bios + data_offset);
+		tv_info = (struct _ATOM_ANALOG_TV_INFO *)
+			(mode_info->atom_context->bios + data_offset);
 
-	switch (tv_info->ucTV_BootUpDefaultStandard) {
-	case ATOM_TV_NTSC:
-		tv_std = TV_STD_NTSC;
-		DRM_INFO("Default TV standard: NTSC\n");
-		break;
-	case ATOM_TV_NTSCJ:
-		tv_std = TV_STD_NTSC_J;
-		DRM_INFO("Default TV standard: NTSC-J\n");
-		break;
-	case ATOM_TV_PAL:
-		tv_std = TV_STD_PAL;
-		DRM_INFO("Default TV standard: PAL\n");
-		break;
-	case ATOM_TV_PALM:
-		tv_std = TV_STD_PAL_M;
-		DRM_INFO("Default TV standard: PAL-M\n");
-		break;
-	case ATOM_TV_PALN:
-		tv_std = TV_STD_PAL_N;
-		DRM_INFO("Default TV standard: PAL-N\n");
-		break;
-	case ATOM_TV_PALCN:
-		tv_std = TV_STD_PAL_CN;
-		DRM_INFO("Default TV standard: PAL-CN\n");
-		break;
-	case ATOM_TV_PAL60:
-		tv_std = TV_STD_PAL_60;
-		DRM_INFO("Default TV standard: PAL-60\n");
-		break;
-	case ATOM_TV_SECAM:
-		tv_std = TV_STD_SECAM;
-		DRM_INFO("Default TV standard: SECAM\n");
-		break;
-	default:
-		tv_std = TV_STD_NTSC;
-		DRM_INFO("Unknown TV standard; defaulting to NTSC\n");
-		break;
+		switch (tv_info->ucTV_BootUpDefaultStandard) {
+		case ATOM_TV_NTSC:
+			tv_std = TV_STD_NTSC;
+			DRM_INFO("Default TV standard: NTSC\n");
+			break;
+		case ATOM_TV_NTSCJ:
+			tv_std = TV_STD_NTSC_J;
+			DRM_INFO("Default TV standard: NTSC-J\n");
+			break;
+		case ATOM_TV_PAL:
+			tv_std = TV_STD_PAL;
+			DRM_INFO("Default TV standard: PAL\n");
+			break;
+		case ATOM_TV_PALM:
+			tv_std = TV_STD_PAL_M;
+			DRM_INFO("Default TV standard: PAL-M\n");
+			break;
+		case ATOM_TV_PALN:
+			tv_std = TV_STD_PAL_N;
+			DRM_INFO("Default TV standard: PAL-N\n");
+			break;
+		case ATOM_TV_PALCN:
+			tv_std = TV_STD_PAL_CN;
+			DRM_INFO("Default TV standard: PAL-CN\n");
+			break;
+		case ATOM_TV_PAL60:
+			tv_std = TV_STD_PAL_60;
+			DRM_INFO("Default TV standard: PAL-60\n");
+			break;
+		case ATOM_TV_SECAM:
+			tv_std = TV_STD_SECAM;
+			DRM_INFO("Default TV standard: SECAM\n");
+			break;
+		default:
+			tv_std = TV_STD_NTSC;
+			DRM_INFO("Unknown TV standard; defaulting to NTSC\n");
+			break;
+		}
 	}
 	return tv_std;
 }
@@ -1420,11 +1409,12 @@ radeon_atombios_get_tv_dac_info(struct radeon_encoder *encoder)
 	uint8_t bg, dac;
 	struct radeon_encoder_tv_dac *tv_dac = NULL;
 
-	atom_parse_data_header(mode_info->atom_context, index, NULL, &frev, &crev, &data_offset);
+	if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
 
-	dac_info = (struct _COMPASSIONATE_DATA *)(mode_info->atom_context->bios + data_offset);
+		dac_info = (struct _COMPASSIONATE_DATA *)
+			(mode_info->atom_context->bios + data_offset);
 
-	if (dac_info) {
 		tv_dac = kzalloc(sizeof(struct radeon_encoder_tv_dac), GFP_KERNEL);
 
 		if (!tv_dac)
@@ -1447,6 +1437,30 @@ radeon_atombios_get_tv_dac_info(struct radeon_encoder *encoder)
 	return tv_dac;
 }
 
+static const char *thermal_controller_names[] = {
+	"NONE",
+	"LM63",
+	"ADM1032",
+	"ADM1030",
+	"MUA6649",
+	"LM64",
+	"F75375",
+	"ASC7512",
+};
+
+static const char *pp_lib_thermal_controller_names[] = {
+	"NONE",
+	"LM63",
+	"ADM1032",
+	"ADM1030",
+	"MUA6649",
+	"LM64",
+	"F75375",
+	"RV6xx",
+	"RV770",
+	"ADT7473",
+};
+
 union power_info {
 	struct _ATOM_POWERPLAY_INFO info;
 	struct _ATOM_POWERPLAY_INFO_V2 info_2;
@@ -1466,15 +1480,22 @@ void radeon_atombios_get_power_modes(struct radeon_device *rdev)
 	struct _ATOM_PPLIB_STATE *power_state;
 	int num_modes = 0, i, j;
 	int state_index = 0, mode_index = 0;
-
-	atom_parse_data_header(mode_info->atom_context, index, NULL, &frev, &crev, &data_offset);
-
-	power_info = (union power_info *)(mode_info->atom_context->bios + data_offset);
+	struct radeon_i2c_bus_rec i2c_bus;
 
 	rdev->pm.default_power_state = NULL;
 
-	if (power_info) {
+	if (atom_parse_data_header(mode_info->atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
+		power_info = (union power_info *)(mode_info->atom_context->bios + data_offset);
 		if (frev < 4) {
+			/* add the i2c bus for thermal/fan chip */
+			if (power_info->info.ucOverdriveThermalController > 0) {
+				DRM_INFO("Possible %s thermal controller at 0x%02x\n",
+					 thermal_controller_names[power_info->info.ucOverdriveThermalController],
+					 power_info->info.ucOverdriveControllerAddress >> 1);
+				i2c_bus = radeon_lookup_i2c_gpio(rdev, power_info->info.ucOverdriveI2cLine);
+				rdev->pm.i2c_bus = radeon_i2c_create(rdev->ddev, &i2c_bus, "Thermal");
+			}
 			num_modes = power_info->info.ucNumOfPowerModeEntries;
 			if (num_modes > ATOM_MAX_NUMBEROF_POWER_BLOCK)
 				num_modes = ATOM_MAX_NUMBEROF_POWER_BLOCK;
@@ -1684,6 +1705,24 @@ void radeon_atombios_get_power_modes(struct radeon_device *rdev)
 				}
 			}
 		} else if (frev == 4) {
+			/* add the i2c bus for thermal/fan chip */
+			/* no support for internal controller yet */
+			if (power_info->info_4.sThermalController.ucType > 0) {
+				if ((power_info->info_4.sThermalController.ucType == ATOM_PP_THERMALCONTROLLER_RV6xx) ||
+				    (power_info->info_4.sThermalController.ucType == ATOM_PP_THERMALCONTROLLER_RV770)) {
+					DRM_INFO("Internal thermal controller %s fan control\n",
+						 (power_info->info_4.sThermalController.ucFanParameters &
+						  ATOM_PP_FANPARAMETERS_NOFAN) ? "without" : "with");
+				} else {
+					DRM_INFO("Possible %s thermal controller at 0x%02x %s fan control\n",
+						 pp_lib_thermal_controller_names[power_info->info_4.sThermalController.ucType],
+						 power_info->info_4.sThermalController.ucI2cAddress >> 1,
+						 (power_info->info_4.sThermalController.ucFanParameters &
+						  ATOM_PP_FANPARAMETERS_NOFAN) ? "without" : "with");
+					i2c_bus = radeon_lookup_i2c_gpio(rdev, power_info->info_4.sThermalController.ucI2cLine);
+					rdev->pm.i2c_bus = radeon_i2c_create(rdev->ddev, &i2c_bus, "Thermal");
+				}
+			}
 			for (i = 0; i < power_info->info_4.ucNumStates; i++) {
 				mode_index = 0;
 				power_state = (struct _ATOM_PPLIB_STATE *)
diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c
index e9ea38ece37..2becdeda68a 100644
--- a/drivers/gpu/drm/radeon/radeon_combios.c
+++ b/drivers/gpu/drm/radeon/radeon_combios.c
@@ -531,10 +531,7 @@ static struct radeon_i2c_bus_rec combios_setup_i2c_bus(struct radeon_device *rde
 	case CHIP_RS300:
 		switch (ddc_line) {
 		case RADEON_GPIO_DVI_DDC:
-			/* in theory this should be hw capable,
-			 * but it doesn't seem to work
-			 */
-			i2c.hw_capable = false;
+			i2c.hw_capable = true;
 			break;
 		default:
 			i2c.hw_capable = false;
@@ -633,6 +630,8 @@ bool radeon_combios_get_clock_info(struct drm_device *dev)
 		p1pll->reference_div = RBIOS16(pll_info + 0x10);
 		p1pll->pll_out_min = RBIOS32(pll_info + 0x12);
 		p1pll->pll_out_max = RBIOS32(pll_info + 0x16);
+		p1pll->lcd_pll_out_min = p1pll->pll_out_min;
+		p1pll->lcd_pll_out_max = p1pll->pll_out_max;
 
 		if (rev > 9) {
 			p1pll->pll_in_min = RBIOS32(pll_info + 0x36);
diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c
index ee0083f982d..60d59816b94 100644
--- a/drivers/gpu/drm/radeon/radeon_connectors.c
+++ b/drivers/gpu/drm/radeon/radeon_connectors.c
@@ -940,7 +940,7 @@ static void radeon_dp_connector_destroy(struct drm_connector *connector)
 	if (radeon_connector->edid)
 		kfree(radeon_connector->edid);
 	if (radeon_dig_connector->dp_i2c_bus)
-		radeon_i2c_destroy_dp(radeon_dig_connector->dp_i2c_bus);
+		radeon_i2c_destroy(radeon_dig_connector->dp_i2c_bus);
 	kfree(radeon_connector->con_priv);
 	drm_sysfs_connector_remove(connector);
 	drm_connector_cleanup(connector);
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c
index 70ba02ed772..f9b0fe002c0 100644
--- a/drivers/gpu/drm/radeon/radeon_cs.c
+++ b/drivers/gpu/drm/radeon/radeon_cs.c
@@ -193,9 +193,11 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error)
 		radeon_bo_list_fence(&parser->validated, parser->ib->fence);
 	}
 	radeon_bo_list_unreserve(&parser->validated);
-	for (i = 0; i < parser->nrelocs; i++) {
-		if (parser->relocs[i].gobj)
-			drm_gem_object_unreference_unlocked(parser->relocs[i].gobj);
+	if (parser->relocs != NULL) {
+		for (i = 0; i < parser->nrelocs; i++) {
+			if (parser->relocs[i].gobj)
+				drm_gem_object_unreference_unlocked(parser->relocs[i].gobj);
+		}
 	}
 	kfree(parser->track);
 	kfree(parser->relocs);
@@ -243,7 +245,8 @@ int radeon_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 	}
 	r = radeon_cs_parser_relocs(&parser);
 	if (r) {
-		DRM_ERROR("Failed to parse relocation !\n");
+		if (r != -ERESTARTSYS)
+			DRM_ERROR("Failed to parse relocation %d!\n", r);
 		radeon_cs_parser_fini(&parser, r);
 		mutex_unlock(&rdev->cs_mutex);
 		return r;
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index e28e4ed5f72..60ec47b7164 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -33,7 +33,6 @@
 #include <linux/vga_switcheroo.h>
 #include "radeon_reg.h"
 #include "radeon.h"
-#include "radeon_asic.h"
 #include "atom.h"
 
 /*
@@ -242,6 +241,36 @@ bool radeon_card_posted(struct radeon_device *rdev)
 
 }
 
+void radeon_update_bandwidth_info(struct radeon_device *rdev)
+{
+	fixed20_12 a;
+	u32 sclk, mclk;
+
+	if (rdev->flags & RADEON_IS_IGP) {
+		sclk = radeon_get_engine_clock(rdev);
+		mclk = rdev->clock.default_mclk;
+
+		a.full = rfixed_const(100);
+		rdev->pm.sclk.full = rfixed_const(sclk);
+		rdev->pm.sclk.full = rfixed_div(rdev->pm.sclk, a);
+		rdev->pm.mclk.full = rfixed_const(mclk);
+		rdev->pm.mclk.full = rfixed_div(rdev->pm.mclk, a);
+
+		a.full = rfixed_const(16);
+		/* core_bandwidth = sclk(Mhz) * 16 */
+		rdev->pm.core_bandwidth.full = rfixed_div(rdev->pm.sclk, a);
+	} else {
+		sclk = radeon_get_engine_clock(rdev);
+		mclk = radeon_get_memory_clock(rdev);
+
+		a.full = rfixed_const(100);
+		rdev->pm.sclk.full = rfixed_const(sclk);
+		rdev->pm.sclk.full = rfixed_div(rdev->pm.sclk, a);
+		rdev->pm.mclk.full = rfixed_const(mclk);
+		rdev->pm.mclk.full = rfixed_div(rdev->pm.mclk, a);
+	}
+}
+
 bool radeon_boot_test_post_card(struct radeon_device *rdev)
 {
 	if (radeon_card_posted(rdev))
@@ -288,181 +317,6 @@ void radeon_dummy_page_fini(struct radeon_device *rdev)
 }
 
 
-/*
- * Registers accessors functions.
- */
-uint32_t radeon_invalid_rreg(struct radeon_device *rdev, uint32_t reg)
-{
-	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
-	BUG_ON(1);
-	return 0;
-}
-
-void radeon_invalid_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v)
-{
-	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
-		  reg, v);
-	BUG_ON(1);
-}
-
-void radeon_register_accessor_init(struct radeon_device *rdev)
-{
-	rdev->mc_rreg = &radeon_invalid_rreg;
-	rdev->mc_wreg = &radeon_invalid_wreg;
-	rdev->pll_rreg = &radeon_invalid_rreg;
-	rdev->pll_wreg = &radeon_invalid_wreg;
-	rdev->pciep_rreg = &radeon_invalid_rreg;
-	rdev->pciep_wreg = &radeon_invalid_wreg;
-
-	/* Don't change order as we are overridding accessor. */
-	if (rdev->family < CHIP_RV515) {
-		rdev->pcie_reg_mask = 0xff;
-	} else {
-		rdev->pcie_reg_mask = 0x7ff;
-	}
-	/* FIXME: not sure here */
-	if (rdev->family <= CHIP_R580) {
-		rdev->pll_rreg = &r100_pll_rreg;
-		rdev->pll_wreg = &r100_pll_wreg;
-	}
-	if (rdev->family >= CHIP_R420) {
-		rdev->mc_rreg = &r420_mc_rreg;
-		rdev->mc_wreg = &r420_mc_wreg;
-	}
-	if (rdev->family >= CHIP_RV515) {
-		rdev->mc_rreg = &rv515_mc_rreg;
-		rdev->mc_wreg = &rv515_mc_wreg;
-	}
-	if (rdev->family == CHIP_RS400 || rdev->family == CHIP_RS480) {
-		rdev->mc_rreg = &rs400_mc_rreg;
-		rdev->mc_wreg = &rs400_mc_wreg;
-	}
-	if (rdev->family == CHIP_RS690 || rdev->family == CHIP_RS740) {
-		rdev->mc_rreg = &rs690_mc_rreg;
-		rdev->mc_wreg = &rs690_mc_wreg;
-	}
-	if (rdev->family == CHIP_RS600) {
-		rdev->mc_rreg = &rs600_mc_rreg;
-		rdev->mc_wreg = &rs600_mc_wreg;
-	}
-	if ((rdev->family >= CHIP_R600) && (rdev->family <= CHIP_RV740)) {
-		rdev->pciep_rreg = &r600_pciep_rreg;
-		rdev->pciep_wreg = &r600_pciep_wreg;
-	}
-}
-
-
-/*
- * ASIC
- */
-int radeon_asic_init(struct radeon_device *rdev)
-{
-	radeon_register_accessor_init(rdev);
-	switch (rdev->family) {
-	case CHIP_R100:
-	case CHIP_RV100:
-	case CHIP_RS100:
-	case CHIP_RV200:
-	case CHIP_RS200:
-		rdev->asic = &r100_asic;
-		break;
-	case CHIP_R200:
-	case CHIP_RV250:
-	case CHIP_RS300:
-	case CHIP_RV280:
-		rdev->asic = &r200_asic;
-		break;
-	case CHIP_R300:
-	case CHIP_R350:
-	case CHIP_RV350:
-	case CHIP_RV380:
-		if (rdev->flags & RADEON_IS_PCIE)
-			rdev->asic = &r300_asic_pcie;
-		else
-			rdev->asic = &r300_asic;
-		break;
-	case CHIP_R420:
-	case CHIP_R423:
-	case CHIP_RV410:
-		rdev->asic = &r420_asic;
-		break;
-	case CHIP_RS400:
-	case CHIP_RS480:
-		rdev->asic = &rs400_asic;
-		break;
-	case CHIP_RS600:
-		rdev->asic = &rs600_asic;
-		break;
-	case CHIP_RS690:
-	case CHIP_RS740:
-		rdev->asic = &rs690_asic;
-		break;
-	case CHIP_RV515:
-		rdev->asic = &rv515_asic;
-		break;
-	case CHIP_R520:
-	case CHIP_RV530:
-	case CHIP_RV560:
-	case CHIP_RV570:
-	case CHIP_R580:
-		rdev->asic = &r520_asic;
-		break;
-	case CHIP_R600:
-	case CHIP_RV610:
-	case CHIP_RV630:
-	case CHIP_RV620:
-	case CHIP_RV635:
-	case CHIP_RV670:
-	case CHIP_RS780:
-	case CHIP_RS880:
-		rdev->asic = &r600_asic;
-		break;
-	case CHIP_RV770:
-	case CHIP_RV730:
-	case CHIP_RV710:
-	case CHIP_RV740:
-		rdev->asic = &rv770_asic;
-		break;
-	case CHIP_CEDAR:
-	case CHIP_REDWOOD:
-	case CHIP_JUNIPER:
-	case CHIP_CYPRESS:
-	case CHIP_HEMLOCK:
-		rdev->asic = &evergreen_asic;
-		break;
-	default:
-		/* FIXME: not supported yet */
-		return -EINVAL;
-	}
-
-	if (rdev->flags & RADEON_IS_IGP) {
-		rdev->asic->get_memory_clock = NULL;
-		rdev->asic->set_memory_clock = NULL;
-	}
-
-	return 0;
-}
-
-
-/*
- * Wrapper around modesetting bits.
- */
-int radeon_clocks_init(struct radeon_device *rdev)
-{
-	int r;
-
-	r = radeon_static_clocks_init(rdev->ddev);
-	if (r) {
-		return r;
-	}
-	DRM_INFO("Clocks initialized !\n");
-	return 0;
-}
-
-void radeon_clocks_fini(struct radeon_device *rdev)
-{
-}
-
 /* ATOM accessor methods */
 static uint32_t cail_pll_read(struct card_info *info, uint32_t reg)
 {
@@ -567,29 +421,6 @@ static unsigned int radeon_vga_set_decode(void *cookie, bool state)
 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
 }
 
-void radeon_agp_disable(struct radeon_device *rdev)
-{
-	rdev->flags &= ~RADEON_IS_AGP;
-	if (rdev->family >= CHIP_R600) {
-		DRM_INFO("Forcing AGP to PCIE mode\n");
-		rdev->flags |= RADEON_IS_PCIE;
-	} else if (rdev->family >= CHIP_RV515 ||
-			rdev->family == CHIP_RV380 ||
-			rdev->family == CHIP_RV410 ||
-			rdev->family == CHIP_R423) {
-		DRM_INFO("Forcing AGP to PCIE mode\n");
-		rdev->flags |= RADEON_IS_PCIE;
-		rdev->asic->gart_tlb_flush = &rv370_pcie_gart_tlb_flush;
-		rdev->asic->gart_set_page = &rv370_pcie_gart_set_page;
-	} else {
-		DRM_INFO("Forcing AGP to PCI mode\n");
-		rdev->flags |= RADEON_IS_PCI;
-		rdev->asic->gart_tlb_flush = &r100_pci_gart_tlb_flush;
-		rdev->asic->gart_set_page = &r100_pci_gart_set_page;
-	}
-	rdev->mc.gtt_size = radeon_gart_size * 1024 * 1024;
-}
-
 void radeon_check_arguments(struct radeon_device *rdev)
 {
 	/* vramlimit must be a power of two */
@@ -731,6 +562,14 @@ int radeon_device_init(struct radeon_device *rdev,
 		return r;
 	radeon_check_arguments(rdev);
 
+	/* all of the newer IGP chips have an internal gart
+	 * However some rs4xx report as AGP, so remove that here.
+	 */
+	if ((rdev->family >= CHIP_RS400) &&
+	    (rdev->flags & RADEON_IS_IGP)) {
+		rdev->flags &= ~RADEON_IS_AGP;
+	}
+
 	if (rdev->flags & RADEON_IS_AGP && radeon_agpmode == -1) {
 		radeon_agp_disable(rdev);
 	}
diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
index ba8d806dcf3..b8d67282824 100644
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -368,10 +368,9 @@ static bool radeon_setup_enc_conn(struct drm_device *dev)
 
 	if (rdev->bios) {
 		if (rdev->is_atom_bios) {
-			if (rdev->family >= CHIP_R600)
+			ret = radeon_get_atom_connector_info_from_supported_devices_table(dev);
+			if (ret == false)
 				ret = radeon_get_atom_connector_info_from_object_table(dev);
-			else
-				ret = radeon_get_atom_connector_info_from_supported_devices_table(dev);
 		} else {
 			ret = radeon_get_legacy_connector_info_from_bios(dev);
 			if (ret == false)
@@ -469,10 +468,19 @@ static void radeon_compute_pll_legacy(struct radeon_pll *pll,
 	uint32_t best_error = 0xffffffff;
 	uint32_t best_vco_diff = 1;
 	uint32_t post_div;
+	u32 pll_out_min, pll_out_max;
 
 	DRM_DEBUG("PLL freq %llu %u %u\n", freq, pll->min_ref_div, pll->max_ref_div);
 	freq = freq * 1000;
 
+	if (pll->flags & RADEON_PLL_IS_LCD) {
+		pll_out_min = pll->lcd_pll_out_min;
+		pll_out_max = pll->lcd_pll_out_max;
+	} else {
+		pll_out_min = pll->pll_out_min;
+		pll_out_max = pll->pll_out_max;
+	}
+
 	if (pll->flags & RADEON_PLL_USE_REF_DIV)
 		min_ref_div = max_ref_div = pll->reference_div;
 	else {
@@ -536,10 +544,10 @@ static void radeon_compute_pll_legacy(struct radeon_pll *pll,
 				tmp = (uint64_t)pll->reference_freq * feedback_div;
 				vco = radeon_div(tmp, ref_div);
 
-				if (vco < pll->pll_out_min) {
+				if (vco < pll_out_min) {
 					min_feed_div = feedback_div + 1;
 					continue;
-				} else if (vco > pll->pll_out_max) {
+				} else if (vco > pll_out_max) {
 					max_feed_div = feedback_div;
 					continue;
 				}
@@ -675,6 +683,15 @@ calc_fb_ref_div(struct radeon_pll *pll,
 {
 	fixed20_12 ffreq, max_error, error, pll_out, a;
 	u32 vco;
+	u32 pll_out_min, pll_out_max;
+
+	if (pll->flags & RADEON_PLL_IS_LCD) {
+		pll_out_min = pll->lcd_pll_out_min;
+		pll_out_max = pll->lcd_pll_out_max;
+	} else {
+		pll_out_min = pll->pll_out_min;
+		pll_out_max = pll->pll_out_max;
+	}
 
 	ffreq.full = rfixed_const(freq);
 	/* max_error = ffreq * 0.0025; */
@@ -686,7 +703,7 @@ calc_fb_ref_div(struct radeon_pll *pll,
 			vco = pll->reference_freq * (((*fb_div) * 10) + (*fb_div_frac));
 			vco = vco / ((*ref_div) * 10);
 
-			if ((vco < pll->pll_out_min) || (vco > pll->pll_out_max))
+			if ((vco < pll_out_min) || (vco > pll_out_max))
 				continue;
 
 			/* pll_out = vco / post_div; */
@@ -714,6 +731,15 @@ static void radeon_compute_pll_new(struct radeon_pll *pll,
 {
 	u32 fb_div = 0, fb_div_frac = 0, post_div = 0, ref_div = 0;
 	u32 best_freq = 0, vco_frequency;
+	u32 pll_out_min, pll_out_max;
+
+	if (pll->flags & RADEON_PLL_IS_LCD) {
+		pll_out_min = pll->lcd_pll_out_min;
+		pll_out_max = pll->lcd_pll_out_max;
+	} else {
+		pll_out_min = pll->pll_out_min;
+		pll_out_max = pll->pll_out_max;
+	}
 
 	/* freq = freq / 10; */
 	do_div(freq, 10);
@@ -724,7 +750,7 @@ static void radeon_compute_pll_new(struct radeon_pll *pll,
 			goto done;
 
 		vco_frequency = freq * post_div;
-		if ((vco_frequency < pll->pll_out_min) || (vco_frequency > pll->pll_out_max))
+		if ((vco_frequency < pll_out_min) || (vco_frequency > pll_out_max))
 			goto done;
 
 		if (pll->flags & RADEON_PLL_USE_REF_DIV) {
@@ -749,7 +775,7 @@ static void radeon_compute_pll_new(struct radeon_pll *pll,
 				continue;
 
 			vco_frequency = freq * post_div;
-			if ((vco_frequency < pll->pll_out_min) || (vco_frequency > pll->pll_out_max))
+			if ((vco_frequency < pll_out_min) || (vco_frequency > pll_out_max))
 				continue;
 			if (pll->flags & RADEON_PLL_USE_REF_DIV) {
 				ref_div = pll->reference_div;
@@ -945,6 +971,23 @@ static int radeon_modeset_create_props(struct radeon_device *rdev)
 	return 0;
 }
 
+void radeon_update_display_priority(struct radeon_device *rdev)
+{
+	/* adjustment options for the display watermarks */
+	if ((radeon_disp_priority == 0) || (radeon_disp_priority > 2)) {
+		/* set display priority to high for r3xx, rv515 chips
+		 * this avoids flickering due to underflow to the
+		 * display controllers during heavy acceleration.
+		 */
+		if (ASIC_IS_R300(rdev) || (rdev->family == CHIP_RV515))
+			rdev->disp_priority = 2;
+		else
+			rdev->disp_priority = 0;
+	} else
+		rdev->disp_priority = radeon_disp_priority;
+
+}
+
 int radeon_modeset_init(struct radeon_device *rdev)
 {
 	int i;
@@ -976,15 +1019,6 @@ int radeon_modeset_init(struct radeon_device *rdev)
 		radeon_combios_check_hardcoded_edid(rdev);
 	}
 
-	if (rdev->flags & RADEON_SINGLE_CRTC)
-		rdev->num_crtc = 1;
-	else {
-		if (ASIC_IS_DCE4(rdev))
-			rdev->num_crtc = 6;
-		else
-			rdev->num_crtc = 2;
-	}
-
 	/* allocate crtcs */
 	for (i = 0; i < rdev->num_crtc; i++) {
 		radeon_crtc_init(rdev->ddev, i);
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index 6eec0ece6a6..055a51732dc 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -42,9 +42,10 @@
  * KMS wrapper.
  * - 2.0.0 - initial interface
  * - 2.1.0 - add square tiling interface
+ * - 2.2.0 - add r6xx/r7xx const buffer support
  */
 #define KMS_DRIVER_MAJOR	2
-#define KMS_DRIVER_MINOR	1
+#define KMS_DRIVER_MINOR	2
 #define KMS_DRIVER_PATCHLEVEL	0
 int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags);
 int radeon_driver_unload_kms(struct drm_device *dev);
@@ -91,6 +92,8 @@ int radeon_tv = 1;
 int radeon_new_pll = -1;
 int radeon_dynpm = -1;
 int radeon_audio = 1;
+int radeon_disp_priority = 0;
+int radeon_hw_i2c = 0;
 
 MODULE_PARM_DESC(no_wb, "Disable AGP writeback for scratch registers");
 module_param_named(no_wb, radeon_no_wb, int, 0444);
@@ -134,6 +137,12 @@ module_param_named(dynpm, radeon_dynpm, int, 0444);
 MODULE_PARM_DESC(audio, "Audio enable (0 = disable)");
 module_param_named(audio, radeon_audio, int, 0444);
 
+MODULE_PARM_DESC(disp_priority, "Display Priority (0 = auto, 1 = normal, 2 = high)");
+module_param_named(disp_priority, radeon_disp_priority, int, 0444);
+
+MODULE_PARM_DESC(hw_i2c, "hw i2c engine enable (0 = disable)");
+module_param_named(hw_i2c, radeon_hw_i2c, int, 0444);
+
 static int radeon_suspend(struct drm_device *dev, pm_message_t state)
 {
 	drm_radeon_private_t *dev_priv = dev->dev_private;
diff --git a/drivers/gpu/drm/radeon/radeon_drv.h b/drivers/gpu/drm/radeon/radeon_drv.h
index ec55f2b23c2..448eba89d1e 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.h
+++ b/drivers/gpu/drm/radeon/radeon_drv.h
@@ -107,9 +107,10 @@
  * 1.30- Add support for occlusion queries
  * 1.31- Add support for num Z pipes from GET_PARAM
  * 1.32- fixes for rv740 setup
+ * 1.33- Add r6xx/r7xx const buffer support
  */
 #define DRIVER_MAJOR		1
-#define DRIVER_MINOR		32
+#define DRIVER_MINOR		33
 #define DRIVER_PATCHLEVEL	0
 
 enum radeon_cp_microcode_version {
diff --git a/drivers/gpu/drm/radeon/radeon_encoders.c b/drivers/gpu/drm/radeon/radeon_encoders.c
index bc926ea0a53..52d6f96f274 100644
--- a/drivers/gpu/drm/radeon/radeon_encoders.c
+++ b/drivers/gpu/drm/radeon/radeon_encoders.c
@@ -302,7 +302,7 @@ static bool radeon_atom_mode_fixup(struct drm_encoder *encoder,
 	}
 
 	if (ASIC_IS_DCE3(rdev) &&
-	    (radeon_encoder->active_device & (ATOM_DEVICE_DFP_SUPPORT))) {
+	    (radeon_encoder->active_device & (ATOM_DEVICE_DFP_SUPPORT | ATOM_DEVICE_LCD_SUPPORT))) {
 		struct drm_connector *connector = radeon_get_connector_for_encoder(encoder);
 		radeon_dp_set_link_config(connector, mode);
 	}
@@ -519,7 +519,8 @@ atombios_digital_setup(struct drm_encoder *encoder, int action)
 		break;
 	}
 
-	atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev);
+	if (!atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev))
+		return;
 
 	switch (frev) {
 	case 1:
@@ -593,7 +594,6 @@ atombios_digital_setup(struct drm_encoder *encoder, int action)
 	}
 
 	atom_execute_table(rdev->mode_info.atom_context, index, (uint32_t *)&args);
-	r600_hdmi_enable(encoder, hdmi_detected);
 }
 
 int
@@ -708,7 +708,7 @@ atombios_dig_encoder_setup(struct drm_encoder *encoder, int action)
 	struct radeon_connector_atom_dig *dig_connector =
 		radeon_get_atom_connector_priv_from_encoder(encoder);
 	union dig_encoder_control args;
-	int index = 0, num = 0;
+	int index = 0;
 	uint8_t frev, crev;
 
 	if (!dig || !dig_connector)
@@ -724,9 +724,9 @@ atombios_dig_encoder_setup(struct drm_encoder *encoder, int action)
 		else
 			index = GetIndexIntoMasterTable(COMMAND, DIG1EncoderControl);
 	}
-	num = dig->dig_encoder + 1;
 
-	atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev);
+	if (!atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev))
+		return;
 
 	args.v1.ucAction = action;
 	args.v1.usPixelClock = cpu_to_le16(radeon_encoder->pixel_clock / 10);
@@ -785,7 +785,7 @@ atombios_dig_transmitter_setup(struct drm_encoder *encoder, int action, uint8_t
 	struct drm_connector *connector;
 	struct radeon_connector *radeon_connector;
 	union dig_transmitter_control args;
-	int index = 0, num = 0;
+	int index = 0;
 	uint8_t frev, crev;
 	bool is_dp = false;
 	int pll_id = 0;
@@ -814,7 +814,8 @@ atombios_dig_transmitter_setup(struct drm_encoder *encoder, int action, uint8_t
 		}
 	}
 
-	atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev);
+	if (!atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev))
+		return;
 
 	args.v1.ucAction = action;
 	if (action == ATOM_TRANSMITTER_ACTION_INIT) {
@@ -860,15 +861,12 @@ atombios_dig_transmitter_setup(struct drm_encoder *encoder, int action, uint8_t
 		switch (radeon_encoder->encoder_id) {
 		case ENCODER_OBJECT_ID_INTERNAL_UNIPHY:
 			args.v3.acConfig.ucTransmitterSel = 0;
-			num = 0;
 			break;
 		case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1:
 			args.v3.acConfig.ucTransmitterSel = 1;
-			num = 1;
 			break;
 		case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2:
 			args.v3.acConfig.ucTransmitterSel = 2;
-			num = 2;
 			break;
 		}
 
@@ -879,23 +877,19 @@ atombios_dig_transmitter_setup(struct drm_encoder *encoder, int action, uint8_t
 				args.v3.acConfig.fCoherentMode = 1;
 		}
 	} else if (ASIC_IS_DCE32(rdev)) {
-		if (dig->dig_encoder == 1)
-			args.v2.acConfig.ucEncoderSel = 1;
+		args.v2.acConfig.ucEncoderSel = dig->dig_encoder;
 		if (dig_connector->linkb)
 			args.v2.acConfig.ucLinkSel = 1;
 
 		switch (radeon_encoder->encoder_id) {
 		case ENCODER_OBJECT_ID_INTERNAL_UNIPHY:
 			args.v2.acConfig.ucTransmitterSel = 0;
-			num = 0;
 			break;
 		case ENCODER_OBJECT_ID_INTERNAL_UNIPHY1:
 			args.v2.acConfig.ucTransmitterSel = 1;
-			num = 1;
 			break;
 		case ENCODER_OBJECT_ID_INTERNAL_UNIPHY2:
 			args.v2.acConfig.ucTransmitterSel = 2;
-			num = 2;
 			break;
 		}
 
@@ -913,31 +907,25 @@ atombios_dig_transmitter_setup(struct drm_encoder *encoder, int action, uint8_t
 		else
 			args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_DIG1_ENCODER;
 
-		switch (radeon_encoder->encoder_id) {
-		case ENCODER_OBJECT_ID_INTERNAL_UNIPHY:
-			if (rdev->flags & RADEON_IS_IGP) {
-				if (radeon_encoder->pixel_clock > 165000) {
-					if (dig_connector->igp_lane_info & 0x3)
-						args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_0_7;
-					else if (dig_connector->igp_lane_info & 0xc)
-						args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_8_15;
-				} else {
-					if (dig_connector->igp_lane_info & 0x1)
-						args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_0_3;
-					else if (dig_connector->igp_lane_info & 0x2)
-						args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_4_7;
-					else if (dig_connector->igp_lane_info & 0x4)
-						args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_8_11;
-					else if (dig_connector->igp_lane_info & 0x8)
-						args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_12_15;
-				}
+		if ((rdev->flags & RADEON_IS_IGP) &&
+		    (radeon_encoder->encoder_id == ENCODER_OBJECT_ID_INTERNAL_UNIPHY)) {
+			if (is_dp || (radeon_encoder->pixel_clock <= 165000)) {
+				if (dig_connector->igp_lane_info & 0x1)
+					args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_0_3;
+				else if (dig_connector->igp_lane_info & 0x2)
+					args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_4_7;
+				else if (dig_connector->igp_lane_info & 0x4)
+					args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_8_11;
+				else if (dig_connector->igp_lane_info & 0x8)
+					args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_12_15;
+			} else {
+				if (dig_connector->igp_lane_info & 0x3)
+					args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_0_7;
+				else if (dig_connector->igp_lane_info & 0xc)
+					args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LANE_8_15;
 			}
-			break;
 		}
 
-		if (radeon_encoder->pixel_clock > 165000)
-			args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_8LANE_LINK;
-
 		if (dig_connector->linkb)
 			args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_LINKB;
 		else
@@ -948,6 +936,8 @@ atombios_dig_transmitter_setup(struct drm_encoder *encoder, int action, uint8_t
 		else if (radeon_encoder->devices & (ATOM_DEVICE_DFP_SUPPORT)) {
 			if (dig->coherent_mode)
 				args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_COHERENT;
+			if (radeon_encoder->pixel_clock > 165000)
+				args.v1.ucConfig |= ATOM_TRANSMITTER_CONFIG_8LANE_LINK;
 		}
 	}
 
@@ -1054,16 +1044,25 @@ radeon_atom_encoder_dpms(struct drm_encoder *encoder, int mode)
 	if (is_dig) {
 		switch (mode) {
 		case DRM_MODE_DPMS_ON:
-			atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE_OUTPUT, 0, 0);
-			{
+			if (atombios_get_encoder_mode(encoder) == ATOM_ENCODER_MODE_DP) {
 				struct drm_connector *connector = radeon_get_connector_for_encoder(encoder);
+
 				dp_link_train(encoder, connector);
+				if (ASIC_IS_DCE4(rdev))
+					atombios_dig_encoder_setup(encoder, ATOM_ENCODER_CMD_DP_VIDEO_ON);
 			}
+			if (!ASIC_IS_DCE4(rdev))
+				atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_ENABLE_OUTPUT, 0, 0);
 			break;
 		case DRM_MODE_DPMS_STANDBY:
 		case DRM_MODE_DPMS_SUSPEND:
 		case DRM_MODE_DPMS_OFF:
-			atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE_OUTPUT, 0, 0);
+			if (!ASIC_IS_DCE4(rdev))
+				atombios_dig_transmitter_setup(encoder, ATOM_TRANSMITTER_ACTION_DISABLE_OUTPUT, 0, 0);
+			if (atombios_get_encoder_mode(encoder) == ATOM_ENCODER_MODE_DP) {
+				if (ASIC_IS_DCE4(rdev))
+					atombios_dig_encoder_setup(encoder, ATOM_ENCODER_CMD_DP_VIDEO_OFF);
+			}
 			break;
 		}
 	} else {
@@ -1104,7 +1103,8 @@ atombios_set_encoder_crtc_source(struct drm_encoder *encoder)
 
 	memset(&args, 0, sizeof(args));
 
-	atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev);
+	if (!atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev))
+		return;
 
 	switch (frev) {
 	case 1:
@@ -1216,6 +1216,9 @@ atombios_set_encoder_crtc_source(struct drm_encoder *encoder)
 	}
 
 	atom_execute_table(rdev->mode_info.atom_context, index, (uint32_t *)&args);
+
+	/* update scratch regs with new routing */
+	radeon_atombios_encoder_crtc_scratch_regs(encoder, radeon_crtc->crtc_id);
 }
 
 static void
@@ -1326,19 +1329,9 @@ radeon_atom_encoder_mode_set(struct drm_encoder *encoder,
 	struct drm_device *dev = encoder->dev;
 	struct radeon_device *rdev = dev->dev_private;
 	struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder);
-	struct radeon_crtc *radeon_crtc = to_radeon_crtc(encoder->crtc);
 
-	if (radeon_encoder->active_device &
-	    (ATOM_DEVICE_DFP_SUPPORT | ATOM_DEVICE_LCD_SUPPORT)) {
-		struct radeon_encoder_atom_dig *dig = radeon_encoder->enc_priv;
-		if (dig)
-			dig->dig_encoder = radeon_atom_pick_dig_encoder(encoder);
-	}
 	radeon_encoder->pixel_clock = adjusted_mode->clock;
 
-	radeon_atombios_encoder_crtc_scratch_regs(encoder, radeon_crtc->crtc_id);
-	atombios_set_encoder_crtc_source(encoder);
-
 	if (ASIC_IS_AVIVO(rdev)) {
 		if (radeon_encoder->active_device & (ATOM_DEVICE_CV_SUPPORT | ATOM_DEVICE_TV_SUPPORT))
 			atombios_yuv_setup(encoder, true);
@@ -1396,9 +1389,10 @@ radeon_atom_encoder_mode_set(struct drm_encoder *encoder,
 	}
 	atombios_apply_encoder_quirks(encoder, adjusted_mode);
 
-	/* XXX */
-	if (!ASIC_IS_DCE4(rdev))
+	if (atombios_get_encoder_mode(encoder) == ATOM_ENCODER_MODE_HDMI) {
+		r600_hdmi_enable(encoder);
 		r600_hdmi_setmode(encoder, adjusted_mode);
+	}
 }
 
 static bool
@@ -1418,7 +1412,8 @@ atombios_dac_load_detect(struct drm_encoder *encoder, struct drm_connector *conn
 
 		memset(&args, 0, sizeof(args));
 
-		atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev);
+		if (!atom_parse_cmd_header(rdev->mode_info.atom_context, index, &frev, &crev))
+			return false;
 
 		args.sDacload.ucMisc = 0;
 
@@ -1492,8 +1487,20 @@ radeon_atom_dac_detect(struct drm_encoder *encoder, struct drm_connector *connec
 
 static void radeon_atom_encoder_prepare(struct drm_encoder *encoder)
 {
+	struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder);
+
+	if (radeon_encoder->active_device &
+	    (ATOM_DEVICE_DFP_SUPPORT | ATOM_DEVICE_LCD_SUPPORT)) {
+		struct radeon_encoder_atom_dig *dig = radeon_encoder->enc_priv;
+		if (dig)
+			dig->dig_encoder = radeon_atom_pick_dig_encoder(encoder);
+	}
+
 	radeon_atom_output_lock(encoder, true);
 	radeon_atom_encoder_dpms(encoder, DRM_MODE_DPMS_OFF);
+
+	/* this is needed for the pll/ss setup to work correctly in some cases */
+	atombios_set_encoder_crtc_source(encoder);
 }
 
 static void radeon_atom_encoder_commit(struct drm_encoder *encoder)
@@ -1509,6 +1516,8 @@ static void radeon_atom_encoder_disable(struct drm_encoder *encoder)
 	radeon_atom_encoder_dpms(encoder, DRM_MODE_DPMS_OFF);
 
 	if (radeon_encoder_is_digital(encoder)) {
+		if (atombios_get_encoder_mode(encoder) == ATOM_ENCODER_MODE_HDMI)
+			r600_hdmi_disable(encoder);
 		dig = radeon_encoder->enc_priv;
 		dig->dig_encoder = -1;
 	}
@@ -1659,6 +1668,4 @@ radeon_add_atom_encoder(struct drm_device *dev, uint32_t encoder_id, uint32_t su
 		drm_encoder_helper_add(encoder, &radeon_atom_dig_helper_funcs);
 		break;
 	}
-
-	r600_hdmi_init(encoder);
 }
diff --git a/drivers/gpu/drm/radeon/radeon_i2c.c b/drivers/gpu/drm/radeon/radeon_i2c.c
index 4ae50c19589..5def6f5dff3 100644
--- a/drivers/gpu/drm/radeon/radeon_i2c.c
+++ b/drivers/gpu/drm/radeon/radeon_i2c.c
@@ -59,6 +59,7 @@ bool radeon_ddc_probe(struct radeon_connector *radeon_connector)
 	return false;
 }
 
+/* bit banging i2c */
 
 static void radeon_i2c_do_lock(struct radeon_i2c_chan *i2c, int lock_state)
 {
@@ -181,13 +182,30 @@ static void set_data(void *i2c_priv, int data)
 	WREG32(rec->en_data_reg, val);
 }
 
+static int pre_xfer(struct i2c_adapter *i2c_adap)
+{
+	struct radeon_i2c_chan *i2c = i2c_get_adapdata(i2c_adap);
+
+	radeon_i2c_do_lock(i2c, 1);
+
+	return 0;
+}
+
+static void post_xfer(struct i2c_adapter *i2c_adap)
+{
+	struct radeon_i2c_chan *i2c = i2c_get_adapdata(i2c_adap);
+
+	radeon_i2c_do_lock(i2c, 0);
+}
+
+/* hw i2c */
+
 static u32 radeon_get_i2c_prescale(struct radeon_device *rdev)
 {
-	struct radeon_pll *spll = &rdev->clock.spll;
 	u32 sclk = radeon_get_engine_clock(rdev);
 	u32 prescale = 0;
-	u32 n, m;
-	u8 loop;
+	u32 nm;
+	u8 n, m, loop;
 	int i2c_clock;
 
 	switch (rdev->family) {
@@ -203,13 +221,15 @@ static u32 radeon_get_i2c_prescale(struct radeon_device *rdev)
 	case CHIP_R300:
 	case CHIP_R350:
 	case CHIP_RV350:
-		n = (spll->reference_freq) / (4 * 6);
+		i2c_clock = 60;
+		nm = (sclk * 10) / (i2c_clock * 4);
 		for (loop = 1; loop < 255; loop++) {
-			if ((loop * (loop - 1)) > n)
+			if ((nm / loop) < loop)
 				break;
 		}
-		m = loop - 1;
-		prescale = m | (loop << 8);
+		n = loop - 1;
+		m = loop - 2;
+		prescale = m | (n << 8);
 		break;
 	case CHIP_RV380:
 	case CHIP_RS400:
@@ -217,7 +237,6 @@ static u32 radeon_get_i2c_prescale(struct radeon_device *rdev)
 	case CHIP_R420:
 	case CHIP_R423:
 	case CHIP_RV410:
-		sclk = radeon_get_engine_clock(rdev);
 		prescale = (((sclk * 10)/(4 * 128 * 100) + 1) << 8) + 128;
 		break;
 	case CHIP_RS600:
@@ -232,7 +251,6 @@ static u32 radeon_get_i2c_prescale(struct radeon_device *rdev)
 	case CHIP_RV570:
 	case CHIP_R580:
 		i2c_clock = 50;
-		sclk = radeon_get_engine_clock(rdev);
 		if (rdev->family == CHIP_R520)
 			prescale = (127 << 8) + ((sclk * 10) / (4 * 127 * i2c_clock));
 		else
@@ -291,6 +309,7 @@ static int r100_hw_i2c_xfer(struct i2c_adapter *i2c_adap,
 	prescale = radeon_get_i2c_prescale(rdev);
 
 	reg = ((prescale << RADEON_I2C_PRESCALE_SHIFT) |
+	       RADEON_I2C_DRIVE_EN |
 	       RADEON_I2C_START |
 	       RADEON_I2C_STOP |
 	       RADEON_I2C_GO);
@@ -757,26 +776,13 @@ done:
 	return ret;
 }
 
-static int radeon_sw_i2c_xfer(struct i2c_adapter *i2c_adap,
+static int radeon_hw_i2c_xfer(struct i2c_adapter *i2c_adap,
 			      struct i2c_msg *msgs, int num)
 {
 	struct radeon_i2c_chan *i2c = i2c_get_adapdata(i2c_adap);
-	int ret;
-
-	radeon_i2c_do_lock(i2c, 1);
-	ret = i2c_transfer(&i2c->algo.radeon.bit_adapter, msgs, num);
-	radeon_i2c_do_lock(i2c, 0);
-
-	return ret;
-}
-
-static int radeon_i2c_xfer(struct i2c_adapter *i2c_adap,
-			   struct i2c_msg *msgs, int num)
-{
-	struct radeon_i2c_chan *i2c = i2c_get_adapdata(i2c_adap);
 	struct radeon_device *rdev = i2c->dev->dev_private;
 	struct radeon_i2c_bus_rec *rec = &i2c->rec;
-	int ret;
+	int ret = 0;
 
 	switch (rdev->family) {
 	case CHIP_R100:
@@ -797,16 +803,12 @@ static int radeon_i2c_xfer(struct i2c_adapter *i2c_adap,
 	case CHIP_RV410:
 	case CHIP_RS400:
 	case CHIP_RS480:
-		if (rec->hw_capable)
-			ret = r100_hw_i2c_xfer(i2c_adap, msgs, num);
-		else
-			ret = radeon_sw_i2c_xfer(i2c_adap, msgs, num);
+		ret = r100_hw_i2c_xfer(i2c_adap, msgs, num);
 		break;
 	case CHIP_RS600:
 	case CHIP_RS690:
 	case CHIP_RS740:
 		/* XXX fill in hw i2c implementation */
-		ret = radeon_sw_i2c_xfer(i2c_adap, msgs, num);
 		break;
 	case CHIP_RV515:
 	case CHIP_R520:
@@ -814,20 +816,16 @@ static int radeon_i2c_xfer(struct i2c_adapter *i2c_adap,
 	case CHIP_RV560:
 	case CHIP_RV570:
 	case CHIP_R580:
-		if (rec->hw_capable) {
-			if (rec->mm_i2c)
-				ret = r100_hw_i2c_xfer(i2c_adap, msgs, num);
-			else
-				ret = r500_hw_i2c_xfer(i2c_adap, msgs, num);
-		} else
-			ret = radeon_sw_i2c_xfer(i2c_adap, msgs, num);
+		if (rec->mm_i2c)
+			ret = r100_hw_i2c_xfer(i2c_adap, msgs, num);
+		else
+			ret = r500_hw_i2c_xfer(i2c_adap, msgs, num);
 		break;
 	case CHIP_R600:
 	case CHIP_RV610:
 	case CHIP_RV630:
 	case CHIP_RV670:
 		/* XXX fill in hw i2c implementation */
-		ret = radeon_sw_i2c_xfer(i2c_adap, msgs, num);
 		break;
 	case CHIP_RV620:
 	case CHIP_RV635:
@@ -838,7 +836,6 @@ static int radeon_i2c_xfer(struct i2c_adapter *i2c_adap,
 	case CHIP_RV710:
 	case CHIP_RV740:
 		/* XXX fill in hw i2c implementation */
-		ret = radeon_sw_i2c_xfer(i2c_adap, msgs, num);
 		break;
 	case CHIP_CEDAR:
 	case CHIP_REDWOOD:
@@ -846,7 +843,6 @@ static int radeon_i2c_xfer(struct i2c_adapter *i2c_adap,
 	case CHIP_CYPRESS:
 	case CHIP_HEMLOCK:
 		/* XXX fill in hw i2c implementation */
-		ret = radeon_sw_i2c_xfer(i2c_adap, msgs, num);
 		break;
 	default:
 		DRM_ERROR("i2c: unhandled radeon chip\n");
@@ -857,20 +853,21 @@ static int radeon_i2c_xfer(struct i2c_adapter *i2c_adap,
 	return ret;
 }
 
-static u32 radeon_i2c_func(struct i2c_adapter *adap)
+static u32 radeon_hw_i2c_func(struct i2c_adapter *adap)
 {
 	return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
 }
 
 static const struct i2c_algorithm radeon_i2c_algo = {
-	.master_xfer = radeon_i2c_xfer,
-	.functionality = radeon_i2c_func,
+	.master_xfer = radeon_hw_i2c_xfer,
+	.functionality = radeon_hw_i2c_func,
 };
 
 struct radeon_i2c_chan *radeon_i2c_create(struct drm_device *dev,
 					  struct radeon_i2c_bus_rec *rec,
 					  const char *name)
 {
+	struct radeon_device *rdev = dev->dev_private;
 	struct radeon_i2c_chan *i2c;
 	int ret;
 
@@ -878,37 +875,43 @@ struct radeon_i2c_chan *radeon_i2c_create(struct drm_device *dev,
 	if (i2c == NULL)
 		return NULL;
 
-	/* set the internal bit adapter */
-	i2c->algo.radeon.bit_adapter.owner = THIS_MODULE;
-	i2c_set_adapdata(&i2c->algo.radeon.bit_adapter, i2c);
-	sprintf(i2c->algo.radeon.bit_adapter.name, "Radeon internal i2c bit bus %s", name);
-	i2c->algo.radeon.bit_adapter.algo_data = &i2c->algo.radeon.bit_data;
-	i2c->algo.radeon.bit_data.setsda = set_data;
-	i2c->algo.radeon.bit_data.setscl = set_clock;
-	i2c->algo.radeon.bit_data.getsda = get_data;
-	i2c->algo.radeon.bit_data.getscl = get_clock;
-	i2c->algo.radeon.bit_data.udelay = 20;
-	/* vesa says 2.2 ms is enough, 1 jiffy doesn't seem to always
-	 * make this, 2 jiffies is a lot more reliable */
-	i2c->algo.radeon.bit_data.timeout = 2;
-	i2c->algo.radeon.bit_data.data = i2c;
-	ret = i2c_bit_add_bus(&i2c->algo.radeon.bit_adapter);
-	if (ret) {
-		DRM_ERROR("Failed to register internal bit i2c %s\n", name);
-		goto out_free;
-	}
-	/* set the radeon i2c adapter */
-	i2c->dev = dev;
 	i2c->rec = *rec;
 	i2c->adapter.owner = THIS_MODULE;
+	i2c->dev = dev;
 	i2c_set_adapdata(&i2c->adapter, i2c);
-	sprintf(i2c->adapter.name, "Radeon i2c %s", name);
-	i2c->adapter.algo_data = &i2c->algo.radeon;
-	i2c->adapter.algo = &radeon_i2c_algo;
-	ret = i2c_add_adapter(&i2c->adapter);
-	if (ret) {
-		DRM_ERROR("Failed to register i2c %s\n", name);
-		goto out_free;
+	if (rec->mm_i2c ||
+	    (rec->hw_capable &&
+	     radeon_hw_i2c &&
+	     ((rdev->family <= CHIP_RS480) ||
+	      ((rdev->family >= CHIP_RV515) && (rdev->family <= CHIP_R580))))) {
+		/* set the radeon hw i2c adapter */
+		sprintf(i2c->adapter.name, "Radeon i2c hw bus %s", name);
+		i2c->adapter.algo = &radeon_i2c_algo;
+		ret = i2c_add_adapter(&i2c->adapter);
+		if (ret) {
+			DRM_ERROR("Failed to register hw i2c %s\n", name);
+			goto out_free;
+		}
+	} else {
+		/* set the radeon bit adapter */
+		sprintf(i2c->adapter.name, "Radeon i2c bit bus %s", name);
+		i2c->adapter.algo_data = &i2c->algo.bit;
+		i2c->algo.bit.pre_xfer = pre_xfer;
+		i2c->algo.bit.post_xfer = post_xfer;
+		i2c->algo.bit.setsda = set_data;
+		i2c->algo.bit.setscl = set_clock;
+		i2c->algo.bit.getsda = get_data;
+		i2c->algo.bit.getscl = get_clock;
+		i2c->algo.bit.udelay = 20;
+		/* vesa says 2.2 ms is enough, 1 jiffy doesn't seem to always
+		 * make this, 2 jiffies is a lot more reliable */
+		i2c->algo.bit.timeout = 2;
+		i2c->algo.bit.data = i2c;
+		ret = i2c_bit_add_bus(&i2c->adapter);
+		if (ret) {
+			DRM_ERROR("Failed to register bit i2c %s\n", name);
+			goto out_free;
+		}
 	}
 
 	return i2c;
@@ -953,16 +956,6 @@ void radeon_i2c_destroy(struct radeon_i2c_chan *i2c)
 {
 	if (!i2c)
 		return;
-	i2c_del_adapter(&i2c->algo.radeon.bit_adapter);
-	i2c_del_adapter(&i2c->adapter);
-	kfree(i2c);
-}
-
-void radeon_i2c_destroy_dp(struct radeon_i2c_chan *i2c)
-{
-	if (!i2c)
-		return;
-
 	i2c_del_adapter(&i2c->adapter);
 	kfree(i2c);
 }
diff --git a/drivers/gpu/drm/radeon/radeon_irq_kms.c b/drivers/gpu/drm/radeon/radeon_irq_kms.c
index 3cfd60fd008..a212041e8b0 100644
--- a/drivers/gpu/drm/radeon/radeon_irq_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_irq_kms.c
@@ -67,9 +67,10 @@ void radeon_driver_irq_preinstall_kms(struct drm_device *dev)
 
 	/* Disable *all* interrupts */
 	rdev->irq.sw_int = false;
-	for (i = 0; i < 2; i++) {
+	for (i = 0; i < rdev->num_crtc; i++)
 		rdev->irq.crtc_vblank_int[i] = false;
-	}
+	for (i = 0; i < 6; i++)
+		rdev->irq.hpd[i] = false;
 	radeon_irq_set(rdev);
 	/* Clear bits */
 	radeon_irq_process(rdev);
@@ -95,34 +96,29 @@ void radeon_driver_irq_uninstall_kms(struct drm_device *dev)
 	}
 	/* Disable *all* interrupts */
 	rdev->irq.sw_int = false;
-	for (i = 0; i < 2; i++) {
+	for (i = 0; i < rdev->num_crtc; i++)
 		rdev->irq.crtc_vblank_int[i] = false;
+	for (i = 0; i < 6; i++)
 		rdev->irq.hpd[i] = false;
-	}
 	radeon_irq_set(rdev);
 }
 
 int radeon_irq_kms_init(struct radeon_device *rdev)
 {
 	int r = 0;
-	int num_crtc = 2;
 
-	if (rdev->flags & RADEON_SINGLE_CRTC)
-		num_crtc = 1;
 	spin_lock_init(&rdev->irq.sw_lock);
-	r = drm_vblank_init(rdev->ddev, num_crtc);
+	r = drm_vblank_init(rdev->ddev, rdev->num_crtc);
 	if (r) {
 		return r;
 	}
 	/* enable msi */
 	rdev->msi_enabled = 0;
-	/* MSIs don't seem to work on my rs780;
-	 * not sure about rs880 or other rs780s.
-	 * Needs more investigation.
+	/* MSIs don't seem to work reliably on all IGP
+	 * chips.  Disable MSI on them for now.
 	 */
 	if ((rdev->family >= CHIP_RV380) &&
-	    (rdev->family != CHIP_RS780) &&
-	    (rdev->family != CHIP_RS880)) {
+	    (!(rdev->flags & RADEON_IS_IGP))) {
 		int ret = pci_enable_msi(rdev->pdev);
 		if (!ret) {
 			rdev->msi_enabled = 1;
diff --git a/drivers/gpu/drm/radeon/radeon_legacy_crtc.c b/drivers/gpu/drm/radeon/radeon_legacy_crtc.c
index df23d6a01d0..88865e38fe3 100644
--- a/drivers/gpu/drm/radeon/radeon_legacy_crtc.c
+++ b/drivers/gpu/drm/radeon/radeon_legacy_crtc.c
@@ -603,6 +603,10 @@ static bool radeon_set_crtc_timing(struct drm_crtc *crtc, struct drm_display_mod
 				      ? RADEON_CRTC2_INTERLACE_EN
 				      : 0));
 
+		/* rs4xx chips seem to like to have the crtc enabled when the timing is set */
+		if ((rdev->family == CHIP_RS400) || (rdev->family == CHIP_RS480))
+			crtc2_gen_cntl |= RADEON_CRTC2_EN;
+
 		disp2_merge_cntl = RREG32(RADEON_DISP2_MERGE_CNTL);
 		disp2_merge_cntl &= ~RADEON_DISP2_RGB_OFFSET_EN;
 
@@ -630,6 +634,10 @@ static bool radeon_set_crtc_timing(struct drm_crtc *crtc, struct drm_display_mod
 				    ? RADEON_CRTC_INTERLACE_EN
 				    : 0));
 
+		/* rs4xx chips seem to like to have the crtc enabled when the timing is set */
+		if ((rdev->family == CHIP_RS400) || (rdev->family == CHIP_RS480))
+			crtc_gen_cntl |= RADEON_CRTC_EN;
+
 		crtc_ext_cntl = RREG32(RADEON_CRTC_EXT_CNTL);
 		crtc_ext_cntl |= (RADEON_XCRT_CNT_EN |
 				  RADEON_CRTC_VSYNC_DIS |
diff --git a/drivers/gpu/drm/radeon/radeon_legacy_tv.c b/drivers/gpu/drm/radeon/radeon_legacy_tv.c
index 417684daef4..f2ed27c8055 100644
--- a/drivers/gpu/drm/radeon/radeon_legacy_tv.c
+++ b/drivers/gpu/drm/radeon/radeon_legacy_tv.c
@@ -57,6 +57,10 @@
 #define NTSC_TV_PLL_N_14 693
 #define NTSC_TV_PLL_P_14 7
 
+#define PAL_TV_PLL_M_14 19
+#define PAL_TV_PLL_N_14 353
+#define PAL_TV_PLL_P_14 5
+
 #define VERT_LEAD_IN_LINES 2
 #define FRAC_BITS 0xe
 #define FRAC_MASK 0x3fff
@@ -205,9 +209,24 @@ static const struct radeon_tv_mode_constants available_tv_modes[] = {
 		630627,             /* defRestart */
 		347,                /* crtcPLL_N */
 		14,                 /* crtcPLL_M */
-			8,                  /* crtcPLL_postDiv */
+		8,                  /* crtcPLL_postDiv */
 		1022,               /* pixToTV */
 	},
+	{ /* PAL timing for 14 Mhz ref clk */
+		800,                /* horResolution */
+		600,                /* verResolution */
+		TV_STD_PAL,         /* standard */
+		1131,               /* horTotal */
+		742,                /* verTotal */
+		813,                /* horStart */
+		840,                /* horSyncStart */
+		633,                /* verSyncStart */
+		708369,             /* defRestart */
+		211,                /* crtcPLL_N */
+		9,                  /* crtcPLL_M */
+		8,                  /* crtcPLL_postDiv */
+		759,                /* pixToTV */
+	},
 };
 
 #define N_AVAILABLE_MODES ARRAY_SIZE(available_tv_modes)
@@ -242,7 +261,7 @@ static const struct radeon_tv_mode_constants *radeon_legacy_tv_get_std_mode(stru
 		if (pll->reference_freq == 2700)
 			const_ptr = &available_tv_modes[1];
 		else
-			const_ptr = &available_tv_modes[1]; /* FIX ME */
+			const_ptr = &available_tv_modes[3];
 	}
 	return const_ptr;
 }
@@ -685,9 +704,9 @@ void radeon_legacy_tv_mode_set(struct drm_encoder *encoder,
 			n = PAL_TV_PLL_N_27;
 			p = PAL_TV_PLL_P_27;
 		} else {
-			m = PAL_TV_PLL_M_27;
-			n = PAL_TV_PLL_N_27;
-			p = PAL_TV_PLL_P_27;
+			m = PAL_TV_PLL_M_14;
+			n = PAL_TV_PLL_N_14;
+			p = PAL_TV_PLL_P_14;
 		}
 	}
 
diff --git a/drivers/gpu/drm/radeon/radeon_mode.h b/drivers/gpu/drm/radeon/radeon_mode.h
index 1702b820aa4..0b8e32776b1 100644
--- a/drivers/gpu/drm/radeon/radeon_mode.h
+++ b/drivers/gpu/drm/radeon/radeon_mode.h
@@ -129,6 +129,7 @@ struct radeon_tmds_pll {
 #define RADEON_PLL_USE_FRAC_FB_DIV      (1 << 10)
 #define RADEON_PLL_PREFER_CLOSEST_LOWER (1 << 11)
 #define RADEON_PLL_USE_POST_DIV         (1 << 12)
+#define RADEON_PLL_IS_LCD               (1 << 13)
 
 /* pll algo */
 enum radeon_pll_algo {
@@ -149,6 +150,8 @@ struct radeon_pll {
 	uint32_t pll_in_max;
 	uint32_t pll_out_min;
 	uint32_t pll_out_max;
+	uint32_t lcd_pll_out_min;
+	uint32_t lcd_pll_out_max;
 	uint32_t best_vco;
 
 	/* divider limits */
@@ -170,17 +173,12 @@ struct radeon_pll {
 	enum radeon_pll_algo algo;
 };
 
-struct i2c_algo_radeon_data {
-	struct i2c_adapter bit_adapter;
-	struct i2c_algo_bit_data bit_data;
-};
-
 struct radeon_i2c_chan {
 	struct i2c_adapter adapter;
 	struct drm_device *dev;
 	union {
+		struct i2c_algo_bit_data bit;
 		struct i2c_algo_dp_aux_data dp;
-		struct i2c_algo_radeon_data radeon;
 	} algo;
 	struct radeon_i2c_bus_rec rec;
 };
@@ -342,6 +340,7 @@ struct radeon_encoder {
 	struct drm_display_mode native_mode;
 	void *enc_priv;
 	int hdmi_offset;
+	int hdmi_config_offset;
 	int hdmi_audio_workaround;
 	int hdmi_buffer_status;
 };
@@ -431,7 +430,6 @@ extern struct radeon_i2c_chan *radeon_i2c_create(struct drm_device *dev,
 						 struct radeon_i2c_bus_rec *rec,
 						 const char *name);
 extern void radeon_i2c_destroy(struct radeon_i2c_chan *i2c);
-extern void radeon_i2c_destroy_dp(struct radeon_i2c_chan *i2c);
 extern void radeon_i2c_get_byte(struct radeon_i2c_chan *i2c_bus,
 				u8 slave_addr,
 				u8 addr,
diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c
index fc9d00ac6b1..dc7e3f44913 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -185,8 +185,10 @@ int radeon_bo_pin(struct radeon_bo *bo, u32 domain, u64 *gpu_addr)
 		return 0;
 	}
 	radeon_ttm_placement_from_domain(bo, domain);
-	/* force to pin into visible video ram */
-	bo->placement.lpfn = bo->rdev->mc.visible_vram_size >> PAGE_SHIFT;
+	if (domain == RADEON_GEM_DOMAIN_VRAM) {
+		/* force to pin into visible video ram */
+		bo->placement.lpfn = bo->rdev->mc.visible_vram_size >> PAGE_SHIFT;
+	}
 	for (i = 0; i < bo->placement.num_placement; i++)
 		bo->placements[i] |= TTM_PL_FLAG_NO_EVICT;
 	r = ttm_bo_validate(&bo->tbo, &bo->placement, false, false);
diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c
index d4d1c39a0e9..a4b57493aa7 100644
--- a/drivers/gpu/drm/radeon/radeon_pm.c
+++ b/drivers/gpu/drm/radeon/radeon_pm.c
@@ -28,6 +28,7 @@
 #define RADEON_RECLOCK_DELAY_MS 200
 #define RADEON_WAIT_VBLANK_TIMEOUT 200
 
+static bool radeon_pm_debug_check_in_vbl(struct radeon_device *rdev, bool finish);
 static void radeon_pm_set_clocks_locked(struct radeon_device *rdev);
 static void radeon_pm_set_clocks(struct radeon_device *rdev);
 static void radeon_pm_idle_work_handler(struct work_struct *work);
@@ -179,6 +180,16 @@ static void radeon_get_power_state(struct radeon_device *rdev,
 		 rdev->pm.requested_power_state->non_clock_info.pcie_lanes);
 }
 
+static inline void radeon_sync_with_vblank(struct radeon_device *rdev)
+{
+	if (rdev->pm.active_crtcs) {
+		rdev->pm.vblank_sync = false;
+		wait_event_timeout(
+			rdev->irq.vblank_queue, rdev->pm.vblank_sync,
+			msecs_to_jiffies(RADEON_WAIT_VBLANK_TIMEOUT));
+	}
+}
+
 static void radeon_set_power_state(struct radeon_device *rdev)
 {
 	/* if *_clock_mode are the same, *_power_state are as well */
@@ -189,11 +200,28 @@ static void radeon_set_power_state(struct radeon_device *rdev)
 		 rdev->pm.requested_clock_mode->sclk,
 		 rdev->pm.requested_clock_mode->mclk,
 		 rdev->pm.requested_power_state->non_clock_info.pcie_lanes);
+
 	/* set pcie lanes */
+	/* TODO */
+
 	/* set voltage */
+	/* TODO */
+
 	/* set engine clock */
+	radeon_sync_with_vblank(rdev);
+	radeon_pm_debug_check_in_vbl(rdev, false);
 	radeon_set_engine_clock(rdev, rdev->pm.requested_clock_mode->sclk);
+	radeon_pm_debug_check_in_vbl(rdev, true);
+
+#if 0
 	/* set memory clock */
+	if (rdev->asic->set_memory_clock) {
+		radeon_sync_with_vblank(rdev);
+		radeon_pm_debug_check_in_vbl(rdev, false);
+		radeon_set_memory_clock(rdev, rdev->pm.requested_clock_mode->mclk);
+		radeon_pm_debug_check_in_vbl(rdev, true);
+	}
+#endif
 
 	rdev->pm.current_power_state = rdev->pm.requested_power_state;
 	rdev->pm.current_clock_mode = rdev->pm.requested_clock_mode;
@@ -229,6 +257,12 @@ int radeon_pm_init(struct radeon_device *rdev)
 	return 0;
 }
 
+void radeon_pm_fini(struct radeon_device *rdev)
+{
+	if (rdev->pm.i2c_bus)
+		radeon_i2c_destroy(rdev->pm.i2c_bus);
+}
+
 void radeon_pm_compute_clocks(struct radeon_device *rdev)
 {
 	struct drm_device *ddev = rdev->ddev;
@@ -245,7 +279,8 @@ void radeon_pm_compute_clocks(struct radeon_device *rdev)
 	list_for_each_entry(connector,
 		&ddev->mode_config.connector_list, head) {
 		if (connector->encoder &&
-			connector->dpms != DRM_MODE_DPMS_OFF) {
+		    connector->encoder->crtc &&
+		    connector->dpms != DRM_MODE_DPMS_OFF) {
 			radeon_crtc = to_radeon_crtc(connector->encoder->crtc);
 			rdev->pm.active_crtcs |= (1 << radeon_crtc->crtc_id);
 			++count;
@@ -333,10 +368,7 @@ static void radeon_pm_set_clocks_locked(struct radeon_device *rdev)
 		break;
 	}
 
-	/* check if we are in vblank */
-	radeon_pm_debug_check_in_vbl(rdev, false);
 	radeon_set_power_state(rdev);
-	radeon_pm_debug_check_in_vbl(rdev, true);
 	rdev->pm.planned_action = PM_ACTION_NONE;
 }
 
@@ -353,10 +385,7 @@ static void radeon_pm_set_clocks(struct radeon_device *rdev)
 		rdev->pm.req_vblank |= (1 << 1);
 		drm_vblank_get(rdev->ddev, 1);
 	}
-	if (rdev->pm.active_crtcs)
-		wait_event_interruptible_timeout(
-			rdev->irq.vblank_queue, 0,
-			msecs_to_jiffies(RADEON_WAIT_VBLANK_TIMEOUT));
+	radeon_pm_set_clocks_locked(rdev);
 	if (rdev->pm.req_vblank & (1 << 0)) {
 		rdev->pm.req_vblank &= ~(1 << 0);
 		drm_vblank_put(rdev->ddev, 0);
@@ -366,7 +395,6 @@ static void radeon_pm_set_clocks(struct radeon_device *rdev)
 		drm_vblank_put(rdev->ddev, 1);
 	}
 
-	radeon_pm_set_clocks_locked(rdev);
 	mutex_unlock(&rdev->cp.mutex);
 }
 
diff --git a/drivers/gpu/drm/radeon/radeon_reg.h b/drivers/gpu/drm/radeon/radeon_reg.h
index 5c0dc082d33..eabbc9cf30a 100644
--- a/drivers/gpu/drm/radeon/radeon_reg.h
+++ b/drivers/gpu/drm/radeon/radeon_reg.h
@@ -346,6 +346,7 @@
 #       define RADEON_TVPLL_PWRMGT_OFF      (1 << 30)
 #       define RADEON_TVCLK_TURNOFF         (1 << 31)
 #define RADEON_PLL_PWRMGT_CNTL              0x0015 /* PLL */
+#	define RADEON_PM_MODE_SEL           (1 << 13)
 #       define RADEON_TCL_BYPASS_DISABLE    (1 << 20)
 #define RADEON_CLR_CMP_CLR_3D               0x1a24
 #define RADEON_CLR_CMP_CLR_DST              0x15c8
diff --git a/drivers/gpu/drm/radeon/reg_srcs/r600 b/drivers/gpu/drm/radeon/reg_srcs/r600
index 8f414a5f520..af0da4ae3f5 100644
--- a/drivers/gpu/drm/radeon/reg_srcs/r600
+++ b/drivers/gpu/drm/radeon/reg_srcs/r600
@@ -26,20 +26,16 @@ r600 0x9400
 0x00028408 VGT_INDX_OFFSET
 0x00028AA0 VGT_INSTANCE_STEP_RATE_0
 0x00028AA4 VGT_INSTANCE_STEP_RATE_1
-0x000088C0 VGT_LAST_COPY_STATE
 0x00028400 VGT_MAX_VTX_INDX
-0x000088D8 VGT_MC_LAT_CNTL
 0x00028404 VGT_MIN_VTX_INDX
 0x00028A94 VGT_MULTI_PRIM_IB_RESET_EN
 0x0002840C VGT_MULTI_PRIM_IB_RESET_INDX
 0x00008970 VGT_NUM_INDICES
 0x00008974 VGT_NUM_INSTANCES
 0x00028A10 VGT_OUTPUT_PATH_CNTL
-0x00028C5C VGT_OUT_DEALLOC_CNTL
 0x00028A84 VGT_PRIMITIVEID_EN
 0x00008958 VGT_PRIMITIVE_TYPE
 0x00028AB4 VGT_REUSE_OFF
-0x00028C58 VGT_VERTEX_REUSE_BLOCK_CNTL
 0x00028AB8 VGT_VTX_CNT_EN
 0x000088B0 VGT_VTX_VECT_EJECT_REG
 0x00028810 PA_CL_CLIP_CNTL
@@ -280,7 +276,6 @@ r600 0x9400
 0x00028E00 PA_SU_POLY_OFFSET_FRONT_SCALE
 0x00028814 PA_SU_SC_MODE_CNTL
 0x00028C08 PA_SU_VTX_CNTL
-0x00008C00 SQ_CONFIG
 0x00008C04 SQ_GPR_RESOURCE_MGMT_1
 0x00008C08 SQ_GPR_RESOURCE_MGMT_2
 0x00008C10 SQ_STACK_RESOURCE_MGMT_1
@@ -320,18 +315,6 @@ r600 0x9400
 0x000283FC SQ_VTX_SEMANTIC_31
 0x000288E0 SQ_VTX_SEMANTIC_CLEAR
 0x0003CFF4 SQ_VTX_START_INST_LOC
-0x0003C000 SQ_TEX_SAMPLER_WORD0_0
-0x0003C004 SQ_TEX_SAMPLER_WORD1_0
-0x0003C008 SQ_TEX_SAMPLER_WORD2_0
-0x00030000 SQ_ALU_CONSTANT0_0
-0x00030004 SQ_ALU_CONSTANT1_0
-0x00030008 SQ_ALU_CONSTANT2_0
-0x0003000C SQ_ALU_CONSTANT3_0
-0x0003E380 SQ_BOOL_CONST_0
-0x0003E384 SQ_BOOL_CONST_1
-0x0003E388 SQ_BOOL_CONST_2
-0x0003E200 SQ_LOOP_CONST_0
-0x0003E200 SQ_LOOP_CONST_DX10_0
 0x000281C0 SQ_ALU_CONST_BUFFER_SIZE_GS_0
 0x000281C4 SQ_ALU_CONST_BUFFER_SIZE_GS_1
 0x000281C8 SQ_ALU_CONST_BUFFER_SIZE_GS_2
@@ -380,54 +363,6 @@ r600 0x9400
 0x000281B4 SQ_ALU_CONST_BUFFER_SIZE_VS_13
 0x000281B8 SQ_ALU_CONST_BUFFER_SIZE_VS_14
 0x000281BC SQ_ALU_CONST_BUFFER_SIZE_VS_15
-0x000289C0 SQ_ALU_CONST_CACHE_GS_0
-0x000289C4 SQ_ALU_CONST_CACHE_GS_1
-0x000289C8 SQ_ALU_CONST_CACHE_GS_2
-0x000289CC SQ_ALU_CONST_CACHE_GS_3
-0x000289D0 SQ_ALU_CONST_CACHE_GS_4
-0x000289D4 SQ_ALU_CONST_CACHE_GS_5
-0x000289D8 SQ_ALU_CONST_CACHE_GS_6
-0x000289DC SQ_ALU_CONST_CACHE_GS_7
-0x000289E0 SQ_ALU_CONST_CACHE_GS_8
-0x000289E4 SQ_ALU_CONST_CACHE_GS_9
-0x000289E8 SQ_ALU_CONST_CACHE_GS_10
-0x000289EC SQ_ALU_CONST_CACHE_GS_11
-0x000289F0 SQ_ALU_CONST_CACHE_GS_12
-0x000289F4 SQ_ALU_CONST_CACHE_GS_13
-0x000289F8 SQ_ALU_CONST_CACHE_GS_14
-0x000289FC SQ_ALU_CONST_CACHE_GS_15
-0x00028940 SQ_ALU_CONST_CACHE_PS_0
-0x00028944 SQ_ALU_CONST_CACHE_PS_1
-0x00028948 SQ_ALU_CONST_CACHE_PS_2
-0x0002894C SQ_ALU_CONST_CACHE_PS_3
-0x00028950 SQ_ALU_CONST_CACHE_PS_4
-0x00028954 SQ_ALU_CONST_CACHE_PS_5
-0x00028958 SQ_ALU_CONST_CACHE_PS_6
-0x0002895C SQ_ALU_CONST_CACHE_PS_7
-0x00028960 SQ_ALU_CONST_CACHE_PS_8
-0x00028964 SQ_ALU_CONST_CACHE_PS_9
-0x00028968 SQ_ALU_CONST_CACHE_PS_10
-0x0002896C SQ_ALU_CONST_CACHE_PS_11
-0x00028970 SQ_ALU_CONST_CACHE_PS_12
-0x00028974 SQ_ALU_CONST_CACHE_PS_13
-0x00028978 SQ_ALU_CONST_CACHE_PS_14
-0x0002897C SQ_ALU_CONST_CACHE_PS_15
-0x00028980 SQ_ALU_CONST_CACHE_VS_0
-0x00028984 SQ_ALU_CONST_CACHE_VS_1
-0x00028988 SQ_ALU_CONST_CACHE_VS_2
-0x0002898C SQ_ALU_CONST_CACHE_VS_3
-0x00028990 SQ_ALU_CONST_CACHE_VS_4
-0x00028994 SQ_ALU_CONST_CACHE_VS_5
-0x00028998 SQ_ALU_CONST_CACHE_VS_6
-0x0002899C SQ_ALU_CONST_CACHE_VS_7
-0x000289A0 SQ_ALU_CONST_CACHE_VS_8
-0x000289A4 SQ_ALU_CONST_CACHE_VS_9
-0x000289A8 SQ_ALU_CONST_CACHE_VS_10
-0x000289AC SQ_ALU_CONST_CACHE_VS_11
-0x000289B0 SQ_ALU_CONST_CACHE_VS_12
-0x000289B4 SQ_ALU_CONST_CACHE_VS_13
-0x000289B8 SQ_ALU_CONST_CACHE_VS_14
-0x000289BC SQ_ALU_CONST_CACHE_VS_15
 0x000288D8 SQ_PGM_CF_OFFSET_ES
 0x000288DC SQ_PGM_CF_OFFSET_FS
 0x000288D4 SQ_PGM_CF_OFFSET_GS
@@ -494,12 +429,7 @@ r600 0x9400
 0x00028438 SX_ALPHA_REF
 0x00028410 SX_ALPHA_TEST_CONTROL
 0x00028350 SX_MISC
-0x0000A020 SMX_DC_CTL0
-0x0000A024 SMX_DC_CTL1
-0x0000A028 SMX_DC_CTL2
-0x00009608 TC_CNTL
 0x00009604 TC_INVALIDATE
-0x00009490 TD_CNTL
 0x00009400 TD_FILTER4
 0x00009404 TD_FILTER4_1
 0x00009408 TD_FILTER4_2
@@ -824,14 +754,9 @@ r600 0x9400
 0x00028428 CB_FOG_GREEN
 0x00028424 CB_FOG_RED
 0x00008040 WAIT_UNTIL
-0x00008950 CC_GC_SHADER_PIPE_CONFIG
-0x00008954 GC_USER_SHADER_PIPE_CONFIG
 0x00009714 VC_ENHANCE
 0x00009830 DB_DEBUG
 0x00009838 DB_WATERMARKS
 0x00028D28 DB_SRESULTS_COMPARE_STATE0
 0x00028D44 DB_ALPHA_TO_MASK
-0x00009504 TA_CNTL
 0x00009700 VC_CNTL
-0x00009718 VC_CONFIG
-0x0000A02C SMX_DC_MC_INTF_CTL
diff --git a/drivers/gpu/drm/radeon/rs400.c b/drivers/gpu/drm/radeon/rs400.c
index 626d51891ee..626aaf082b1 100644
--- a/drivers/gpu/drm/radeon/rs400.c
+++ b/drivers/gpu/drm/radeon/rs400.c
@@ -28,6 +28,7 @@
 #include <linux/seq_file.h>
 #include <drm/drmP.h>
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "rs400d.h"
 
 /* This files gather functions specifics to : rs400,rs480 */
@@ -202,9 +203,9 @@ void rs400_gart_disable(struct radeon_device *rdev)
 
 void rs400_gart_fini(struct radeon_device *rdev)
 {
+	radeon_gart_fini(rdev);
 	rs400_gart_disable(rdev);
 	radeon_gart_table_ram_free(rdev);
-	radeon_gart_fini(rdev);
 }
 
 int rs400_gart_set_page(struct radeon_device *rdev, int i, uint64_t addr)
@@ -264,6 +265,7 @@ void rs400_mc_init(struct radeon_device *rdev)
 	base = (RREG32(RADEON_NB_TOM) & 0xffff) << 16;
 	radeon_vram_location(rdev, &rdev->mc, base);
 	radeon_gtt_location(rdev, &rdev->mc);
+	radeon_update_bandwidth_info(rdev);
 }
 
 uint32_t rs400_mc_rreg(struct radeon_device *rdev, uint32_t reg)
@@ -388,6 +390,8 @@ static int rs400_startup(struct radeon_device *rdev)
 {
 	int r;
 
+	r100_set_common_regs(rdev);
+
 	rs400_mc_program(rdev);
 	/* Resume clock */
 	r300_clock_startup(rdev);
@@ -453,6 +457,7 @@ int rs400_suspend(struct radeon_device *rdev)
 
 void rs400_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	r100_cp_fini(rdev);
 	r100_wb_fini(rdev);
 	r100_ib_fini(rdev);
diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c
index 47f046b78c6..abf824c2123 100644
--- a/drivers/gpu/drm/radeon/rs600.c
+++ b/drivers/gpu/drm/radeon/rs600.c
@@ -37,6 +37,7 @@
  */
 #include "drmP.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "atom.h"
 #include "rs600d.h"
 
@@ -267,9 +268,9 @@ void rs600_gart_disable(struct radeon_device *rdev)
 
 void rs600_gart_fini(struct radeon_device *rdev)
 {
+	radeon_gart_fini(rdev);
 	rs600_gart_disable(rdev);
 	radeon_gart_table_vram_free(rdev);
-	radeon_gart_fini(rdev);
 }
 
 #define R600_PTE_VALID     (1 << 0)
@@ -392,10 +393,12 @@ int rs600_irq_process(struct radeon_device *rdev)
 		/* Vertical blank interrupts */
 		if (G_007EDC_LB_D1_VBLANK_INTERRUPT(r500_disp_int)) {
 			drm_handle_vblank(rdev->ddev, 0);
+			rdev->pm.vblank_sync = true;
 			wake_up(&rdev->irq.vblank_queue);
 		}
 		if (G_007EDC_LB_D2_VBLANK_INTERRUPT(r500_disp_int)) {
 			drm_handle_vblank(rdev->ddev, 1);
+			rdev->pm.vblank_sync = true;
 			wake_up(&rdev->irq.vblank_queue);
 		}
 		if (G_007EDC_DC_HOT_PLUG_DETECT1_INTERRUPT(r500_disp_int)) {
@@ -472,13 +475,38 @@ void rs600_mc_init(struct radeon_device *rdev)
 	rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev);
 	base = RREG32_MC(R_000004_MC_FB_LOCATION);
 	base = G_000004_MC_FB_START(base) << 16;
+	rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev);
 	radeon_vram_location(rdev, &rdev->mc, base);
 	radeon_gtt_location(rdev, &rdev->mc);
+	radeon_update_bandwidth_info(rdev);
 }
 
 void rs600_bandwidth_update(struct radeon_device *rdev)
 {
-	/* FIXME: implement, should this be like rs690 ? */
+	struct drm_display_mode *mode0 = NULL;
+	struct drm_display_mode *mode1 = NULL;
+	u32 d1mode_priority_a_cnt, d2mode_priority_a_cnt;
+	/* FIXME: implement full support */
+
+	radeon_update_display_priority(rdev);
+
+	if (rdev->mode_info.crtcs[0]->base.enabled)
+		mode0 = &rdev->mode_info.crtcs[0]->base.mode;
+	if (rdev->mode_info.crtcs[1]->base.enabled)
+		mode1 = &rdev->mode_info.crtcs[1]->base.mode;
+
+	rs690_line_buffer_adjust(rdev, mode0, mode1);
+
+	if (rdev->disp_priority == 2) {
+		d1mode_priority_a_cnt = RREG32(R_006548_D1MODE_PRIORITY_A_CNT);
+		d2mode_priority_a_cnt = RREG32(R_006D48_D2MODE_PRIORITY_A_CNT);
+		d1mode_priority_a_cnt |= S_006548_D1MODE_PRIORITY_A_ALWAYS_ON(1);
+		d2mode_priority_a_cnt |= S_006D48_D2MODE_PRIORITY_A_ALWAYS_ON(1);
+		WREG32(R_006548_D1MODE_PRIORITY_A_CNT, d1mode_priority_a_cnt);
+		WREG32(R_00654C_D1MODE_PRIORITY_B_CNT, d1mode_priority_a_cnt);
+		WREG32(R_006D48_D2MODE_PRIORITY_A_CNT, d2mode_priority_a_cnt);
+		WREG32(R_006D4C_D2MODE_PRIORITY_B_CNT, d2mode_priority_a_cnt);
+	}
 }
 
 uint32_t rs600_mc_rreg(struct radeon_device *rdev, uint32_t reg)
@@ -598,6 +626,7 @@ int rs600_suspend(struct radeon_device *rdev)
 
 void rs600_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	r100_cp_fini(rdev);
 	r100_wb_fini(rdev);
 	r100_ib_fini(rdev);
diff --git a/drivers/gpu/drm/radeon/rs600d.h b/drivers/gpu/drm/radeon/rs600d.h
index c1c8f5885cb..e52d2695510 100644
--- a/drivers/gpu/drm/radeon/rs600d.h
+++ b/drivers/gpu/drm/radeon/rs600d.h
@@ -535,4 +535,57 @@
 #define   G_00016C_INVALIDATE_L1_TLB(x)                (((x) >> 20) & 0x1)
 #define   C_00016C_INVALIDATE_L1_TLB                   0xFFEFFFFF
 
+#define R_006548_D1MODE_PRIORITY_A_CNT               0x006548
+#define   S_006548_D1MODE_PRIORITY_MARK_A(x)           (((x) & 0x7FFF) << 0)
+#define   G_006548_D1MODE_PRIORITY_MARK_A(x)           (((x) >> 0) & 0x7FFF)
+#define   C_006548_D1MODE_PRIORITY_MARK_A              0xFFFF8000
+#define   S_006548_D1MODE_PRIORITY_A_OFF(x)            (((x) & 0x1) << 16)
+#define   G_006548_D1MODE_PRIORITY_A_OFF(x)            (((x) >> 16) & 0x1)
+#define   C_006548_D1MODE_PRIORITY_A_OFF               0xFFFEFFFF
+#define   S_006548_D1MODE_PRIORITY_A_ALWAYS_ON(x)      (((x) & 0x1) << 20)
+#define   G_006548_D1MODE_PRIORITY_A_ALWAYS_ON(x)      (((x) >> 20) & 0x1)
+#define   C_006548_D1MODE_PRIORITY_A_ALWAYS_ON         0xFFEFFFFF
+#define   S_006548_D1MODE_PRIORITY_A_FORCE_MASK(x)     (((x) & 0x1) << 24)
+#define   G_006548_D1MODE_PRIORITY_A_FORCE_MASK(x)     (((x) >> 24) & 0x1)
+#define   C_006548_D1MODE_PRIORITY_A_FORCE_MASK        0xFEFFFFFF
+#define R_00654C_D1MODE_PRIORITY_B_CNT               0x00654C
+#define   S_00654C_D1MODE_PRIORITY_MARK_B(x)           (((x) & 0x7FFF) << 0)
+#define   G_00654C_D1MODE_PRIORITY_MARK_B(x)           (((x) >> 0) & 0x7FFF)
+#define   C_00654C_D1MODE_PRIORITY_MARK_B              0xFFFF8000
+#define   S_00654C_D1MODE_PRIORITY_B_OFF(x)            (((x) & 0x1) << 16)
+#define   G_00654C_D1MODE_PRIORITY_B_OFF(x)            (((x) >> 16) & 0x1)
+#define   C_00654C_D1MODE_PRIORITY_B_OFF               0xFFFEFFFF
+#define   S_00654C_D1MODE_PRIORITY_B_ALWAYS_ON(x)      (((x) & 0x1) << 20)
+#define   G_00654C_D1MODE_PRIORITY_B_ALWAYS_ON(x)      (((x) >> 20) & 0x1)
+#define   C_00654C_D1MODE_PRIORITY_B_ALWAYS_ON         0xFFEFFFFF
+#define   S_00654C_D1MODE_PRIORITY_B_FORCE_MASK(x)     (((x) & 0x1) << 24)
+#define   G_00654C_D1MODE_PRIORITY_B_FORCE_MASK(x)     (((x) >> 24) & 0x1)
+#define   C_00654C_D1MODE_PRIORITY_B_FORCE_MASK        0xFEFFFFFF
+#define R_006D48_D2MODE_PRIORITY_A_CNT               0x006D48
+#define   S_006D48_D2MODE_PRIORITY_MARK_A(x)           (((x) & 0x7FFF) << 0)
+#define   G_006D48_D2MODE_PRIORITY_MARK_A(x)           (((x) >> 0) & 0x7FFF)
+#define   C_006D48_D2MODE_PRIORITY_MARK_A              0xFFFF8000
+#define   S_006D48_D2MODE_PRIORITY_A_OFF(x)            (((x) & 0x1) << 16)
+#define   G_006D48_D2MODE_PRIORITY_A_OFF(x)            (((x) >> 16) & 0x1)
+#define   C_006D48_D2MODE_PRIORITY_A_OFF               0xFFFEFFFF
+#define   S_006D48_D2MODE_PRIORITY_A_ALWAYS_ON(x)      (((x) & 0x1) << 20)
+#define   G_006D48_D2MODE_PRIORITY_A_ALWAYS_ON(x)      (((x) >> 20) & 0x1)
+#define   C_006D48_D2MODE_PRIORITY_A_ALWAYS_ON         0xFFEFFFFF
+#define   S_006D48_D2MODE_PRIORITY_A_FORCE_MASK(x)     (((x) & 0x1) << 24)
+#define   G_006D48_D2MODE_PRIORITY_A_FORCE_MASK(x)     (((x) >> 24) & 0x1)
+#define   C_006D48_D2MODE_PRIORITY_A_FORCE_MASK        0xFEFFFFFF
+#define R_006D4C_D2MODE_PRIORITY_B_CNT               0x006D4C
+#define   S_006D4C_D2MODE_PRIORITY_MARK_B(x)           (((x) & 0x7FFF) << 0)
+#define   G_006D4C_D2MODE_PRIORITY_MARK_B(x)           (((x) >> 0) & 0x7FFF)
+#define   C_006D4C_D2MODE_PRIORITY_MARK_B              0xFFFF8000
+#define   S_006D4C_D2MODE_PRIORITY_B_OFF(x)            (((x) & 0x1) << 16)
+#define   G_006D4C_D2MODE_PRIORITY_B_OFF(x)            (((x) >> 16) & 0x1)
+#define   C_006D4C_D2MODE_PRIORITY_B_OFF               0xFFFEFFFF
+#define   S_006D4C_D2MODE_PRIORITY_B_ALWAYS_ON(x)      (((x) & 0x1) << 20)
+#define   G_006D4C_D2MODE_PRIORITY_B_ALWAYS_ON(x)      (((x) >> 20) & 0x1)
+#define   C_006D4C_D2MODE_PRIORITY_B_ALWAYS_ON         0xFFEFFFFF
+#define   S_006D4C_D2MODE_PRIORITY_B_FORCE_MASK(x)     (((x) & 0x1) << 24)
+#define   G_006D4C_D2MODE_PRIORITY_B_FORCE_MASK(x)     (((x) >> 24) & 0x1)
+#define   C_006D4C_D2MODE_PRIORITY_B_FORCE_MASK        0xFEFFFFFF
+
 #endif
diff --git a/drivers/gpu/drm/radeon/rs690.c b/drivers/gpu/drm/radeon/rs690.c
index 83b9174f76f..bbf3da790fd 100644
--- a/drivers/gpu/drm/radeon/rs690.c
+++ b/drivers/gpu/drm/radeon/rs690.c
@@ -27,6 +27,7 @@
  */
 #include "drmP.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "atom.h"
 #include "rs690d.h"
 
@@ -57,42 +58,57 @@ static void rs690_gpu_init(struct radeon_device *rdev)
 	}
 }
 
+union igp_info {
+	struct _ATOM_INTEGRATED_SYSTEM_INFO info;
+	struct _ATOM_INTEGRATED_SYSTEM_INFO_V2 info_v2;
+};
+
 void rs690_pm_info(struct radeon_device *rdev)
 {
 	int index = GetIndexIntoMasterTable(DATA, IntegratedSystemInfo);
-	struct _ATOM_INTEGRATED_SYSTEM_INFO *info;
-	struct _ATOM_INTEGRATED_SYSTEM_INFO_V2 *info_v2;
-	void *ptr;
+	union igp_info *info;
 	uint16_t data_offset;
 	uint8_t frev, crev;
 	fixed20_12 tmp;
 
-	atom_parse_data_header(rdev->mode_info.atom_context, index, NULL,
-			       &frev, &crev, &data_offset);
-	ptr = rdev->mode_info.atom_context->bios + data_offset;
-	info = (struct _ATOM_INTEGRATED_SYSTEM_INFO *)ptr;
-	info_v2 = (struct _ATOM_INTEGRATED_SYSTEM_INFO_V2 *)ptr;
-	/* Get various system informations from bios */
-	switch (crev) {
-	case 1:
-		tmp.full = rfixed_const(100);
-		rdev->pm.igp_sideport_mclk.full = rfixed_const(info->ulBootUpMemoryClock);
-		rdev->pm.igp_sideport_mclk.full = rfixed_div(rdev->pm.igp_sideport_mclk, tmp);
-		rdev->pm.igp_system_mclk.full = rfixed_const(le16_to_cpu(info->usK8MemoryClock));
-		rdev->pm.igp_ht_link_clk.full = rfixed_const(le16_to_cpu(info->usFSBClock));
-		rdev->pm.igp_ht_link_width.full = rfixed_const(info->ucHTLinkWidth);
-		break;
-	case 2:
-		tmp.full = rfixed_const(100);
-		rdev->pm.igp_sideport_mclk.full = rfixed_const(info_v2->ulBootUpSidePortClock);
-		rdev->pm.igp_sideport_mclk.full = rfixed_div(rdev->pm.igp_sideport_mclk, tmp);
-		rdev->pm.igp_system_mclk.full = rfixed_const(info_v2->ulBootUpUMAClock);
-		rdev->pm.igp_system_mclk.full = rfixed_div(rdev->pm.igp_system_mclk, tmp);
-		rdev->pm.igp_ht_link_clk.full = rfixed_const(info_v2->ulHTLinkFreq);
-		rdev->pm.igp_ht_link_clk.full = rfixed_div(rdev->pm.igp_ht_link_clk, tmp);
-		rdev->pm.igp_ht_link_width.full = rfixed_const(le16_to_cpu(info_v2->usMinHTLinkWidth));
-		break;
-	default:
+	if (atom_parse_data_header(rdev->mode_info.atom_context, index, NULL,
+				   &frev, &crev, &data_offset)) {
+		info = (union igp_info *)(rdev->mode_info.atom_context->bios + data_offset);
+
+		/* Get various system informations from bios */
+		switch (crev) {
+		case 1:
+			tmp.full = rfixed_const(100);
+			rdev->pm.igp_sideport_mclk.full = rfixed_const(info->info.ulBootUpMemoryClock);
+			rdev->pm.igp_sideport_mclk.full = rfixed_div(rdev->pm.igp_sideport_mclk, tmp);
+			rdev->pm.igp_system_mclk.full = rfixed_const(le16_to_cpu(info->info.usK8MemoryClock));
+			rdev->pm.igp_ht_link_clk.full = rfixed_const(le16_to_cpu(info->info.usFSBClock));
+			rdev->pm.igp_ht_link_width.full = rfixed_const(info->info.ucHTLinkWidth);
+			break;
+		case 2:
+			tmp.full = rfixed_const(100);
+			rdev->pm.igp_sideport_mclk.full = rfixed_const(info->info_v2.ulBootUpSidePortClock);
+			rdev->pm.igp_sideport_mclk.full = rfixed_div(rdev->pm.igp_sideport_mclk, tmp);
+			rdev->pm.igp_system_mclk.full = rfixed_const(info->info_v2.ulBootUpUMAClock);
+			rdev->pm.igp_system_mclk.full = rfixed_div(rdev->pm.igp_system_mclk, tmp);
+			rdev->pm.igp_ht_link_clk.full = rfixed_const(info->info_v2.ulHTLinkFreq);
+			rdev->pm.igp_ht_link_clk.full = rfixed_div(rdev->pm.igp_ht_link_clk, tmp);
+			rdev->pm.igp_ht_link_width.full = rfixed_const(le16_to_cpu(info->info_v2.usMinHTLinkWidth));
+			break;
+		default:
+			tmp.full = rfixed_const(100);
+			/* We assume the slower possible clock ie worst case */
+			/* DDR 333Mhz */
+			rdev->pm.igp_sideport_mclk.full = rfixed_const(333);
+			/* FIXME: system clock ? */
+			rdev->pm.igp_system_mclk.full = rfixed_const(100);
+			rdev->pm.igp_system_mclk.full = rfixed_div(rdev->pm.igp_system_mclk, tmp);
+			rdev->pm.igp_ht_link_clk.full = rfixed_const(200);
+			rdev->pm.igp_ht_link_width.full = rfixed_const(8);
+			DRM_ERROR("No integrated system info for your GPU, using safe default\n");
+			break;
+		}
+	} else {
 		tmp.full = rfixed_const(100);
 		/* We assume the slower possible clock ie worst case */
 		/* DDR 333Mhz */
@@ -103,7 +119,6 @@ void rs690_pm_info(struct radeon_device *rdev)
 		rdev->pm.igp_ht_link_clk.full = rfixed_const(200);
 		rdev->pm.igp_ht_link_width.full = rfixed_const(8);
 		DRM_ERROR("No integrated system info for your GPU, using safe default\n");
-		break;
 	}
 	/* Compute various bandwidth */
 	/* k8_bandwidth = (memory_clk / 2) * 2 * 8 * 0.5 = memory_clk * 4  */
@@ -131,7 +146,6 @@ void rs690_pm_info(struct radeon_device *rdev)
 
 void rs690_mc_init(struct radeon_device *rdev)
 {
-	fixed20_12 a;
 	u64 base;
 
 	rs400_gart_adjust_size(rdev);
@@ -145,18 +159,10 @@ void rs690_mc_init(struct radeon_device *rdev)
 	base = RREG32_MC(R_000100_MCCFG_FB_LOCATION);
 	base = G_000100_MC_FB_START(base) << 16;
 	rs690_pm_info(rdev);
-	/* FIXME: we should enforce default clock in case GPU is not in
-	 * default setup
-	 */
-	a.full = rfixed_const(100);
-	rdev->pm.sclk.full = rfixed_const(rdev->clock.default_sclk);
-	rdev->pm.sclk.full = rfixed_div(rdev->pm.sclk, a);
-	a.full = rfixed_const(16);
-	/* core_bandwidth = sclk(Mhz) * 16 */
-	rdev->pm.core_bandwidth.full = rfixed_div(rdev->pm.sclk, a);
 	rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev);
 	radeon_vram_location(rdev, &rdev->mc, base);
 	radeon_gtt_location(rdev, &rdev->mc);
+	radeon_update_bandwidth_info(rdev);
 }
 
 void rs690_line_buffer_adjust(struct radeon_device *rdev,
@@ -394,10 +400,12 @@ void rs690_bandwidth_update(struct radeon_device *rdev)
 	struct drm_display_mode *mode1 = NULL;
 	struct rs690_watermark wm0;
 	struct rs690_watermark wm1;
-	u32 tmp;
+	u32 tmp, d1mode_priority_a_cnt, d2mode_priority_a_cnt;
 	fixed20_12 priority_mark02, priority_mark12, fill_rate;
 	fixed20_12 a, b;
 
+	radeon_update_display_priority(rdev);
+
 	if (rdev->mode_info.crtcs[0]->base.enabled)
 		mode0 = &rdev->mode_info.crtcs[0]->base.mode;
 	if (rdev->mode_info.crtcs[1]->base.enabled)
@@ -407,7 +415,8 @@ void rs690_bandwidth_update(struct radeon_device *rdev)
 	 * modes if the user specifies HIGH for displaypriority
 	 * option.
 	 */
-	if (rdev->disp_priority == 2) {
+	if ((rdev->disp_priority == 2) &&
+	    ((rdev->family == CHIP_RS690) || (rdev->family == CHIP_RS740))) {
 		tmp = RREG32_MC(R_000104_MC_INIT_MISC_LAT_TIMER);
 		tmp &= C_000104_MC_DISP0R_INIT_LAT;
 		tmp &= C_000104_MC_DISP1R_INIT_LAT;
@@ -482,10 +491,16 @@ void rs690_bandwidth_update(struct radeon_device *rdev)
 			priority_mark12.full = 0;
 		if (wm1.priority_mark_max.full > priority_mark12.full)
 			priority_mark12.full = wm1.priority_mark_max.full;
-		WREG32(R_006548_D1MODE_PRIORITY_A_CNT, rfixed_trunc(priority_mark02));
-		WREG32(R_00654C_D1MODE_PRIORITY_B_CNT, rfixed_trunc(priority_mark02));
-		WREG32(R_006D48_D2MODE_PRIORITY_A_CNT, rfixed_trunc(priority_mark12));
-		WREG32(R_006D4C_D2MODE_PRIORITY_B_CNT, rfixed_trunc(priority_mark12));
+		d1mode_priority_a_cnt = rfixed_trunc(priority_mark02);
+		d2mode_priority_a_cnt = rfixed_trunc(priority_mark12);
+		if (rdev->disp_priority == 2) {
+			d1mode_priority_a_cnt |= S_006548_D1MODE_PRIORITY_A_ALWAYS_ON(1);
+			d2mode_priority_a_cnt |= S_006D48_D2MODE_PRIORITY_A_ALWAYS_ON(1);
+		}
+		WREG32(R_006548_D1MODE_PRIORITY_A_CNT, d1mode_priority_a_cnt);
+		WREG32(R_00654C_D1MODE_PRIORITY_B_CNT, d1mode_priority_a_cnt);
+		WREG32(R_006D48_D2MODE_PRIORITY_A_CNT, d2mode_priority_a_cnt);
+		WREG32(R_006D4C_D2MODE_PRIORITY_B_CNT, d2mode_priority_a_cnt);
 	} else if (mode0) {
 		if (rfixed_trunc(wm0.dbpp) > 64)
 			a.full = rfixed_mul(wm0.dbpp, wm0.num_line_pair);
@@ -512,8 +527,11 @@ void rs690_bandwidth_update(struct radeon_device *rdev)
 			priority_mark02.full = 0;
 		if (wm0.priority_mark_max.full > priority_mark02.full)
 			priority_mark02.full = wm0.priority_mark_max.full;
-		WREG32(R_006548_D1MODE_PRIORITY_A_CNT, rfixed_trunc(priority_mark02));
-		WREG32(R_00654C_D1MODE_PRIORITY_B_CNT, rfixed_trunc(priority_mark02));
+		d1mode_priority_a_cnt = rfixed_trunc(priority_mark02);
+		if (rdev->disp_priority == 2)
+			d1mode_priority_a_cnt |= S_006548_D1MODE_PRIORITY_A_ALWAYS_ON(1);
+		WREG32(R_006548_D1MODE_PRIORITY_A_CNT, d1mode_priority_a_cnt);
+		WREG32(R_00654C_D1MODE_PRIORITY_B_CNT, d1mode_priority_a_cnt);
 		WREG32(R_006D48_D2MODE_PRIORITY_A_CNT,
 			S_006D48_D2MODE_PRIORITY_A_OFF(1));
 		WREG32(R_006D4C_D2MODE_PRIORITY_B_CNT,
@@ -544,12 +562,15 @@ void rs690_bandwidth_update(struct radeon_device *rdev)
 			priority_mark12.full = 0;
 		if (wm1.priority_mark_max.full > priority_mark12.full)
 			priority_mark12.full = wm1.priority_mark_max.full;
+		d2mode_priority_a_cnt = rfixed_trunc(priority_mark12);
+		if (rdev->disp_priority == 2)
+			d2mode_priority_a_cnt |= S_006D48_D2MODE_PRIORITY_A_ALWAYS_ON(1);
 		WREG32(R_006548_D1MODE_PRIORITY_A_CNT,
 			S_006548_D1MODE_PRIORITY_A_OFF(1));
 		WREG32(R_00654C_D1MODE_PRIORITY_B_CNT,
 			S_00654C_D1MODE_PRIORITY_B_OFF(1));
-		WREG32(R_006D48_D2MODE_PRIORITY_A_CNT, rfixed_trunc(priority_mark12));
-		WREG32(R_006D4C_D2MODE_PRIORITY_B_CNT, rfixed_trunc(priority_mark12));
+		WREG32(R_006D48_D2MODE_PRIORITY_A_CNT, d2mode_priority_a_cnt);
+		WREG32(R_006D4C_D2MODE_PRIORITY_B_CNT, d2mode_priority_a_cnt);
 	}
 }
 
@@ -657,6 +678,7 @@ int rs690_suspend(struct radeon_device *rdev)
 
 void rs690_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	r100_cp_fini(rdev);
 	r100_wb_fini(rdev);
 	r100_ib_fini(rdev);
diff --git a/drivers/gpu/drm/radeon/rs690d.h b/drivers/gpu/drm/radeon/rs690d.h
index 62d31e7a897..36e6398a98a 100644
--- a/drivers/gpu/drm/radeon/rs690d.h
+++ b/drivers/gpu/drm/radeon/rs690d.h
@@ -182,6 +182,9 @@
 #define   S_006548_D1MODE_PRIORITY_A_OFF(x)            (((x) & 0x1) << 16)
 #define   G_006548_D1MODE_PRIORITY_A_OFF(x)            (((x) >> 16) & 0x1)
 #define   C_006548_D1MODE_PRIORITY_A_OFF               0xFFFEFFFF
+#define   S_006548_D1MODE_PRIORITY_A_ALWAYS_ON(x)      (((x) & 0x1) << 20)
+#define   G_006548_D1MODE_PRIORITY_A_ALWAYS_ON(x)      (((x) >> 20) & 0x1)
+#define   C_006548_D1MODE_PRIORITY_A_ALWAYS_ON         0xFFEFFFFF
 #define   S_006548_D1MODE_PRIORITY_A_FORCE_MASK(x)     (((x) & 0x1) << 24)
 #define   G_006548_D1MODE_PRIORITY_A_FORCE_MASK(x)     (((x) >> 24) & 0x1)
 #define   C_006548_D1MODE_PRIORITY_A_FORCE_MASK        0xFEFFFFFF
diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c
index bea747da123..1cf233f7e51 100644
--- a/drivers/gpu/drm/radeon/rv515.c
+++ b/drivers/gpu/drm/radeon/rv515.c
@@ -29,6 +29,7 @@
 #include "drmP.h"
 #include "rv515d.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "atom.h"
 #include "rv515_reg_safe.h"
 
@@ -279,19 +280,13 @@ static void rv515_vram_get_type(struct radeon_device *rdev)
 
 void rv515_mc_init(struct radeon_device *rdev)
 {
-	fixed20_12 a;
 
 	rv515_vram_get_type(rdev);
 	r100_vram_init_sizes(rdev);
 	radeon_vram_location(rdev, &rdev->mc, 0);
 	if (!(rdev->flags & RADEON_IS_AGP))
 		radeon_gtt_location(rdev, &rdev->mc);
-	/* FIXME: we should enforce default clock in case GPU is not in
-	 * default setup
-	 */
-	a.full = rfixed_const(100);
-	rdev->pm.sclk.full = rfixed_const(rdev->clock.default_sclk);
-	rdev->pm.sclk.full = rfixed_div(rdev->pm.sclk, a);
+	radeon_update_bandwidth_info(rdev);
 }
 
 uint32_t rv515_mc_rreg(struct radeon_device *rdev, uint32_t reg)
@@ -539,6 +534,7 @@ void rv515_set_safe_registers(struct radeon_device *rdev)
 
 void rv515_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	r100_cp_fini(rdev);
 	r100_wb_fini(rdev);
 	r100_ib_fini(rdev);
@@ -1020,7 +1016,7 @@ void rv515_bandwidth_avivo_update(struct radeon_device *rdev)
 	struct drm_display_mode *mode1 = NULL;
 	struct rv515_watermark wm0;
 	struct rv515_watermark wm1;
-	u32 tmp;
+	u32 tmp, d1mode_priority_a_cnt, d2mode_priority_a_cnt;
 	fixed20_12 priority_mark02, priority_mark12, fill_rate;
 	fixed20_12 a, b;
 
@@ -1088,10 +1084,16 @@ void rv515_bandwidth_avivo_update(struct radeon_device *rdev)
 			priority_mark12.full = 0;
 		if (wm1.priority_mark_max.full > priority_mark12.full)
 			priority_mark12.full = wm1.priority_mark_max.full;
-		WREG32(D1MODE_PRIORITY_A_CNT, rfixed_trunc(priority_mark02));
-		WREG32(D1MODE_PRIORITY_B_CNT, rfixed_trunc(priority_mark02));
-		WREG32(D2MODE_PRIORITY_A_CNT, rfixed_trunc(priority_mark12));
-		WREG32(D2MODE_PRIORITY_B_CNT, rfixed_trunc(priority_mark12));
+		d1mode_priority_a_cnt = rfixed_trunc(priority_mark02);
+		d2mode_priority_a_cnt = rfixed_trunc(priority_mark12);
+		if (rdev->disp_priority == 2) {
+			d1mode_priority_a_cnt |= MODE_PRIORITY_ALWAYS_ON;
+			d2mode_priority_a_cnt |= MODE_PRIORITY_ALWAYS_ON;
+		}
+		WREG32(D1MODE_PRIORITY_A_CNT, d1mode_priority_a_cnt);
+		WREG32(D1MODE_PRIORITY_B_CNT, d1mode_priority_a_cnt);
+		WREG32(D2MODE_PRIORITY_A_CNT, d2mode_priority_a_cnt);
+		WREG32(D2MODE_PRIORITY_B_CNT, d2mode_priority_a_cnt);
 	} else if (mode0) {
 		if (rfixed_trunc(wm0.dbpp) > 64)
 			a.full = rfixed_div(wm0.dbpp, wm0.num_line_pair);
@@ -1118,8 +1120,11 @@ void rv515_bandwidth_avivo_update(struct radeon_device *rdev)
 			priority_mark02.full = 0;
 		if (wm0.priority_mark_max.full > priority_mark02.full)
 			priority_mark02.full = wm0.priority_mark_max.full;
-		WREG32(D1MODE_PRIORITY_A_CNT, rfixed_trunc(priority_mark02));
-		WREG32(D1MODE_PRIORITY_B_CNT, rfixed_trunc(priority_mark02));
+		d1mode_priority_a_cnt = rfixed_trunc(priority_mark02);
+		if (rdev->disp_priority == 2)
+			d1mode_priority_a_cnt |= MODE_PRIORITY_ALWAYS_ON;
+		WREG32(D1MODE_PRIORITY_A_CNT, d1mode_priority_a_cnt);
+		WREG32(D1MODE_PRIORITY_B_CNT, d1mode_priority_a_cnt);
 		WREG32(D2MODE_PRIORITY_A_CNT, MODE_PRIORITY_OFF);
 		WREG32(D2MODE_PRIORITY_B_CNT, MODE_PRIORITY_OFF);
 	} else {
@@ -1148,10 +1153,13 @@ void rv515_bandwidth_avivo_update(struct radeon_device *rdev)
 			priority_mark12.full = 0;
 		if (wm1.priority_mark_max.full > priority_mark12.full)
 			priority_mark12.full = wm1.priority_mark_max.full;
+		d2mode_priority_a_cnt = rfixed_trunc(priority_mark12);
+		if (rdev->disp_priority == 2)
+			d2mode_priority_a_cnt |= MODE_PRIORITY_ALWAYS_ON;
 		WREG32(D1MODE_PRIORITY_A_CNT, MODE_PRIORITY_OFF);
 		WREG32(D1MODE_PRIORITY_B_CNT, MODE_PRIORITY_OFF);
-		WREG32(D2MODE_PRIORITY_A_CNT, rfixed_trunc(priority_mark12));
-		WREG32(D2MODE_PRIORITY_B_CNT, rfixed_trunc(priority_mark12));
+		WREG32(D2MODE_PRIORITY_A_CNT, d2mode_priority_a_cnt);
+		WREG32(D2MODE_PRIORITY_B_CNT, d2mode_priority_a_cnt);
 	}
 }
 
@@ -1161,6 +1169,8 @@ void rv515_bandwidth_update(struct radeon_device *rdev)
 	struct drm_display_mode *mode0 = NULL;
 	struct drm_display_mode *mode1 = NULL;
 
+	radeon_update_display_priority(rdev);
+
 	if (rdev->mode_info.crtcs[0]->base.enabled)
 		mode0 = &rdev->mode_info.crtcs[0]->base.mode;
 	if (rdev->mode_info.crtcs[1]->base.enabled)
@@ -1170,7 +1180,8 @@ void rv515_bandwidth_update(struct radeon_device *rdev)
 	 * modes if the user specifies HIGH for displaypriority
 	 * option.
 	 */
-	if (rdev->disp_priority == 2) {
+	if ((rdev->disp_priority == 2) &&
+	    (rdev->family == CHIP_RV515)) {
 		tmp = RREG32_MC(MC_MISC_LAT_TIMER);
 		tmp &= ~MC_DISP1R_INIT_LAT_MASK;
 		tmp &= ~MC_DISP0R_INIT_LAT_MASK;
diff --git a/drivers/gpu/drm/radeon/rv770.c b/drivers/gpu/drm/radeon/rv770.c
index 37887dee12a..9f37d2efb0a 100644
--- a/drivers/gpu/drm/radeon/rv770.c
+++ b/drivers/gpu/drm/radeon/rv770.c
@@ -29,6 +29,7 @@
 #include <linux/platform_device.h>
 #include "drmP.h"
 #include "radeon.h"
+#include "radeon_asic.h"
 #include "radeon_drm.h"
 #include "rv770d.h"
 #include "atom.h"
@@ -125,9 +126,9 @@ void rv770_pcie_gart_disable(struct radeon_device *rdev)
 
 void rv770_pcie_gart_fini(struct radeon_device *rdev)
 {
+	radeon_gart_fini(rdev);
 	rv770_pcie_gart_disable(rdev);
 	radeon_gart_table_vram_free(rdev);
-	radeon_gart_fini(rdev);
 }
 
 
@@ -647,10 +648,13 @@ static void rv770_gpu_init(struct radeon_device *rdev)
 
 	WREG32(CC_RB_BACKEND_DISABLE,      cc_rb_backend_disable);
 	WREG32(CC_GC_SHADER_PIPE_CONFIG,   cc_gc_shader_pipe_config);
+	WREG32(GC_USER_SHADER_PIPE_CONFIG, cc_gc_shader_pipe_config);
 	WREG32(CC_SYS_RB_BACKEND_DISABLE,  cc_rb_backend_disable);
 
 	WREG32(CGTS_SYS_TCC_DISABLE, 0);
 	WREG32(CGTS_TCC_DISABLE, 0);
+	WREG32(CGTS_USER_SYS_TCC_DISABLE, 0);
+	WREG32(CGTS_USER_TCC_DISABLE, 0);
 
 	num_qd_pipes =
 		R7XX_MAX_PIPES - r600_count_pipe_bits((cc_gc_shader_pipe_config & INACTIVE_QD_PIPES_MASK) >> 8);
@@ -864,7 +868,6 @@ static void rv770_gpu_init(struct radeon_device *rdev)
 
 int rv770_mc_init(struct radeon_device *rdev)
 {
-	fixed20_12 a;
 	u32 tmp;
 	int chansize, numchan;
 
@@ -908,12 +911,8 @@ int rv770_mc_init(struct radeon_device *rdev)
 		rdev->mc.real_vram_size = rdev->mc.aper_size;
 	}
 	r600_vram_gtt_location(rdev, &rdev->mc);
-	/* FIXME: we should enforce default clock in case GPU is not in
-	 * default setup
-	 */
-	a.full = rfixed_const(100);
-	rdev->pm.sclk.full = rfixed_const(rdev->clock.default_sclk);
-	rdev->pm.sclk.full = rfixed_div(rdev->pm.sclk, a);
+	radeon_update_bandwidth_info(rdev);
+
 	return 0;
 }
 
@@ -1013,6 +1012,13 @@ int rv770_resume(struct radeon_device *rdev)
 		DRM_ERROR("radeon: failled testing IB (%d).\n", r);
 		return r;
 	}
+
+	r = r600_audio_init(rdev);
+	if (r) {
+		dev_err(rdev->dev, "radeon: audio init failed\n");
+		return r;
+	}
+
 	return r;
 
 }
@@ -1021,6 +1027,7 @@ int rv770_suspend(struct radeon_device *rdev)
 {
 	int r;
 
+	r600_audio_fini(rdev);
 	/* FIXME: we should wait for ring to be empty */
 	r700_cp_stop(rdev);
 	rdev->cp.ready = false;
@@ -1144,11 +1151,19 @@ int rv770_init(struct radeon_device *rdev)
 			}
 		}
 	}
+
+	r = r600_audio_init(rdev);
+	if (r) {
+		dev_err(rdev->dev, "radeon: audio init failed\n");
+		return r;
+	}
+
 	return 0;
 }
 
 void rv770_fini(struct radeon_device *rdev)
 {
+	radeon_pm_fini(rdev);
 	r600_blit_fini(rdev);
 	r600_cp_fini(rdev);
 	r600_wb_fini(rdev);
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 89c38c49066..dd47b2a9a79 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1425,8 +1425,8 @@ int ttm_bo_global_init(struct ttm_global_reference *ref)
 
 	atomic_set(&glob->bo_count, 0);
 
-	kobject_init(&glob->kobj, &ttm_bo_glob_kobj_type);
-	ret = kobject_add(&glob->kobj, ttm_get_kobj(), "buffer_objects");
+	ret = kobject_init_and_add(
+		&glob->kobj, &ttm_bo_glob_kobj_type, ttm_get_kobj(), "buffer_objects");
 	if (unlikely(ret != 0))
 		kobject_put(&glob->kobj);
 	return ret;
diff --git a/drivers/gpu/drm/ttm/ttm_memory.c b/drivers/gpu/drm/ttm/ttm_memory.c
index eb143e04d40..c40e5f48e9a 100644
--- a/drivers/gpu/drm/ttm/ttm_memory.c
+++ b/drivers/gpu/drm/ttm/ttm_memory.c
@@ -260,8 +260,8 @@ static int ttm_mem_init_kernel_zone(struct ttm_mem_global *glob,
 	zone->used_mem = 0;
 	zone->glob = glob;
 	glob->zone_kernel = zone;
-	kobject_init(&zone->kobj, &ttm_mem_zone_kobj_type);
-	ret = kobject_add(&zone->kobj, &glob->kobj, zone->name);
+	ret = kobject_init_and_add(
+		&zone->kobj, &ttm_mem_zone_kobj_type, &glob->kobj, zone->name);
 	if (unlikely(ret != 0)) {
 		kobject_put(&zone->kobj);
 		return ret;
@@ -296,8 +296,8 @@ static int ttm_mem_init_highmem_zone(struct ttm_mem_global *glob,
 	zone->used_mem = 0;
 	zone->glob = glob;
 	glob->zone_highmem = zone;
-	kobject_init(&zone->kobj, &ttm_mem_zone_kobj_type);
-	ret = kobject_add(&zone->kobj, &glob->kobj, zone->name);
+	ret = kobject_init_and_add(
+		&zone->kobj, &ttm_mem_zone_kobj_type, &glob->kobj, zone->name);
 	if (unlikely(ret != 0)) {
 		kobject_put(&zone->kobj);
 		return ret;
@@ -343,8 +343,8 @@ static int ttm_mem_init_dma32_zone(struct ttm_mem_global *glob,
 	zone->used_mem = 0;
 	zone->glob = glob;
 	glob->zone_dma32 = zone;
-	kobject_init(&zone->kobj, &ttm_mem_zone_kobj_type);
-	ret = kobject_add(&zone->kobj, &glob->kobj, zone->name);
+	ret = kobject_init_and_add(
+		&zone->kobj, &ttm_mem_zone_kobj_type, &glob->kobj, zone->name);
 	if (unlikely(ret != 0)) {
 		kobject_put(&zone->kobj);
 		return ret;
@@ -365,10 +365,8 @@ int ttm_mem_global_init(struct ttm_mem_global *glob)
 	glob->swap_queue = create_singlethread_workqueue("ttm_swap");
 	INIT_WORK(&glob->work, ttm_shrink_work);
 	init_waitqueue_head(&glob->queue);
-	kobject_init(&glob->kobj, &ttm_mem_glob_kobj_type);
-	ret = kobject_add(&glob->kobj,
-			  ttm_get_kobj(),
-			  "memory_accounting");
+	ret = kobject_init_and_add(
+		&glob->kobj, &ttm_mem_glob_kobj_type, ttm_get_kobj(), "memory_accounting");
 	if (unlikely(ret != 0)) {
 		kobject_put(&glob->kobj);
 		return ret;
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index a759170763b..bab6cd8d8a1 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -28,13 +28,13 @@
  * Authors: Thomas Hellstrom <thellstrom-at-vmware-dot-com>
  */
 
-#include <linux/vmalloc.h>
 #include <linux/sched.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/swap.h>
 #include "drm_cache.h"
+#include "drm_mem_util.h"
 #include "ttm/ttm_module.h"
 #include "ttm/ttm_bo_driver.h"
 #include "ttm/ttm_placement.h"
@@ -43,32 +43,15 @@ static int ttm_tt_swapin(struct ttm_tt *ttm);
 
 /**
  * Allocates storage for pointers to the pages that back the ttm.
- *
- * Uses kmalloc if possible. Otherwise falls back to vmalloc.
  */
 static void ttm_tt_alloc_page_directory(struct ttm_tt *ttm)
 {
-	unsigned long size = ttm->num_pages * sizeof(*ttm->pages);
-	ttm->pages = NULL;
-
-	if (size <= PAGE_SIZE)
-		ttm->pages = kzalloc(size, GFP_KERNEL);
-
-	if (!ttm->pages) {
-		ttm->pages = vmalloc_user(size);
-		if (ttm->pages)
-			ttm->page_flags |= TTM_PAGE_FLAG_VMALLOC;
-	}
+	ttm->pages = drm_calloc_large(ttm->num_pages, sizeof(*ttm->pages));
 }
 
 static void ttm_tt_free_page_directory(struct ttm_tt *ttm)
 {
-	if (ttm->page_flags & TTM_PAGE_FLAG_VMALLOC) {
-		vfree(ttm->pages);
-		ttm->page_flags &= ~TTM_PAGE_FLAG_VMALLOC;
-	} else {
-		kfree(ttm->pages);
-	}
+	drm_free_large(ttm->pages);
 	ttm->pages = NULL;
 }
 
diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig
index f20b8bcbef3..30ad13344f7 100644
--- a/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@ -1,6 +1,6 @@
 config DRM_VMWGFX
 	tristate "DRM driver for VMware Virtual GPU"
-	depends on DRM && PCI
+	depends on DRM && PCI && FB
 	select FB_DEFERRED_IO
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index e4595e6147b..9be8e1754a0 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -217,8 +217,8 @@ config SENSORS_ASC7621
 	depends on HWMON && I2C
 	help
 	  If you say yes here you get support for the aSC7621
-	  family of SMBus sensors chip found on most Intel X48, X38, 975,
-	  965 and 945 desktop boards.  Currently supported chips:
+	  family of SMBus sensors chip found on most Intel X38, X48, X58,
+	  945, 965 and 975 desktop boards.  Currently supported chips:
 	  aSC7621
 	  aSC7621a
 
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 2d7bceeed0b..e9b7fbc5a44 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -228,7 +228,7 @@ static int __devinit adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *
 		if (err) {
 			dev_warn(dev,
 				 "Unable to access MSR 0xEE, for Tjmax, left"
-				 " at default");
+				 " at default\n");
 		} else if (eax & 0x40000000) {
 			tjmax = tjmax_ee;
 		}
@@ -466,7 +466,7 @@ static int __init coretemp_init(void)
 			   family 6 CPU */
 			if ((c->x86 == 0x6) && (c->x86_model > 0xf))
 				printk(KERN_WARNING DRVNAME ": Unknown CPU "
-					"model %x\n", c->x86_model);
+					"model 0x%x\n", c->x86_model);
 			continue;
 		}
 
diff --git a/drivers/hwmon/w83793.c b/drivers/hwmon/w83793.c
index 9de81a4c15a..612807d9715 100644
--- a/drivers/hwmon/w83793.c
+++ b/drivers/hwmon/w83793.c
@@ -1294,7 +1294,7 @@ static int watchdog_close(struct inode *inode, struct file *filp)
 static ssize_t watchdog_write(struct file *filp, const char __user *buf,
 	size_t count, loff_t *offset)
 {
-	size_t ret;
+	ssize_t ret;
 	struct w83793_data *data = filp->private_data;
 
 	if (count) {
diff --git a/drivers/i2c/busses/i2c-scmi.c b/drivers/i2c/busses/i2c-scmi.c
index 365e0becaf1..388cbdc96db 100644
--- a/drivers/i2c/busses/i2c-scmi.c
+++ b/drivers/i2c/busses/i2c-scmi.c
@@ -33,6 +33,7 @@ struct acpi_smbus_cmi {
 	u8 cap_info:1;
 	u8 cap_read:1;
 	u8 cap_write:1;
+	struct smbus_methods_t *methods;
 };
 
 static const struct smbus_methods_t smbus_methods = {
@@ -41,10 +42,19 @@ static const struct smbus_methods_t smbus_methods = {
 	.mt_sbw  = "_SBW",
 };
 
+/* Some IBM BIOSes omit the leading underscore */
+static const struct smbus_methods_t ibm_smbus_methods = {
+	.mt_info = "SBI_",
+	.mt_sbr  = "SBR_",
+	.mt_sbw  = "SBW_",
+};
+
 static const struct acpi_device_id acpi_smbus_cmi_ids[] = {
-	{"SMBUS01", 0},
+	{"SMBUS01", (kernel_ulong_t)&smbus_methods},
+	{ACPI_SMBUS_IBM_HID, (kernel_ulong_t)&ibm_smbus_methods},
 	{"", 0}
 };
+MODULE_DEVICE_TABLE(acpi, acpi_smbus_cmi_ids);
 
 #define ACPI_SMBUS_STATUS_OK			0x00
 #define ACPI_SMBUS_STATUS_FAIL			0x07
@@ -150,11 +160,11 @@ acpi_smbus_cmi_access(struct i2c_adapter *adap, u16 addr, unsigned short flags,
 
 	if (read_write == I2C_SMBUS_READ) {
 		protocol |= ACPI_SMBUS_PRTCL_READ;
-		method = smbus_methods.mt_sbr;
+		method = smbus_cmi->methods->mt_sbr;
 		input.count = 3;
 	} else {
 		protocol |= ACPI_SMBUS_PRTCL_WRITE;
-		method = smbus_methods.mt_sbw;
+		method = smbus_cmi->methods->mt_sbw;
 		input.count = 5;
 	}
 
@@ -290,13 +300,13 @@ static int acpi_smbus_cmi_add_cap(struct acpi_smbus_cmi *smbus_cmi,
 	union acpi_object *obj;
 	acpi_status status;
 
-	if (!strcmp(name, smbus_methods.mt_info)) {
+	if (!strcmp(name, smbus_cmi->methods->mt_info)) {
 		status = acpi_evaluate_object(smbus_cmi->handle,
-					smbus_methods.mt_info,
+					smbus_cmi->methods->mt_info,
 					NULL, &buffer);
 		if (ACPI_FAILURE(status)) {
 			ACPI_ERROR((AE_INFO, "Evaluating %s: %i",
-				   smbus_methods.mt_info, status));
+				   smbus_cmi->methods->mt_info, status));
 			return -EIO;
 		}
 
@@ -319,9 +329,9 @@ static int acpi_smbus_cmi_add_cap(struct acpi_smbus_cmi *smbus_cmi,
 
 		kfree(buffer.pointer);
 		smbus_cmi->cap_info = 1;
-	} else if (!strcmp(name, smbus_methods.mt_sbr))
+	} else if (!strcmp(name, smbus_cmi->methods->mt_sbr))
 		smbus_cmi->cap_read = 1;
-	else if (!strcmp(name, smbus_methods.mt_sbw))
+	else if (!strcmp(name, smbus_cmi->methods->mt_sbw))
 		smbus_cmi->cap_write = 1;
 	else
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Unsupported CMI method: %s\n",
@@ -349,6 +359,7 @@ static acpi_status acpi_smbus_cmi_query_methods(acpi_handle handle, u32 level,
 static int acpi_smbus_cmi_add(struct acpi_device *device)
 {
 	struct acpi_smbus_cmi *smbus_cmi;
+	const struct acpi_device_id *id;
 
 	smbus_cmi = kzalloc(sizeof(struct acpi_smbus_cmi), GFP_KERNEL);
 	if (!smbus_cmi)
@@ -362,6 +373,11 @@ static int acpi_smbus_cmi_add(struct acpi_device *device)
 	smbus_cmi->cap_read = 0;
 	smbus_cmi->cap_write = 0;
 
+	for (id = acpi_smbus_cmi_ids; id->id[0]; id++)
+		if (!strcmp(id->id, acpi_device_hid(device)))
+			smbus_cmi->methods =
+				(struct smbus_methods_t *) id->driver_data;
+
 	acpi_walk_namespace(ACPI_TYPE_METHOD, smbus_cmi->handle, 1,
 			    acpi_smbus_cmi_query_methods, NULL, smbus_cmi, NULL);
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index fbedd35feb4..4c3d1bfec0c 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -695,14 +695,8 @@ static int ide_probe_port(ide_hwif_t *hwif)
 	if (irqd)
 		disable_irq(hwif->irq);
 
-	rc = ide_port_wait_ready(hwif);
-	if (rc == -ENODEV) {
-		printk(KERN_INFO "%s: no devices on the port\n", hwif->name);
-		goto out;
-	} else if (rc == -EBUSY)
-		printk(KERN_ERR "%s: not ready before the probe\n", hwif->name);
-	else
-		rc = -ENODEV;
+	if (ide_port_wait_ready(hwif) == -EBUSY)
+		printk(KERN_DEBUG "%s: Wait for ready failed before probe !\n", hwif->name);
 
 	/*
 	 * Second drive should only exist if first drive was found,
@@ -713,7 +707,7 @@ static int ide_probe_port(ide_hwif_t *hwif)
 		if (drive->dev_flags & IDE_DFLAG_PRESENT)
 			rc = 0;
 	}
-out:
+
 	/*
 	 * Use cached IRQ number. It might be (and is...) changed by probe
 	 * code above
diff --git a/drivers/ide/via82cxxx.c b/drivers/ide/via82cxxx.c
index e65d010b708..48fd4efc90a 100644
--- a/drivers/ide/via82cxxx.c
+++ b/drivers/ide/via82cxxx.c
@@ -110,7 +110,6 @@ struct via82cxxx_dev
 {
 	struct via_isa_bridge *via_config;
 	unsigned int via_80w;
-	u8 cached_device[2];
 };
 
 /**
@@ -403,66 +402,10 @@ static const struct ide_port_ops via_port_ops = {
 	.cable_detect		= via82cxxx_cable_detect,
 };
 
-static void via_write_devctl(ide_hwif_t *hwif, u8 ctl)
-{
-	struct via82cxxx_dev *vdev = hwif->host->host_priv;
-
-	outb(ctl, hwif->io_ports.ctl_addr);
-	outb(vdev->cached_device[hwif->channel], hwif->io_ports.device_addr);
-}
-
-static void __via_dev_select(ide_drive_t *drive, u8 select)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct via82cxxx_dev *vdev = hwif->host->host_priv;
-
-	outb(select, hwif->io_ports.device_addr);
-	vdev->cached_device[hwif->channel] = select;
-}
-
-static void via_dev_select(ide_drive_t *drive)
-{
-	__via_dev_select(drive, drive->select | ATA_DEVICE_OBS);
-}
-
-static void via_tf_load(ide_drive_t *drive, struct ide_taskfile *tf, u8 valid)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_io_ports *io_ports = &hwif->io_ports;
-
-	if (valid & IDE_VALID_FEATURE)
-		outb(tf->feature, io_ports->feature_addr);
-	if (valid & IDE_VALID_NSECT)
-		outb(tf->nsect, io_ports->nsect_addr);
-	if (valid & IDE_VALID_LBAL)
-		outb(tf->lbal, io_ports->lbal_addr);
-	if (valid & IDE_VALID_LBAM)
-		outb(tf->lbam, io_ports->lbam_addr);
-	if (valid & IDE_VALID_LBAH)
-		outb(tf->lbah, io_ports->lbah_addr);
-	if (valid & IDE_VALID_DEVICE)
-		__via_dev_select(drive, tf->device);
-}
-
-const struct ide_tp_ops via_tp_ops = {
-	.exec_command		= ide_exec_command,
-	.read_status		= ide_read_status,
-	.read_altstatus		= ide_read_altstatus,
-	.write_devctl		= via_write_devctl,
-
-	.dev_select		= via_dev_select,
-	.tf_load		= via_tf_load,
-	.tf_read		= ide_tf_read,
-
-	.input_data		= ide_input_data,
-	.output_data		= ide_output_data,
-};
-
 static const struct ide_port_info via82cxxx_chipset __devinitdata = {
 	.name		= DRV_NAME,
 	.init_chipset	= init_chipset_via82cxxx,
 	.enablebits	= { { 0x40, 0x02, 0x02 }, { 0x40, 0x01, 0x01 } },
-	.tp_ops		= &via_tp_ops,
 	.port_ops	= &via_port_ops,
 	.host_flags	= IDE_HFLAG_PIO_NO_BLACKLIST |
 			  IDE_HFLAG_POST_SET_MODE |
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 1558bb7fc74..f901957abc8 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -461,6 +461,7 @@ alloc_group_attrs(ssize_t (*show)(struct ib_port *,
 		element->attr.attr.mode  = S_IRUGO;
 		element->attr.show       = show;
 		element->index		 = i;
+		sysfs_attr_init(&element->attr.attr);
 
 		tab_attr[i] = &element->attr.attr;
 	}
diff --git a/drivers/isdn/hisax/avma1_cs.c b/drivers/isdn/hisax/avma1_cs.c
index e5deb15cf40..8d1d63a02b3 100644
--- a/drivers/isdn/hisax/avma1_cs.c
+++ b/drivers/isdn/hisax/avma1_cs.c
@@ -50,7 +50,7 @@ module_param(isdnprot, int, 0);
    handler.
 */
 
-static int avma1cs_config(struct pcmcia_device *link);
+static int avma1cs_config(struct pcmcia_device *link) __devinit ;
 static void avma1cs_release(struct pcmcia_device *link);
 
 /*
@@ -59,7 +59,7 @@ static void avma1cs_release(struct pcmcia_device *link);
    needed to manage one actual PCMCIA card.
 */
 
-static void avma1cs_detach(struct pcmcia_device *p_dev);
+static void avma1cs_detach(struct pcmcia_device *p_dev) __devexit ;
 
 
 /*
@@ -99,7 +99,7 @@ typedef struct local_info_t {
     
 ======================================================================*/
 
-static int avma1cs_probe(struct pcmcia_device *p_dev)
+static int __devinit avma1cs_probe(struct pcmcia_device *p_dev)
 {
     local_info_t *local;
 
@@ -140,7 +140,7 @@ static int avma1cs_probe(struct pcmcia_device *p_dev)
 
 ======================================================================*/
 
-static void avma1cs_detach(struct pcmcia_device *link)
+static void __devexit avma1cs_detach(struct pcmcia_device *link)
 {
 	dev_dbg(&link->dev, "avma1cs_detach(0x%p)\n", link);
 	avma1cs_release(link);
@@ -174,7 +174,7 @@ static int avma1cs_configcheck(struct pcmcia_device *p_dev,
 }
 
 
-static int avma1cs_config(struct pcmcia_device *link)
+static int __devinit avma1cs_config(struct pcmcia_device *link)
 {
     local_info_t *dev;
     int i;
@@ -282,7 +282,7 @@ static struct pcmcia_driver avma1cs_driver = {
 		.name	= "avma1_cs",
 	},
 	.probe		= avma1cs_probe,
-	.remove		= avma1cs_detach,
+	.remove		= __devexit_p(avma1cs_detach),
 	.id_table	= avma1cs_ids,
 };
 
diff --git a/drivers/isdn/hisax/elsa_cs.c b/drivers/isdn/hisax/elsa_cs.c
index c9a30b1c923..c9f2279e21f 100644
--- a/drivers/isdn/hisax/elsa_cs.c
+++ b/drivers/isdn/hisax/elsa_cs.c
@@ -76,7 +76,7 @@ module_param(protocol, int, 0);
    handler.
 */
 
-static int elsa_cs_config(struct pcmcia_device *link);
+static int elsa_cs_config(struct pcmcia_device *link) __devinit ;
 static void elsa_cs_release(struct pcmcia_device *link);
 
 /*
@@ -85,7 +85,7 @@ static void elsa_cs_release(struct pcmcia_device *link);
    needed to manage one actual PCMCIA card.
 */
 
-static void elsa_cs_detach(struct pcmcia_device *p_dev);
+static void elsa_cs_detach(struct pcmcia_device *p_dev) __devexit;
 
 /*
    A driver needs to provide a dev_node_t structure for each device
@@ -121,7 +121,7 @@ typedef struct local_info_t {
 
 ======================================================================*/
 
-static int elsa_cs_probe(struct pcmcia_device *link)
+static int __devinit elsa_cs_probe(struct pcmcia_device *link)
 {
     local_info_t *local;
 
@@ -166,7 +166,7 @@ static int elsa_cs_probe(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static void elsa_cs_detach(struct pcmcia_device *link)
+static void __devexit elsa_cs_detach(struct pcmcia_device *link)
 {
 	local_info_t *info = link->priv;
 
@@ -210,7 +210,7 @@ static int elsa_cs_configcheck(struct pcmcia_device *p_dev,
 	return -ENODEV;
 }
 
-static int elsa_cs_config(struct pcmcia_device *link)
+static int __devinit elsa_cs_config(struct pcmcia_device *link)
 {
     local_info_t *dev;
     int i;
@@ -327,7 +327,7 @@ static struct pcmcia_driver elsa_cs_driver = {
 		.name	= "elsa_cs",
 	},
 	.probe		= elsa_cs_probe,
-	.remove		= elsa_cs_detach,
+	.remove		= __devexit_p(elsa_cs_detach),
 	.id_table	= elsa_ids,
 	.suspend	= elsa_suspend,
 	.resume		= elsa_resume,
diff --git a/drivers/isdn/hisax/sedlbauer_cs.c b/drivers/isdn/hisax/sedlbauer_cs.c
index 7836ec3c7f8..71b3ddef03b 100644
--- a/drivers/isdn/hisax/sedlbauer_cs.c
+++ b/drivers/isdn/hisax/sedlbauer_cs.c
@@ -76,7 +76,7 @@ module_param(protocol, int, 0);
    event handler. 
 */
 
-static int sedlbauer_config(struct pcmcia_device *link);
+static int sedlbauer_config(struct pcmcia_device *link) __devinit ;
 static void sedlbauer_release(struct pcmcia_device *link);
 
 /*
@@ -85,7 +85,7 @@ static void sedlbauer_release(struct pcmcia_device *link);
    needed to manage one actual PCMCIA card.
 */
 
-static void sedlbauer_detach(struct pcmcia_device *p_dev);
+static void sedlbauer_detach(struct pcmcia_device *p_dev) __devexit;
 
 /*
    You'll also need to prototype all the functions that will actually
@@ -129,7 +129,7 @@ typedef struct local_info_t {
     
 ======================================================================*/
 
-static int sedlbauer_probe(struct pcmcia_device *link)
+static int __devinit sedlbauer_probe(struct pcmcia_device *link)
 {
     local_info_t *local;
 
@@ -177,7 +177,7 @@ static int sedlbauer_probe(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static void sedlbauer_detach(struct pcmcia_device *link)
+static void __devexit sedlbauer_detach(struct pcmcia_device *link)
 {
 	dev_dbg(&link->dev, "sedlbauer_detach(0x%p)\n", link);
 
@@ -283,7 +283,7 @@ static int sedlbauer_config_check(struct pcmcia_device *p_dev,
 
 
 
-static int sedlbauer_config(struct pcmcia_device *link)
+static int __devinit sedlbauer_config(struct pcmcia_device *link)
 {
     local_info_t *dev = link->priv;
     win_req_t *req;
@@ -441,7 +441,7 @@ static struct pcmcia_driver sedlbauer_driver = {
 		.name	= "sedlbauer_cs",
 	},
 	.probe		= sedlbauer_probe,
-	.remove		= sedlbauer_detach,
+	.remove		= __devexit_p(sedlbauer_detach),
 	.id_table	= sedlbauer_ids,
 	.suspend	= sedlbauer_suspend,
 	.resume		= sedlbauer_resume,
diff --git a/drivers/isdn/hisax/teles_cs.c b/drivers/isdn/hisax/teles_cs.c
index b0c5976cbdb..d010a0da8e1 100644
--- a/drivers/isdn/hisax/teles_cs.c
+++ b/drivers/isdn/hisax/teles_cs.c
@@ -57,7 +57,7 @@ module_param(protocol, int, 0);
    handler.
 */
 
-static int teles_cs_config(struct pcmcia_device *link);
+static int teles_cs_config(struct pcmcia_device *link) __devinit ;
 static void teles_cs_release(struct pcmcia_device *link);
 
 /*
@@ -66,7 +66,7 @@ static void teles_cs_release(struct pcmcia_device *link);
    needed to manage one actual PCMCIA card.
 */
 
-static void teles_detach(struct pcmcia_device *p_dev);
+static void teles_detach(struct pcmcia_device *p_dev) __devexit ;
 
 /*
    A linked list of "instances" of the teles_cs device.  Each actual
@@ -112,7 +112,7 @@ typedef struct local_info_t {
 
 ======================================================================*/
 
-static int teles_probe(struct pcmcia_device *link)
+static int __devinit teles_probe(struct pcmcia_device *link)
 {
     local_info_t *local;
 
@@ -156,7 +156,7 @@ static int teles_probe(struct pcmcia_device *link)
 
 ======================================================================*/
 
-static void teles_detach(struct pcmcia_device *link)
+static void __devexit teles_detach(struct pcmcia_device *link)
 {
 	local_info_t *info = link->priv;
 
@@ -200,7 +200,7 @@ static int teles_cs_configcheck(struct pcmcia_device *p_dev,
 	return -ENODEV;
 }
 
-static int teles_cs_config(struct pcmcia_device *link)
+static int __devinit teles_cs_config(struct pcmcia_device *link)
 {
     local_info_t *dev;
     int i;
@@ -319,7 +319,7 @@ static struct pcmcia_driver teles_cs_driver = {
 		.name	= "teles_cs",
 	},
 	.probe		= teles_probe,
-	.remove		= teles_detach,
+	.remove		= __devexit_p(teles_detach),
 	.id_table       = teles_ids,
 	.suspend	= teles_suspend,
 	.resume		= teles_resume,
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index b5346b4db91..b7a85f46a6c 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -912,8 +912,8 @@ struct c2port_device *c2port_device_register(char *name,
 
 	c2dev->dev = device_create(c2port_class, NULL, 0, c2dev,
 					"c2port%d", id);
-	if (unlikely(!c2dev->dev)) {
-		ret = -ENOMEM;
+	if (unlikely(IS_ERR(c2dev->dev))) {
+		ret = PTR_ERR(c2dev->dev);
 		goto error_device_create;
 	}
 	dev_set_drvdata(c2dev->dev, c2dev);
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 0eac6c81490..e041c003db2 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -225,7 +225,7 @@ static int mmc_read_ext_csd(struct mmc_card *card)
 			mmc_card_set_blockaddr(card);
 	}
 
-	switch (ext_csd[EXT_CSD_CARD_TYPE]) {
+	switch (ext_csd[EXT_CSD_CARD_TYPE] & EXT_CSD_CARD_TYPE_MASK) {
 	case EXT_CSD_CARD_TYPE_52 | EXT_CSD_CARD_TYPE_26:
 		card->ext_csd.hs_max_dtr = 52000000;
 		break;
@@ -237,7 +237,6 @@ static int mmc_read_ext_csd(struct mmc_card *card)
 		printk(KERN_WARNING "%s: card is mmc v4 but doesn't "
 			"support any high-speed modes.\n",
 			mmc_hostname(card->host));
-		goto out;
 	}
 
 	if (card->ext_csd.rev >= 3) {
diff --git a/drivers/mtd/maps/omap_nor.c b/drivers/mtd/maps/omap_nor.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/drivers/mtd/maps/omap_nor.c
+++ /dev/null
diff --git a/drivers/net/arm/ks8695net.c b/drivers/net/arm/ks8695net.c
index a1d4188c430..e7810b74f39 100644
--- a/drivers/net/arm/ks8695net.c
+++ b/drivers/net/arm/ks8695net.c
@@ -449,11 +449,10 @@ ks8695_rx_irq(int irq, void *dev_id)
 }
 
 /**
- *	ks8695_rx - Receive packets  called by NAPI poll method
+ *	ks8695_rx - Receive packets called by NAPI poll method
  *	@ksp: Private data for the KS8695 Ethernet
- *	@budget: The max packets would be receive
+ *	@budget: Number of packets allowed to process
  */
-
 static int ks8695_rx(struct ks8695_priv *ksp, int budget)
 {
 	struct net_device *ndev = ksp->ndev;
@@ -461,7 +460,6 @@ static int ks8695_rx(struct ks8695_priv *ksp, int budget)
 	int buff_n;
 	u32 flags;
 	int pktlen;
-	int last_rx_processed = -1;
 	int received = 0;
 
 	buff_n = ksp->next_rx_desc_read;
@@ -471,6 +469,7 @@ static int ks8695_rx(struct ks8695_priv *ksp, int budget)
 					cpu_to_le32(RDES_OWN)))) {
 			rmb();
 			flags = le32_to_cpu(ksp->rx_ring[buff_n].status);
+
 			/* Found an SKB which we own, this means we
 			 * received a packet
 			 */
@@ -533,23 +532,18 @@ rx_failure:
 			ksp->rx_ring[buff_n].status = cpu_to_le32(RDES_OWN);
 rx_finished:
 			received++;
-			/* And note this as processed so we can start
-			 * from here next time
-			 */
-			last_rx_processed = buff_n;
 			buff_n = (buff_n + 1) & MAX_RX_DESC_MASK;
-			/*And note which RX descriptor we last did */
-			if (likely(last_rx_processed != -1))
-				ksp->next_rx_desc_read =
-					(last_rx_processed + 1) &
-					MAX_RX_DESC_MASK;
 	}
+
+	/* And note which RX descriptor we last did */
+	ksp->next_rx_desc_read = buff_n;
+
 	/* And refill the buffers */
 	ks8695_refill_rxbuffers(ksp);
 
-	/* Kick the RX DMA engine, in case it became
-	 *  suspended */
+	/* Kick the RX DMA engine, in case it became suspended */
 	ks8695_writereg(ksp, KS8695_DRSC, 0);
+
 	return received;
 }
 
diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c
index 9ba547069db..0ebd8208f60 100644
--- a/drivers/net/atlx/atl1.c
+++ b/drivers/net/atlx/atl1.c
@@ -84,7 +84,7 @@
 
 #define ATLX_DRIVER_VERSION "2.1.3"
 MODULE_AUTHOR("Xiong Huang <xiong.huang@atheros.com>, \
-	Chris Snook <csnook@redhat.com>, Jay Cliburn <jcliburn@gmail.com>");
+Chris Snook <csnook@redhat.com>, Jay Cliburn <jcliburn@gmail.com>");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(ATLX_DRIVER_VERSION);
 
diff --git a/drivers/net/benet/be_ethtool.c b/drivers/net/benet/be_ethtool.c
index 9560d48944a..51e1065e789 100644
--- a/drivers/net/benet/be_ethtool.c
+++ b/drivers/net/benet/be_ethtool.c
@@ -490,7 +490,7 @@ be_test_ddr_dma(struct be_adapter *adapter)
 {
 	int ret, i;
 	struct be_dma_mem ddrdma_cmd;
-	u64 pattern[2] = {0x5a5a5a5a5a5a5a5a, 0xa5a5a5a5a5a5a5a5};
+	u64 pattern[2] = {0x5a5a5a5a5a5a5a5aULL, 0xa5a5a5a5a5a5a5a5ULL};
 
 	ddrdma_cmd.size = sizeof(struct be_cmd_req_ddrdma_test);
 	ddrdma_cmd.va = pci_alloc_consistent(adapter->pdev, ddrdma_cmd.size,
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 381887ba677..a257babd1bb 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -246,6 +246,8 @@ static const struct flash_spec flash_5709 = {
 
 MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl);
 
+static void bnx2_init_napi(struct bnx2 *bp);
+
 static inline u32 bnx2_tx_avail(struct bnx2 *bp, struct bnx2_tx_ring_info *txr)
 {
 	u32 diff;
@@ -6197,6 +6199,7 @@ bnx2_open(struct net_device *dev)
 	bnx2_disable_int(bp);
 
 	bnx2_setup_int_mode(bp, disable_msi);
+	bnx2_init_napi(bp);
 	bnx2_napi_enable(bp);
 	rc = bnx2_alloc_mem(bp);
 	if (rc)
@@ -7643,9 +7646,11 @@ poll_bnx2(struct net_device *dev)
 	int i;
 
 	for (i = 0; i < bp->irq_nvecs; i++) {
-		disable_irq(bp->irq_tbl[i].vector);
-		bnx2_interrupt(bp->irq_tbl[i].vector, &bp->bnx2_napi[i]);
-		enable_irq(bp->irq_tbl[i].vector);
+		struct bnx2_irq *irq = &bp->irq_tbl[i];
+
+		disable_irq(irq->vector);
+		irq->handler(irq->vector, &bp->bnx2_napi[i]);
+		enable_irq(irq->vector);
 	}
 }
 #endif
@@ -8207,7 +8212,7 @@ bnx2_init_napi(struct bnx2 *bp)
 {
 	int i;
 
-	for (i = 0; i < BNX2_MAX_MSIX_VEC; i++) {
+	for (i = 0; i < bp->irq_nvecs; i++) {
 		struct bnx2_napi *bnapi = &bp->bnx2_napi[i];
 		int (*poll)(struct napi_struct *, int);
 
@@ -8276,7 +8281,6 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev->ethtool_ops = &bnx2_ethtool_ops;
 
 	bp = netdev_priv(dev);
-	bnx2_init_napi(bp);
 
 	pci_set_drvdata(pdev, dev);
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 430c02267d7..5b92fbff431 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1235,6 +1235,11 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
 			write_lock_bh(&bond->curr_slave_lock);
 		}
 	}
+
+	/* resend IGMP joins since all were sent on curr_active_slave */
+	if (bond->params.mode == BOND_MODE_ROUNDROBIN) {
+		bond_resend_igmp_join_requests(bond);
+	}
 }
 
 /**
@@ -4138,22 +4143,41 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave, *start_at;
 	int i, slave_no, res = 1;
+	struct iphdr *iph = ip_hdr(skb);
 
 	read_lock(&bond->lock);
 
 	if (!BOND_IS_OK(bond))
 		goto out;
-
 	/*
-	 * Concurrent TX may collide on rr_tx_counter; we accept that
-	 * as being rare enough not to justify using an atomic op here
+	 * Start with the curr_active_slave that joined the bond as the
+	 * default for sending IGMP traffic.  For failover purposes one
+	 * needs to maintain some consistency for the interface that will
+	 * send the join/membership reports.  The curr_active_slave found
+	 * will send all of this type of traffic.
 	 */
-	slave_no = bond->rr_tx_counter++ % bond->slave_cnt;
+	if ((iph->protocol == htons(IPPROTO_IGMP)) &&
+	    (skb->protocol == htons(ETH_P_IP))) {
 
-	bond_for_each_slave(bond, slave, i) {
-		slave_no--;
-		if (slave_no < 0)
-			break;
+		read_lock(&bond->curr_slave_lock);
+		slave = bond->curr_active_slave;
+		read_unlock(&bond->curr_slave_lock);
+
+		if (!slave)
+			goto out;
+	} else {
+		/*
+		 * Concurrent TX may collide on rr_tx_counter; we accept
+		 * that as being rare enough not to justify using an
+		 * atomic op here.
+		 */
+		slave_no = bond->rr_tx_counter++ % bond->slave_cnt;
+
+		bond_for_each_slave(bond, slave, i) {
+			slave_no--;
+			if (slave_no < 0)
+				break;
+		}
 	}
 
 	start_at = slave;
diff --git a/drivers/net/can/bfin_can.c b/drivers/net/can/bfin_can.c
index 866905fa411..03489864376 100644
--- a/drivers/net/can/bfin_can.c
+++ b/drivers/net/can/bfin_can.c
@@ -22,6 +22,7 @@
 #include <linux/can/dev.h>
 #include <linux/can/error.h>
 
+#include <asm/bfin_can.h>
 #include <asm/portmux.h>
 
 #define DRV_NAME "bfin_can"
@@ -29,90 +30,6 @@
 #define TX_ECHO_SKB_MAX  1
 
 /*
- * transmit and receive channels
- */
-#define TRANSMIT_CHL            24
-#define RECEIVE_STD_CHL         0
-#define RECEIVE_EXT_CHL         4
-#define RECEIVE_RTR_CHL         8
-#define RECEIVE_EXT_RTR_CHL     12
-#define MAX_CHL_NUMBER          32
-
-/*
- * bfin can registers layout
- */
-struct bfin_can_mask_regs {
-	u16 aml;
-	u16 dummy1;
-	u16 amh;
-	u16 dummy2;
-};
-
-struct bfin_can_channel_regs {
-	u16 data[8];
-	u16 dlc;
-	u16 dummy1;
-	u16 tsv;
-	u16 dummy2;
-	u16 id0;
-	u16 dummy3;
-	u16 id1;
-	u16 dummy4;
-};
-
-struct bfin_can_regs {
-	/*
-	 * global control and status registers
-	 */
-	u16 mc1;           /* offset 0 */
-	u16 dummy1;
-	u16 md1;           /* offset 4 */
-	u16 rsv1[13];
-	u16 mbtif1;        /* offset 0x20 */
-	u16 dummy2;
-	u16 mbrif1;        /* offset 0x24 */
-	u16 dummy3;
-	u16 mbim1;         /* offset 0x28 */
-	u16 rsv2[11];
-	u16 mc2;           /* offset 0x40 */
-	u16 dummy4;
-	u16 md2;           /* offset 0x44 */
-	u16 dummy5;
-	u16 trs2;          /* offset 0x48 */
-	u16 rsv3[11];
-	u16 mbtif2;        /* offset 0x60 */
-	u16 dummy6;
-	u16 mbrif2;        /* offset 0x64 */
-	u16 dummy7;
-	u16 mbim2;         /* offset 0x68 */
-	u16 rsv4[11];
-	u16 clk;           /* offset 0x80 */
-	u16 dummy8;
-	u16 timing;        /* offset 0x84 */
-	u16 rsv5[3];
-	u16 status;        /* offset 0x8c */
-	u16 dummy9;
-	u16 cec;           /* offset 0x90 */
-	u16 dummy10;
-	u16 gis;           /* offset 0x94 */
-	u16 dummy11;
-	u16 gim;           /* offset 0x98 */
-	u16 rsv6[3];
-	u16 ctrl;          /* offset 0xa0 */
-	u16 dummy12;
-	u16 intr;          /* offset 0xa4 */
-	u16 rsv7[7];
-	u16 esr;           /* offset 0xb4 */
-	u16 rsv8[37];
-
-	/*
-	 * channel(mailbox) mask and message registers
-	 */
-	struct bfin_can_mask_regs msk[MAX_CHL_NUMBER];    /* offset 0x100 */
-	struct bfin_can_channel_regs chl[MAX_CHL_NUMBER]; /* offset 0x200 */
-};
-
-/*
  * bfin can private data
  */
 struct bfin_can_priv {
@@ -163,7 +80,7 @@ static int bfin_can_set_bittiming(struct net_device *dev)
 	if (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES)
 		timing |= SAM;
 
-	bfin_write16(&reg->clk, clk);
+	bfin_write16(&reg->clock, clk);
 	bfin_write16(&reg->timing, timing);
 
 	dev_info(dev->dev.parent, "setting CLOCK=0x%04x TIMING=0x%04x\n",
@@ -185,11 +102,11 @@ static void bfin_can_set_reset_mode(struct net_device *dev)
 	bfin_write16(&reg->gim, 0);
 
 	/* reset can and enter configuration mode */
-	bfin_write16(&reg->ctrl, SRS | CCR);
+	bfin_write16(&reg->control, SRS | CCR);
 	SSYNC();
-	bfin_write16(&reg->ctrl, CCR);
+	bfin_write16(&reg->control, CCR);
 	SSYNC();
-	while (!(bfin_read16(&reg->ctrl) & CCA)) {
+	while (!(bfin_read16(&reg->control) & CCA)) {
 		udelay(10);
 		if (--timeout == 0) {
 			dev_err(dev->dev.parent,
@@ -244,7 +161,7 @@ static void bfin_can_set_normal_mode(struct net_device *dev)
 	/*
 	 * leave configuration mode
 	 */
-	bfin_write16(&reg->ctrl, bfin_read16(&reg->ctrl) & ~CCR);
+	bfin_write16(&reg->control, bfin_read16(&reg->control) & ~CCR);
 
 	while (bfin_read16(&reg->status) & CCA) {
 		udelay(10);
@@ -726,7 +643,7 @@ static int bfin_can_suspend(struct platform_device *pdev, pm_message_t mesg)
 
 	if (netif_running(dev)) {
 		/* enter sleep mode */
-		bfin_write16(&reg->ctrl, bfin_read16(&reg->ctrl) | SMR);
+		bfin_write16(&reg->control, bfin_read16(&reg->control) | SMR);
 		SSYNC();
 		while (!(bfin_read16(&reg->intr) & SMACK)) {
 			udelay(10);
diff --git a/drivers/net/e1000/e1000.h b/drivers/net/e1000/e1000.h
index 9902b33b716..2f29c213185 100644
--- a/drivers/net/e1000/e1000.h
+++ b/drivers/net/e1000/e1000.h
@@ -261,7 +261,6 @@ struct e1000_adapter {
 	/* TX */
 	struct e1000_tx_ring *tx_ring;      /* One per active queue */
 	unsigned int restart_queue;
-	unsigned long tx_queue_len;
 	u32 txd_cmd;
 	u32 tx_int_delay;
 	u32 tx_abs_int_delay;
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 8be6faee43e..b15ece26ed8 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -383,8 +383,6 @@ static void e1000_configure(struct e1000_adapter *adapter)
 		adapter->alloc_rx_buf(adapter, ring,
 		                      E1000_DESC_UNUSED(ring));
 	}
-
-	adapter->tx_queue_len = netdev->tx_queue_len;
 }
 
 int e1000_up(struct e1000_adapter *adapter)
@@ -503,7 +501,6 @@ void e1000_down(struct e1000_adapter *adapter)
 	del_timer_sync(&adapter->watchdog_timer);
 	del_timer_sync(&adapter->phy_info_timer);
 
-	netdev->tx_queue_len = adapter->tx_queue_len;
 	adapter->link_speed = 0;
 	adapter->link_duplex = 0;
 	netif_carrier_off(netdev);
@@ -2316,19 +2313,15 @@ static void e1000_watchdog(unsigned long data)
 			        E1000_CTRL_RFCE) ? "RX" : ((ctrl &
 			        E1000_CTRL_TFCE) ? "TX" : "None" )));
 
-			/* tweak tx_queue_len according to speed/duplex
-			 * and adjust the timeout factor */
-			netdev->tx_queue_len = adapter->tx_queue_len;
+			/* adjust timeout factor according to speed/duplex */
 			adapter->tx_timeout_factor = 1;
 			switch (adapter->link_speed) {
 			case SPEED_10:
 				txb2b = false;
-				netdev->tx_queue_len = 10;
 				adapter->tx_timeout_factor = 16;
 				break;
 			case SPEED_100:
 				txb2b = false;
-				netdev->tx_queue_len = 100;
 				/* maybe add some timeout factor ? */
 				break;
 			}
diff --git a/drivers/net/e1000e/e1000.h b/drivers/net/e1000e/e1000.h
index c2ec095d216..118bdf48359 100644
--- a/drivers/net/e1000e/e1000.h
+++ b/drivers/net/e1000e/e1000.h
@@ -279,7 +279,6 @@ struct e1000_adapter {
 
 	struct napi_struct napi;
 
-	unsigned long tx_queue_len;
 	unsigned int restart_queue;
 	u32 txd_cmd;
 
diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
index 88d54d3efce..e1cceb60657 100644
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@ -2289,8 +2289,6 @@ static void e1000_configure_tx(struct e1000_adapter *adapter)
 	ew32(TCTL, tctl);
 
 	e1000e_config_collision_dist(hw);
-
-	adapter->tx_queue_len = adapter->netdev->tx_queue_len;
 }
 
 /**
@@ -2877,7 +2875,6 @@ void e1000e_down(struct e1000_adapter *adapter)
 	del_timer_sync(&adapter->watchdog_timer);
 	del_timer_sync(&adapter->phy_info_timer);
 
-	netdev->tx_queue_len = adapter->tx_queue_len;
 	netif_carrier_off(netdev);
 	adapter->link_speed = 0;
 	adapter->link_duplex = 0;
@@ -3588,21 +3585,15 @@ static void e1000_watchdog_task(struct work_struct *work)
 					       "link gets many collisions.\n");
 			}
 
-			/*
-			 * tweak tx_queue_len according to speed/duplex
-			 * and adjust the timeout factor
-			 */
-			netdev->tx_queue_len = adapter->tx_queue_len;
+			/* adjust timeout factor according to speed/duplex */
 			adapter->tx_timeout_factor = 1;
 			switch (adapter->link_speed) {
 			case SPEED_10:
 				txb2b = 0;
-				netdev->tx_queue_len = 10;
 				adapter->tx_timeout_factor = 16;
 				break;
 			case SPEED_100:
 				txb2b = 0;
-				netdev->tx_queue_len = 100;
 				adapter->tx_timeout_factor = 10;
 				break;
 			}
diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c
index b6715553cf1..669de028d44 100644
--- a/drivers/net/gianfar.c
+++ b/drivers/net/gianfar.c
@@ -2393,6 +2393,7 @@ struct sk_buff * gfar_new_skb(struct net_device *dev)
 	 * as many bytes as needed to align the data properly
 	 */
 	skb_reserve(skb, alignamount);
+	GFAR_CB(skb)->alignamount = alignamount;
 
 	return skb;
 }
@@ -2533,13 +2534,13 @@ int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue, int rx_work_limit)
 				newskb = skb;
 			else if (skb) {
 				/*
-				 * We need to reset ->data to what it
+				 * We need to un-reserve() the skb to what it
 				 * was before gfar_new_skb() re-aligned
 				 * it to an RXBUF_ALIGNMENT boundary
 				 * before we put the skb back on the
 				 * recycle list.
 				 */
-				skb->data = skb->head + NET_SKB_PAD;
+				skb_reserve(skb, -GFAR_CB(skb)->alignamount);
 				__skb_queue_head(&priv->rx_recycle, skb);
 			}
 		} else {
diff --git a/drivers/net/gianfar.h b/drivers/net/gianfar.h
index 3d72dc43dca..17d25e71423 100644
--- a/drivers/net/gianfar.h
+++ b/drivers/net/gianfar.h
@@ -566,6 +566,12 @@ struct rxfcb {
 	u16	vlctl;	/* VLAN control word */
 };
 
+struct gianfar_skb_cb {
+	int alignamount;
+};
+
+#define GFAR_CB(skb) ((struct gianfar_skb_cb *)((skb)->cb))
+
 struct rmon_mib
 {
 	u32	tr64;	/* 0x.680 - Transmit and Receive 64-byte Frame Counter */
diff --git a/drivers/net/igb/e1000_82575.c b/drivers/net/igb/e1000_82575.c
index 9d7fa2fb85e..0bc990ec4a8 100644
--- a/drivers/net/igb/e1000_82575.c
+++ b/drivers/net/igb/e1000_82575.c
@@ -94,6 +94,7 @@ static s32 igb_get_invariants_82575(struct e1000_hw *hw)
 	case E1000_DEV_ID_82576_FIBER:
 	case E1000_DEV_ID_82576_SERDES:
 	case E1000_DEV_ID_82576_QUAD_COPPER:
+	case E1000_DEV_ID_82576_QUAD_COPPER_ET2:
 	case E1000_DEV_ID_82576_SERDES_QUAD:
 		mac->type = e1000_82576;
 		break;
diff --git a/drivers/net/igb/e1000_hw.h b/drivers/net/igb/e1000_hw.h
index 448005276b2..82a533f5192 100644
--- a/drivers/net/igb/e1000_hw.h
+++ b/drivers/net/igb/e1000_hw.h
@@ -41,6 +41,7 @@ struct e1000_hw;
 #define E1000_DEV_ID_82576_FIBER              0x10E6
 #define E1000_DEV_ID_82576_SERDES             0x10E7
 #define E1000_DEV_ID_82576_QUAD_COPPER        0x10E8
+#define E1000_DEV_ID_82576_QUAD_COPPER_ET2    0x1526
 #define E1000_DEV_ID_82576_NS                 0x150A
 #define E1000_DEV_ID_82576_NS_SERDES          0x1518
 #define E1000_DEV_ID_82576_SERDES_QUAD        0x150D
diff --git a/drivers/net/igb/e1000_mac.c b/drivers/net/igb/e1000_mac.c
index 2a8a886b37e..be8d010e402 100644
--- a/drivers/net/igb/e1000_mac.c
+++ b/drivers/net/igb/e1000_mac.c
@@ -1367,7 +1367,8 @@ out:
  *  igb_enable_mng_pass_thru - Enable processing of ARP's
  *  @hw: pointer to the HW structure
  *
- *  Verifies the hardware needs to allow ARPs to be processed by the host.
+ *  Verifies the hardware needs to leave interface enabled so that frames can
+ *  be directed to and from the management interface.
  **/
 bool igb_enable_mng_pass_thru(struct e1000_hw *hw)
 {
@@ -1380,8 +1381,7 @@ bool igb_enable_mng_pass_thru(struct e1000_hw *hw)
 
 	manc = rd32(E1000_MANC);
 
-	if (!(manc & E1000_MANC_RCV_TCO_EN) ||
-	    !(manc & E1000_MANC_EN_MAC_ADDR_FILTER))
+	if (!(manc & E1000_MANC_RCV_TCO_EN))
 		goto out;
 
 	if (hw->mac.arc_subsystem_valid) {
diff --git a/drivers/net/igb/igb.h b/drivers/net/igb/igb.h
index a1775705b24..3b772b822a5 100644
--- a/drivers/net/igb/igb.h
+++ b/drivers/net/igb/igb.h
@@ -267,7 +267,6 @@ struct igb_adapter {
 
 	/* TX */
 	struct igb_ring *tx_ring[16];
-	unsigned long tx_queue_len;
 	u32 tx_timeout_count;
 
 	/* RX */
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 0ed25f059a0..01c65c7447e 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -72,6 +72,7 @@ static DEFINE_PCI_DEVICE_TABLE(igb_pci_tbl) = {
 	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_FIBER), board_82575 },
 	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_SERDES), board_82575 },
 	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_SERDES_QUAD), board_82575 },
+	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_QUAD_COPPER_ET2), board_82575 },
 	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_QUAD_COPPER), board_82575 },
 	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82575EB_COPPER), board_82575 },
 	{ PCI_VDEVICE(INTEL, E1000_DEV_ID_82575EB_FIBER_SERDES), board_82575 },
@@ -1104,9 +1105,6 @@ static void igb_configure(struct igb_adapter *adapter)
 		struct igb_ring *ring = adapter->rx_ring[i];
 		igb_alloc_rx_buffers_adv(ring, igb_desc_unused(ring));
 	}
-
-
-	adapter->tx_queue_len = netdev->tx_queue_len;
 }
 
 /**
@@ -1212,7 +1210,6 @@ void igb_down(struct igb_adapter *adapter)
 	del_timer_sync(&adapter->watchdog_timer);
 	del_timer_sync(&adapter->phy_info_timer);
 
-	netdev->tx_queue_len = adapter->tx_queue_len;
 	netif_carrier_off(netdev);
 
 	/* record the stats before reset*/
@@ -3105,17 +3102,13 @@ static void igb_watchdog_task(struct work_struct *work)
 			       ((ctrl & E1000_CTRL_RFCE) ?  "RX" :
 			       ((ctrl & E1000_CTRL_TFCE) ?  "TX" : "None")));
 
-			/* tweak tx_queue_len according to speed/duplex and
-			 * adjust the timeout factor */
-			netdev->tx_queue_len = adapter->tx_queue_len;
+			/* adjust timeout factor according to speed/duplex */
 			adapter->tx_timeout_factor = 1;
 			switch (adapter->link_speed) {
 			case SPEED_10:
-				netdev->tx_queue_len = 10;
 				adapter->tx_timeout_factor = 14;
 				break;
 			case SPEED_100:
-				netdev->tx_queue_len = 100;
 				/* maybe add some timeout factor ? */
 				break;
 			}
@@ -3962,7 +3955,7 @@ void igb_update_stats(struct igb_adapter *adapter)
 	struct net_device_stats *net_stats = igb_get_stats(adapter->netdev);
 	struct e1000_hw *hw = &adapter->hw;
 	struct pci_dev *pdev = adapter->pdev;
-	u32 rnbc, reg;
+	u32 reg, mpc;
 	u16 phy_tmp;
 	int i;
 	u64 bytes, packets;
@@ -4020,7 +4013,9 @@ void igb_update_stats(struct igb_adapter *adapter)
 	adapter->stats.symerrs += rd32(E1000_SYMERRS);
 	adapter->stats.sec += rd32(E1000_SEC);
 
-	adapter->stats.mpc += rd32(E1000_MPC);
+	mpc = rd32(E1000_MPC);
+	adapter->stats.mpc += mpc;
+	net_stats->rx_fifo_errors += mpc;
 	adapter->stats.scc += rd32(E1000_SCC);
 	adapter->stats.ecol += rd32(E1000_ECOL);
 	adapter->stats.mcc += rd32(E1000_MCC);
@@ -4035,9 +4030,7 @@ void igb_update_stats(struct igb_adapter *adapter)
 	adapter->stats.gptc += rd32(E1000_GPTC);
 	adapter->stats.gotc += rd32(E1000_GOTCL);
 	rd32(E1000_GOTCH); /* clear GOTCL */
-	rnbc = rd32(E1000_RNBC);
-	adapter->stats.rnbc += rnbc;
-	net_stats->rx_fifo_errors += rnbc;
+	adapter->stats.rnbc += rd32(E1000_RNBC);
 	adapter->stats.ruc += rd32(E1000_RUC);
 	adapter->stats.rfc += rd32(E1000_RFC);
 	adapter->stats.rjc += rd32(E1000_RJC);
@@ -5109,7 +5102,7 @@ static void igb_receive_skb(struct igb_q_vector *q_vector,
 {
 	struct igb_adapter *adapter = q_vector->adapter;
 
-	if (vlan_tag)
+	if (vlan_tag && adapter->vlgrp)
 		vlan_gro_receive(&q_vector->napi, adapter->vlgrp,
 		                 vlan_tag, skb);
 	else
diff --git a/drivers/net/igbvf/igbvf.h b/drivers/net/igbvf/igbvf.h
index a1774b29d22..debeee2dc71 100644
--- a/drivers/net/igbvf/igbvf.h
+++ b/drivers/net/igbvf/igbvf.h
@@ -198,7 +198,6 @@ struct igbvf_adapter {
 	struct igbvf_ring *tx_ring /* One per active queue */
 	____cacheline_aligned_in_smp;
 
-	unsigned long tx_queue_len;
 	unsigned int restart_queue;
 	u32 txd_cmd;
 
diff --git a/drivers/net/igbvf/netdev.c b/drivers/net/igbvf/netdev.c
index a77afd8a14b..b41037ed808 100644
--- a/drivers/net/igbvf/netdev.c
+++ b/drivers/net/igbvf/netdev.c
@@ -1304,8 +1304,6 @@ static void igbvf_configure_tx(struct igbvf_adapter *adapter)
 
 	/* enable Report Status bit */
 	adapter->txd_cmd |= E1000_ADVTXD_DCMD_RS;
-
-	adapter->tx_queue_len = adapter->netdev->tx_queue_len;
 }
 
 /**
@@ -1524,7 +1522,6 @@ void igbvf_down(struct igbvf_adapter *adapter)
 
 	del_timer_sync(&adapter->watchdog_timer);
 
-	netdev->tx_queue_len = adapter->tx_queue_len;
 	netif_carrier_off(netdev);
 
 	/* record the stats before reset*/
@@ -1857,21 +1854,15 @@ static void igbvf_watchdog_task(struct work_struct *work)
 			                          &adapter->link_duplex);
 			igbvf_print_link_info(adapter);
 
-			/*
-			 * tweak tx_queue_len according to speed/duplex
-			 * and adjust the timeout factor
-			 */
-			netdev->tx_queue_len = adapter->tx_queue_len;
+			/* adjust timeout factor according to speed/duplex */
 			adapter->tx_timeout_factor = 1;
 			switch (adapter->link_speed) {
 			case SPEED_10:
 				txb2b = 0;
-				netdev->tx_queue_len = 10;
 				adapter->tx_timeout_factor = 16;
 				break;
 			case SPEED_100:
 				txb2b = 0;
-				netdev->tx_queue_len = 100;
 				/* maybe add some timeout factor ? */
 				break;
 			}
diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 19e94ee155a..79c35ae3718 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -204,14 +204,17 @@ enum ixgbe_ring_f_enum {
 #define IXGBE_MAX_FDIR_INDICES 64
 #ifdef IXGBE_FCOE
 #define IXGBE_MAX_FCOE_INDICES  8
+#define MAX_RX_QUEUES (IXGBE_MAX_FDIR_INDICES + IXGBE_MAX_FCOE_INDICES)
+#define MAX_TX_QUEUES (IXGBE_MAX_FDIR_INDICES + IXGBE_MAX_FCOE_INDICES)
+#else
+#define MAX_RX_QUEUES IXGBE_MAX_FDIR_INDICES
+#define MAX_TX_QUEUES IXGBE_MAX_FDIR_INDICES
 #endif /* IXGBE_FCOE */
 struct ixgbe_ring_feature {
 	int indices;
 	int mask;
 } ____cacheline_internodealigned_in_smp;
 
-#define MAX_RX_QUEUES 128
-#define MAX_TX_QUEUES 128
 
 #define MAX_RX_PACKET_BUFFERS ((adapter->flags & IXGBE_FLAG_DCB_ENABLED) \
                               ? 8 : 1)
diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index 1f30e163bd9..b405a00817c 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -39,6 +39,7 @@
 #define IXGBE_82599_MC_TBL_SIZE   128
 #define IXGBE_82599_VFT_TBL_SIZE  128
 
+void ixgbe_flap_tx_laser_multispeed_fiber(struct ixgbe_hw *hw);
 s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
                                           ixgbe_link_speed speed,
                                           bool autoneg,
@@ -68,7 +69,9 @@ static void ixgbe_init_mac_link_ops_82599(struct ixgbe_hw *hw)
 	if (hw->phy.multispeed_fiber) {
 		/* Set up dual speed SFP+ support */
 		mac->ops.setup_link = &ixgbe_setup_mac_link_multispeed_fiber;
+		mac->ops.flap_tx_laser = &ixgbe_flap_tx_laser_multispeed_fiber;
 	} else {
+		mac->ops.flap_tx_laser = NULL;
 		if ((mac->ops.get_media_type(hw) ==
 		     ixgbe_media_type_backplane) &&
 		    (hw->phy.smart_speed == ixgbe_smart_speed_auto ||
@@ -413,6 +416,41 @@ s32 ixgbe_start_mac_link_82599(struct ixgbe_hw *hw,
 }
 
 /**
+ *  ixgbe_flap_tx_laser_multispeed_fiber - Flap Tx laser
+ *  @hw: pointer to hardware structure
+ *
+ *  When the driver changes the link speeds that it can support,
+ *  it sets autotry_restart to true to indicate that we need to
+ *  initiate a new autotry session with the link partner.  To do
+ *  so, we set the speed then disable and re-enable the tx laser, to
+ *  alert the link partner that it also needs to restart autotry on its
+ *  end.  This is consistent with true clause 37 autoneg, which also
+ *  involves a loss of signal.
+ **/
+void ixgbe_flap_tx_laser_multispeed_fiber(struct ixgbe_hw *hw)
+{
+	u32 esdp_reg = IXGBE_READ_REG(hw, IXGBE_ESDP);
+
+	hw_dbg(hw, "ixgbe_flap_tx_laser_multispeed_fiber\n");
+
+	if (hw->mac.autotry_restart) {
+		/* Disable tx laser; allow 100us to go dark per spec */
+		esdp_reg |= IXGBE_ESDP_SDP3;
+		IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
+		IXGBE_WRITE_FLUSH(hw);
+		udelay(100);
+
+		/* Enable tx laser; allow 100ms to light up */
+		esdp_reg &= ~IXGBE_ESDP_SDP3;
+		IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
+		IXGBE_WRITE_FLUSH(hw);
+		msleep(100);
+
+		hw->mac.autotry_restart = false;
+	}
+}
+
+/**
  *  ixgbe_setup_mac_link_multispeed_fiber - Set MAC link speed
  *  @hw: pointer to hardware structure
  *  @speed: new link speed
@@ -440,16 +478,6 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
 	speed &= phy_link_speed;
 
 	/*
-	 * When the driver changes the link speeds that it can support,
-	 * it sets autotry_restart to true to indicate that we need to
-	 * initiate a new autotry session with the link partner.  To do
-	 * so, we set the speed then disable and re-enable the tx laser, to
-	 * alert the link partner that it also needs to restart autotry on its
-	 * end.  This is consistent with true clause 37 autoneg, which also
-	 * involves a loss of signal.
-	 */
-
-	/*
 	 * Try each speed one by one, highest priority first.  We do this in
 	 * software because 10gb fiber doesn't support speed autonegotiation.
 	 */
@@ -466,6 +494,7 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
 		/* Set the module link speed */
 		esdp_reg |= (IXGBE_ESDP_SDP5_DIR | IXGBE_ESDP_SDP5);
 		IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
+		IXGBE_WRITE_FLUSH(hw);
 
 		/* Allow module to change analog characteristics (1G->10G) */
 		msleep(40);
@@ -478,19 +507,7 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
 			return status;
 
 		/* Flap the tx laser if it has not already been done */
-		if (hw->mac.autotry_restart) {
-			/* Disable tx laser; allow 100us to go dark per spec */
-			esdp_reg |= IXGBE_ESDP_SDP3;
-			IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
-			udelay(100);
-
-			/* Enable tx laser; allow 2ms to light up per spec */
-			esdp_reg &= ~IXGBE_ESDP_SDP3;
-			IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
-			msleep(2);
-
-			hw->mac.autotry_restart = false;
-		}
+		hw->mac.ops.flap_tx_laser(hw);
 
 		/*
 		 * Wait for the controller to acquire link.  Per IEEE 802.3ap,
@@ -525,6 +542,7 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
 		esdp_reg &= ~IXGBE_ESDP_SDP5;
 		esdp_reg |= IXGBE_ESDP_SDP5_DIR;
 		IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
+		IXGBE_WRITE_FLUSH(hw);
 
 		/* Allow module to change analog characteristics (10G->1G) */
 		msleep(40);
@@ -537,19 +555,7 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
 			return status;
 
 		/* Flap the tx laser if it has not already been done */
-		if (hw->mac.autotry_restart) {
-			/* Disable tx laser; allow 100us to go dark per spec */
-			esdp_reg |= IXGBE_ESDP_SDP3;
-			IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
-			udelay(100);
-
-			/* Enable tx laser; allow 2ms to light up per spec */
-			esdp_reg &= ~IXGBE_ESDP_SDP3;
-			IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
-			msleep(2);
-
-			hw->mac.autotry_restart = false;
-		}
+		hw->mac.ops.flap_tx_laser(hw);
 
 		/* Wait for the link partner to also set speed */
 		msleep(100);
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index 7949a446e4c..1959ef76c96 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -1853,6 +1853,26 @@ static void ixgbe_diag_test(struct net_device *netdev,
 		if (ixgbe_link_test(adapter, &data[4]))
 			eth_test->flags |= ETH_TEST_FL_FAILED;
 
+		if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) {
+			int i;
+			for (i = 0; i < adapter->num_vfs; i++) {
+				if (adapter->vfinfo[i].clear_to_send) {
+					netdev_warn(netdev, "%s",
+						    "offline diagnostic is not "
+						    "supported when VFs are "
+						    "present\n");
+					data[0] = 1;
+					data[1] = 1;
+					data[2] = 1;
+					data[3] = 1;
+					eth_test->flags |= ETH_TEST_FL_FAILED;
+					clear_bit(__IXGBE_TESTING,
+						  &adapter->state);
+					goto skip_ol_tests;
+				}
+			}
+		}
+
 		if (if_running)
 			/* indicate we're in test mode */
 			dev_close(netdev);
@@ -1908,6 +1928,7 @@ skip_loopback:
 
 		clear_bit(__IXGBE_TESTING, &adapter->state);
 	}
+skip_ol_tests:
 	msleep_interruptible(4 * 1000);
 }
 
diff --git a/drivers/net/ixgbe/ixgbe_fcoe.c b/drivers/net/ixgbe/ixgbe_fcoe.c
index 4123dec0dfb..9276d5965b0 100644
--- a/drivers/net/ixgbe/ixgbe_fcoe.c
+++ b/drivers/net/ixgbe/ixgbe_fcoe.c
@@ -202,6 +202,15 @@ int ixgbe_fcoe_ddp_get(struct net_device *netdev, u16 xid,
 		addr = sg_dma_address(sg);
 		len = sg_dma_len(sg);
 		while (len) {
+			/* max number of buffers allowed in one DDP context */
+			if (j >= IXGBE_BUFFCNT_MAX) {
+				netif_err(adapter, drv, adapter->netdev,
+					  "xid=%x:%d,%d,%d:addr=%llx "
+					  "not enough descriptors\n",
+					  xid, i, j, dmacount, (u64)addr);
+				goto out_noddp_free;
+			}
+
 			/* get the offset of length of current buffer */
 			thisoff = addr & ((dma_addr_t)bufflen - 1);
 			thislen = min((bufflen - thisoff), len);
@@ -227,20 +236,13 @@ int ixgbe_fcoe_ddp_get(struct net_device *netdev, u16 xid,
 			len -= thislen;
 			addr += thislen;
 			j++;
-			/* max number of buffers allowed in one DDP context */
-			if (j > IXGBE_BUFFCNT_MAX) {
-				DPRINTK(DRV, ERR, "xid=%x:%d,%d,%d:addr=%llx "
-					"not enough descriptors\n",
-					xid, i, j, dmacount, (u64)addr);
-				goto out_noddp_free;
-			}
 		}
 	}
 	/* only the last buffer may have non-full bufflen */
 	lastsize = thisoff + thislen;
 
 	fcbuff = (IXGBE_FCBUFF_4KB << IXGBE_FCBUFF_BUFFSIZE_SHIFT);
-	fcbuff |= (j << IXGBE_FCBUFF_BUFFCNT_SHIFT);
+	fcbuff |= ((j & 0xff) << IXGBE_FCBUFF_BUFFCNT_SHIFT);
 	fcbuff |= (firstoff << IXGBE_FCBUFF_OFFSET_SHIFT);
 	fcbuff |= (IXGBE_FCBUFF_VALID);
 
@@ -520,6 +522,9 @@ void ixgbe_configure_fcoe(struct ixgbe_adapter *adapter)
 	/* Enable L2 eth type filter for FCoE */
 	IXGBE_WRITE_REG(hw, IXGBE_ETQF(IXGBE_ETQF_FILTER_FCOE),
 			(ETH_P_FCOE | IXGBE_ETQF_FCOE | IXGBE_ETQF_FILTER_EN));
+	/* Enable L2 eth type filter for FIP */
+	IXGBE_WRITE_REG(hw, IXGBE_ETQF(IXGBE_ETQF_FILTER_FIP),
+			(ETH_P_FIP | IXGBE_ETQF_FILTER_EN));
 	if (adapter->ring_feature[RING_F_FCOE].indices) {
 		/* Use multiple rx queues for FCoE by redirection table */
 		for (i = 0; i < IXGBE_FCRETA_SIZE; i++) {
@@ -530,6 +535,12 @@ void ixgbe_configure_fcoe(struct ixgbe_adapter *adapter)
 		}
 		IXGBE_WRITE_REG(hw, IXGBE_FCRECTL, IXGBE_FCRECTL_ENA);
 		IXGBE_WRITE_REG(hw, IXGBE_ETQS(IXGBE_ETQF_FILTER_FCOE), 0);
+		fcoe_i = f->mask;
+		fcoe_i &= IXGBE_FCRETA_ENTRY_MASK;
+		fcoe_q = adapter->rx_ring[fcoe_i]->reg_idx;
+		IXGBE_WRITE_REG(hw, IXGBE_ETQS(IXGBE_ETQF_FILTER_FIP),
+				IXGBE_ETQS_QUEUE_EN |
+				(fcoe_q << IXGBE_ETQS_RX_QUEUE_SHIFT));
 	} else  {
 		/* Use single rx queue for FCoE */
 		fcoe_i = f->mask;
@@ -539,6 +550,12 @@ void ixgbe_configure_fcoe(struct ixgbe_adapter *adapter)
 				IXGBE_ETQS_QUEUE_EN |
 				(fcoe_q << IXGBE_ETQS_RX_QUEUE_SHIFT));
 	}
+	/* send FIP frames to the first FCoE queue */
+	fcoe_i = f->mask;
+	fcoe_q = adapter->rx_ring[fcoe_i]->reg_idx;
+	IXGBE_WRITE_REG(hw, IXGBE_ETQS(IXGBE_ETQF_FILTER_FIP),
+			IXGBE_ETQS_QUEUE_EN |
+			(fcoe_q << IXGBE_ETQS_RX_QUEUE_SHIFT));
 
 	IXGBE_WRITE_REG(hw, IXGBE_FCRXCTRL,
 			IXGBE_FCRXCTRL_FCOELLI |
@@ -614,9 +631,9 @@ int ixgbe_fcoe_enable(struct net_device *netdev)
 	netdev->vlan_features |= NETIF_F_FSO;
 	netdev->vlan_features |= NETIF_F_FCOE_MTU;
 	netdev->fcoe_ddp_xid = IXGBE_FCOE_DDP_MAX - 1;
-	netdev_features_change(netdev);
 
 	ixgbe_init_interrupt_scheme(adapter);
+	netdev_features_change(netdev);
 
 	if (netif_running(netdev))
 		netdev->netdev_ops->ndo_open(netdev);
@@ -660,11 +677,11 @@ int ixgbe_fcoe_disable(struct net_device *netdev)
 	netdev->vlan_features &= ~NETIF_F_FSO;
 	netdev->vlan_features &= ~NETIF_F_FCOE_MTU;
 	netdev->fcoe_ddp_xid = 0;
-	netdev_features_change(netdev);
 
 	ixgbe_cleanup_fcoe(adapter);
-
 	ixgbe_init_interrupt_scheme(adapter);
+	netdev_features_change(netdev);
+
 	if (netif_running(netdev))
 		netdev->netdev_ops->ndo_open(netdev);
 	rc = 0;
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 684af371462..0c553f6cb53 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -935,10 +935,12 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 			if (skb->prev)
 				skb = ixgbe_transform_rsc_queue(skb, &(rx_ring->rsc_count));
 			if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) {
-				if (IXGBE_RSC_CB(skb)->dma)
+				if (IXGBE_RSC_CB(skb)->dma) {
 					pci_unmap_single(pdev, IXGBE_RSC_CB(skb)->dma,
 					                 rx_ring->rx_buf_len,
 					                 PCI_DMA_FROMDEVICE);
+					IXGBE_RSC_CB(skb)->dma = 0;
+				}
 				if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED)
 					rx_ring->rsc_count += skb_shinfo(skb)->nr_frags;
 				else
@@ -3054,6 +3056,14 @@ void ixgbe_reinit_locked(struct ixgbe_adapter *adapter)
 	while (test_and_set_bit(__IXGBE_RESETTING, &adapter->state))
 		msleep(1);
 	ixgbe_down(adapter);
+	/*
+	 * If SR-IOV enabled then wait a bit before bringing the adapter
+	 * back up to give the VFs time to respond to the reset.  The
+	 * two second wait is based upon the watchdog timer cycle in
+	 * the VF driver.
+	 */
+	if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)
+		msleep(2000);
 	ixgbe_up(adapter);
 	clear_bit(__IXGBE_RESETTING, &adapter->state);
 }
@@ -3126,10 +3136,12 @@ static void ixgbe_clean_rx_ring(struct ixgbe_adapter *adapter,
 			rx_buffer_info->skb = NULL;
 			do {
 				struct sk_buff *this = skb;
-				if (IXGBE_RSC_CB(this)->dma)
+				if (IXGBE_RSC_CB(this)->dma) {
 					pci_unmap_single(pdev, IXGBE_RSC_CB(this)->dma,
 					                 rx_ring->rx_buf_len,
 					                 PCI_DMA_FROMDEVICE);
+					IXGBE_RSC_CB(this)->dma = 0;
+				}
 				skb = skb->prev;
 				dev_kfree_skb(this);
 			} while (skb);
@@ -3232,13 +3244,15 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
 
 	/* disable receive for all VFs and wait one second */
 	if (adapter->num_vfs) {
-		for (i = 0 ; i < adapter->num_vfs; i++)
-			adapter->vfinfo[i].clear_to_send = 0;
-
 		/* ping all the active vfs to let them know we are going down */
 		ixgbe_ping_all_vfs(adapter);
+
 		/* Disable all VFTE/VFRE TX/RX */
 		ixgbe_disable_tx_rx(adapter);
+
+		/* Mark all the VFs as inactive */
+		for (i = 0 ; i < adapter->num_vfs; i++)
+			adapter->vfinfo[i].clear_to_send = 0;
 	}
 
 	/* disable receives */
@@ -5018,6 +5032,7 @@ static void ixgbe_multispeed_fiber_task(struct work_struct *work)
 	autoneg = hw->phy.autoneg_advertised;
 	if ((!autoneg) && (hw->mac.ops.get_link_capabilities))
 		hw->mac.ops.get_link_capabilities(hw, &autoneg, &negotiation);
+	hw->mac.autotry_restart = false;
 	if (hw->mac.ops.setup_link)
 		hw->mac.ops.setup_link(hw, autoneg, negotiation, true);
 	adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
@@ -5633,7 +5648,8 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb)
 
 #ifdef IXGBE_FCOE
 	if ((adapter->flags & IXGBE_FLAG_FCOE_ENABLED) &&
-	    (skb->protocol == htons(ETH_P_FCOE))) {
+	    ((skb->protocol == htons(ETH_P_FCOE)) ||
+	     (skb->protocol == htons(ETH_P_FIP)))) {
 		txq &= (adapter->ring_feature[RING_F_FCOE].indices - 1);
 		txq += adapter->ring_feature[RING_F_FCOE].mask;
 		return txq;
@@ -5680,18 +5696,25 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
 
 	tx_ring = adapter->tx_ring[skb->queue_mapping];
 
-	if ((adapter->flags & IXGBE_FLAG_FCOE_ENABLED) &&
-	    (skb->protocol == htons(ETH_P_FCOE))) {
-		tx_flags |= IXGBE_TX_FLAGS_FCOE;
 #ifdef IXGBE_FCOE
+	if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) {
 #ifdef CONFIG_IXGBE_DCB
-		tx_flags &= ~(IXGBE_TX_FLAGS_VLAN_PRIO_MASK
-			      << IXGBE_TX_FLAGS_VLAN_SHIFT);
-		tx_flags |= ((adapter->fcoe.up << 13)
-			      << IXGBE_TX_FLAGS_VLAN_SHIFT);
-#endif
+		/* for FCoE with DCB, we force the priority to what
+		 * was specified by the switch */
+		if ((skb->protocol == htons(ETH_P_FCOE)) ||
+		    (skb->protocol == htons(ETH_P_FIP))) {
+			tx_flags &= ~(IXGBE_TX_FLAGS_VLAN_PRIO_MASK
+				      << IXGBE_TX_FLAGS_VLAN_SHIFT);
+			tx_flags |= ((adapter->fcoe.up << 13)
+				     << IXGBE_TX_FLAGS_VLAN_SHIFT);
+		}
 #endif
+		/* flag for FCoE offloads */
+		if (skb->protocol == htons(ETH_P_FCOE))
+			tx_flags |= IXGBE_TX_FLAGS_FCOE;
 	}
+#endif
+
 	/* four things can cause us to need a context descriptor */
 	if (skb_is_gso(skb) ||
 	    (skb->ip_summed == CHECKSUM_PARTIAL) ||
@@ -6046,7 +6069,6 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	indices += min_t(unsigned int, num_possible_cpus(),
 			 IXGBE_MAX_FCOE_INDICES);
 #endif
-	indices = min_t(unsigned int, indices, MAX_TX_QUEUES);
 	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), indices);
 	if (!netdev) {
 		err = -ENOMEM;
@@ -6245,9 +6267,6 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	case IXGBE_DEV_ID_82599_KX4:
 		adapter->wol = (IXGBE_WUFC_MAG | IXGBE_WUFC_EX |
 		                IXGBE_WUFC_MC | IXGBE_WUFC_BC);
-		/* Enable ACPI wakeup in GRC */
-		IXGBE_WRITE_REG(hw, IXGBE_GRC,
-		             (IXGBE_READ_REG(hw, IXGBE_GRC) & ~IXGBE_GRC_APME));
 		break;
 	default:
 		adapter->wol = 0;
@@ -6380,6 +6399,16 @@ static void __devexit ixgbe_remove(struct pci_dev *pdev)
 	del_timer_sync(&adapter->sfp_timer);
 	cancel_work_sync(&adapter->watchdog_task);
 	cancel_work_sync(&adapter->sfp_task);
+	if (adapter->hw.phy.multispeed_fiber) {
+		struct ixgbe_hw *hw = &adapter->hw;
+		/*
+		 * Restart clause 37 autoneg, disable and re-enable
+		 * the tx laser, to clear & alert the link partner
+		 * that it needs to restart autotry
+		 */
+		hw->mac.autotry_restart = true;
+		hw->mac.ops.flap_tx_laser(hw);
+	}
 	cancel_work_sync(&adapter->multispeed_fiber_task);
 	cancel_work_sync(&adapter->sfp_config_module_task);
 	if (adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE ||
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index 2be90746659..4ec6dc1a5b7 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -1298,6 +1298,7 @@
 #define IXGBE_ETQF_FILTER_BCN            1
 #define IXGBE_ETQF_FILTER_FCOE           2
 #define IXGBE_ETQF_FILTER_1588           3
+#define IXGBE_ETQF_FILTER_FIP            4
 /* VLAN Control Bit Masks */
 #define IXGBE_VLNCTRL_VET       0x0000FFFF  /* bits 0-15 */
 #define IXGBE_VLNCTRL_CFI       0x10000000  /* bit 28 */
@@ -2397,6 +2398,7 @@ struct ixgbe_mac_operations {
 	s32 (*enable_rx_dma)(struct ixgbe_hw *, u32);
 
 	/* Link */
+	void (*flap_tx_laser)(struct ixgbe_hw *);
 	s32 (*setup_link)(struct ixgbe_hw *, ixgbe_link_speed, bool, bool);
 	s32 (*check_link)(struct ixgbe_hw *, ixgbe_link_speed *, bool *, bool);
 	s32 (*get_link_capabilities)(struct ixgbe_hw *, ixgbe_link_speed *,
diff --git a/drivers/net/ixgbevf/ethtool.c b/drivers/net/ixgbevf/ethtool.c
index 399be0c34c3..6fdd651abcd 100644
--- a/drivers/net/ixgbevf/ethtool.c
+++ b/drivers/net/ixgbevf/ethtool.c
@@ -46,22 +46,32 @@ struct ixgbe_stats {
 	int sizeof_stat;
 	int stat_offset;
 	int base_stat_offset;
+	int saved_reset_offset;
 };
 
-#define IXGBEVF_STAT(m, b)  sizeof(((struct ixgbevf_adapter *)0)->m), \
-			    offsetof(struct ixgbevf_adapter, m),      \
-			    offsetof(struct ixgbevf_adapter, b)
+#define IXGBEVF_STAT(m, b, r)  sizeof(((struct ixgbevf_adapter *)0)->m), \
+			    offsetof(struct ixgbevf_adapter, m),         \
+			    offsetof(struct ixgbevf_adapter, b),         \
+			    offsetof(struct ixgbevf_adapter, r)
 static struct ixgbe_stats ixgbe_gstrings_stats[] = {
-	{"rx_packets", IXGBEVF_STAT(stats.vfgprc, stats.base_vfgprc)},
-	{"tx_packets", IXGBEVF_STAT(stats.vfgptc, stats.base_vfgptc)},
-	{"rx_bytes", IXGBEVF_STAT(stats.vfgorc, stats.base_vfgorc)},
-	{"tx_bytes", IXGBEVF_STAT(stats.vfgotc, stats.base_vfgotc)},
-	{"tx_busy", IXGBEVF_STAT(tx_busy, zero_base)},
-	{"multicast", IXGBEVF_STAT(stats.vfmprc, stats.base_vfmprc)},
-	{"rx_csum_offload_good", IXGBEVF_STAT(hw_csum_rx_good, zero_base)},
-	{"rx_csum_offload_errors", IXGBEVF_STAT(hw_csum_rx_error, zero_base)},
-	{"tx_csum_offload_ctxt", IXGBEVF_STAT(hw_csum_tx_good, zero_base)},
-	{"rx_header_split", IXGBEVF_STAT(rx_hdr_split, zero_base)},
+	{"rx_packets", IXGBEVF_STAT(stats.vfgprc, stats.base_vfgprc,
+				    stats.saved_reset_vfgprc)},
+	{"tx_packets", IXGBEVF_STAT(stats.vfgptc, stats.base_vfgptc,
+				    stats.saved_reset_vfgptc)},
+	{"rx_bytes", IXGBEVF_STAT(stats.vfgorc, stats.base_vfgorc,
+				  stats.saved_reset_vfgorc)},
+	{"tx_bytes", IXGBEVF_STAT(stats.vfgotc, stats.base_vfgotc,
+				  stats.saved_reset_vfgotc)},
+	{"tx_busy", IXGBEVF_STAT(tx_busy, zero_base, zero_base)},
+	{"multicast", IXGBEVF_STAT(stats.vfmprc, stats.base_vfmprc,
+				   stats.saved_reset_vfmprc)},
+	{"rx_csum_offload_good", IXGBEVF_STAT(hw_csum_rx_good, zero_base,
+					      zero_base)},
+	{"rx_csum_offload_errors", IXGBEVF_STAT(hw_csum_rx_error, zero_base,
+						zero_base)},
+	{"tx_csum_offload_ctxt", IXGBEVF_STAT(hw_csum_tx_good, zero_base,
+					      zero_base)},
+	{"rx_header_split", IXGBEVF_STAT(rx_hdr_split, zero_base, zero_base)},
 };
 
 #define IXGBE_QUEUE_STATS_LEN 0
@@ -455,10 +465,14 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev,
 			ixgbe_gstrings_stats[i].stat_offset;
 		char *b = (char *)adapter +
 			ixgbe_gstrings_stats[i].base_stat_offset;
+		char *r = (char *)adapter +
+			ixgbe_gstrings_stats[i].saved_reset_offset;
 		data[i] = ((ixgbe_gstrings_stats[i].sizeof_stat ==
 			    sizeof(u64)) ? *(u64 *)p : *(u32 *)p) -
 			  ((ixgbe_gstrings_stats[i].sizeof_stat ==
-			    sizeof(u64)) ? *(u64 *)b : *(u32 *)b);
+			    sizeof(u64)) ? *(u64 *)b : *(u32 *)b) +
+			  ((ixgbe_gstrings_stats[i].sizeof_stat ==
+			    sizeof(u64)) ? *(u64 *)r : *(u32 *)r);
 	}
 }
 
diff --git a/drivers/net/ixgbevf/ixgbevf_main.c b/drivers/net/ixgbevf/ixgbevf_main.c
index ca653c49b76..1bbbef3ee3f 100644
--- a/drivers/net/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ixgbevf/ixgbevf_main.c
@@ -965,7 +965,7 @@ static irqreturn_t ixgbevf_msix_mbx(int irq, void *data)
 
 	if ((msg & IXGBE_MBVFICR_VFREQ_MASK) == IXGBE_PF_CONTROL_MSG)
 		mod_timer(&adapter->watchdog_timer,
-			  round_jiffies(jiffies + 10));
+			  round_jiffies(jiffies + 1));
 
 	return IRQ_HANDLED;
 }
@@ -1610,6 +1610,44 @@ static inline void ixgbevf_rx_desc_queue_enable(struct ixgbevf_adapter *adapter,
 				(adapter->rx_ring[rxr].count - 1));
 }
 
+static void ixgbevf_save_reset_stats(struct ixgbevf_adapter *adapter)
+{
+	/* Only save pre-reset stats if there are some */
+	if (adapter->stats.vfgprc || adapter->stats.vfgptc) {
+		adapter->stats.saved_reset_vfgprc += adapter->stats.vfgprc -
+			adapter->stats.base_vfgprc;
+		adapter->stats.saved_reset_vfgptc += adapter->stats.vfgptc -
+			adapter->stats.base_vfgptc;
+		adapter->stats.saved_reset_vfgorc += adapter->stats.vfgorc -
+			adapter->stats.base_vfgorc;
+		adapter->stats.saved_reset_vfgotc += adapter->stats.vfgotc -
+			adapter->stats.base_vfgotc;
+		adapter->stats.saved_reset_vfmprc += adapter->stats.vfmprc -
+			adapter->stats.base_vfmprc;
+	}
+}
+
+static void ixgbevf_init_last_counter_stats(struct ixgbevf_adapter *adapter)
+{
+	struct ixgbe_hw *hw = &adapter->hw;
+
+	adapter->stats.last_vfgprc = IXGBE_READ_REG(hw, IXGBE_VFGPRC);
+	adapter->stats.last_vfgorc = IXGBE_READ_REG(hw, IXGBE_VFGORC_LSB);
+	adapter->stats.last_vfgorc |=
+		(((u64)(IXGBE_READ_REG(hw, IXGBE_VFGORC_MSB))) << 32);
+	adapter->stats.last_vfgptc = IXGBE_READ_REG(hw, IXGBE_VFGPTC);
+	adapter->stats.last_vfgotc = IXGBE_READ_REG(hw, IXGBE_VFGOTC_LSB);
+	adapter->stats.last_vfgotc |=
+		(((u64)(IXGBE_READ_REG(hw, IXGBE_VFGOTC_MSB))) << 32);
+	adapter->stats.last_vfmprc = IXGBE_READ_REG(hw, IXGBE_VFMPRC);
+
+	adapter->stats.base_vfgprc = adapter->stats.last_vfgprc;
+	adapter->stats.base_vfgorc = adapter->stats.last_vfgorc;
+	adapter->stats.base_vfgptc = adapter->stats.last_vfgptc;
+	adapter->stats.base_vfgotc = adapter->stats.last_vfgotc;
+	adapter->stats.base_vfmprc = adapter->stats.last_vfmprc;
+}
+
 static int ixgbevf_up_complete(struct ixgbevf_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
@@ -1656,6 +1694,9 @@ static int ixgbevf_up_complete(struct ixgbevf_adapter *adapter)
 	/* enable transmits */
 	netif_tx_start_all_queues(netdev);
 
+	ixgbevf_save_reset_stats(adapter);
+	ixgbevf_init_last_counter_stats(adapter);
+
 	/* bring the link up in the watchdog, this could race with our first
 	 * link up interrupt but shouldn't be a problem */
 	adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
@@ -2228,27 +2269,6 @@ out:
 	return err;
 }
 
-static void ixgbevf_init_last_counter_stats(struct ixgbevf_adapter *adapter)
-{
-	struct ixgbe_hw *hw = &adapter->hw;
-
-	adapter->stats.last_vfgprc = IXGBE_READ_REG(hw, IXGBE_VFGPRC);
-	adapter->stats.last_vfgorc = IXGBE_READ_REG(hw, IXGBE_VFGORC_LSB);
-	adapter->stats.last_vfgorc |=
-		(((u64)(IXGBE_READ_REG(hw, IXGBE_VFGORC_MSB))) << 32);
-	adapter->stats.last_vfgptc = IXGBE_READ_REG(hw, IXGBE_VFGPTC);
-	adapter->stats.last_vfgotc = IXGBE_READ_REG(hw, IXGBE_VFGOTC_LSB);
-	adapter->stats.last_vfgotc |=
-		(((u64)(IXGBE_READ_REG(hw, IXGBE_VFGOTC_MSB))) << 32);
-	adapter->stats.last_vfmprc = IXGBE_READ_REG(hw, IXGBE_VFMPRC);
-
-	adapter->stats.base_vfgprc = adapter->stats.last_vfgprc;
-	adapter->stats.base_vfgorc = adapter->stats.last_vfgorc;
-	adapter->stats.base_vfgptc = adapter->stats.last_vfgptc;
-	adapter->stats.base_vfgotc = adapter->stats.last_vfgotc;
-	adapter->stats.base_vfmprc = adapter->stats.last_vfmprc;
-}
-
 #define UPDATE_VF_COUNTER_32bit(reg, last_counter, counter)	\
 	{							\
 		u32 current_counter = IXGBE_READ_REG(hw, reg);	\
@@ -2399,7 +2419,7 @@ static void ixgbevf_watchdog_task(struct work_struct *work)
 		if (!netif_carrier_ok(netdev)) {
 			hw_dbg(&adapter->hw, "NIC Link is Up %s, ",
 			       ((link_speed == IXGBE_LINK_SPEED_10GB_FULL) ?
-				"10 Gbps" : "1 Gbps"));
+				"10 Gbps\n" : "1 Gbps\n"));
 			netif_carrier_on(netdev);
 			netif_tx_wake_all_queues(netdev);
 		} else {
@@ -2416,9 +2436,9 @@ static void ixgbevf_watchdog_task(struct work_struct *work)
 		}
 	}
 
-pf_has_reset:
 	ixgbevf_update_stats(adapter);
 
+pf_has_reset:
 	/* Force detection of hung controller every watchdog period */
 	adapter->detect_tx_hung = true;
 
@@ -2675,7 +2695,7 @@ static int ixgbevf_open(struct net_device *netdev)
 		if (hw->adapter_stopped) {
 			err = IXGBE_ERR_MBX;
 			printk(KERN_ERR "Unable to start - perhaps the PF"
-			       "Driver isn't up yet\n");
+			       " Driver isn't up yet\n");
 			goto err_setup_reset;
 		}
 	}
@@ -2923,9 +2943,10 @@ static int ixgbevf_tx_map(struct ixgbevf_adapter *adapter,
 	struct ixgbevf_tx_buffer *tx_buffer_info;
 	unsigned int len;
 	unsigned int total = skb->len;
-	unsigned int offset = 0, size, count = 0, i;
+	unsigned int offset = 0, size, count = 0;
 	unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
 	unsigned int f;
+	int i;
 
 	i = tx_ring->next_to_use;
 
@@ -3390,8 +3411,6 @@ static int __devinit ixgbevf_probe(struct pci_dev *pdev,
 	/* setup the private structure */
 	err = ixgbevf_sw_init(adapter);
 
-	ixgbevf_init_last_counter_stats(adapter);
-
 #ifdef MAX_SKB_FRAGS
 	netdev->features = NETIF_F_SG |
 			   NETIF_F_IP_CSUM |
@@ -3449,6 +3468,8 @@ static int __devinit ixgbevf_probe(struct pci_dev *pdev,
 
 	adapter->netdev_registered = true;
 
+	ixgbevf_init_last_counter_stats(adapter);
+
 	/* print the MAC address */
 	hw_dbg(hw, "%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n",
 	       netdev->dev_addr[0],
diff --git a/drivers/net/ixgbevf/vf.h b/drivers/net/ixgbevf/vf.h
index 799600e9270..1f31b052d4b 100644
--- a/drivers/net/ixgbevf/vf.h
+++ b/drivers/net/ixgbevf/vf.h
@@ -157,6 +157,12 @@ struct ixgbevf_hw_stats {
 	u64 vfgorc;
 	u64 vfgotc;
 	u64 vfmprc;
+
+	u64 saved_reset_vfgprc;
+	u64 saved_reset_vfgptc;
+	u64 saved_reset_vfgorc;
+	u64 saved_reset_vfgotc;
+	u64 saved_reset_vfmprc;
 };
 
 struct ixgbevf_info {
diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index 0f31497833d..c0b59a55538 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -946,6 +946,8 @@ jme_alloc_and_feed_skb(struct jme_adapter *jme, int idx)
 				jme->jme_vlan_rx(skb, jme->vlgrp,
 					le16_to_cpu(rxdesc->descwb.vlan));
 				NET_STAT(jme).rx_bytes += 4;
+			} else {
+				dev_kfree_skb(skb);
 			}
 		} else {
 			jme->jme_rx(skb);
@@ -2081,12 +2083,45 @@ jme_tx_timeout(struct net_device *netdev)
 	jme_reset_link(jme);
 }
 
+static inline void jme_pause_rx(struct jme_adapter *jme)
+{
+	atomic_dec(&jme->link_changing);
+
+	jme_set_rx_pcc(jme, PCC_OFF);
+	if (test_bit(JME_FLAG_POLL, &jme->flags)) {
+		JME_NAPI_DISABLE(jme);
+	} else {
+		tasklet_disable(&jme->rxclean_task);
+		tasklet_disable(&jme->rxempty_task);
+	}
+}
+
+static inline void jme_resume_rx(struct jme_adapter *jme)
+{
+	struct dynpcc_info *dpi = &(jme->dpi);
+
+	if (test_bit(JME_FLAG_POLL, &jme->flags)) {
+		JME_NAPI_ENABLE(jme);
+	} else {
+		tasklet_hi_enable(&jme->rxclean_task);
+		tasklet_hi_enable(&jme->rxempty_task);
+	}
+	dpi->cur		= PCC_P1;
+	dpi->attempt		= PCC_P1;
+	dpi->cnt		= 0;
+	jme_set_rx_pcc(jme, PCC_P1);
+
+	atomic_inc(&jme->link_changing);
+}
+
 static void
 jme_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
 {
 	struct jme_adapter *jme = netdev_priv(netdev);
 
+	jme_pause_rx(jme);
 	jme->vlgrp = grp;
+	jme_resume_rx(jme);
 }
 
 static void
diff --git a/drivers/net/jme.h b/drivers/net/jme.h
index c19db9146a2..07ad3a45718 100644
--- a/drivers/net/jme.h
+++ b/drivers/net/jme.h
@@ -25,7 +25,7 @@
 #define __JME_H_INCLUDED__
 
 #define DRV_NAME	"jme"
-#define DRV_VERSION	"1.0.5"
+#define DRV_VERSION	"1.0.6"
 #define PFX		DRV_NAME ": "
 
 #define PCI_DEVICE_ID_JMICRON_JMC250	0x0250
diff --git a/drivers/net/ks8851.c b/drivers/net/ks8851.c
index 0573e0bb444..13cc1ca261d 100644
--- a/drivers/net/ks8851.c
+++ b/drivers/net/ks8851.c
@@ -976,7 +976,6 @@ static void ks8851_set_rx_mode(struct net_device *dev)
 			crc >>= (32 - 6);  /* get top six bits */
 
 			rxctrl.mchash[crc >> 4] |= (1 << (crc & 0xf));
-			mcptr = mcptr->next;
 		}
 
 		rxctrl.rxcr1 = RXCR1_RXME | RXCR1_RXPAFMA;
diff --git a/drivers/net/ksz884x.c b/drivers/net/ksz884x.c
index 0f59099ee72..6c5327af1bf 100644
--- a/drivers/net/ksz884x.c
+++ b/drivers/net/ksz884x.c
@@ -6322,7 +6322,7 @@ static int netdev_set_eeprom(struct net_device *dev,
 	int len;
 
 	if (eeprom->magic != EEPROM_MAGIC)
-		return 1;
+		return -EINVAL;
 
 	len = (eeprom->offset + eeprom->len + 1) / 2;
 	for (i = eeprom->offset / 2; i < len; i++)
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 8f6e816a739..b402a95c87c 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -1023,6 +1023,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	info->port_attr.attr.mode = S_IRUGO | S_IWUSR;
 	info->port_attr.show      = show_port_type;
 	info->port_attr.store     = set_port_type;
+	sysfs_attr_init(&info->port_attr.attr);
 
 	err = device_create_file(&dev->pdev->dev, &info->port_attr);
 	if (err) {
diff --git a/drivers/net/netxen/netxen_nic.h b/drivers/net/netxen/netxen_nic.h
index 144d2e88042..0f703838e21 100644
--- a/drivers/net/netxen/netxen_nic.h
+++ b/drivers/net/netxen/netxen_nic.h
@@ -53,8 +53,8 @@
 
 #define _NETXEN_NIC_LINUX_MAJOR 4
 #define _NETXEN_NIC_LINUX_MINOR 0
-#define _NETXEN_NIC_LINUX_SUBVERSION 72
-#define NETXEN_NIC_LINUX_VERSIONID  "4.0.72"
+#define _NETXEN_NIC_LINUX_SUBVERSION 73
+#define NETXEN_NIC_LINUX_VERSIONID  "4.0.73"
 
 #define NETXEN_VERSION_CODE(a, b, c)	(((a) << 24) + ((b) << 16) + (c))
 #define _major(v)	(((v) >> 24) & 0xff)
diff --git a/drivers/net/netxen/netxen_nic_ctx.c b/drivers/net/netxen/netxen_nic_ctx.c
index 2a8ef5fc966..f26e54716c8 100644
--- a/drivers/net/netxen/netxen_nic_ctx.c
+++ b/drivers/net/netxen/netxen_nic_ctx.c
@@ -669,13 +669,15 @@ int netxen_alloc_hw_resources(struct netxen_adapter *adapter)
 		}
 		sds_ring->desc_head = (struct status_desc *)addr;
 
-		sds_ring->crb_sts_consumer =
-			netxen_get_ioaddr(adapter,
-			recv_crb_registers[port].crb_sts_consumer[ring]);
+		if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
+			sds_ring->crb_sts_consumer =
+				netxen_get_ioaddr(adapter,
+				recv_crb_registers[port].crb_sts_consumer[ring]);
 
-		sds_ring->crb_intr_mask =
-			netxen_get_ioaddr(adapter,
-			recv_crb_registers[port].sw_int_mask[ring]);
+			sds_ring->crb_intr_mask =
+				netxen_get_ioaddr(adapter,
+				recv_crb_registers[port].sw_int_mask[ring]);
+		}
 	}
 
 
diff --git a/drivers/net/netxen/netxen_nic_init.c b/drivers/net/netxen/netxen_nic_init.c
index 1c63610ead4..7eb925a9f36 100644
--- a/drivers/net/netxen/netxen_nic_init.c
+++ b/drivers/net/netxen/netxen_nic_init.c
@@ -761,7 +761,7 @@ nx_get_bios_version(struct netxen_adapter *adapter)
 	if (adapter->fw_type == NX_UNIFIED_ROMIMAGE) {
 		bios_ver = cpu_to_le32(*((u32 *) (&fw->data[prd_off])
 						+ NX_UNI_BIOS_VERSION_OFF));
-		return (bios_ver << 24) + ((bios_ver >> 8) & 0xff00) +
+		return (bios_ver << 16) + ((bios_ver >> 8) & 0xff00) +
 							(bios_ver >> 24);
 	} else
 		return cpu_to_le32(*(u32 *)&fw->data[NX_BIOS_VERSION_OFFSET]);
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index 08780ef1c1f..01808b28d1b 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -604,16 +604,14 @@ netxen_cleanup_pci_map(struct netxen_adapter *adapter)
 static int
 netxen_setup_pci_map(struct netxen_adapter *adapter)
 {
-	void __iomem *mem_ptr0 = NULL;
-	void __iomem *mem_ptr1 = NULL;
-	void __iomem *mem_ptr2 = NULL;
 	void __iomem *db_ptr = NULL;
 
 	resource_size_t mem_base, db_base;
-	unsigned long mem_len, db_len = 0, pci_len0 = 0;
+	unsigned long mem_len, db_len = 0;
 
 	struct pci_dev *pdev = adapter->pdev;
 	int pci_func = adapter->ahw.pci_func;
+	struct netxen_hardware_context *ahw = &adapter->ahw;
 
 	int err = 0;
 
@@ -630,24 +628,40 @@ netxen_setup_pci_map(struct netxen_adapter *adapter)
 
 	/* 128 Meg of memory */
 	if (mem_len == NETXEN_PCI_128MB_SIZE) {
-		mem_ptr0 = ioremap(mem_base, FIRST_PAGE_GROUP_SIZE);
-		mem_ptr1 = ioremap(mem_base + SECOND_PAGE_GROUP_START,
+
+		ahw->pci_base0 = ioremap(mem_base, FIRST_PAGE_GROUP_SIZE);
+		ahw->pci_base1 = ioremap(mem_base + SECOND_PAGE_GROUP_START,
 				SECOND_PAGE_GROUP_SIZE);
-		mem_ptr2 = ioremap(mem_base + THIRD_PAGE_GROUP_START,
+		ahw->pci_base2 = ioremap(mem_base + THIRD_PAGE_GROUP_START,
 				THIRD_PAGE_GROUP_SIZE);
-		pci_len0 = FIRST_PAGE_GROUP_SIZE;
+		if (ahw->pci_base0 == NULL || ahw->pci_base1 == NULL ||
+						ahw->pci_base2 == NULL) {
+			dev_err(&pdev->dev, "failed to map PCI bar 0\n");
+			err = -EIO;
+			goto err_out;
+		}
+
+		ahw->pci_len0 = FIRST_PAGE_GROUP_SIZE;
+
 	} else if (mem_len == NETXEN_PCI_32MB_SIZE) {
-		mem_ptr1 = ioremap(mem_base, SECOND_PAGE_GROUP_SIZE);
-		mem_ptr2 = ioremap(mem_base + THIRD_PAGE_GROUP_START -
+
+		ahw->pci_base1 = ioremap(mem_base, SECOND_PAGE_GROUP_SIZE);
+		ahw->pci_base2 = ioremap(mem_base + THIRD_PAGE_GROUP_START -
 			SECOND_PAGE_GROUP_START, THIRD_PAGE_GROUP_SIZE);
+		if (ahw->pci_base1 == NULL || ahw->pci_base2 == NULL) {
+			dev_err(&pdev->dev, "failed to map PCI bar 0\n");
+			err = -EIO;
+			goto err_out;
+		}
+
 	} else if (mem_len == NETXEN_PCI_2MB_SIZE) {
 
-		mem_ptr0 = pci_ioremap_bar(pdev, 0);
-		if (mem_ptr0 == NULL) {
+		ahw->pci_base0 = pci_ioremap_bar(pdev, 0);
+		if (ahw->pci_base0 == NULL) {
 			dev_err(&pdev->dev, "failed to map PCI bar 0\n");
 			return -EIO;
 		}
-		pci_len0 = mem_len;
+		ahw->pci_len0 = mem_len;
 	} else {
 		return -EIO;
 	}
@@ -656,11 +670,6 @@ netxen_setup_pci_map(struct netxen_adapter *adapter)
 
 	dev_info(&pdev->dev, "%dMB memory map\n", (int)(mem_len>>20));
 
-	adapter->ahw.pci_base0 = mem_ptr0;
-	adapter->ahw.pci_len0 = pci_len0;
-	adapter->ahw.pci_base1 = mem_ptr1;
-	adapter->ahw.pci_base2 = mem_ptr2;
-
 	if (NX_IS_REVISION_P3P(adapter->ahw.revision_id)) {
 		adapter->ahw.ocm_win_crb = netxen_get_ioaddr(adapter,
 			NETXEN_PCIX_PS_REG(PCIX_OCM_WINDOW_REG(pci_func)));
@@ -1246,8 +1255,8 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	int pci_func_id = PCI_FUNC(pdev->devfn);
 	uint8_t revision_id;
 
-	if (pdev->revision >= NX_P3_A0 && pdev->revision < NX_P3_B1) {
-		pr_warning("%s: chip revisions between 0x%x-0x%x"
+	if (pdev->revision >= NX_P3_A0 && pdev->revision <= NX_P3_B1) {
+		pr_warning("%s: chip revisions between 0x%x-0x%x "
 				"will not be enabled.\n",
 				module_name(THIS_MODULE), NX_P3_A0, NX_P3_B1);
 		return -ENODEV;
diff --git a/drivers/net/pcmcia/pcnet_cs.c b/drivers/net/pcmcia/pcnet_cs.c
index 776cad2f571..1028fcb91a2 100644
--- a/drivers/net/pcmcia/pcnet_cs.c
+++ b/drivers/net/pcmcia/pcnet_cs.c
@@ -1549,6 +1549,7 @@ static struct pcmcia_device_id pcnet_ids[] = {
 	PCMCIA_PFC_DEVICE_MANF_CARD(0, 0x021b, 0x0101),
 	PCMCIA_PFC_DEVICE_MANF_CARD(0, 0x08a1, 0xc0ab),
 	PCMCIA_PFC_DEVICE_PROD_ID12(0, "AnyCom", "Fast Ethernet + 56K COMBO", 0x578ba6e7, 0xb0ac62c4),
+	PCMCIA_PFC_DEVICE_PROD_ID12(0, "ATKK", "LM33-PCM-T", 0xba9eb7e2, 0x077c174e),
 	PCMCIA_PFC_DEVICE_PROD_ID12(0, "D-Link", "DME336T", 0x1a424a1c, 0xb23897ff),
 	PCMCIA_PFC_DEVICE_PROD_ID12(0, "Grey Cell", "GCS3000", 0x2a151fac, 0x48b932ae),
 	PCMCIA_PFC_DEVICE_PROD_ID12(0, "Linksys", "EtherFast 10&100 + 56K PC Card (PCMLM56)", 0x0733cc81, 0xb3765033),
@@ -1740,7 +1741,7 @@ static struct pcmcia_device_id pcnet_ids[] = {
 	PCMCIA_MFC_DEVICE_CIS_PROD_ID12(0, "DAYNA COMMUNICATIONS", "LAN AND MODEM MULTIFUNCTION", 0x8fdf8f89, 0xdd5ed9e8, "cis/DP83903.cis"),
 	PCMCIA_MFC_DEVICE_CIS_PROD_ID4(0, "NSC MF LAN/Modem", 0x58fc6056, "cis/DP83903.cis"),
 	PCMCIA_MFC_DEVICE_CIS_MANF_CARD(0, 0x0175, 0x0000, "cis/DP83903.cis"),
-	PCMCIA_DEVICE_CIS_MANF_CARD(0xc00f, 0x0002, "cis/LA-PCM.cis"),
+	PCMCIA_DEVICE_CIS_PROD_ID12("Allied Telesis,K.K", "Ethernet LAN Card", 0x2ad62f3c, 0x9fd2f0a2, "cis/LA-PCM.cis"),
 	PCMCIA_DEVICE_CIS_PROD_ID12("KTI", "PE520 PLUS", 0xad180345, 0x9d58d392, "cis/PE520.cis"),
 	PCMCIA_DEVICE_CIS_PROD_ID12("NDC", "Ethernet", 0x01c43ae1, 0x00b2e941, "cis/NE2K.cis"),
 	PCMCIA_DEVICE_CIS_PROD_ID12("PMX   ", "PE-200", 0x34f3f1c8, 0x10b59f8c, "cis/PE-200.cis"),
diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 9d3ebf3e975..96740051cdc 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -186,8 +186,13 @@ static DEFINE_PCI_DEVICE_TABLE(rtl8169_pci_tbl) = {
 
 MODULE_DEVICE_TABLE(pci, rtl8169_pci_tbl);
 
-static int rx_copybreak = 200;
-static int use_dac = -1;
+/*
+ * we set our copybreak very high so that we don't have
+ * to allocate 16k frames all the time (see note in
+ * rtl8169_open()
+ */
+static int rx_copybreak = 16383;
+static int use_dac;
 static struct {
 	u32 msg_enable;
 } debug = { -1 };
@@ -511,8 +516,7 @@ MODULE_DESCRIPTION("RealTek RTL-8169 Gigabit Ethernet driver");
 module_param(rx_copybreak, int, 0);
 MODULE_PARM_DESC(rx_copybreak, "Copy breakpoint for copy-only-tiny-frames");
 module_param(use_dac, int, 0);
-MODULE_PARM_DESC(use_dac, "Enable PCI DAC. -1 defaults on for PCI Express only."
-" Unsafe on 32 bit PCI slot.");
+MODULE_PARM_DESC(use_dac, "Enable PCI DAC. Unsafe on 32 bit PCI slot.");
 module_param_named(debug, debug.msg_enable, int, 0);
 MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., 16=all)");
 MODULE_LICENSE("GPL");
@@ -2821,8 +2825,8 @@ static void rtl_rar_set(struct rtl8169_private *tp, u8 *addr)
 	spin_lock_irq(&tp->lock);
 
 	RTL_W8(Cfg9346, Cfg9346_Unlock);
-	RTL_W32(MAC0, low);
 	RTL_W32(MAC4, high);
+	RTL_W32(MAC0, low);
 	RTL_W8(Cfg9346, Cfg9346_Lock);
 
 	spin_unlock_irq(&tp->lock);
@@ -2974,7 +2978,6 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	void __iomem *ioaddr;
 	unsigned int i;
 	int rc;
-	int this_use_dac = use_dac;
 
 	if (netif_msg_drv(&debug)) {
 		printk(KERN_INFO "%s Gigabit Ethernet driver %s loaded\n",
@@ -3040,17 +3043,8 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	tp->cp_cmd = PCIMulRW | RxChkSum;
 
-	tp->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
-	if (!tp->pcie_cap)
-		netif_info(tp, probe, dev, "no PCI Express capability\n");
-
-	if (this_use_dac < 0)
-		this_use_dac = tp->pcie_cap != 0;
-
 	if ((sizeof(dma_addr_t) > 4) &&
-	    this_use_dac &&
-	    !pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
-		netif_info(tp, probe, dev, "using 64-bit DMA\n");
+	    !pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) && use_dac) {
 		tp->cp_cmd |= PCIDAC;
 		dev->features |= NETIF_F_HIGHDMA;
 	} else {
@@ -3069,6 +3063,10 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_free_res_4;
 	}
 
+	tp->pcie_cap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+	if (!tp->pcie_cap)
+		netif_info(tp, probe, dev, "no PCI Express capability\n");
+
 	RTL_W16(IntrMask, 0x0000);
 
 	/* Soft reset the chip. */
@@ -3224,9 +3222,13 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
 }
 
 static void rtl8169_set_rxbufsize(struct rtl8169_private *tp,
-				  struct net_device *dev)
+				  unsigned int mtu)
 {
-	unsigned int max_frame = dev->mtu + VLAN_ETH_HLEN + ETH_FCS_LEN;
+	unsigned int max_frame = mtu + VLAN_ETH_HLEN + ETH_FCS_LEN;
+
+	if (max_frame != 16383)
+		printk(KERN_WARNING "WARNING! Changing of MTU on this NIC"
+			"May lead to frame reception errors!\n");
 
 	tp->rx_buf_sz = (max_frame > RX_BUF_SIZE) ? max_frame : RX_BUF_SIZE;
 }
@@ -3238,7 +3240,17 @@ static int rtl8169_open(struct net_device *dev)
 	int retval = -ENOMEM;
 
 
-	rtl8169_set_rxbufsize(tp, dev);
+	/*
+	 * Note that we use a magic value here, its wierd I know
+	 * its done because, some subset of rtl8169 hardware suffers from
+	 * a problem in which frames received that are longer than
+	 * the size set in RxMaxSize register return garbage sizes
+	 * when received.  To avoid this we need to turn off filtering,
+	 * which is done by setting a value of 16383 in the RxMaxSize register
+	 * and allocating 16k frames to handle the largest possible rx value
+	 * thats what the magic math below does.
+	 */
+	rtl8169_set_rxbufsize(tp, 16383 - VLAN_ETH_HLEN - ETH_FCS_LEN);
 
 	/*
 	 * Rx and Tx desscriptors needs 256 bytes alignment.
@@ -3891,7 +3903,7 @@ static int rtl8169_change_mtu(struct net_device *dev, int new_mtu)
 
 	rtl8169_down(dev);
 
-	rtl8169_set_rxbufsize(tp, dev);
+	rtl8169_set_rxbufsize(tp, dev->mtu);
 
 	ret = rtl8169_init_ring(dev);
 	if (ret < 0)
@@ -4754,8 +4766,8 @@ static void rtl_set_rx_mode(struct net_device *dev)
 		mc_filter[1] = swab32(data);
 	}
 
-	RTL_W32(MAR0 + 0, mc_filter[0]);
 	RTL_W32(MAR0 + 4, mc_filter[1]);
+	RTL_W32(MAR0 + 0, mc_filter[0]);
 
 	RTL_W32(RxConfig, tmp);
 
diff --git a/drivers/net/tulip/uli526x.c b/drivers/net/tulip/uli526x.c
index 0ab05af237e..a4f09d49053 100644
--- a/drivers/net/tulip/uli526x.c
+++ b/drivers/net/tulip/uli526x.c
@@ -851,13 +851,15 @@ static void uli526x_rx_packet(struct net_device *dev, struct uli526x_board_info
 
 			if ( !(rdes0 & 0x8000) ||
 				((db->cr6_data & CR6_PM) && (rxlen>6)) ) {
+				struct sk_buff *new_skb = NULL;
+
 				skb = rxptr->rx_skb_ptr;
 
 				/* Good packet, send to upper layer */
 				/* Shorst packet used new SKB */
-				if ( (rxlen < RX_COPY_SIZE) &&
-					( (skb = dev_alloc_skb(rxlen + 2) )
-					!= NULL) ) {
+				if ((rxlen < RX_COPY_SIZE) &&
+				    (((new_skb = dev_alloc_skb(rxlen + 2)) != NULL))) {
+					skb = new_skb;
 					/* size less than COPY_SIZE, allocate a rxlen SKB */
 					skb_reserve(skb, 2); /* 16byte align */
 					memcpy(skb_put(skb, rxlen),
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index d222d7e2527..73f9a31cf94 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -1189,9 +1189,21 @@ static struct sk_buff *smsc95xx_tx_fixup(struct usbnet *dev,
 	}
 
 	if (csum) {
-		u32 csum_preamble = smsc95xx_calc_csum_preamble(skb);
-		skb_push(skb, 4);
-		memcpy(skb->data, &csum_preamble, 4);
+		if (skb->len <= 45) {
+			/* workaround - hardware tx checksum does not work
+			 * properly with extremely small packets */
+			long csstart = skb->csum_start - skb_headroom(skb);
+			__wsum calc = csum_partial(skb->data + csstart,
+				skb->len - csstart, 0);
+			*((__sum16 *)(skb->data + csstart
+				+ skb->csum_offset)) = csum_fold(calc);
+
+			csum = false;
+		} else {
+			u32 csum_preamble = smsc95xx_calc_csum_preamble(skb);
+			skb_push(skb, 4);
+			memcpy(skb->data, &csum_preamble, 4);
+		}
 	}
 
 	skb_push(skb, 4);
diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c
index 3a486f3bad3..bc278d4ee89 100644
--- a/drivers/net/via-velocity.c
+++ b/drivers/net/via-velocity.c
@@ -812,7 +812,7 @@ static void set_mii_flow_control(struct velocity_info *vptr)
 
 	case FLOW_CNTL_TX_RX:
 		MII_REG_BITS_ON(ANAR_PAUSE, MII_REG_ANAR, vptr->mac_regs);
-		MII_REG_BITS_ON(ANAR_ASMDIR, MII_REG_ANAR, vptr->mac_regs);
+		MII_REG_BITS_OFF(ANAR_ASMDIR, MII_REG_ANAR, vptr->mac_regs);
 		break;
 
 	case FLOW_CNTL_DISABLE:
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index b2c8207f7bc..294b486bc3e 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -1353,25 +1353,6 @@ static enum ath9k_pkt_type get_hw_packet_type(struct sk_buff *skb)
 	return htype;
 }
 
-static bool is_pae(struct sk_buff *skb)
-{
-	struct ieee80211_hdr *hdr;
-	__le16 fc;
-
-	hdr = (struct ieee80211_hdr *)skb->data;
-	fc = hdr->frame_control;
-
-	if (ieee80211_is_data(fc)) {
-		if (ieee80211_is_nullfunc(fc) ||
-		    /* Port Access Entity (IEEE 802.1X) */
-		    (skb->protocol == cpu_to_be16(ETH_P_PAE))) {
-			return true;
-		}
-	}
-
-	return false;
-}
-
 static int get_hw_crypto_keytype(struct sk_buff *skb)
 {
 	struct ieee80211_tx_info *tx_info = IEEE80211_SKB_CB(skb);
@@ -1696,7 +1677,7 @@ static void ath_tx_start_dma(struct ath_softc *sc, struct ath_buf *bf,
 			goto tx_done;
 		}
 
-		if ((tx_info->flags & IEEE80211_TX_CTL_AMPDU) && !is_pae(skb)) {
+		if (tx_info->flags & IEEE80211_TX_CTL_AMPDU) {
 			/*
 			 * Try aggregation if it's a unicast data frame
 			 * and the destination is HT capable.
diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c
index 1ed5206721e..8c12311dbb0 100644
--- a/drivers/net/wireless/iwlwifi/iwl-tx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-tx.c
@@ -124,7 +124,7 @@ void iwl_free_tfds_in_queue(struct iwl_priv *priv,
 	if (priv->stations[sta_id].tid[tid].tfds_in_queue >= freed)
 		priv->stations[sta_id].tid[tid].tfds_in_queue -= freed;
 	else {
-		IWL_ERR(priv, "free more than tfds_in_queue (%u:%d)\n",
+		IWL_DEBUG_TX(priv, "free more than tfds_in_queue (%u:%d)\n",
 			priv->stations[sta_id].tid[tid].tfds_in_queue,
 			freed);
 		priv->stations[sta_id].tid[tid].tfds_in_queue = 0;
diff --git a/drivers/net/wireless/wl12xx/wl1251_debugfs.c b/drivers/net/wireless/wl12xx/wl1251_debugfs.c
index 0ccba57fb9f..05e4d68eb4c 100644
--- a/drivers/net/wireless/wl12xx/wl1251_debugfs.c
+++ b/drivers/net/wireless/wl12xx/wl1251_debugfs.c
@@ -466,7 +466,8 @@ out:
 
 void wl1251_debugfs_reset(struct wl1251 *wl)
 {
-	memset(wl->stats.fw_stats, 0, sizeof(*wl->stats.fw_stats));
+	if (wl->stats.fw_stats != NULL)
+		memset(wl->stats.fw_stats, 0, sizeof(*wl->stats.fw_stats));
 	wl->stats.retry_count = 0;
 	wl->stats.excessive_retries = 0;
 }
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 40b48f569b1..9665d6b17a2 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -832,9 +832,8 @@ static inline void dbg_ctrl(struct controller *ctrl)
 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
 		if (!pci_resource_len(pdev, i))
 			continue;
-		ctrl_info(ctrl, "  PCI resource [%d]     : 0x%llx@0x%llx\n",
-			  i, (unsigned long long)pci_resource_len(pdev, i),
-			  (unsigned long long)pci_resource_start(pdev, i));
+		ctrl_info(ctrl, "  PCI resource [%d]     : %pR\n",
+			  i, &pdev->resource[i]);
 	}
 	ctrl_info(ctrl, "Slot Capabilities      : 0x%08x\n", ctrl->slot_cap);
 	ctrl_info(ctrl, "  Physical Slot Number : %d\n", PSN(ctrl));
diff --git a/drivers/pci/ioapic.c b/drivers/pci/ioapic.c
index 3e0d7b5dd1b..fb9fdf4a42b 100644
--- a/drivers/pci/ioapic.c
+++ b/drivers/pci/ioapic.c
@@ -31,9 +31,9 @@ static int ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent)
 	acpi_status status;
 	unsigned long long gsb;
 	struct ioapic *ioapic;
-	u64 addr;
 	int ret;
 	char *type;
+	struct resource *res;
 
 	handle = DEVICE_ACPI_HANDLE(&dev->dev);
 	if (!handle)
@@ -69,13 +69,12 @@ static int ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent)
 	if (pci_request_region(dev, 0, type))
 		goto exit_disable;
 
-	addr = pci_resource_start(dev, 0);
-	if (acpi_register_ioapic(ioapic->handle, addr, ioapic->gsi_base))
+	res = &dev->resource[0];
+	if (acpi_register_ioapic(ioapic->handle, res->start, ioapic->gsi_base))
 		goto exit_release;
 
 	pci_set_drvdata(dev, ioapic);
-	dev_info(&dev->dev, "%s at %#llx, GSI %u\n", type, addr,
-		 ioapic->gsi_base);
+	dev_info(&dev->dev, "%s at %pR, GSI %u\n", type, res, ioapic->gsi_base);
 	return 0;
 
 exit_release:
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index de296452c95..997668558e7 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -655,8 +655,8 @@ void pci_create_legacy_files(struct pci_bus *b)
 		goto legacy_io_err;
 
 	/* Allocated above after the legacy_io struct */
-	sysfs_bin_attr_init(b->legacy_mem);
 	b->legacy_mem = b->legacy_io + 1;
+	sysfs_bin_attr_init(b->legacy_mem);
 	b->legacy_mem->attr.name = "legacy_mem";
 	b->legacy_mem->size = 1024*1024;
 	b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index cb1dd5f4988..1531f3a4987 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2576,18 +2576,17 @@ EXPORT_SYMBOL_GPL(pci_reset_function);
  */
 int pcix_get_max_mmrbc(struct pci_dev *dev)
 {
-	int err, cap;
+	int cap;
 	u32 stat;
 
 	cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
 	if (!cap)
 		return -EINVAL;
 
-	err = pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat);
-	if (err)
+	if (pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat))
 		return -EINVAL;
 
-	return (stat & PCI_X_STATUS_MAX_READ) >> 12;
+	return 512 << ((stat & PCI_X_STATUS_MAX_READ) >> 21);
 }
 EXPORT_SYMBOL(pcix_get_max_mmrbc);
 
@@ -2600,18 +2599,17 @@ EXPORT_SYMBOL(pcix_get_max_mmrbc);
  */
 int pcix_get_mmrbc(struct pci_dev *dev)
 {
-	int ret, cap;
-	u32 cmd;
+	int cap;
+	u16 cmd;
 
 	cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
 	if (!cap)
 		return -EINVAL;
 
-	ret = pci_read_config_dword(dev, cap + PCI_X_CMD, &cmd);
-	if (!ret)
-		ret = 512 << ((cmd & PCI_X_CMD_MAX_READ) >> 2);
+	if (pci_read_config_word(dev, cap + PCI_X_CMD, &cmd))
+		return -EINVAL;
 
-	return ret;
+	return 512 << ((cmd & PCI_X_CMD_MAX_READ) >> 2);
 }
 EXPORT_SYMBOL(pcix_get_mmrbc);
 
@@ -2626,28 +2624,27 @@ EXPORT_SYMBOL(pcix_get_mmrbc);
  */
 int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc)
 {
-	int cap, err = -EINVAL;
-	u32 stat, cmd, v, o;
+	int cap;
+	u32 stat, v, o;
+	u16 cmd;
 
 	if (mmrbc < 512 || mmrbc > 4096 || !is_power_of_2(mmrbc))
-		goto out;
+		return -EINVAL;
 
 	v = ffs(mmrbc) - 10;
 
 	cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
 	if (!cap)
-		goto out;
+		return -EINVAL;
 
-	err = pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat);
-	if (err)
-		goto out;
+	if (pci_read_config_dword(dev, cap + PCI_X_STATUS, &stat))
+		return -EINVAL;
 
 	if (v > (stat & PCI_X_STATUS_MAX_READ) >> 21)
 		return -E2BIG;
 
-	err = pci_read_config_dword(dev, cap + PCI_X_CMD, &cmd);
-	if (err)
-		goto out;
+	if (pci_read_config_word(dev, cap + PCI_X_CMD, &cmd))
+		return -EINVAL;
 
 	o = (cmd & PCI_X_CMD_MAX_READ) >> 2;
 	if (o != v) {
@@ -2657,10 +2654,10 @@ int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc)
 
 		cmd &= ~PCI_X_CMD_MAX_READ;
 		cmd |= v << 2;
-		err = pci_write_config_dword(dev, cap + PCI_X_CMD, cmd);
+		if (pci_write_config_word(dev, cap + PCI_X_CMD, cmd))
+			return -EIO;
 	}
-out:
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL(pcix_set_mmrbc);
 
@@ -3023,7 +3020,6 @@ EXPORT_SYMBOL(pcim_pin_device);
 EXPORT_SYMBOL(pci_disable_device);
 EXPORT_SYMBOL(pci_find_capability);
 EXPORT_SYMBOL(pci_bus_find_capability);
-EXPORT_SYMBOL(pci_register_set_vga_state);
 EXPORT_SYMBOL(pci_release_regions);
 EXPORT_SYMBOL(pci_request_regions);
 EXPORT_SYMBOL(pci_request_regions_exclusive);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 2a943090a3b..882bd8d29fe 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -174,14 +174,19 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 	pci_read_config_dword(dev, pos, &sz);
 	pci_write_config_dword(dev, pos, l);
 
+	if (!sz)
+		goto fail;	/* BAR not implemented */
+
 	/*
 	 * All bits set in sz means the device isn't working properly.
-	 * If the BAR isn't implemented, all bits must be 0.  If it's a
-	 * memory BAR or a ROM, bit 0 must be clear; if it's an io BAR, bit
-	 * 1 must be clear.
+	 * If it's a memory BAR or a ROM, bit 0 must be clear; if it's
+	 * an io BAR, bit 1 must be clear.
 	 */
-	if (!sz || sz == 0xffffffff)
+	if (sz == 0xffffffff) {
+		dev_err(&dev->dev, "reg %x: invalid size %#x; broken device?\n",
+			pos, sz);
 		goto fail;
+	}
 
 	/*
 	 * I don't know how l can have all bits set.  Copied from old code.
@@ -244,13 +249,17 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 				   pos, res);
 		}
 	} else {
-		sz = pci_size(l, sz, mask);
+		u32 size = pci_size(l, sz, mask);
 
-		if (!sz)
+		if (!size) {
+			dev_err(&dev->dev, "reg %x: invalid size "
+			        "(l %#x sz %#x mask %#x); broken device?",
+				pos, l, sz, mask);
 			goto fail;
+		}
 
 		res->start = l;
-		res->end = l + sz;
+		res->end = l + size;
 
 		dev_printk(KERN_DEBUG, &dev->dev, "reg %x: %pR\n", pos, res);
 	}
@@ -312,7 +321,7 @@ static void __devinit pci_read_bridge_io(struct pci_bus *child)
 		dev_printk(KERN_DEBUG, &dev->dev, "  bridge window %pR\n", res);
 	} else {
 		dev_printk(KERN_DEBUG, &dev->dev,
-			 "  bridge window [io  %04lx - %04lx] reg reading\n",
+			 "  bridge window [io  %#06lx-%#06lx] (disabled)\n",
 				 base, limit);
 	}
 }
@@ -336,7 +345,7 @@ static void __devinit pci_read_bridge_mmio(struct pci_bus *child)
 		dev_printk(KERN_DEBUG, &dev->dev, "  bridge window %pR\n", res);
 	} else {
 		dev_printk(KERN_DEBUG, &dev->dev,
-			"  bridge window [mem 0x%08lx - 0x%08lx] reg reading\n",
+			"  bridge window [mem %#010lx-%#010lx] (disabled)\n",
 					 base, limit + 0xfffff);
 	}
 }
@@ -387,7 +396,7 @@ static void __devinit pci_read_bridge_mmio_pref(struct pci_bus *child)
 		dev_printk(KERN_DEBUG, &dev->dev, "  bridge window %pR\n", res);
 	} else {
 		dev_printk(KERN_DEBUG, &dev->dev,
-		     "  bridge window [mem 0x%08lx - %08lx pref] reg reading\n",
+		     "  bridge window [mem %#010lx-%#010lx pref] (disabled)\n",
 					 base, limit + 0xfffff);
 	}
 }
@@ -673,16 +682,20 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 	int is_cardbus = (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS);
 	u32 buses, i, j = 0;
 	u16 bctl;
+	u8 primary, secondary, subordinate;
 	int broken = 0;
 
 	pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses);
+	primary = buses & 0xFF;
+	secondary = (buses >> 8) & 0xFF;
+	subordinate = (buses >> 16) & 0xFF;
 
-	dev_dbg(&dev->dev, "scanning behind bridge, config %06x, pass %d\n",
-		buses & 0xffffff, pass);
+	dev_dbg(&dev->dev, "scanning [bus %02x-%02x] behind bridge, pass %d\n",
+		secondary, subordinate, pass);
 
 	/* Check if setup is sensible at all */
 	if (!pass &&
-	    ((buses & 0xff) != bus->number || ((buses >> 8) & 0xff) <= bus->number)) {
+	    (primary != bus->number || secondary <= bus->number)) {
 		dev_dbg(&dev->dev, "bus configuration invalid, reconfiguring\n");
 		broken = 1;
 	}
@@ -693,15 +706,15 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 	pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
 			      bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
 
-	if ((buses & 0xffff00) && !pcibios_assign_all_busses() && !is_cardbus && !broken) {
-		unsigned int cmax, busnr;
+	if ((secondary || subordinate) && !pcibios_assign_all_busses() &&
+	    !is_cardbus && !broken) {
+		unsigned int cmax;
 		/*
 		 * Bus already configured by firmware, process it in the first
 		 * pass and just note the configuration.
 		 */
 		if (pass)
 			goto out;
-		busnr = (buses >> 8) & 0xFF;
 
 		/*
 		 * If we already got to this bus through a different bridge,
@@ -710,13 +723,13 @@ int __devinit pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max,
 		 * However, we continue to descend down the hierarchy and
 		 * scan remaining child buses.
 		 */
-		child = pci_find_bus(pci_domain_nr(bus), busnr);
+		child = pci_find_bus(pci_domain_nr(bus), secondary);
 		if (!child) {
-			child = pci_add_new_bus(bus, dev, busnr);
+			child = pci_add_new_bus(bus, dev, secondary);
 			if (!child)
 				goto out;
-			child->primary = buses & 0xFF;
-			child->subordinate = (buses >> 16) & 0xFF;
+			child->primary = primary;
+			child->subordinate = subordinate;
 			child->bridge_ctl = bctl;
 		}
 
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 81d19d5683a..3ea0b29c010 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -368,8 +368,9 @@ static void __devinit quirk_io_region(struct pci_dev *dev, unsigned region,
 		bus_region.end = res->end;
 		pcibios_bus_to_resource(dev, res, &bus_region);
 
-		pci_claim_resource(dev, nr);
-		dev_info(&dev->dev, "quirk: %pR claimed by %s\n", res, name);
+		if (pci_claim_resource(dev, nr) == 0)
+			dev_info(&dev->dev, "quirk: %pR claimed by %s\n",
+				 res, name);
 	}
 }	
 
@@ -1977,11 +1978,25 @@ static void __devinit quirk_via_cx700_pci_parking_caching(struct pci_dev *dev)
 	/*
 	 * Disable PCI Bus Parking and PCI Master read caching on CX700
 	 * which causes unspecified timing errors with a VT6212L on the PCI
-	 * bus leading to USB2.0 packet loss. The defaults are that these
-	 * features are turned off but some BIOSes turn them on.
+	 * bus leading to USB2.0 packet loss.
+	 *
+	 * This quirk is only enabled if a second (on the external PCI bus)
+	 * VT6212L is found -- the CX700 core itself also contains a USB
+	 * host controller with the same PCI ID as the VT6212L.
 	 */
 
+	/* Count VT6212L instances */
+	struct pci_dev *p = pci_get_device(PCI_VENDOR_ID_VIA,
+		PCI_DEVICE_ID_VIA_8235_USB_2, NULL);
 	uint8_t b;
+
+	/* p should contain the first (internal) VT6212L -- see if we have
+	   an external one by searching again */
+	p = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235_USB_2, p);
+	if (!p)
+		return;
+	pci_dev_put(p);
+
 	if (pci_read_config_byte(dev, 0x76, &b) == 0) {
 		if (b & 0x40) {
 			/* Turn off PCI Bus Parking */
@@ -2008,7 +2023,7 @@ static void __devinit quirk_via_cx700_pci_parking_caching(struct pci_dev *dev)
 		}
 	}
 }
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_VIA, 0x324e, quirk_via_cx700_pci_parking_caching);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, 0x324e, quirk_via_cx700_pci_parking_caching);
 
 /*
  * For Broadcom 5706, 5708, 5709 rev. A nics, any read beyond the
@@ -2108,6 +2123,7 @@ static void __devinit quirk_disable_msi(struct pci_dev *dev)
 	}
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_8131_BRIDGE, quirk_disable_msi);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, 0xa238, quirk_disable_msi);
 
 /* Go through the list of Hypertransport capabilities and
  * return 1 if a HT MSI capability is found and enabled */
@@ -2479,6 +2495,39 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x4374,
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x4375,
 			quirk_msi_intx_disable_bug);
 
+/*
+ * MSI does not work with the AMD RS780/RS880 internal graphics and HDMI audio
+ * devices unless the BIOS has initialized the nb_cntl.strap_msi_enable bit.
+ */
+static void __init rs780_int_gfx_disable_msi(struct pci_dev *int_gfx_bridge)
+{
+	u32 nb_cntl;
+
+	if (!int_gfx_bridge->subordinate)
+		return;
+
+	pci_bus_write_config_dword(int_gfx_bridge->bus, PCI_DEVFN(0, 0),
+				   0x60, 0);
+	pci_bus_read_config_dword(int_gfx_bridge->bus, PCI_DEVFN(0, 0),
+				  0x64, &nb_cntl);
+
+	if (!(nb_cntl & BIT(10))) {
+		dev_warn(&int_gfx_bridge->dev,
+			 FW_WARN "RS780: MSI for internal graphics disabled\n");
+		int_gfx_bridge->subordinate->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
+	}
+}
+
+#define PCI_DEVICE_ID_AMD_RS780_P2P_INT_GFX	0x9602
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD,
+			PCI_DEVICE_ID_AMD_RS780_P2P_INT_GFX,
+			rs780_int_gfx_disable_msi);
+/* wrong vendor ID on M4A785TD motherboard: */
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ASUSTEK,
+			PCI_DEVICE_ID_AMD_RS780_P2P_INT_GFX,
+			rs780_int_gfx_disable_msi);
+
 #endif /* CONFIG_PCI_MSI */
 
 #ifdef CONFIG_PCI_IOV
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 7d678bb15ff..17bed18d24a 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -93,8 +93,7 @@ void pci_update_resource(struct pci_dev *dev, int resno)
 int pci_claim_resource(struct pci_dev *dev, int resource)
 {
 	struct resource *res = &dev->resource[resource];
-	struct resource *root;
-	int err;
+	struct resource *root, *conflict;
 
 	root = pci_find_parent_resource(dev, res);
 	if (!root) {
@@ -103,12 +102,15 @@ int pci_claim_resource(struct pci_dev *dev, int resource)
 		return -EINVAL;
 	}
 
-	err = request_resource(root, res);
-	if (err)
+	conflict = request_resource_conflict(root, res);
+	if (conflict) {
 		dev_err(&dev->dev,
-			"address space collision: %pR already in use\n", res);
+			"address space collision: %pR conflicts with %s %pR\n",
+			res, conflict->name, conflict);
+		return -EBUSY;
+	}
 
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL(pci_claim_resource);
 
diff --git a/drivers/pcmcia/at91_cf.c b/drivers/pcmcia/at91_cf.c
index 5d228071ec6..fb904f444d9 100644
--- a/drivers/pcmcia/at91_cf.c
+++ b/drivers/pcmcia/at91_cf.c
@@ -361,7 +361,6 @@ static int at91_cf_suspend(struct platform_device *pdev, pm_message_t mesg)
 	struct at91_cf_socket	*cf = platform_get_drvdata(pdev);
 	struct at91_cf_data	*board = cf->board;
 
-	pcmcia_socket_dev_suspend(&pdev->dev);
 	if (device_may_wakeup(&pdev->dev)) {
 		enable_irq_wake(board->det_pin);
 		if (board->irq_pin)
@@ -381,7 +380,6 @@ static int at91_cf_resume(struct platform_device *pdev)
 			disable_irq_wake(board->irq_pin);
 	}
 
-	pcmcia_socket_dev_resume(&pdev->dev);
 	return 0;
 }
 
diff --git a/drivers/pcmcia/au1000_generic.c b/drivers/pcmcia/au1000_generic.c
index 171c8a65488..ac4d089430f 100644
--- a/drivers/pcmcia/au1000_generic.c
+++ b/drivers/pcmcia/au1000_generic.c
@@ -510,17 +510,6 @@ static int au1x00_drv_pcmcia_probe(struct platform_device *dev)
 	return ret;
 }
 
-static int au1x00_drv_pcmcia_suspend(struct platform_device *dev,
-				     pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
-
-static int au1x00_drv_pcmcia_resume(struct platform_device *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
-
 static struct platform_driver au1x00_pcmcia_driver = {
 	.driver = {
 		.name		= "au1x00-pcmcia",
@@ -528,8 +517,6 @@ static struct platform_driver au1x00_pcmcia_driver = {
 	},
 	.probe		= au1x00_drv_pcmcia_probe,
 	.remove		= au1x00_drv_pcmcia_remove,
-	.suspend 	= au1x00_drv_pcmcia_suspend,
-	.resume 	= au1x00_drv_pcmcia_resume,
 };
 
 
diff --git a/drivers/pcmcia/bfin_cf_pcmcia.c b/drivers/pcmcia/bfin_cf_pcmcia.c
index 2482ce7ac6d..93f9ddeb0c3 100644
--- a/drivers/pcmcia/bfin_cf_pcmcia.c
+++ b/drivers/pcmcia/bfin_cf_pcmcia.c
@@ -300,16 +300,6 @@ static int __devexit bfin_cf_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static int bfin_cf_suspend(struct platform_device *pdev, pm_message_t mesg)
-{
-	return pcmcia_socket_dev_suspend(&pdev->dev);
-}
-
-static int bfin_cf_resume(struct platform_device *pdev)
-{
-	return pcmcia_socket_dev_resume(&pdev->dev);
-}
-
 static struct platform_driver bfin_cf_driver = {
 	.driver = {
 		   .name = (char *)driver_name,
@@ -317,8 +307,6 @@ static struct platform_driver bfin_cf_driver = {
 		   },
 	.probe = bfin_cf_probe,
 	.remove = __devexit_p(bfin_cf_remove),
-	.suspend = bfin_cf_suspend,
-	.resume = bfin_cf_resume,
 };
 
 static int __init bfin_cf_init(void)
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index e679e708db6..75ed866e695 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -76,65 +76,6 @@ DECLARE_RWSEM(pcmcia_socket_list_rwsem);
 EXPORT_SYMBOL(pcmcia_socket_list_rwsem);
 
 
-/*
- * Low-level PCMCIA socket drivers need to register with the PCCard
- * core using pcmcia_register_socket.
- *
- * socket drivers are expected to use the following callbacks in their
- * .drv struct:
- *  - pcmcia_socket_dev_suspend
- *  - pcmcia_socket_dev_resume
- * These functions check for the appropriate struct pcmcia_soket arrays,
- * and pass them to the low-level functions pcmcia_{suspend,resume}_socket
- */
-static int socket_early_resume(struct pcmcia_socket *skt);
-static int socket_late_resume(struct pcmcia_socket *skt);
-static int socket_resume(struct pcmcia_socket *skt);
-static int socket_suspend(struct pcmcia_socket *skt);
-
-static void pcmcia_socket_dev_run(struct device *dev,
-				  int (*cb)(struct pcmcia_socket *))
-{
-	struct pcmcia_socket *socket;
-
-	down_read(&pcmcia_socket_list_rwsem);
-	list_for_each_entry(socket, &pcmcia_socket_list, socket_list) {
-		if (socket->dev.parent != dev)
-			continue;
-		mutex_lock(&socket->skt_mutex);
-		cb(socket);
-		mutex_unlock(&socket->skt_mutex);
-	}
-	up_read(&pcmcia_socket_list_rwsem);
-}
-
-int pcmcia_socket_dev_suspend(struct device *dev)
-{
-	pcmcia_socket_dev_run(dev, socket_suspend);
-	return 0;
-}
-EXPORT_SYMBOL(pcmcia_socket_dev_suspend);
-
-void pcmcia_socket_dev_early_resume(struct device *dev)
-{
-	pcmcia_socket_dev_run(dev, socket_early_resume);
-}
-EXPORT_SYMBOL(pcmcia_socket_dev_early_resume);
-
-void pcmcia_socket_dev_late_resume(struct device *dev)
-{
-	pcmcia_socket_dev_run(dev, socket_late_resume);
-}
-EXPORT_SYMBOL(pcmcia_socket_dev_late_resume);
-
-int pcmcia_socket_dev_resume(struct device *dev)
-{
-	pcmcia_socket_dev_run(dev, socket_resume);
-	return 0;
-}
-EXPORT_SYMBOL(pcmcia_socket_dev_resume);
-
-
 struct pcmcia_socket *pcmcia_get_socket(struct pcmcia_socket *skt)
 {
 	struct device *dev = get_device(&skt->dev);
@@ -578,12 +519,18 @@ static int socket_early_resume(struct pcmcia_socket *skt)
 
 static int socket_late_resume(struct pcmcia_socket *skt)
 {
+	int ret;
+
 	mutex_lock(&skt->ops_mutex);
 	skt->state &= ~SOCKET_SUSPEND;
 	mutex_unlock(&skt->ops_mutex);
 
-	if (!(skt->state & SOCKET_PRESENT))
-		return socket_insert(skt);
+	if (!(skt->state & SOCKET_PRESENT)) {
+		ret = socket_insert(skt);
+		if (ret == -ENODEV)
+			ret = 0;
+		return ret;
+	}
 
 	if (skt->resume_status) {
 		socket_shutdown(skt);
@@ -919,11 +866,66 @@ static void pcmcia_release_socket_class(struct class *data)
 }
 
 
+#ifdef CONFIG_PM
+
+static int __pcmcia_pm_op(struct device *dev,
+			  int (*callback) (struct pcmcia_socket *skt))
+{
+	struct pcmcia_socket *s = container_of(dev, struct pcmcia_socket, dev);
+	int ret;
+
+	mutex_lock(&s->skt_mutex);
+	ret = callback(s);
+	mutex_unlock(&s->skt_mutex);
+
+	return ret;
+}
+
+static int pcmcia_socket_dev_suspend_noirq(struct device *dev)
+{
+	return __pcmcia_pm_op(dev, socket_suspend);
+}
+
+static int pcmcia_socket_dev_resume_noirq(struct device *dev)
+{
+	return __pcmcia_pm_op(dev, socket_early_resume);
+}
+
+static int pcmcia_socket_dev_resume(struct device *dev)
+{
+	return __pcmcia_pm_op(dev, socket_late_resume);
+}
+
+static const struct dev_pm_ops pcmcia_socket_pm_ops = {
+	/* dev_resume may be called with IRQs enabled */
+	SET_SYSTEM_SLEEP_PM_OPS(NULL,
+				pcmcia_socket_dev_resume)
+
+	/* late suspend must be called with IRQs disabled */
+	.suspend_noirq = pcmcia_socket_dev_suspend_noirq,
+	.freeze_noirq = pcmcia_socket_dev_suspend_noirq,
+	.poweroff_noirq = pcmcia_socket_dev_suspend_noirq,
+
+	/* early resume must be called with IRQs disabled */
+	.resume_noirq = pcmcia_socket_dev_resume_noirq,
+	.thaw_noirq = pcmcia_socket_dev_resume_noirq,
+	.restore_noirq = pcmcia_socket_dev_resume_noirq,
+};
+
+#define PCMCIA_SOCKET_CLASS_PM_OPS (&pcmcia_socket_pm_ops)
+
+#else /* CONFIG_PM */
+
+#define PCMCIA_SOCKET_CLASS_PM_OPS NULL
+
+#endif /* CONFIG_PM */
+
 struct class pcmcia_socket_class = {
 	.name = "pcmcia_socket",
 	.dev_uevent = pcmcia_socket_uevent,
 	.dev_release = pcmcia_release_socket,
 	.class_release = pcmcia_release_socket_class,
+	.pm = PCMCIA_SOCKET_CLASS_PM_OPS,
 };
 EXPORT_SYMBOL(pcmcia_socket_class);
 
diff --git a/drivers/pcmcia/db1xxx_ss.c b/drivers/pcmcia/db1xxx_ss.c
index 9254ab0b29b..a520193b645 100644
--- a/drivers/pcmcia/db1xxx_ss.c
+++ b/drivers/pcmcia/db1xxx_ss.c
@@ -558,37 +558,10 @@ static int __devexit db1x_pcmcia_socket_remove(struct platform_device *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int db1x_pcmcia_suspend(struct device *dev)
-{
-	return pcmcia_socket_dev_suspend(dev);
-}
-
-static int db1x_pcmcia_resume(struct device *dev)
-{
-	return pcmcia_socket_dev_resume(dev);
-}
-
-static struct dev_pm_ops db1x_pcmcia_pmops = {
-	.resume		= db1x_pcmcia_resume,
-	.suspend	= db1x_pcmcia_suspend,
-	.thaw		= db1x_pcmcia_resume,
-	.freeze		= db1x_pcmcia_suspend,
-};
-
-#define DB1XXX_SS_PMOPS &db1x_pcmcia_pmops
-
-#else
-
-#define DB1XXX_SS_PMOPS NULL
-
-#endif
-
 static struct platform_driver db1x_pcmcia_socket_driver = {
 	.driver	= {
 		.name	= "db1xxx_pcmcia",
 		.owner	= THIS_MODULE,
-		.pm	= DB1XXX_SS_PMOPS
 	},
 	.probe		= db1x_pcmcia_socket_probe,
 	.remove		= __devexit_p(db1x_pcmcia_socket_remove),
diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index ad93ebd7b2a..52d33b2a5bc 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -509,8 +509,12 @@ struct pcmcia_device *pcmcia_device_add(struct pcmcia_socket *s, unsigned int fu
 	p_dev->device_no = (s->device_count++);
 	mutex_unlock(&s->ops_mutex);
 
-	/* max of 2 devices per card */
-	if (p_dev->device_no >= 2)
+	/* max of 2 PFC devices */
+	if ((p_dev->device_no >= 2) && (function == 0))
+		goto err_free;
+
+	/* max of 4 devices overall */
+	if (p_dev->device_no >= 4)
 		goto err_free;
 
 	p_dev->socket = s;
diff --git a/drivers/pcmcia/i82092.c b/drivers/pcmcia/i82092.c
index f5da6265331..3003bb3dfcc 100644
--- a/drivers/pcmcia/i82092.c
+++ b/drivers/pcmcia/i82092.c
@@ -39,27 +39,11 @@ static struct pci_device_id i82092aa_pci_ids[] = {
 };
 MODULE_DEVICE_TABLE(pci, i82092aa_pci_ids);
 
-#ifdef CONFIG_PM
-static int i82092aa_socket_suspend (struct pci_dev *dev, pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
-
-static int i82092aa_socket_resume (struct pci_dev *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
-#endif
-
 static struct pci_driver i82092aa_pci_driver = {
 	.name           = "i82092aa",
 	.id_table       = i82092aa_pci_ids,
 	.probe          = i82092aa_pci_probe,
 	.remove         = __devexit_p(i82092aa_pci_remove),
-#ifdef CONFIG_PM
-	.suspend        = i82092aa_socket_suspend,
-	.resume         = i82092aa_socket_resume,
-#endif
 };
 
 
diff --git a/drivers/pcmcia/i82365.c b/drivers/pcmcia/i82365.c
index c13fd936051..d53d9b5659c 100644
--- a/drivers/pcmcia/i82365.c
+++ b/drivers/pcmcia/i82365.c
@@ -1223,16 +1223,7 @@ static int pcic_init(struct pcmcia_socket *s)
 	return 0;
 }
 
-static int i82365_drv_pcmcia_suspend(struct platform_device *dev,
-				     pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
 
-static int i82365_drv_pcmcia_resume(struct platform_device *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
 static struct pccard_operations pcic_operations = {
 	.init			= pcic_init,
 	.get_status		= pcic_get_status,
@@ -1248,8 +1239,6 @@ static struct platform_driver i82365_driver = {
 		.name = "i82365",
 		.owner		= THIS_MODULE,
 	},
-	.suspend 	= i82365_drv_pcmcia_suspend,
-	.resume 	= i82365_drv_pcmcia_resume,
 };
 
 static struct platform_device *i82365_device;
diff --git a/drivers/pcmcia/m32r_cfc.c b/drivers/pcmcia/m32r_cfc.c
index 0ece2cd4a85..ab21264468d 100644
--- a/drivers/pcmcia/m32r_cfc.c
+++ b/drivers/pcmcia/m32r_cfc.c
@@ -685,16 +685,7 @@ static struct pccard_operations pcc_operations = {
 	.set_mem_map		= pcc_set_mem_map,
 };
 
-static int cfc_drv_pcmcia_suspend(struct platform_device *dev,
-				     pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
 
-static int cfc_drv_pcmcia_resume(struct platform_device *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
 /*====================================================================*/
 
 static struct platform_driver pcc_driver = {
@@ -702,8 +693,6 @@ static struct platform_driver pcc_driver = {
 		.name		= "cfc",
 		.owner		= THIS_MODULE,
 	},
-	.suspend 	= cfc_drv_pcmcia_suspend,
-	.resume 	= cfc_drv_pcmcia_resume,
 };
 
 static struct platform_device pcc_device = {
diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c
index 72844c5a6d0..0caf3db7c70 100644
--- a/drivers/pcmcia/m32r_pcc.c
+++ b/drivers/pcmcia/m32r_pcc.c
@@ -663,16 +663,6 @@ static struct pccard_operations pcc_operations = {
 	.set_mem_map		= pcc_set_mem_map,
 };
 
-static int pcc_drv_pcmcia_suspend(struct platform_device *dev,
-				     pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
-
-static int pcc_drv_pcmcia_resume(struct platform_device *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
 /*====================================================================*/
 
 static struct platform_driver pcc_driver = {
@@ -680,8 +670,6 @@ static struct platform_driver pcc_driver = {
 		.name		= "pcc",
 		.owner		= THIS_MODULE,
 	},
-	.suspend 	= pcc_drv_pcmcia_suspend,
-	.resume 	= pcc_drv_pcmcia_resume,
 };
 
 static struct platform_device pcc_device = {
diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c
index 61c21591812..01ef7de1532 100644
--- a/drivers/pcmcia/m8xx_pcmcia.c
+++ b/drivers/pcmcia/m8xx_pcmcia.c
@@ -1288,21 +1288,6 @@ static int m8xx_remove(struct of_device *ofdev)
 	return 0;
 }
 
-#ifdef CONFIG_PM
-static int m8xx_suspend(struct platform_device *pdev, pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&pdev->dev);
-}
-
-static int m8xx_resume(struct platform_device *pdev)
-{
-	return pcmcia_socket_dev_resume(&pdev->dev);
-}
-#else
-#define m8xx_suspend NULL
-#define m8xx_resume NULL
-#endif
-
 static const struct of_device_id m8xx_pcmcia_match[] = {
 	{
 	 .type = "pcmcia",
@@ -1318,8 +1303,6 @@ static struct of_platform_driver m8xx_pcmcia_driver = {
 	.match_table = m8xx_pcmcia_match,
 	.probe = m8xx_probe,
 	.remove = m8xx_remove,
-	.suspend = m8xx_suspend,
-	.resume = m8xx_resume,
 };
 
 static int __init m8xx_init(void)
diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c
index 3ef99155239..9edc396577b 100644
--- a/drivers/pcmcia/omap_cf.c
+++ b/drivers/pcmcia/omap_cf.c
@@ -330,24 +330,12 @@ static int __exit omap_cf_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static int omap_cf_suspend(struct platform_device *pdev, pm_message_t mesg)
-{
-	return pcmcia_socket_dev_suspend(&pdev->dev);
-}
-
-static int omap_cf_resume(struct platform_device *pdev)
-{
-	return pcmcia_socket_dev_resume(&pdev->dev);
-}
-
 static struct platform_driver omap_cf_driver = {
 	.driver = {
 		.name	= (char *) driver_name,
 		.owner	= THIS_MODULE,
 	},
 	.remove		= __exit_p(omap_cf_remove),
-	.suspend	= omap_cf_suspend,
-	.resume		= omap_cf_resume,
 };
 
 static int __init omap_cf_init(void)
diff --git a/drivers/pcmcia/pd6729.c b/drivers/pcmcia/pd6729.c
index 7ba57a565cd..4a34268cc51 100644
--- a/drivers/pcmcia/pd6729.c
+++ b/drivers/pcmcia/pd6729.c
@@ -14,13 +14,13 @@
 #include <linux/workqueue.h>
 #include <linux/interrupt.h>
 #include <linux/device.h>
+#include <linux/io.h>
 
 #include <pcmcia/cs_types.h>
 #include <pcmcia/ss.h>
 #include <pcmcia/cs.h>
 
 #include <asm/system.h>
-#include <asm/io.h>
 
 #include "pd6729.h"
 #include "i82365.h"
@@ -222,9 +222,9 @@ static irqreturn_t pd6729_interrupt(int irq, void *dev)
 						? SS_READY : 0;
 			}
 
-			if (events) {
+			if (events)
 				pcmcia_parse_events(&socket[i].socket, events);
-			}
+
 			active |= events;
 		}
 
@@ -256,9 +256,8 @@ static int pd6729_get_status(struct pcmcia_socket *sock, u_int *value)
 	status = indirect_read(socket, I365_STATUS);
 	*value = 0;
 
-	if ((status & I365_CS_DETECT) == I365_CS_DETECT) {
+	if ((status & I365_CS_DETECT) == I365_CS_DETECT)
 		*value |= SS_DETECT;
-	}
 
 	/*
 	 * IO cards have a different meaning of bits 0,1
@@ -308,7 +307,7 @@ static int pd6729_set_socket(struct pcmcia_socket *sock, socket_state_t *state)
 	socket->card_irq = state->io_irq;
 
 	reg = 0;
- 	/* The reset bit has "inverse" logic */
+	/* The reset bit has "inverse" logic */
 	if (!(state->flags & SS_RESET))
 		reg |= I365_PC_RESET;
 	if (state->flags & SS_IOCARD)
@@ -380,7 +379,7 @@ static int pd6729_set_socket(struct pcmcia_socket *sock, socket_state_t *state)
 		indirect_write(socket, I365_POWER, reg);
 
 	if (irq_mode == 1) {
-		 /* all interrupts are to be done as PCI interrupts */
+		/* all interrupts are to be done as PCI interrupts */
 		data = PD67_EC1_INV_MGMT_IRQ | PD67_EC1_INV_CARD_IRQ;
 	} else
 		data = 0;
@@ -391,9 +390,9 @@ static int pd6729_set_socket(struct pcmcia_socket *sock, socket_state_t *state)
 	/* Enable specific interrupt events */
 
 	reg = 0x00;
-	if (state->csc_mask & SS_DETECT) {
+	if (state->csc_mask & SS_DETECT)
 		reg |= I365_CSC_DETECT;
-	}
+
 	if (state->flags & SS_IOCARD) {
 		if (state->csc_mask & SS_STSCHG)
 			reg |= I365_CSC_STSCHG;
@@ -450,9 +449,12 @@ static int pd6729_set_io_map(struct pcmcia_socket *sock,
 
 	ioctl = indirect_read(socket, I365_IOCTL) & ~I365_IOCTL_MASK(map);
 
-	if (io->flags & MAP_0WS) ioctl |= I365_IOCTL_0WS(map);
-	if (io->flags & MAP_16BIT) ioctl |= I365_IOCTL_16BIT(map);
-	if (io->flags & MAP_AUTOSZ) ioctl |= I365_IOCTL_IOCS16(map);
+	if (io->flags & MAP_0WS)
+		ioctl |= I365_IOCTL_0WS(map);
+	if (io->flags & MAP_16BIT)
+		ioctl |= I365_IOCTL_16BIT(map);
+	if (io->flags & MAP_AUTOSZ)
+		ioctl |= I365_IOCTL_IOCS16(map);
 
 	indirect_write(socket, I365_IOCTL, ioctl);
 
@@ -497,7 +499,7 @@ static int pd6729_set_mem_map(struct pcmcia_socket *sock,
 
 	/* write the stop address */
 
-	i= (mem->res->end >> 12) & 0x0fff;
+	i = (mem->res->end >> 12) & 0x0fff;
 	switch (to_cycles(mem->speed)) {
 	case 0:
 		break;
@@ -563,7 +565,7 @@ static int pd6729_init(struct pcmcia_socket *sock)
 
 /* the pccard structure and its functions */
 static struct pccard_operations pd6729_operations = {
-	.init 			= pd6729_init,
+	.init			= pd6729_init,
 	.get_status		= pd6729_get_status,
 	.set_socket		= pd6729_set_socket,
 	.set_io_map		= pd6729_set_io_map,
@@ -578,8 +580,13 @@ static irqreturn_t pd6729_test(int irq, void *dev)
 
 static int pd6729_check_irq(int irq)
 {
-	if (request_irq(irq, pd6729_test, IRQF_PROBE_SHARED, "x", pd6729_test)
-		!= 0) return -1;
+	int ret;
+
+	ret = request_irq(irq, pd6729_test, IRQF_PROBE_SHARED, "x",
+			  pd6729_test);
+	if (ret)
+		return -1;
+
 	free_irq(irq, pd6729_test);
 	return 0;
 }
@@ -591,7 +598,7 @@ static u_int __devinit pd6729_isa_scan(void)
 
 	if (irq_mode == 1) {
 		printk(KERN_INFO "pd6729: PCI card interrupts, "
-						"PCI status changes\n");
+		       "PCI status changes\n");
 		return 0;
 	}
 
@@ -607,9 +614,10 @@ static u_int __devinit pd6729_isa_scan(void)
 		if (mask & (1<<i))
 			printk("%s%d", ((mask & ((1<<i)-1)) ? "," : ""), i);
 
-	if (mask == 0) printk("none!");
-
-	printk("  polling status changes.\n");
+	if (mask == 0)
+		printk("none!");
+	else
+		printk("  polling status changes.\n");
 
 	return mask;
 }
@@ -624,11 +632,16 @@ static int __devinit pd6729_pci_probe(struct pci_dev *dev,
 
 	socket = kzalloc(sizeof(struct pd6729_socket) * MAX_SOCKETS,
 			 GFP_KERNEL);
-	if (!socket)
+	if (!socket) {
+		dev_warn(&dev->dev, "failed to kzalloc socket.\n");
 		return -ENOMEM;
+	}
 
-	if ((ret = pci_enable_device(dev)))
+	ret = pci_enable_device(dev);
+	if (ret) {
+		dev_warn(&dev->dev, "failed to enable pci_device.\n");
 		goto err_out_free_mem;
+	}
 
 	if (!pci_resource_start(dev, 0)) {
 		dev_warn(&dev->dev, "refusing to load the driver as the "
@@ -639,7 +652,7 @@ static int __devinit pd6729_pci_probe(struct pci_dev *dev,
 	dev_info(&dev->dev, "Cirrus PD6729 PCI to PCMCIA Bridge at 0x%llx "
 		"on irq %d\n",
 		(unsigned long long)pci_resource_start(dev, 0), dev->irq);
- 	/*
+	/*
 	 * Since we have no memory BARs some firmware may not
 	 * have had PCI_COMMAND_MEMORY enabled, yet the device needs it.
 	 */
@@ -685,8 +698,9 @@ static int __devinit pd6729_pci_probe(struct pci_dev *dev,
 	pci_set_drvdata(dev, socket);
 	if (irq_mode == 1) {
 		/* Register the interrupt handler */
-		if ((ret = request_irq(dev->irq, pd6729_interrupt, IRQF_SHARED,
-							"pd6729", socket))) {
+		ret = request_irq(dev->irq, pd6729_interrupt, IRQF_SHARED,
+				  "pd6729", socket);
+		if (ret) {
 			dev_err(&dev->dev, "Failed to register irq %d\n",
 				dev->irq);
 			goto err_out_free_res;
@@ -750,18 +764,6 @@ static void __devexit pd6729_pci_remove(struct pci_dev *dev)
 	kfree(socket);
 }
 
-#ifdef CONFIG_PM
-static int pd6729_socket_suspend(struct pci_dev *dev, pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
-
-static int pd6729_socket_resume(struct pci_dev *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
-#endif
-
 static struct pci_device_id pd6729_pci_ids[] = {
 	{
 		.vendor		= PCI_VENDOR_ID_CIRRUS,
@@ -778,10 +780,6 @@ static struct pci_driver pd6729_pci_driver = {
 	.id_table	= pd6729_pci_ids,
 	.probe		= pd6729_pci_probe,
 	.remove		= __devexit_p(pd6729_pci_remove),
-#ifdef CONFIG_PM
-	.suspend	= pd6729_socket_suspend,
-	.resume		= pd6729_socket_resume,
-#endif
 };
 
 static int pd6729_module_init(void)
diff --git a/drivers/pcmcia/pxa2xx_base.c b/drivers/pcmcia/pxa2xx_base.c
index 76e640bccde..0a876fabfe4 100644
--- a/drivers/pcmcia/pxa2xx_base.c
+++ b/drivers/pcmcia/pxa2xx_base.c
@@ -325,19 +325,13 @@ static int pxa2xx_drv_pcmcia_remove(struct platform_device *dev)
 	return 0;
 }
 
-static int pxa2xx_drv_pcmcia_suspend(struct device *dev)
-{
-	return pcmcia_socket_dev_suspend(dev);
-}
-
 static int pxa2xx_drv_pcmcia_resume(struct device *dev)
 {
 	pxa2xx_configure_sockets(dev);
-	return pcmcia_socket_dev_resume(dev);
+	return 0;
 }
 
 static const struct dev_pm_ops pxa2xx_drv_pcmcia_pm_ops = {
-	.suspend	= pxa2xx_drv_pcmcia_suspend,
 	.resume		= pxa2xx_drv_pcmcia_resume,
 };
 
diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c
index 4663b3fa9f9..2e47991eccf 100644
--- a/drivers/pcmcia/rsrc_nonstatic.c
+++ b/drivers/pcmcia/rsrc_nonstatic.c
@@ -810,6 +810,13 @@ static int adjust_io(struct pcmcia_socket *s, unsigned int action, unsigned long
 	unsigned long size = end - start + 1;
 	int ret = 0;
 
+#if defined(CONFIG_X86)
+	/* on x86, avoid anything < 0x100 for it is often used for
+	 * legacy platform devices */
+	if (start < 0x100)
+		start = 0x100;
+#endif
+
 	if (end < start)
 		return -EINVAL;
 
@@ -867,10 +874,8 @@ static int nonstatic_autoadd_resources(struct pcmcia_socket *s)
 			if (res == &ioport_resource)
 				continue;
 			dev_printk(KERN_INFO, &s->cb_dev->dev,
-				   "pcmcia: parent PCI bridge I/O "
-				   "window: 0x%llx - 0x%llx\n",
-				   (unsigned long long)res->start,
-				   (unsigned long long)res->end);
+				   "pcmcia: parent PCI bridge window: %pR\n",
+				   res);
 			if (!adjust_io(s, ADD_MANAGED_RESOURCE, res->start, res->end))
 				done |= IORESOURCE_IO;
 
@@ -880,10 +885,8 @@ static int nonstatic_autoadd_resources(struct pcmcia_socket *s)
 			if (res == &iomem_resource)
 				continue;
 			dev_printk(KERN_INFO, &s->cb_dev->dev,
-				   "pcmcia: parent PCI bridge Memory "
-				   "window: 0x%llx - 0x%llx\n",
-				   (unsigned long long)res->start,
-				   (unsigned long long)res->end);
+				   "pcmcia: parent PCI bridge window: %pR\n",
+				   res);
 			if (!adjust_memory(s, ADD_MANAGED_RESOURCE, res->start, res->end))
 				done |= IORESOURCE_MEM;
 		}
diff --git a/drivers/pcmcia/sa1100_generic.c b/drivers/pcmcia/sa1100_generic.c
index 8db86b90c20..51889624142 100644
--- a/drivers/pcmcia/sa1100_generic.c
+++ b/drivers/pcmcia/sa1100_generic.c
@@ -95,17 +95,6 @@ static int sa11x0_drv_pcmcia_remove(struct platform_device *dev)
 	return 0;
 }
 
-static int sa11x0_drv_pcmcia_suspend(struct platform_device *dev,
-				     pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
-
-static int sa11x0_drv_pcmcia_resume(struct platform_device *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
-
 static struct platform_driver sa11x0_pcmcia_driver = {
 	.driver = {
 		.name		= "sa11x0-pcmcia",
@@ -113,8 +102,6 @@ static struct platform_driver sa11x0_pcmcia_driver = {
 	},
 	.probe		= sa11x0_drv_pcmcia_probe,
 	.remove		= sa11x0_drv_pcmcia_remove,
-	.suspend 	= sa11x0_drv_pcmcia_suspend,
-	.resume 	= sa11x0_drv_pcmcia_resume,
 };
 
 /* sa11x0_pcmcia_init()
diff --git a/drivers/pcmcia/sa1111_generic.c b/drivers/pcmcia/sa1111_generic.c
index db79ca61cf9..799e9793e49 100644
--- a/drivers/pcmcia/sa1111_generic.c
+++ b/drivers/pcmcia/sa1111_generic.c
@@ -213,16 +213,6 @@ static int __devexit pcmcia_remove(struct sa1111_dev *dev)
 	return 0;
 }
 
-static int pcmcia_suspend(struct sa1111_dev *dev, pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
-
-static int pcmcia_resume(struct sa1111_dev *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
-
 static struct sa1111_driver pcmcia_driver = {
 	.drv = {
 		.name	= "sa1111-pcmcia",
@@ -230,8 +220,6 @@ static struct sa1111_driver pcmcia_driver = {
 	.devid		= SA1111_DEVID_PCMCIA,
 	.probe		= pcmcia_probe,
 	.remove		= __devexit_p(pcmcia_remove),
-	.suspend	= pcmcia_suspend,
-	.resume		= pcmcia_resume,
 };
 
 static int __init sa1111_drv_pcmcia_init(void)
diff --git a/drivers/pcmcia/tcic.c b/drivers/pcmcia/tcic.c
index 12c49ee135e..bac85f3236b 100644
--- a/drivers/pcmcia/tcic.c
+++ b/drivers/pcmcia/tcic.c
@@ -348,16 +348,6 @@ static int __init get_tcic_id(void)
     return id;
 }
 
-static int tcic_drv_pcmcia_suspend(struct platform_device *dev,
-				     pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
-
-static int tcic_drv_pcmcia_resume(struct platform_device *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
 /*====================================================================*/
 
 static struct platform_driver tcic_driver = {
@@ -365,8 +355,6 @@ static struct platform_driver tcic_driver = {
 		.name = "tcic-pcmcia",
 		.owner		= THIS_MODULE,
 	},
-	.suspend 	= tcic_drv_pcmcia_suspend,
-	.resume 	= tcic_drv_pcmcia_resume,
 };
 
 static struct platform_device tcic_device = {
diff --git a/drivers/pcmcia/vrc4171_card.c b/drivers/pcmcia/vrc4171_card.c
index aaccdb9f4ba..86e4a1a3c64 100644
--- a/drivers/pcmcia/vrc4171_card.c
+++ b/drivers/pcmcia/vrc4171_card.c
@@ -705,24 +705,11 @@ static int __devinit vrc4171_card_setup(char *options)
 
 __setup("vrc4171_card=", vrc4171_card_setup);
 
-static int vrc4171_card_suspend(struct platform_device *dev,
-				     pm_message_t state)
-{
-	return pcmcia_socket_dev_suspend(&dev->dev);
-}
-
-static int vrc4171_card_resume(struct platform_device *dev)
-{
-	return pcmcia_socket_dev_resume(&dev->dev);
-}
-
 static struct platform_driver vrc4171_card_driver = {
 	.driver = {
 		.name		= vrc4171_card_name,
 		.owner		= THIS_MODULE,
 	},
-	.suspend	= vrc4171_card_suspend,
-	.resume		= vrc4171_card_resume,
 };
 
 static int __devinit vrc4171_card_init(void)
diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c
index 418988ab6ed..f19ad02374d 100644
--- a/drivers/pcmcia/yenta_socket.c
+++ b/drivers/pcmcia/yenta_socket.c
@@ -1290,12 +1290,9 @@ static int yenta_dev_suspend_noirq(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct yenta_socket *socket = pci_get_drvdata(pdev);
-	int ret;
-
-	ret = pcmcia_socket_dev_suspend(dev);
 
 	if (!socket)
-		return ret;
+		return 0;
 
 	if (socket->type && socket->type->save_state)
 		socket->type->save_state(socket);
@@ -1312,7 +1309,7 @@ static int yenta_dev_suspend_noirq(struct device *dev)
 	 */
 	/* pci_set_power_state(dev, 3); */
 
-	return ret;
+	return 0;
 }
 
 static int yenta_dev_resume_noirq(struct device *dev)
@@ -1336,26 +1333,16 @@ static int yenta_dev_resume_noirq(struct device *dev)
 	if (socket->type && socket->type->restore_state)
 		socket->type->restore_state(socket);
 
-	pcmcia_socket_dev_early_resume(dev);
-	return 0;
-}
-
-static int yenta_dev_resume(struct device *dev)
-{
-	pcmcia_socket_dev_late_resume(dev);
 	return 0;
 }
 
 static const struct dev_pm_ops yenta_pm_ops = {
 	.suspend_noirq = yenta_dev_suspend_noirq,
 	.resume_noirq = yenta_dev_resume_noirq,
-	.resume = yenta_dev_resume,
 	.freeze_noirq = yenta_dev_suspend_noirq,
 	.thaw_noirq = yenta_dev_resume_noirq,
-	.thaw = yenta_dev_resume,
 	.poweroff_noirq = yenta_dev_suspend_noirq,
 	.restore_noirq = yenta_dev_resume_noirq,
-	.restore = yenta_dev_resume,
 };
 
 #define YENTA_PM_OPS	(&yenta_pm_ops)
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index e631dbeafd7..7bec4588c26 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -385,6 +385,16 @@ config EEEPC_LAPTOP
 
 	  If you have an Eee PC laptop, say Y or M here.
 
+config EEEPC_WMI
+	tristate "Eee PC WMI Hotkey Driver (EXPERIMENTAL)"
+	depends on ACPI_WMI
+	depends on INPUT
+	depends on EXPERIMENTAL
+	---help---
+	  Say Y here if you want to support WMI-based hotkeys on Eee PC laptops.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called eeepc-wmi.
 
 config ACPI_WMI
 	tristate "WMI"
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 9cd9fa0a27e..a906490e353 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -4,6 +4,7 @@
 #
 obj-$(CONFIG_ASUS_LAPTOP)	+= asus-laptop.o
 obj-$(CONFIG_EEEPC_LAPTOP)	+= eeepc-laptop.o
+obj-$(CONFIG_EEEPC_WMI)		+= eeepc-wmi.o
 obj-$(CONFIG_MSI_LAPTOP)	+= msi-laptop.o
 obj-$(CONFIG_ACPI_CMPC)		+= classmate-laptop.o
 obj-$(CONFIG_COMPAL_LAPTOP)	+= compal-laptop.o
diff --git a/drivers/platform/x86/asus-laptop.c b/drivers/platform/x86/asus-laptop.c
index db5f7db2ba3..475ab50732a 100644
--- a/drivers/platform/x86/asus-laptop.c
+++ b/drivers/platform/x86/asus-laptop.c
@@ -139,7 +139,7 @@ MODULE_PARM_DESC(bluetooth_status, "Set the wireless status on boot "
 
 /* Backlight */
 static acpi_handle lcd_switch_handle;
-static const char *lcd_switch_paths[] = {
+static char *lcd_switch_paths[] = {
   "\\_SB.PCI0.SBRG.EC0._Q10",	/* All new models */
   "\\_SB.PCI0.ISA.EC0._Q10",	/* A1x */
   "\\_SB.PCI0.PX40.ECD0._Q10",	/* L3C */
@@ -153,7 +153,7 @@ static const char *lcd_switch_paths[] = {
 #define METHOD_SWITCH_DISPLAY	"SDSP"
 
 static acpi_handle display_get_handle;
-static const char *display_get_paths[] = {
+static char *display_get_paths[] = {
   /* A6B, A6K A6R A7D F3JM L4R M6R A3G M6A M6V VX-1 V6J V6V W3Z */
   "\\_SB.PCI0.P0P1.VGA.GETD",
   /* A3E A4K, A4D A4L A6J A7J A8J Z71V M9V S5A M5A z33A W1Jc W2V G1 */
diff --git a/drivers/platform/x86/eeepc-wmi.c b/drivers/platform/x86/eeepc-wmi.c
new file mode 100644
index 00000000000..2466b7b7fb0
--- /dev/null
+++ b/drivers/platform/x86/eeepc-wmi.c
@@ -0,0 +1,157 @@
+/*
+ * Eee PC WMI hotkey driver
+ *
+ * Copyright(C) 2010 Intel Corporation.
+ *
+ * Portions based on wistron_btns.c:
+ * Copyright (C) 2005 Miloslav Trmac <mitr@volny.cz>
+ * Copyright (C) 2005 Bernhard Rosenkraenzer <bero@arklinux.org>
+ * Copyright (C) 2005 Dmitry Torokhov <dtor@mail.ru>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/input.h>
+#include <linux/input/sparse-keymap.h>
+#include <acpi/acpi_bus.h>
+#include <acpi/acpi_drivers.h>
+
+MODULE_AUTHOR("Yong Wang <yong.y.wang@intel.com>");
+MODULE_DESCRIPTION("Eee PC WMI Hotkey Driver");
+MODULE_LICENSE("GPL");
+
+#define EEEPC_WMI_EVENT_GUID	"ABBC0F72-8EA1-11D1-00A0-C90629100000"
+
+MODULE_ALIAS("wmi:"EEEPC_WMI_EVENT_GUID);
+
+#define NOTIFY_BRNUP_MIN	0x11
+#define NOTIFY_BRNUP_MAX	0x1f
+#define NOTIFY_BRNDOWN_MIN	0x20
+#define NOTIFY_BRNDOWN_MAX	0x2e
+
+static const struct key_entry eeepc_wmi_keymap[] = {
+	/* Sleep already handled via generic ACPI code */
+	{ KE_KEY, 0x5d, { KEY_WLAN } },
+	{ KE_KEY, 0x32, { KEY_MUTE } },
+	{ KE_KEY, 0x31, { KEY_VOLUMEDOWN } },
+	{ KE_KEY, 0x30, { KEY_VOLUMEUP } },
+	{ KE_IGNORE, NOTIFY_BRNDOWN_MIN, { KEY_BRIGHTNESSDOWN } },
+	{ KE_IGNORE, NOTIFY_BRNUP_MIN, { KEY_BRIGHTNESSUP } },
+	{ KE_KEY, 0xcc, { KEY_SWITCHVIDEOMODE } },
+	{ KE_END, 0},
+};
+
+static struct input_dev *eeepc_wmi_input_dev;
+
+static void eeepc_wmi_notify(u32 value, void *context)
+{
+	struct acpi_buffer response = { ACPI_ALLOCATE_BUFFER, NULL };
+	union acpi_object *obj;
+	acpi_status status;
+	int code;
+
+	status = wmi_get_event_data(value, &response);
+	if (status != AE_OK) {
+		pr_err("EEEPC WMI: bad event status 0x%x\n", status);
+		return;
+	}
+
+	obj = (union acpi_object *)response.pointer;
+
+	if (obj && obj->type == ACPI_TYPE_INTEGER) {
+		code = obj->integer.value;
+
+		if (code >= NOTIFY_BRNUP_MIN && code <= NOTIFY_BRNUP_MAX)
+			code = NOTIFY_BRNUP_MIN;
+		else if (code >= NOTIFY_BRNDOWN_MIN && code <= NOTIFY_BRNDOWN_MAX)
+			code = NOTIFY_BRNDOWN_MIN;
+
+		if (!sparse_keymap_report_event(eeepc_wmi_input_dev,
+						code, 1, true))
+			pr_info("EEEPC WMI: Unknown key %x pressed\n", code);
+	}
+
+	kfree(obj);
+}
+
+static int eeepc_wmi_input_setup(void)
+{
+	int err;
+
+	eeepc_wmi_input_dev = input_allocate_device();
+	if (!eeepc_wmi_input_dev)
+		return -ENOMEM;
+
+	eeepc_wmi_input_dev->name = "Eee PC WMI hotkeys";
+	eeepc_wmi_input_dev->phys = "wmi/input0";
+	eeepc_wmi_input_dev->id.bustype = BUS_HOST;
+
+	err = sparse_keymap_setup(eeepc_wmi_input_dev, eeepc_wmi_keymap, NULL);
+	if (err)
+		goto err_free_dev;
+
+	err = input_register_device(eeepc_wmi_input_dev);
+	if (err)
+		goto err_free_keymap;
+
+	return 0;
+
+err_free_keymap:
+	sparse_keymap_free(eeepc_wmi_input_dev);
+err_free_dev:
+	input_free_device(eeepc_wmi_input_dev);
+	return err;
+}
+
+static int __init eeepc_wmi_init(void)
+{
+	int err;
+	acpi_status status;
+
+	if (!wmi_has_guid(EEEPC_WMI_EVENT_GUID)) {
+		pr_warning("EEEPC WMI: No known WMI GUID found\n");
+		return -ENODEV;
+	}
+
+	err = eeepc_wmi_input_setup();
+	if (err)
+		return err;
+
+	status = wmi_install_notify_handler(EEEPC_WMI_EVENT_GUID,
+					eeepc_wmi_notify, NULL);
+	if (ACPI_FAILURE(status)) {
+		sparse_keymap_free(eeepc_wmi_input_dev);
+		input_unregister_device(eeepc_wmi_input_dev);
+		pr_err("EEEPC WMI: Unable to register notify handler - %d\n",
+			status);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void __exit eeepc_wmi_exit(void)
+{
+	wmi_remove_notify_handler(EEEPC_WMI_EVENT_GUID);
+	sparse_keymap_free(eeepc_wmi_input_dev);
+	input_unregister_device(eeepc_wmi_input_dev);
+}
+
+module_init(eeepc_wmi_init);
+module_exit(eeepc_wmi_exit);
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index c7bbe30010f..5af16c2bb54 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1038,6 +1038,7 @@ static struct regulator *create_regulator(struct regulator_dev *rdev,
 			goto overflow_err;
 
 		regulator->dev = dev;
+		sysfs_attr_init(&regulator->dev_attr.attr);
 		regulator->dev_attr.attr.name = kstrdup(buf, GFP_KERNEL);
 		if (regulator->dev_attr.attr.name == NULL)
 			goto attr_name_err;
diff --git a/drivers/regulator/lp3971.c b/drivers/regulator/lp3971.c
index f5532ed7927..b20b3e1d821 100644
--- a/drivers/regulator/lp3971.c
+++ b/drivers/regulator/lp3971.c
@@ -45,7 +45,7 @@ static int lp3971_set_bits(struct lp3971 *lp3971, u8 reg, u16 mask, u16 val);
 	LP3971_BUCK2 -> 4
 	LP3971_BUCK3 -> 6
 */
-#define BUCK_VOL_CHANGE_SHIFT(x) (((1 << x) & ~0x01) << 1)
+#define BUCK_VOL_CHANGE_SHIFT(x) (((!!x) << 2) | (x & ~0x01))
 #define BUCK_VOL_CHANGE_FLAG_GO 0x01
 #define BUCK_VOL_CHANGE_FLAG_TARGET 0x02
 #define BUCK_VOL_CHANGE_FLAG_MASK 0x03
@@ -187,7 +187,8 @@ static int lp3971_ldo_set_voltage(struct regulator_dev *dev,
 		return -EINVAL;
 
 	return lp3971_set_bits(lp3971, LP3971_LDO_VOL_CONTR_REG(ldo),
-		LDO_VOL_CONTR_MASK << LDO_VOL_CONTR_SHIFT(ldo), val);
+			LDO_VOL_CONTR_MASK << LDO_VOL_CONTR_SHIFT(ldo),
+			val << LDO_VOL_CONTR_SHIFT(ldo));
 }
 
 static struct regulator_ops lp3971_ldo_ops = {
@@ -439,6 +440,10 @@ static int __devinit setup_regulators(struct lp3971 *lp3971,
 	lp3971->num_regulators = pdata->num_regulators;
 	lp3971->rdev = kcalloc(pdata->num_regulators,
 				sizeof(struct regulator_dev *), GFP_KERNEL);
+	if (!lp3971->rdev) {
+		err = -ENOMEM;
+		goto err_nomem;
+	}
 
 	/* Instantiate the regulators */
 	for (i = 0; i < pdata->num_regulators; i++) {
@@ -461,6 +466,7 @@ error:
 		regulator_unregister(lp3971->rdev[i]);
 	kfree(lp3971->rdev);
 	lp3971->rdev = NULL;
+err_nomem:
 	return err;
 }
 
diff --git a/drivers/regulator/max1586.c b/drivers/regulator/max1586.c
index a49fc952c9a..c0b09e15edb 100644
--- a/drivers/regulator/max1586.c
+++ b/drivers/regulator/max1586.c
@@ -243,8 +243,8 @@ static int __devexit max1586_pmic_remove(struct i2c_client *client)
 	for (i = 0; i <= MAX1586_V6; i++)
 		if (rdev[i])
 			regulator_unregister(rdev[i]);
-	kfree(rdev);
 	i2c_set_clientdata(client, NULL);
+	kfree(rdev);
 
 	return 0;
 }
diff --git a/drivers/regulator/max8649.c b/drivers/regulator/max8649.c
index 3ebdf698c64..833aaedc7e6 100644
--- a/drivers/regulator/max8649.c
+++ b/drivers/regulator/max8649.c
@@ -356,6 +356,7 @@ static int __devinit max8649_regulator_probe(struct i2c_client *client,
 	dev_info(info->dev, "Max8649 regulator device is detected.\n");
 	return 0;
 out:
+	i2c_set_clientdata(client, NULL);
 	kfree(info);
 	return ret;
 }
@@ -367,9 +368,9 @@ static int __devexit max8649_regulator_remove(struct i2c_client *client)
 	if (info) {
 		if (info->regulator)
 			regulator_unregister(info->regulator);
+		i2c_set_clientdata(client, NULL);
 		kfree(info);
 	}
-	i2c_set_clientdata(client, NULL);
 
 	return 0;
 }
diff --git a/drivers/regulator/max8660.c b/drivers/regulator/max8660.c
index f12f1bb6213..47f90b2fc29 100644
--- a/drivers/regulator/max8660.c
+++ b/drivers/regulator/max8660.c
@@ -470,8 +470,8 @@ static int __devexit max8660_remove(struct i2c_client *client)
 	for (i = 0; i < MAX8660_V_END; i++)
 		if (rdev[i])
 			regulator_unregister(rdev[i]);
-	kfree(rdev);
 	i2c_set_clientdata(client, NULL);
+	kfree(rdev);
 
 	return 0;
 }
diff --git a/drivers/regulator/max8925-regulator.c b/drivers/regulator/max8925-regulator.c
index 67873f08ed4..b6218f11c95 100644
--- a/drivers/regulator/max8925-regulator.c
+++ b/drivers/regulator/max8925-regulator.c
@@ -230,7 +230,7 @@ static struct max8925_regulator_info max8925_regulator_info[] = {
 	MAX8925_LDO(20, 750, 3900, 50),
 };
 
-static inline struct max8925_regulator_info *find_regulator_info(int id)
+static struct max8925_regulator_info * __devinit find_regulator_info(int id)
 {
 	struct max8925_regulator_info *ri;
 	int i;
@@ -247,7 +247,7 @@ static int __devinit max8925_regulator_probe(struct platform_device *pdev)
 {
 	struct max8925_chip *chip = dev_get_drvdata(pdev->dev.parent);
 	struct max8925_platform_data *pdata = chip->dev->platform_data;
-	struct max8925_regulator_info *ri = NULL;
+	struct max8925_regulator_info *ri;
 	struct regulator_dev *rdev;
 
 	ri = find_regulator_info(pdev->id);
@@ -274,7 +274,9 @@ static int __devexit max8925_regulator_remove(struct platform_device *pdev)
 {
 	struct regulator_dev *rdev = platform_get_drvdata(pdev);
 
+	platform_set_drvdata(pdev, NULL);
 	regulator_unregister(rdev);
+
 	return 0;
 }
 
diff --git a/drivers/rtc/rtc-mc13783.c b/drivers/rtc/rtc-mc13783.c
index d60c81b7b69..1379c7faa44 100644
--- a/drivers/rtc/rtc-mc13783.c
+++ b/drivers/rtc/rtc-mc13783.c
@@ -319,35 +319,38 @@ static int __devinit mc13783_rtc_probe(struct platform_device *pdev)
 {
 	int ret;
 	struct mc13783_rtc *priv;
+	struct mc13783 *mc13783;
 	int rtcrst_pending;
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
 
-	priv->mc13783 = dev_get_drvdata(pdev->dev.parent);
+	mc13783 = dev_get_drvdata(pdev->dev.parent);
+	priv->mc13783 = mc13783;
+
 	platform_set_drvdata(pdev, priv);
 
-	mc13783_lock(priv->mc13783);
+	mc13783_lock(mc13783);
 
-	ret = mc13783_irq_request(priv->mc13783, MC13783_IRQ_RTCRST,
+	ret = mc13783_irq_request(mc13783, MC13783_IRQ_RTCRST,
 			mc13783_rtc_reset_handler, DRIVER_NAME, priv);
 	if (ret)
 		goto err_reset_irq_request;
 
-	ret = mc13783_irq_status(priv->mc13783, MC13783_IRQ_RTCRST,
+	ret = mc13783_irq_status(mc13783, MC13783_IRQ_RTCRST,
 			NULL, &rtcrst_pending);
 	if (ret)
 		goto err_reset_irq_status;
 
 	priv->valid = !rtcrst_pending;
 
-	ret = mc13783_irq_request_nounmask(priv->mc13783, MC13783_IRQ_1HZ,
+	ret = mc13783_irq_request_nounmask(mc13783, MC13783_IRQ_1HZ,
 			mc13783_rtc_update_handler, DRIVER_NAME, priv);
 	if (ret)
 		goto err_update_irq_request;
 
-	ret = mc13783_irq_request_nounmask(priv->mc13783, MC13783_IRQ_TODA,
+	ret = mc13783_irq_request_nounmask(mc13783, MC13783_IRQ_TODA,
 			mc13783_rtc_alarm_handler, DRIVER_NAME, priv);
 	if (ret)
 		goto err_alarm_irq_request;
@@ -357,22 +360,22 @@ static int __devinit mc13783_rtc_probe(struct platform_device *pdev)
 	if (IS_ERR(priv->rtc)) {
 		ret = PTR_ERR(priv->rtc);
 
-		mc13783_irq_free(priv->mc13783, MC13783_IRQ_TODA, priv);
+		mc13783_irq_free(mc13783, MC13783_IRQ_TODA, priv);
 err_alarm_irq_request:
 
-		mc13783_irq_free(priv->mc13783, MC13783_IRQ_1HZ, priv);
+		mc13783_irq_free(mc13783, MC13783_IRQ_1HZ, priv);
 err_update_irq_request:
 
 err_reset_irq_status:
 
-		mc13783_irq_free(priv->mc13783, MC13783_IRQ_RTCRST, priv);
+		mc13783_irq_free(mc13783, MC13783_IRQ_RTCRST, priv);
 err_reset_irq_request:
 
 		platform_set_drvdata(pdev, NULL);
 		kfree(priv);
 	}
 
-	mc13783_unlock(priv->mc13783);
+	mc13783_unlock(mc13783);
 
 	return ret;
 }
diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c
index 51224f76b98..b3736b8aad3 100644
--- a/drivers/s390/block/dasd_3990_erp.c
+++ b/drivers/s390/block/dasd_3990_erp.c
@@ -2287,7 +2287,8 @@ static struct dasd_ccw_req *dasd_3990_erp_add_erp(struct dasd_ccw_req *cqr)
 
 	if (cqr->cpmode == 1) {
 		cplength = 0;
-		datasize = sizeof(struct tcw) + sizeof(struct tsb);
+		/* TCW needs to be 64 byte aligned, so leave enough room */
+		datasize = 64 + sizeof(struct tcw) + sizeof(struct tsb);
 	} else {
 		cplength = 2;
 		datasize = 0;
@@ -2316,8 +2317,8 @@ static struct dasd_ccw_req *dasd_3990_erp_add_erp(struct dasd_ccw_req *cqr)
 	if (cqr->cpmode == 1) {
 		/* make a shallow copy of the original tcw but set new tsb */
 		erp->cpmode = 1;
-		erp->cpaddr = erp->data;
-		tcw = erp->data;
+		erp->cpaddr = PTR_ALIGN(erp->data, 64);
+		tcw = erp->cpaddr;
 		tsb = (struct tsb *) &tcw[1];
 		*tcw = *((struct tcw *)cqr->cpaddr);
 		tcw->tsb = (long)tsb;
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 01f4e7a34aa..0cb23311685 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -3155,11 +3155,11 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
 
 	tsb = NULL;
 	sense = NULL;
-	if (irb->scsw.tm.tcw)
+	if (irb->scsw.tm.tcw && (irb->scsw.tm.fcxs == 0x01))
 		tsb = tcw_get_tsb(
 			(struct tcw *)(unsigned long)irb->scsw.tm.tcw);
 
-	if (tsb && (irb->scsw.tm.fcxs == 0x01)) {
+	if (tsb) {
 		len += sprintf(page + len, KERN_ERR PRINTK_HEADER
 			       " tsb->length %d\n", tsb->length);
 		len += sprintf(page + len, KERN_ERR PRINTK_HEADER
diff --git a/drivers/s390/char/sclp_async.c b/drivers/s390/char/sclp_async.c
index 740fe405c39..f449c696e50 100644
--- a/drivers/s390/char/sclp_async.c
+++ b/drivers/s390/char/sclp_async.c
@@ -84,6 +84,7 @@ static int proc_handler_callhome(struct ctl_table *ctl, int write,
 		rc = copy_from_user(buf, buffer, sizeof(buf));
 		if (rc != 0)
 			return -EFAULT;
+		buf[len - 1] = '\0';
 		if (strict_strtoul(buf, 0, &val) != 0)
 			return -EINVAL;
 		if (val != 0 && val != 1)
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index fc7ae05ce48..4b60ede07f0 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -308,6 +308,13 @@ struct assign_storage_sccb {
 	u16 rn;
 } __packed;
 
+int arch_get_memory_phys_device(unsigned long start_pfn)
+{
+	if (!rzm)
+		return 0;
+	return PFN_PHYS(start_pfn) >> ilog2(rzm);
+}
+
 static unsigned long long rn2addr(u16 rn)
 {
 	return (unsigned long long) (rn - 1) * rzm;
@@ -704,13 +711,6 @@ int sclp_chp_deconfigure(struct chp_id chpid)
 	return do_chp_configure(SCLP_CMDW_DECONFIGURE_CHPATH | chpid.id << 8);
 }
 
-int arch_get_memory_phys_device(unsigned long start_pfn)
-{
-	if (!rzm)
-		return 0;
-	return PFN_PHYS(start_pfn) / rzm;
-}
-
 struct chp_info_sccb {
 	struct sccb_header header;
 	u8 recognized[SCLP_CHP_INFO_MASK_SIZE];
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 3438658b66b..3166d85914f 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -141,33 +141,6 @@ static int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count)
 	return memcpy_hsa(dest, src, count, TO_KERNEL);
 }
 
-static int memcpy_real(void *dest, unsigned long src, size_t count)
-{
-	unsigned long flags;
-	int rc = -EFAULT;
-	register unsigned long _dest asm("2") = (unsigned long) dest;
-	register unsigned long _len1 asm("3") = (unsigned long) count;
-	register unsigned long _src  asm("4") = src;
-	register unsigned long _len2 asm("5") = (unsigned long) count;
-
-	if (count == 0)
-		return 0;
-	flags = __raw_local_irq_stnsm(0xf8UL); /* switch to real mode */
-	asm volatile (
-		"0:	mvcle	%1,%2,0x0\n"
-		"1:	jo	0b\n"
-		"	lhi	%0,0x0\n"
-		"2:\n"
-		EX_TABLE(1b,2b)
-		: "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
-		  "+d" (_len2), "=m" (*((long*)dest))
-		: "m" (*((long*)src))
-		: "cc", "memory");
-	__raw_local_irq_ssm(flags);
-
-	return rc;
-}
-
 static int memcpy_real_user(void __user *dest, unsigned long src, size_t count)
 {
 	static char buf[4096];
@@ -175,7 +148,7 @@ static int memcpy_real_user(void __user *dest, unsigned long src, size_t count)
 
 	while (offs < count) {
 		size = min(sizeof(buf), count - offs);
-		if (memcpy_real(buf, src + offs, size))
+		if (memcpy_real(buf, (void *) src + offs, size))
 			return -EFAULT;
 		if (copy_to_user(dest + offs, buf, size))
 			return -EFAULT;
@@ -663,7 +636,7 @@ static int __init zcore_reipl_init(void)
 	if (ipib_info.ipib < ZFCPDUMP_HSA_SIZE)
 		rc = memcpy_hsa_kernel(ipl_block, ipib_info.ipib, PAGE_SIZE);
 	else
-		rc = memcpy_real(ipl_block, ipib_info.ipib, PAGE_SIZE);
+		rc = memcpy_real(ipl_block, (void *) ipib_info.ipib, PAGE_SIZE);
 	if (rc) {
 		free_page((unsigned long) ipl_block);
 		return rc;
diff --git a/drivers/serial/cpm_uart/cpm_uart_cpm2.c b/drivers/serial/cpm_uart/cpm_uart_cpm2.c
index a9802e76b5f..722eac18f38 100644
--- a/drivers/serial/cpm_uart/cpm_uart_cpm2.c
+++ b/drivers/serial/cpm_uart/cpm_uart_cpm2.c
@@ -61,7 +61,7 @@ void __iomem *cpm_uart_map_pram(struct uart_cpm_port *port,
 	void __iomem *pram;
 	unsigned long offset;
 	struct resource res;
-	unsigned long len;
+	resource_size_t len;
 
 	/* Don't remap parameter RAM if it has already been initialized
 	 * during console setup.
@@ -74,7 +74,7 @@ void __iomem *cpm_uart_map_pram(struct uart_cpm_port *port,
 	if (of_address_to_resource(np, 1, &res))
 		return NULL;
 
-	len = 1 + res.end - res.start;
+	len = resource_size(&res);
 	pram = ioremap(res.start, len);
 	if (!pram)
 		return NULL;
diff --git a/drivers/serial/serial_cs.c b/drivers/serial/serial_cs.c
index e91db4b3801..175d202ab37 100644
--- a/drivers/serial/serial_cs.c
+++ b/drivers/serial/serial_cs.c
@@ -745,6 +745,7 @@ static struct pcmcia_device_id serial_ids[] = {
 	PCMCIA_PFC_DEVICE_PROD_ID13(1, "Xircom", "REM10", 0x2e3ee845, 0x76df1d29),
 	PCMCIA_PFC_DEVICE_PROD_ID13(1, "Xircom", "XEM5600", 0x2e3ee845, 0xf1403719),
 	PCMCIA_PFC_DEVICE_PROD_ID12(1, "AnyCom", "Fast Ethernet + 56K COMBO", 0x578ba6e7, 0xb0ac62c4),
+	PCMCIA_PFC_DEVICE_PROD_ID12(1, "ATKK", "LM33-PCM-T", 0xba9eb7e2, 0x077c174e),
 	PCMCIA_PFC_DEVICE_PROD_ID12(1, "D-Link", "DME336T", 0x1a424a1c, 0xb23897ff),
 	PCMCIA_PFC_DEVICE_PROD_ID12(1, "Gateway 2000", "XJEM3336", 0xdd9989be, 0x662c394c),
 	PCMCIA_PFC_DEVICE_PROD_ID12(1, "Grey Cell", "GCS3000", 0x2a151fac, 0x48b932ae),
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 980f39449ee..309de6be820 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -50,7 +50,6 @@
 #include <linux/list.h>
 #include <linux/dmaengine.h>
 #include <linux/scatterlist.h>
-#include <linux/timer.h>
 
 #ifdef CONFIG_SUPERH
 #include <asm/sh_bios.h>
@@ -780,10 +779,6 @@ static irqreturn_t sci_mpxed_interrupt(int irq, void *ptr)
 	if ((ssr_status & SCxSR_BRK(port)) && err_enabled)
 		ret = sci_br_interrupt(irq, ptr);
 
-	WARN_ONCE(ret == IRQ_NONE,
-		  "%s: %d IRQ %d, status %x, control %x\n", __func__,
-		  irq, port->line, ssr_status, scr_status);
-
 	return ret;
 }
 
diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h
index fad67d33b0b..f70c49f915f 100644
--- a/drivers/serial/sh-sci.h
+++ b/drivers/serial/sh-sci.h
@@ -31,7 +31,9 @@
 # define SCSCR_INIT(port) (port->mapbase == SCIF2) ? 0xF3 : 0xF0
 #elif defined(CONFIG_CPU_SUBTYPE_SH7720) || \
       defined(CONFIG_CPU_SUBTYPE_SH7721) || \
-      defined(CONFIG_ARCH_SHMOBILE)
+      defined(CONFIG_ARCH_SH7367) || \
+      defined(CONFIG_ARCH_SH7377) || \
+      defined(CONFIG_ARCH_SH7372)
 # define SCSCR_INIT(port)  0x0030 /* TIE=0,RIE=0,TE=1,RE=1 */
 # define PORT_PTCR	   0xA405011EUL
 # define PORT_PVCR	   0xA4050122UL
@@ -94,7 +96,9 @@
 # define SCSCR_INIT(port)       0x0038  /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */
 #elif defined(CONFIG_CPU_SUBTYPE_SH7724)
 # define SCIF_ORER              0x0001  /* overrun error bit */
-# define SCSCR_INIT(port)       0x0038  /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */
+# define SCSCR_INIT(port) ((port)->type == PORT_SCIFA ? \
+	0x30 /* TIE=0,RIE=0,TE=1,RE=1 */ : \
+	0x38 /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */ )
 #elif defined(CONFIG_CPU_SUBTYPE_SH4_202)
 # define SCSPTR2 0xffe80020 /* 16 bit SCIF */
 # define SCIF_ORER 0x0001   /* overrun error bit */
@@ -197,6 +201,8 @@
     defined(CONFIG_CPU_SUBTYPE_SH7786)  || \
     defined(CONFIG_CPU_SUBTYPE_SHX3)
 #define SCI_CTRL_FLAGS_REIE 0x08 /* 7750 SCIF */
+#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
+#define SCI_CTRL_FLAGS_REIE ((port)->type == PORT_SCIFA ? 0 : 8)
 #else
 #define SCI_CTRL_FLAGS_REIE 0
 #endif
@@ -230,7 +236,9 @@
 #if defined(CONFIG_CPU_SUBTYPE_SH7705) || \
     defined(CONFIG_CPU_SUBTYPE_SH7720) || \
     defined(CONFIG_CPU_SUBTYPE_SH7721) || \
-    defined(CONFIG_ARCH_SHMOBILE)
+    defined(CONFIG_ARCH_SH7367) || \
+    defined(CONFIG_ARCH_SH7377) || \
+    defined(CONFIG_ARCH_SH7372)
 # define SCIF_ORER    0x0200
 # define SCIF_ERRORS ( SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK | SCIF_ORER)
 # define SCIF_RFDC_MASK 0x007f
@@ -264,7 +272,9 @@
 #if defined(CONFIG_CPU_SUBTYPE_SH7705) || \
     defined(CONFIG_CPU_SUBTYPE_SH7720) || \
     defined(CONFIG_CPU_SUBTYPE_SH7721) || \
-    defined(CONFIG_ARCH_SHMOBILE)
+    defined(CONFIG_ARCH_SH7367) || \
+    defined(CONFIG_ARCH_SH7377) || \
+    defined(CONFIG_ARCH_SH7372)
 # define SCxSR_RDxF_CLEAR(port)	 (sci_in(port, SCxSR) & 0xfffc)
 # define SCxSR_ERROR_CLEAR(port) (sci_in(port, SCxSR) & 0xfd73)
 # define SCxSR_TDxE_CLEAR(port)	 (sci_in(port, SCxSR) & 0xffdf)
@@ -359,7 +369,10 @@
     SCI_OUT(sci_size, sci_offset, value);				\
   }
 
-#if defined(CONFIG_CPU_SH3) || defined(CONFIG_ARCH_SHMOBILE)
+#if defined(CONFIG_CPU_SH3) || \
+    defined(CONFIG_ARCH_SH7367) || \
+    defined(CONFIG_ARCH_SH7377) || \
+    defined(CONFIG_ARCH_SH7372)
 #if defined(CONFIG_CPU_SUBTYPE_SH7710) || defined(CONFIG_CPU_SUBTYPE_SH7712)
 #define SCIx_FNS(name, sh3_sci_offset, sh3_sci_size, sh4_sci_offset, sh4_sci_size, \
 		                sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size, \
@@ -370,7 +383,9 @@
 #elif defined(CONFIG_CPU_SUBTYPE_SH7705) || \
       defined(CONFIG_CPU_SUBTYPE_SH7720) || \
       defined(CONFIG_CPU_SUBTYPE_SH7721) || \
-      defined(CONFIG_ARCH_SHMOBILE)
+      defined(CONFIG_ARCH_SH7367) || \
+      defined(CONFIG_ARCH_SH7377) || \
+      defined(CONFIG_ARCH_SH7372)
 #define SCIF_FNS(name, scif_offset, scif_size) \
   CPU_SCIF_FNS(name, scif_offset, scif_size)
 #else
@@ -406,7 +421,9 @@
 #if defined(CONFIG_CPU_SUBTYPE_SH7705) || \
     defined(CONFIG_CPU_SUBTYPE_SH7720) || \
     defined(CONFIG_CPU_SUBTYPE_SH7721) || \
-    defined(CONFIG_ARCH_SHMOBILE)
+    defined(CONFIG_ARCH_SH7367) || \
+    defined(CONFIG_ARCH_SH7377) || \
+    defined(CONFIG_ARCH_SH7372)
 
 SCIF_FNS(SCSMR,  0x00, 16)
 SCIF_FNS(SCBRR,  0x04,  8)
@@ -589,7 +606,9 @@ static inline int sci_rxd_in(struct uart_port *port)
 #elif defined(CONFIG_CPU_SUBTYPE_SH7705) || \
       defined(CONFIG_CPU_SUBTYPE_SH7720) || \
       defined(CONFIG_CPU_SUBTYPE_SH7721) || \
-      defined(CONFIG_ARCH_SHMOBILE)
+      defined(CONFIG_ARCH_SH7367) || \
+      defined(CONFIG_ARCH_SH7377) || \
+      defined(CONFIG_ARCH_SH7372)
 #define SCBRR_VALUE(bps, clk) (((clk*2)+16*bps)/(32*bps)-1)
 #elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
       defined(CONFIG_CPU_SUBTYPE_SH7724)
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index c2750391fd3..a3d8677af6a 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -2,7 +2,7 @@
  * Shared interrupt handling code for IPR and INTC2 types of IRQs.
  *
  * Copyright (C) 2007, 2008 Magnus Damm
- * Copyright (C) 2009 Paul Mundt
+ * Copyright (C) 2009, 2010 Paul Mundt
  *
  * Based on intc2.c and ipr.c
  *
@@ -26,6 +26,7 @@
 #include <linux/list.h>
 #include <linux/topology.h>
 #include <linux/bitmap.h>
+#include <linux/cpumask.h>
 
 #define _INTC_MK(fn, mode, addr_e, addr_d, width, shift) \
 	((shift) | ((width) << 5) | ((fn) << 9) | ((mode) << 13) | \
@@ -234,6 +235,10 @@ static inline void _intc_enable(unsigned int irq, unsigned long handle)
 	unsigned int cpu;
 
 	for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_E(handle)); cpu++) {
+#ifdef CONFIG_SMP
+		if (!cpumask_test_cpu(cpu, irq_to_desc(irq)->affinity))
+			continue;
+#endif
 		addr = INTC_REG(d, _INTC_ADDR_E(handle), cpu);
 		intc_enable_fns[_INTC_MODE(handle)](addr, handle, intc_reg_fns\
 						    [_INTC_FN(handle)], irq);
@@ -253,6 +258,10 @@ static void intc_disable(unsigned int irq)
 	unsigned int cpu;
 
 	for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_D(handle)); cpu++) {
+#ifdef CONFIG_SMP
+		if (!cpumask_test_cpu(cpu, irq_to_desc(irq)->affinity))
+			continue;
+#endif
 		addr = INTC_REG(d, _INTC_ADDR_D(handle), cpu);
 		intc_disable_fns[_INTC_MODE(handle)](addr, handle,intc_reg_fns\
 						     [_INTC_FN(handle)], irq);
@@ -301,6 +310,23 @@ static int intc_set_wake(unsigned int irq, unsigned int on)
 	return 0; /* allow wakeup, but setup hardware in intc_suspend() */
 }
 
+#ifdef CONFIG_SMP
+/*
+ * This is held with the irq desc lock held, so we don't require any
+ * additional locking here at the intc desc level. The affinity mask is
+ * later tested in the enable/disable paths.
+ */
+static int intc_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	if (!cpumask_intersects(cpumask, cpu_online_mask))
+		return -1;
+
+	cpumask_copy(irq_to_desc(irq)->affinity, cpumask);
+
+	return 0;
+}
+#endif
+
 static void intc_mask_ack(unsigned int irq)
 {
 	struct intc_desc_int *d = get_intc_desc(irq);
@@ -847,6 +873,9 @@ void __init register_intc_controller(struct intc_desc *desc)
 	d->chip.shutdown = intc_disable;
 	d->chip.set_type = intc_set_sense;
 	d->chip.set_wake = intc_set_wake;
+#ifdef CONFIG_SMP
+	d->chip.set_affinity = intc_set_affinity;
+#endif
 
 	if (hw->ack_regs) {
 		for (i = 0; i < hw->nr_ack_regs; i++)
diff --git a/drivers/staging/et131x/et1310_mac.c b/drivers/staging/et131x/et1310_mac.c
index a292b1edc41..737a9f5401d 100644
--- a/drivers/staging/et131x/et1310_mac.c
+++ b/drivers/staging/et131x/et1310_mac.c
@@ -226,7 +226,7 @@ void ConfigMACRegs2(struct et131x_adapter *etdev)
 	}
 
 	/* Enable TXMAC */
-	ctl |= 0x05;	/* TX mac enable, FC disable */
+	ctl |= 0x09;	/* TX mac enable, FC disable */
 	writel(ctl, &etdev->regs->txmac.ctl);
 
 	/* Ready to start the RXDMA/TXDMA engine */
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 975d556b478..be6331e2c27 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1441,7 +1441,7 @@ static int acm_resume(struct usb_interface *intf)
 			wb = acm->delayed_wb;
 			acm->delayed_wb = NULL;
 			spin_unlock_irq(&acm->write_lock);
-			acm_start_wb(acm, acm->delayed_wb);
+			acm_start_wb(acm, wb);
 		} else {
 			spin_unlock_irq(&acm->write_lock);
 		}
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 18aafcb08fc..189141ca4e0 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -52,7 +52,8 @@ MODULE_DEVICE_TABLE (usb, wdm_ids);
 #define WDM_READ		4
 #define WDM_INT_STALL		5
 #define WDM_POLL_RUNNING	6
-
+#define WDM_RESPONDING		7
+#define WDM_SUSPENDING		8
 
 #define WDM_MAX			16
 
@@ -87,9 +88,7 @@ struct wdm_device {
 	int			count;
 	dma_addr_t		shandle;
 	dma_addr_t		ihandle;
-	struct mutex		wlock;
-	struct mutex		rlock;
-	struct mutex		plock;
+	struct mutex		lock;
 	wait_queue_head_t	wait;
 	struct work_struct	rxwork;
 	int			werr;
@@ -117,21 +116,22 @@ static void wdm_in_callback(struct urb *urb)
 	int status = urb->status;
 
 	spin_lock(&desc->iuspin);
+	clear_bit(WDM_RESPONDING, &desc->flags);
 
 	if (status) {
 		switch (status) {
 		case -ENOENT:
 			dev_dbg(&desc->intf->dev,
 				"nonzero urb status received: -ENOENT");
-			break;
+			goto skip_error;
 		case -ECONNRESET:
 			dev_dbg(&desc->intf->dev,
 				"nonzero urb status received: -ECONNRESET");
-			break;
+			goto skip_error;
 		case -ESHUTDOWN:
 			dev_dbg(&desc->intf->dev,
 				"nonzero urb status received: -ESHUTDOWN");
-			break;
+			goto skip_error;
 		case -EPIPE:
 			dev_err(&desc->intf->dev,
 				"nonzero urb status received: -EPIPE\n");
@@ -147,6 +147,7 @@ static void wdm_in_callback(struct urb *urb)
 	desc->reslength = urb->actual_length;
 	memmove(desc->ubuf + desc->length, desc->inbuf, desc->reslength);
 	desc->length += desc->reslength;
+skip_error:
 	wake_up(&desc->wait);
 
 	set_bit(WDM_READ, &desc->flags);
@@ -229,13 +230,16 @@ static void wdm_int_callback(struct urb *urb)
 	desc->response->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 	spin_lock(&desc->iuspin);
 	clear_bit(WDM_READ, &desc->flags);
-	if (!test_bit(WDM_DISCONNECTING, &desc->flags)) {
+	set_bit(WDM_RESPONDING, &desc->flags);
+	if (!test_bit(WDM_DISCONNECTING, &desc->flags)
+		&& !test_bit(WDM_SUSPENDING, &desc->flags)) {
 		rv = usb_submit_urb(desc->response, GFP_ATOMIC);
 		dev_dbg(&desc->intf->dev, "%s: usb_submit_urb %d",
 			__func__, rv);
 	}
 	spin_unlock(&desc->iuspin);
 	if (rv < 0) {
+		clear_bit(WDM_RESPONDING, &desc->flags);
 		if (rv == -EPERM)
 			return;
 		if (rv == -ENOMEM) {
@@ -305,14 +309,38 @@ static ssize_t wdm_write
 	if (we < 0)
 		return -EIO;
 
-	r = mutex_lock_interruptible(&desc->wlock); /* concurrent writes */
+	desc->outbuf = buf = kmalloc(count, GFP_KERNEL);
+	if (!buf) {
+		rv = -ENOMEM;
+		goto outnl;
+	}
+
+	r = copy_from_user(buf, buffer, count);
+	if (r > 0) {
+		kfree(buf);
+		rv = -EFAULT;
+		goto outnl;
+	}
+
+	/* concurrent writes and disconnect */
+	r = mutex_lock_interruptible(&desc->lock);
 	rv = -ERESTARTSYS;
-	if (r)
+	if (r) {
+		kfree(buf);
 		goto outnl;
+	}
+
+	if (test_bit(WDM_DISCONNECTING, &desc->flags)) {
+		kfree(buf);
+		rv = -ENODEV;
+		goto outnp;
+	}
 
 	r = usb_autopm_get_interface(desc->intf);
-	if (r < 0)
+	if (r < 0) {
+		kfree(buf);
 		goto outnp;
+	}
 
 	if (!file->f_flags && O_NONBLOCK)
 		r = wait_event_interruptible(desc->wait, !test_bit(WDM_IN_USE,
@@ -320,24 +348,8 @@ static ssize_t wdm_write
 	else
 		if (test_bit(WDM_IN_USE, &desc->flags))
 			r = -EAGAIN;
-	if (r < 0)
-		goto out;
-
-	if (test_bit(WDM_DISCONNECTING, &desc->flags)) {
-		rv = -ENODEV;
-		goto out;
-	}
-
-	desc->outbuf = buf = kmalloc(count, GFP_KERNEL);
-	if (!buf) {
-		rv = -ENOMEM;
-		goto out;
-	}
-
-	r = copy_from_user(buf, buffer, count);
-	if (r > 0) {
+	if (r < 0) {
 		kfree(buf);
-		rv = -EFAULT;
 		goto out;
 	}
 
@@ -374,7 +386,7 @@ static ssize_t wdm_write
 out:
 	usb_autopm_put_interface(desc->intf);
 outnp:
-	mutex_unlock(&desc->wlock);
+	mutex_unlock(&desc->lock);
 outnl:
 	return rv < 0 ? rv : count;
 }
@@ -387,7 +399,7 @@ static ssize_t wdm_read
 	struct wdm_device *desc = file->private_data;
 
 
-	rv = mutex_lock_interruptible(&desc->rlock); /*concurrent reads */
+	rv = mutex_lock_interruptible(&desc->lock); /*concurrent reads */
 	if (rv < 0)
 		return -ERESTARTSYS;
 
@@ -424,11 +436,8 @@ retry:
 		spin_lock_irq(&desc->iuspin);
 
 		if (desc->rerr) { /* read completed, error happened */
-			int t = desc->rerr;
 			desc->rerr = 0;
 			spin_unlock_irq(&desc->iuspin);
-			dev_err(&desc->intf->dev,
-				"reading had resulted in %d\n", t);
 			rv = -EIO;
 			goto err;
 		}
@@ -465,9 +474,7 @@ retry:
 	rv = cntr;
 
 err:
-	mutex_unlock(&desc->rlock);
-	if (rv < 0 && rv != -EAGAIN)
-		dev_err(&desc->intf->dev, "wdm_read: exit error\n");
+	mutex_unlock(&desc->lock);
 	return rv;
 }
 
@@ -533,7 +540,7 @@ static int wdm_open(struct inode *inode, struct file *file)
 	}
 	intf->needs_remote_wakeup = 1;
 
-	mutex_lock(&desc->plock);
+	mutex_lock(&desc->lock);
 	if (!desc->count++) {
 		rv = usb_submit_urb(desc->validity, GFP_KERNEL);
 		if (rv < 0) {
@@ -544,7 +551,7 @@ static int wdm_open(struct inode *inode, struct file *file)
 	} else {
 		rv = 0;
 	}
-	mutex_unlock(&desc->plock);
+	mutex_unlock(&desc->lock);
 	usb_autopm_put_interface(desc->intf);
 out:
 	mutex_unlock(&wdm_mutex);
@@ -556,9 +563,9 @@ static int wdm_release(struct inode *inode, struct file *file)
 	struct wdm_device *desc = file->private_data;
 
 	mutex_lock(&wdm_mutex);
-	mutex_lock(&desc->plock);
+	mutex_lock(&desc->lock);
 	desc->count--;
-	mutex_unlock(&desc->plock);
+	mutex_unlock(&desc->lock);
 
 	if (!desc->count) {
 		dev_dbg(&desc->intf->dev, "wdm_release: cleanup");
@@ -655,9 +662,7 @@ next_desc:
 	desc = kzalloc(sizeof(struct wdm_device), GFP_KERNEL);
 	if (!desc)
 		goto out;
-	mutex_init(&desc->wlock);
-	mutex_init(&desc->rlock);
-	mutex_init(&desc->plock);
+	mutex_init(&desc->lock);
 	spin_lock_init(&desc->iuspin);
 	init_waitqueue_head(&desc->wait);
 	desc->wMaxCommand = maxcom;
@@ -771,14 +776,17 @@ static void wdm_disconnect(struct usb_interface *intf)
 	/* to terminate pending flushes */
 	clear_bit(WDM_IN_USE, &desc->flags);
 	spin_unlock_irqrestore(&desc->iuspin, flags);
-	cancel_work_sync(&desc->rxwork);
+	mutex_lock(&desc->lock);
 	kill_urbs(desc);
+	cancel_work_sync(&desc->rxwork);
+	mutex_unlock(&desc->lock);
 	wake_up_all(&desc->wait);
 	if (!desc->count)
 		cleanup(desc);
 	mutex_unlock(&wdm_mutex);
 }
 
+#ifdef CONFIG_PM
 static int wdm_suspend(struct usb_interface *intf, pm_message_t message)
 {
 	struct wdm_device *desc = usb_get_intfdata(intf);
@@ -786,22 +794,30 @@ static int wdm_suspend(struct usb_interface *intf, pm_message_t message)
 
 	dev_dbg(&desc->intf->dev, "wdm%d_suspend\n", intf->minor);
 
-	mutex_lock(&desc->plock);
-#ifdef CONFIG_PM
+	/* if this is an autosuspend the caller does the locking */
+	if (!(message.event & PM_EVENT_AUTO))
+		mutex_lock(&desc->lock);
+	spin_lock_irq(&desc->iuspin);
+
 	if ((message.event & PM_EVENT_AUTO) &&
-			test_bit(WDM_IN_USE, &desc->flags)) {
+			(test_bit(WDM_IN_USE, &desc->flags)
+			|| test_bit(WDM_RESPONDING, &desc->flags))) {
+		spin_unlock_irq(&desc->iuspin);
 		rv = -EBUSY;
 	} else {
-#endif
-		cancel_work_sync(&desc->rxwork);
+
+		set_bit(WDM_SUSPENDING, &desc->flags);
+		spin_unlock_irq(&desc->iuspin);
+		/* callback submits work - order is essential */
 		kill_urbs(desc);
-#ifdef CONFIG_PM
+		cancel_work_sync(&desc->rxwork);
 	}
-#endif
-	mutex_unlock(&desc->plock);
+	if (!(message.event & PM_EVENT_AUTO))
+		mutex_unlock(&desc->lock);
 
 	return rv;
 }
+#endif
 
 static int recover_from_urb_loss(struct wdm_device *desc)
 {
@@ -815,23 +831,27 @@ static int recover_from_urb_loss(struct wdm_device *desc)
 	}
 	return rv;
 }
+
+#ifdef CONFIG_PM
 static int wdm_resume(struct usb_interface *intf)
 {
 	struct wdm_device *desc = usb_get_intfdata(intf);
 	int rv;
 
 	dev_dbg(&desc->intf->dev, "wdm%d_resume\n", intf->minor);
-	mutex_lock(&desc->plock);
+
+	clear_bit(WDM_SUSPENDING, &desc->flags);
 	rv = recover_from_urb_loss(desc);
-	mutex_unlock(&desc->plock);
+
 	return rv;
 }
+#endif
 
 static int wdm_pre_reset(struct usb_interface *intf)
 {
 	struct wdm_device *desc = usb_get_intfdata(intf);
 
-	mutex_lock(&desc->plock);
+	mutex_lock(&desc->lock);
 	return 0;
 }
 
@@ -841,7 +861,7 @@ static int wdm_post_reset(struct usb_interface *intf)
 	int rv;
 
 	rv = recover_from_urb_loss(desc);
-	mutex_unlock(&desc->plock);
+	mutex_unlock(&desc->lock);
 	return 0;
 }
 
@@ -849,9 +869,11 @@ static struct usb_driver wdm_driver = {
 	.name =		"cdc_wdm",
 	.probe =	wdm_probe,
 	.disconnect =	wdm_disconnect,
+#ifdef CONFIG_PM
 	.suspend =	wdm_suspend,
 	.resume =	wdm_resume,
 	.reset_resume =	wdm_resume,
+#endif
 	.pre_reset =	wdm_pre_reset,
 	.post_reset =	wdm_post_reset,
 	.id_table =	wdm_ids,
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index e909ff7b909..3466fdc5bb1 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -1207,6 +1207,13 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 			free_async(as);
 			return -ENOMEM;
 		}
+		/* Isochronous input data may end up being discontiguous
+		 * if some of the packets are short.  Clear the buffer so
+		 * that the gaps don't leak kernel data to userspace.
+		 */
+		if (is_in && uurb->type == USBDEVFS_URB_TYPE_ISO)
+			memset(as->urb->transfer_buffer, 0,
+					uurb->buffer_length);
 	}
 	as->urb->dev = ps->dev;
 	as->urb->pipe = (uurb->type << 30) |
@@ -1345,10 +1352,14 @@ static int processcompl(struct async *as, void __user * __user *arg)
 	void __user *addr = as->userurb;
 	unsigned int i;
 
-	if (as->userbuffer && urb->actual_length)
-		if (copy_to_user(as->userbuffer, urb->transfer_buffer,
-				 urb->actual_length))
+	if (as->userbuffer && urb->actual_length) {
+		if (urb->number_of_packets > 0)		/* Isochronous */
+			i = urb->transfer_buffer_length;
+		else					/* Non-Isoc */
+			i = urb->actual_length;
+		if (copy_to_user(as->userbuffer, urb->transfer_buffer, i))
 			goto err_out;
+	}
 	if (put_user(as->status, &userurb->status))
 		goto err_out;
 	if (put_user(urb->actual_length, &userurb->actual_length))
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 27080561a1c..45a32dadb40 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -453,6 +453,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags)
 			if (urb->interval > (1 << 15))
 				return -EINVAL;
 			max = 1 << 15;
+			break;
 		case USB_SPEED_WIRELESS:
 			if (urb->interval > 16)
 				return -EINVAL;
diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index 7460cd797f4..11a3e0fa433 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -747,7 +747,7 @@ config USB_MASS_STORAGE
 	  which may be used with composite framework.
 
 	  Say "y" to link the driver statically, or "m" to build
-	  a dynamically linked module called "g_file_storage".  If unsure,
+	  a dynamically linked module called "g_mass_storage".  If unsure,
 	  consider File-backed Storage Gadget.
 
 config USB_G_SERIAL
diff --git a/drivers/usb/gadget/epautoconf.c b/drivers/usb/gadget/epautoconf.c
index 65a5f94cbc0..3568de210f7 100644
--- a/drivers/usb/gadget/epautoconf.c
+++ b/drivers/usb/gadget/epautoconf.c
@@ -266,7 +266,7 @@ struct usb_ep * __init usb_ep_autoconfig (
 		}
 
 #ifdef CONFIG_BLACKFIN
-	} else if (gadget_is_musbhsfc(gadget) || gadget_is_musbhdrc(gadget)) {
+	} else if (gadget_is_musbhdrc(gadget)) {
 		if ((USB_ENDPOINT_XFER_BULK == type) ||
 		    (USB_ENDPOINT_XFER_ISOC == type)) {
 			if (USB_DIR_IN & desc->bEndpointAddress)
diff --git a/drivers/usb/gadget/f_mass_storage.c b/drivers/usb/gadget/f_mass_storage.c
index 5a3cdd08f1d..f4911c09022 100644
--- a/drivers/usb/gadget/f_mass_storage.c
+++ b/drivers/usb/gadget/f_mass_storage.c
@@ -2910,7 +2910,7 @@ static void fsg_unbind(struct usb_configuration *c, struct usb_function *f)
 }
 
 
-static int fsg_bind(struct usb_configuration *c, struct usb_function *f)
+static int __init fsg_bind(struct usb_configuration *c, struct usb_function *f)
 {
 	struct fsg_dev		*fsg = fsg_from_func(f);
 	struct usb_gadget	*gadget = c->cdev->gadget;
@@ -2954,7 +2954,6 @@ static int fsg_bind(struct usb_configuration *c, struct usb_function *f)
 autoconf_fail:
 	ERROR(fsg, "unable to autoconfigure all endpoints\n");
 	rc = -ENOTSUPP;
-	fsg_unbind(c, f);
 	return rc;
 }
 
diff --git a/drivers/usb/gadget/gadget_chips.h b/drivers/usb/gadget/gadget_chips.h
index 1edbc12fff1..e511fec9f26 100644
--- a/drivers/usb/gadget/gadget_chips.h
+++ b/drivers/usb/gadget/gadget_chips.h
@@ -136,6 +136,12 @@
 #define	gadget_is_r8a66597(g)	0
 #endif
 
+#ifdef CONFIG_USB_S3C_HSOTG
+#define gadget_is_s3c_hsotg(g)    (!strcmp("s3c-hsotg", (g)->name))
+#else
+#define gadget_is_s3c_hsotg(g)    0
+#endif
+
 
 /**
  * usb_gadget_controller_number - support bcdDevice id convention
@@ -192,6 +198,8 @@ static inline int usb_gadget_controller_number(struct usb_gadget *gadget)
 		return 0x24;
 	else if (gadget_is_r8a66597(gadget))
 		return 0x25;
+	else if (gadget_is_s3c_hsotg(gadget))
+		return 0x26;
 	return -ENOENT;
 }
 
diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c
index e8edc640381..1088d08c7ed 100644
--- a/drivers/usb/gadget/goku_udc.c
+++ b/drivers/usb/gadget/goku_udc.c
@@ -1768,7 +1768,7 @@ static int goku_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	 * usb_gadget_driver_{register,unregister}() must change.
 	 */
 	if (the_controller) {
-		WARNING(dev, "ignoring %s\n", pci_name(pdev));
+		pr_warning("ignoring %s\n", pci_name(pdev));
 		return -EBUSY;
 	}
 	if (!pdev->irq) {
diff --git a/drivers/usb/gadget/multi.c b/drivers/usb/gadget/multi.c
index 76496f5d272..a930d7fd7e7 100644
--- a/drivers/usb/gadget/multi.c
+++ b/drivers/usb/gadget/multi.c
@@ -211,8 +211,6 @@ static int __init cdc_do_config(struct usb_configuration *c)
 	ret = fsg_add(c->cdev, c, fsg_common);
 	if (ret < 0)
 		return ret;
-	if (ret < 0)
-		return ret;
 
 	return 0;
 }
diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile
index 4e0c67f1f51..b6315aa47f7 100644
--- a/drivers/usb/host/Makefile
+++ b/drivers/usb/host/Makefile
@@ -12,7 +12,7 @@ fhci-objs := fhci-hcd.o fhci-hub.o fhci-q.o fhci-mem.o \
 ifeq ($(CONFIG_FHCI_DEBUG),y)
 fhci-objs += fhci-dbg.o
 endif
-xhci-objs := xhci-hcd.o xhci-mem.o xhci-pci.o xhci-ring.o xhci-hub.o xhci-dbg.o
+xhci-hcd-objs := xhci.o xhci-mem.o xhci-pci.o xhci-ring.o xhci-hub.o xhci-dbg.o
 
 obj-$(CONFIG_USB_WHCI_HCD)	+= whci/
 
@@ -25,7 +25,7 @@ obj-$(CONFIG_USB_ISP1362_HCD)	+= isp1362-hcd.o
 obj-$(CONFIG_USB_OHCI_HCD)	+= ohci-hcd.o
 obj-$(CONFIG_USB_UHCI_HCD)	+= uhci-hcd.o
 obj-$(CONFIG_USB_FHCI_HCD)	+= fhci.o
-obj-$(CONFIG_USB_XHCI_HCD)	+= xhci.o
+obj-$(CONFIG_USB_XHCI_HCD)	+= xhci-hcd.o
 obj-$(CONFIG_USB_SL811_HCD)	+= sl811-hcd.o
 obj-$(CONFIG_USB_SL811_CS)	+= sl811_cs.o
 obj-$(CONFIG_USB_U132_HCD)	+= u132-hcd.o
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index d8d6d3461d3..dc55a62859c 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -995,7 +995,7 @@ rescan:
 	/* endpoints can be iso streams.  for now, we don't
 	 * accelerate iso completions ... so spin a while.
 	 */
-	if (qh->hw->hw_info1 == 0) {
+	if (qh->hw == NULL) {
 		ehci_vdbg (ehci, "iso delay\n");
 		goto idle_timeout;
 	}
diff --git a/drivers/usb/host/ehci-sched.c b/drivers/usb/host/ehci-sched.c
index 39340ae00ac..a0aaaaff256 100644
--- a/drivers/usb/host/ehci-sched.c
+++ b/drivers/usb/host/ehci-sched.c
@@ -1123,8 +1123,8 @@ iso_stream_find (struct ehci_hcd *ehci, struct urb *urb)
 					urb->interval);
 		}
 
-	/* if dev->ep [epnum] is a QH, info1.maxpacket is nonzero */
-	} else if (unlikely (stream->hw_info1 != 0)) {
+	/* if dev->ep [epnum] is a QH, hw is set */
+	} else if (unlikely (stream->hw != NULL)) {
 		ehci_dbg (ehci, "dev %s ep%d%s, not iso??\n",
 			urb->dev->devpath, epnum,
 			usb_pipein(urb->pipe) ? "in" : "out");
@@ -1565,13 +1565,27 @@ itd_patch(
 static inline void
 itd_link (struct ehci_hcd *ehci, unsigned frame, struct ehci_itd *itd)
 {
-	/* always prepend ITD/SITD ... only QH tree is order-sensitive */
-	itd->itd_next = ehci->pshadow [frame];
-	itd->hw_next = ehci->periodic [frame];
-	ehci->pshadow [frame].itd = itd;
+	union ehci_shadow	*prev = &ehci->pshadow[frame];
+	__hc32			*hw_p = &ehci->periodic[frame];
+	union ehci_shadow	here = *prev;
+	__hc32			type = 0;
+
+	/* skip any iso nodes which might belong to previous microframes */
+	while (here.ptr) {
+		type = Q_NEXT_TYPE(ehci, *hw_p);
+		if (type == cpu_to_hc32(ehci, Q_TYPE_QH))
+			break;
+		prev = periodic_next_shadow(ehci, prev, type);
+		hw_p = shadow_next_periodic(ehci, &here, type);
+		here = *prev;
+	}
+
+	itd->itd_next = here;
+	itd->hw_next = *hw_p;
+	prev->itd = itd;
 	itd->frame = frame;
 	wmb ();
-	ehci->periodic[frame] = cpu_to_hc32(ehci, itd->itd_dma | Q_TYPE_ITD);
+	*hw_p = cpu_to_hc32(ehci, itd->itd_dma | Q_TYPE_ITD);
 }
 
 /* fit urb's itds into the selected schedule slot; activate as needed */
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index 2d85e21ff28..b1dce96dd62 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -394,9 +394,8 @@ struct ehci_iso_sched {
  * acts like a qh would, if EHCI had them for ISO.
  */
 struct ehci_iso_stream {
-	/* first two fields match QH, but info1 == 0 */
-	__hc32			hw_next;
-	__hc32			hw_info1;
+	/* first field matches ehci_hq, but is NULL */
+	struct ehci_qh_hw	*hw;
 
 	u32			refcount;
 	u8			bEndpointAddress;
diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c
index bee558aed42..f71a73a93d0 100644
--- a/drivers/usb/host/r8a66597-hcd.c
+++ b/drivers/usb/host/r8a66597-hcd.c
@@ -418,7 +418,7 @@ static u8 alloc_usb_address(struct r8a66597 *r8a66597, struct urb *urb)
 
 /* this function must be called with interrupt disabled */
 static void free_usb_address(struct r8a66597 *r8a66597,
-			     struct r8a66597_device *dev)
+			     struct r8a66597_device *dev, int reset)
 {
 	int port;
 
@@ -430,7 +430,13 @@ static void free_usb_address(struct r8a66597 *r8a66597,
 	dev->state = USB_STATE_DEFAULT;
 	r8a66597->address_map &= ~(1 << dev->address);
 	dev->address = 0;
-	dev_set_drvdata(&dev->udev->dev, NULL);
+	/*
+	 * Only when resetting USB, it is necessary to erase drvdata. When
+	 * a usb device with usb hub is disconnect, "dev->udev" is already
+	 * freed on usb_desconnect(). So we cannot access the data.
+	 */
+	if (reset)
+		dev_set_drvdata(&dev->udev->dev, NULL);
 	list_del(&dev->device_list);
 	kfree(dev);
 
@@ -1069,7 +1075,7 @@ static void r8a66597_usb_disconnect(struct r8a66597 *r8a66597, int port)
 	struct r8a66597_device *dev = r8a66597->root_hub[port].dev;
 
 	disable_r8a66597_pipe_all(r8a66597, dev);
-	free_usb_address(r8a66597, dev);
+	free_usb_address(r8a66597, dev, 0);
 
 	start_root_hub_sampling(r8a66597, port, 0);
 }
@@ -2085,7 +2091,7 @@ static void update_usb_address_map(struct r8a66597 *r8a66597,
 				spin_lock_irqsave(&r8a66597->lock, flags);
 				dev = get_r8a66597_device(r8a66597, addr);
 				disable_r8a66597_pipe_all(r8a66597, dev);
-				free_usb_address(r8a66597, dev);
+				free_usb_address(r8a66597, dev, 0);
 				put_child_connect_map(r8a66597, addr);
 				spin_unlock_irqrestore(&r8a66597->lock, flags);
 			}
@@ -2228,7 +2234,7 @@ static int r8a66597_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
 			rh->port |= (1 << USB_PORT_FEAT_RESET);
 
 			disable_r8a66597_pipe_all(r8a66597, dev);
-			free_usb_address(r8a66597, dev);
+			free_usb_address(r8a66597, dev, 1);
 
 			r8a66597_mdfy(r8a66597, USBRST, USBRST | UACT,
 				      get_dvstctr_reg(port));
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index 49f7d72f8b1..bba9b19ed1b 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -566,8 +566,13 @@ static inline unsigned int xhci_get_endpoint_interval(struct usb_device *udev,
 			if (interval < 3)
 				interval = 3;
 			if ((1 << interval) != 8*ep->desc.bInterval)
-				dev_warn(&udev->dev, "ep %#x - rounding interval to %d microframes\n",
-						ep->desc.bEndpointAddress, 1 << interval);
+				dev_warn(&udev->dev,
+						"ep %#x - rounding interval"
+						" to %d microframes, "
+						"ep desc says %d microframes\n",
+						ep->desc.bEndpointAddress,
+						1 << interval,
+						8*ep->desc.bInterval);
 		}
 		break;
 	default:
diff --git a/drivers/usb/host/xhci-hcd.c b/drivers/usb/host/xhci.c
index 4cb69e0af83..492a61c2c79 100644
--- a/drivers/usb/host/xhci-hcd.c
+++ b/drivers/usb/host/xhci.c
@@ -1173,6 +1173,7 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci,
 		cmd_completion = &virt_dev->cmd_completion;
 		cmd_status = &virt_dev->cmd_status;
 	}
+	init_completion(cmd_completion);
 
 	if (!ctx_change)
 		ret = xhci_queue_configure_endpoint(xhci, in_ctx->dma,
diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c
index b4bbf8f2c23..0e8b8ab1d16 100644
--- a/drivers/usb/musb/musb_core.c
+++ b/drivers/usb/musb/musb_core.c
@@ -379,7 +379,6 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 				u8 devctl, u8 power)
 {
 	irqreturn_t handled = IRQ_NONE;
-	void __iomem *mbase = musb->mregs;
 
 	DBG(3, "<== Power=%02x, DevCtl=%02x, int_usb=0x%x\n", power, devctl,
 		int_usb);
@@ -394,6 +393,8 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 
 		if (devctl & MUSB_DEVCTL_HM) {
 #ifdef CONFIG_USB_MUSB_HDRC_HCD
+			void __iomem *mbase = musb->mregs;
+
 			switch (musb->xceiv->state) {
 			case OTG_STATE_A_SUSPEND:
 				/* remote wakeup?  later, GetPortStatus
@@ -471,6 +472,8 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 #ifdef CONFIG_USB_MUSB_HDRC_HCD
 	/* see manual for the order of the tests */
 	if (int_usb & MUSB_INTR_SESSREQ) {
+		void __iomem *mbase = musb->mregs;
+
 		DBG(1, "SESSION_REQUEST (%s)\n", otg_state_string(musb));
 
 		/* IRQ arrives from ID pin sense or (later, if VBUS power
@@ -519,6 +522,8 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 		case OTG_STATE_A_WAIT_BCON:
 		case OTG_STATE_A_WAIT_VRISE:
 			if (musb->vbuserr_retry) {
+				void __iomem *mbase = musb->mregs;
+
 				musb->vbuserr_retry--;
 				ignore = 1;
 				devctl |= MUSB_DEVCTL_SESSION;
@@ -622,6 +627,7 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 
 	if (int_usb & MUSB_INTR_CONNECT) {
 		struct usb_hcd *hcd = musb_to_hcd(musb);
+		void __iomem *mbase = musb->mregs;
 
 		handled = IRQ_HANDLED;
 		musb->is_active = 1;
@@ -2007,7 +2013,6 @@ bad_config:
 	/* host side needs more setup */
 	if (is_host_enabled(musb)) {
 		struct usb_hcd	*hcd = musb_to_hcd(musb);
-		u8 busctl;
 
 		otg_set_host(musb->xceiv, &hcd->self);
 
@@ -2018,9 +2023,9 @@ bad_config:
 
 		/* program PHY to use external vBus if required */
 		if (plat->extvbus) {
-			busctl = musb_readb(musb->mregs, MUSB_ULPI_BUSCONTROL);
+			u8 busctl = musb_read_ulpi_buscontrol(musb->mregs);
 			busctl |= MUSB_ULPI_USE_EXTVBUS;
-			musb_writeb(musb->mregs, MUSB_ULPI_BUSCONTROL, busctl);
+			musb_write_ulpi_buscontrol(musb->mregs, busctl);
 		}
 	}
 
diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h
index d849fb81c13..cd9f4a9a06c 100644
--- a/drivers/usb/musb/musb_core.h
+++ b/drivers/usb/musb/musb_core.h
@@ -469,7 +469,7 @@ struct musb_csr_regs {
 
 struct musb_context_registers {
 
-#if defined(CONFIG_ARCH_OMAP34XX) || defined(CONFIG_ARCH_OMAP2430)
+#ifdef CONFIG_PM
 	u32 otg_sysconfig, otg_forcestandby;
 #endif
 	u8 power;
@@ -483,7 +483,7 @@ struct musb_context_registers {
 	struct musb_csr_regs index_regs[MUSB_C_NUM_EPS];
 };
 
-#if defined(CONFIG_ARCH_OMAP34XX) || defined(CONFIG_ARCH_OMAP2430)
+#ifdef CONFIG_PM
 extern void musb_platform_save_context(struct musb *musb,
 		struct musb_context_registers *musb_context);
 extern void musb_platform_restore_context(struct musb *musb,
diff --git a/drivers/usb/musb/musb_host.c b/drivers/usb/musb/musb_host.c
index 3421cf9858b..dec896e888d 100644
--- a/drivers/usb/musb/musb_host.c
+++ b/drivers/usb/musb/musb_host.c
@@ -1689,7 +1689,7 @@ void musb_host_rx(struct musb *musb, u8 epnum)
 				dma->desired_mode = 1;
 			if (rx_count < hw_ep->max_packet_sz_rx) {
 				length = rx_count;
-				dma->bDesiredMode = 0;
+				dma->desired_mode = 0;
 			} else {
 				length = urb->transfer_buffer_length;
 			}
diff --git a/drivers/usb/musb/musb_regs.h b/drivers/usb/musb/musb_regs.h
index 8d8062b10e2..fa55aacc385 100644
--- a/drivers/usb/musb/musb_regs.h
+++ b/drivers/usb/musb/musb_regs.h
@@ -326,6 +326,11 @@ static inline void  musb_write_rxfifoadd(void __iomem *mbase, u16 c_off)
 	musb_writew(mbase, MUSB_RXFIFOADD, c_off);
 }
 
+static inline void musb_write_ulpi_buscontrol(void __iomem *mbase, u8 val)
+{
+	musb_writeb(mbase, MUSB_ULPI_BUSCONTROL, val);
+}
+
 static inline u8 musb_read_txfifosz(void __iomem *mbase)
 {
 	return musb_readb(mbase, MUSB_TXFIFOSZ);
@@ -346,6 +351,11 @@ static inline u16  musb_read_rxfifoadd(void __iomem *mbase)
 	return musb_readw(mbase, MUSB_RXFIFOADD);
 }
 
+static inline u8 musb_read_ulpi_buscontrol(void __iomem *mbase)
+{
+	return musb_readb(mbase, MUSB_ULPI_BUSCONTROL);
+}
+
 static inline u8 musb_read_configdata(void __iomem *mbase)
 {
 	musb_writeb(mbase, MUSB_INDEX, 0);
@@ -510,20 +520,33 @@ static inline void  musb_write_rxfifoadd(void __iomem *mbase, u16 c_off)
 {
 }
 
+static inline void musb_write_ulpi_buscontrol(void __iomem *mbase, u8 val)
+{
+}
+
 static inline u8 musb_read_txfifosz(void __iomem *mbase)
 {
+	return 0;
 }
 
 static inline u16 musb_read_txfifoadd(void __iomem *mbase)
 {
+	return 0;
 }
 
 static inline u8 musb_read_rxfifosz(void __iomem *mbase)
 {
+	return 0;
 }
 
 static inline u16  musb_read_rxfifoadd(void __iomem *mbase)
 {
+	return 0;
+}
+
+static inline u8 musb_read_ulpi_buscontrol(void __iomem *mbase)
+{
+	return 0;
 }
 
 static inline u8 musb_read_configdata(void __iomem *mbase)
@@ -577,22 +600,27 @@ static inline void  musb_write_txhubport(void __iomem *mbase, u8 epnum,
 
 static inline u8 musb_read_rxfunaddr(void __iomem *mbase, u8 epnum)
 {
+	return 0;
 }
 
 static inline u8 musb_read_rxhubaddr(void __iomem *mbase, u8 epnum)
 {
+	return 0;
 }
 
 static inline u8 musb_read_rxhubport(void __iomem *mbase, u8 epnum)
 {
+	return 0;
 }
 
 static inline u8  musb_read_txfunaddr(void __iomem *mbase, u8 epnum)
 {
+	return 0;
 }
 
 static inline u8  musb_read_txhubaddr(void __iomem *mbase, u8 epnum)
 {
+	return 0;
 }
 
 static inline void  musb_read_txhubport(void __iomem *mbase, u8 epnum)
diff --git a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig
index c78b255e3f8..a0ecb42cb33 100644
--- a/drivers/usb/serial/Kconfig
+++ b/drivers/usb/serial/Kconfig
@@ -474,14 +474,14 @@ config USB_SERIAL_OTI6858
 
 config USB_SERIAL_QCAUX
 	tristate "USB Qualcomm Auxiliary Serial Port Driver"
-	---help---
+	help
 	  Say Y here if you want to use the auxiliary serial ports provided
 	  by many modems based on Qualcomm chipsets.  These ports often use
 	  a proprietary protocol called DM and cannot be used for AT- or
 	  PPP-based communication.
 
 	  To compile this driver as a module, choose M here: the
-	  module will be called moto_modem.  If unsure, choose N.
+	  module will be called qcaux.  If unsure, choose N.
 
 config USB_SERIAL_QUALCOMM
 	tristate "USB Qualcomm Serial modem"
diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index b22ac325852..f347da2ef00 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c
@@ -181,6 +181,7 @@ static int usb_console_setup(struct console *co, char *options)
 	/* The console is special in terms of closing the device so
 	 * indicate this port is now acting as a system console. */
 	port->console = 1;
+	port->port.console = 1;
 
 	mutex_unlock(&serial->disc_mutex);
 	return retval;
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 507382b0a9e..ec9b0449ccf 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -313,11 +313,6 @@ static int cp210x_set_config(struct usb_serial_port *port, u8 request,
 		return -EPROTO;
 	}
 
-	/* Single data value */
-	result = usb_control_msg(serial->dev,
-			usb_sndctrlpipe(serial->dev, 0),
-			request, REQTYPE_HOST_TO_DEVICE, data[0],
-			0, NULL, 0, 300);
 	return 0;
 }
 
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 6af0dfa5f5a..1d7c4fac02e 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -91,7 +91,7 @@ struct ftdi_private {
 	unsigned long tx_outstanding_bytes;
 	unsigned long tx_outstanding_urbs;
 	unsigned short max_packet_size;
-	struct mutex cfg_lock; /* Avoid mess by parallel calls of config ioctl() */
+	struct mutex cfg_lock; /* Avoid mess by parallel calls of config ioctl() and change_speed() */
 };
 
 /* struct ftdi_sio_quirk is used by devices requiring special attention. */
@@ -658,6 +658,7 @@ static struct usb_device_id id_table_combined [] = {
 	{ USB_DEVICE(EVOLUTION_VID, EVOLUTION_ER1_PID) },
 	{ USB_DEVICE(EVOLUTION_VID, EVO_HYBRID_PID) },
 	{ USB_DEVICE(EVOLUTION_VID, EVO_RCM4_PID) },
+	{ USB_DEVICE(CONTEC_VID, CONTEC_COM1USBH_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_ARTEMIS_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16_PID) },
 	{ USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16C_PID) },
@@ -1272,8 +1273,8 @@ check_and_exit:
 	     (priv->flags & ASYNC_SPD_MASK)) ||
 	    (((priv->flags & ASYNC_SPD_MASK) == ASYNC_SPD_CUST) &&
 	     (old_priv.custom_divisor != priv->custom_divisor))) {
-		mutex_unlock(&priv->cfg_lock);
 		change_speed(tty, port);
+		mutex_unlock(&priv->cfg_lock);
 	}
 	else
 		mutex_unlock(&priv->cfg_lock);
@@ -2264,9 +2265,11 @@ static void ftdi_set_termios(struct tty_struct *tty,
 		clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
 	} else {
 		/* set the baudrate determined before */
+		mutex_lock(&priv->cfg_lock);
 		if (change_speed(tty, port))
 			dev_err(&port->dev, "%s urb failed to set baudrate\n",
 				__func__);
+		mutex_unlock(&priv->cfg_lock);
 		/* Ensure RTS and DTR are raised when baudrate changed from 0 */
 		if (!old_termios || (old_termios->c_cflag & CBAUD) == B0)
 			set_mctrl(port, TIOCM_DTR | TIOCM_RTS);
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 0727e198503..75482cbc399 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -501,6 +501,13 @@
 #define CONTEC_COM1USBH_PID	0x8311	/* COM-1(USB)H */
 
 /*
+ * Contec products (http://www.contec.com)
+ * Submitted by Daniel Sangorrin
+ */
+#define CONTEC_VID		0x06CE	/* Vendor ID */
+#define CONTEC_COM1USBH_PID	0x8311	/* COM-1(USB)H */
+
+/*
  * Definitions for B&B Electronics products.
  */
 #define BANDB_VID		0x0856	/* B&B Electronics Vendor ID */
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 89fac36684c..f804acb138e 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -130,7 +130,7 @@ int usb_serial_generic_open(struct tty_struct *tty, struct usb_serial_port *port
 	spin_unlock_irqrestore(&port->lock, flags);
 
 	/* if we have a bulk endpoint, start reading from it */
-	if (serial->num_bulk_in) {
+	if (port->bulk_in_size) {
 		/* Start reading from the device */
 		usb_fill_bulk_urb(port->read_urb, serial->dev,
 				   usb_rcvbulkpipe(serial->dev,
@@ -159,10 +159,10 @@ static void generic_cleanup(struct usb_serial_port *port)
 	dbg("%s - port %d", __func__, port->number);
 
 	if (serial->dev) {
-		/* shutdown any bulk reads that might be going on */
-		if (serial->num_bulk_out)
+		/* shutdown any bulk transfers that might be going on */
+		if (port->bulk_out_size)
 			usb_kill_urb(port->write_urb);
-		if (serial->num_bulk_in)
+		if (port->bulk_in_size)
 			usb_kill_urb(port->read_urb);
 	}
 }
@@ -333,15 +333,15 @@ int usb_serial_generic_write(struct tty_struct *tty,
 
 	dbg("%s - port %d", __func__, port->number);
 
+	/* only do something if we have a bulk out endpoint */
+	if (!port->bulk_out_size)
+		return -ENODEV;
+
 	if (count == 0) {
 		dbg("%s - write request of 0 bytes", __func__);
 		return 0;
 	}
 
-	/* only do something if we have a bulk out endpoint */
-	if (!serial->num_bulk_out)
-		return 0;
-
 	if (serial->type->max_in_flight_urbs)
 		return usb_serial_multi_urb_write(tty, port,
 						  buf, count);
@@ -364,14 +364,19 @@ int usb_serial_generic_write_room(struct tty_struct *tty)
 	int room = 0;
 
 	dbg("%s - port %d", __func__, port->number);
+
+	if (!port->bulk_out_size)
+		return 0;
+
 	spin_lock_irqsave(&port->lock, flags);
 	if (serial->type->max_in_flight_urbs) {
 		if (port->urbs_in_flight < serial->type->max_in_flight_urbs)
 			room = port->bulk_out_size *
 				(serial->type->max_in_flight_urbs -
 				 port->urbs_in_flight);
-	} else if (serial->num_bulk_out)
+	} else {
 		room = kfifo_avail(&port->write_fifo);
+	}
 	spin_unlock_irqrestore(&port->lock, flags);
 
 	dbg("%s - returns %d", __func__, room);
@@ -382,15 +387,18 @@ int usb_serial_generic_chars_in_buffer(struct tty_struct *tty)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	struct usb_serial *serial = port->serial;
-	int chars = 0;
 	unsigned long flags;
+	int chars;
 
 	dbg("%s - port %d", __func__, port->number);
 
+	if (!port->bulk_out_size)
+		return 0;
+
 	spin_lock_irqsave(&port->lock, flags);
 	if (serial->type->max_in_flight_urbs)
 		chars = port->tx_bytes_flight;
-	else if (serial->num_bulk_out)
+	else
 		chars = kfifo_len(&port->write_fifo);
 	spin_unlock_irqrestore(&port->lock, flags);
 
@@ -415,11 +423,13 @@ void usb_serial_generic_resubmit_read_urb(struct usb_serial_port *port,
 			   ((serial->type->read_bulk_callback) ?
 			     serial->type->read_bulk_callback :
 			     usb_serial_generic_read_bulk_callback), port);
+
 	result = usb_submit_urb(urb, mem_flags);
-	if (result)
+	if (result && result != -EPERM) {
 		dev_err(&port->dev,
 			"%s - failed resubmitting read urb, error %d\n",
 							__func__, result);
+	}
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_resubmit_read_urb);
 
@@ -498,23 +508,18 @@ void usb_serial_generic_write_bulk_callback(struct urb *urb)
 		if (port->urbs_in_flight < 0)
 			port->urbs_in_flight = 0;
 		spin_unlock_irqrestore(&port->lock, flags);
-
-		if (status) {
-			dbg("%s - nonzero multi-urb write bulk status "
-				"received: %d", __func__, status);
-			return;
-		}
 	} else {
 		port->write_urb_busy = 0;
 
-		if (status) {
-			dbg("%s - nonzero multi-urb write bulk status "
-				"received: %d", __func__, status);
+		if (status)
 			kfifo_reset_out(&port->write_fifo);
-		} else
+		else
 			usb_serial_generic_write_start(port);
 	}
 
+	if (status)
+		dbg("%s - non-zero urb status: %d", __func__, status);
+
 	usb_serial_port_softint(port);
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_write_bulk_callback);
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 847b805d63a..950cb311ca9 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -288,7 +288,9 @@ static int  option_resume(struct usb_serial *serial);
 
 #define QUALCOMM_VENDOR_ID			0x05C6
 
-#define MAXON_VENDOR_ID				0x16d8
+#define CMOTECH_VENDOR_ID			0x16d8
+#define CMOTECH_PRODUCT_6008			0x6008
+#define CMOTECH_PRODUCT_6280			0x6280
 
 #define TELIT_VENDOR_ID				0x1bc7
 #define TELIT_PRODUCT_UC864E			0x1003
@@ -309,6 +311,7 @@ static int  option_resume(struct usb_serial *serial);
 #define DLINK_VENDOR_ID				0x1186
 #define DLINK_PRODUCT_DWM_652			0x3e04
 #define DLINK_PRODUCT_DWM_652_U5		0xce16
+#define DLINK_PRODUCT_DWM_652_U5A		0xce1e
 
 #define QISDA_VENDOR_ID				0x1da5
 #define QISDA_PRODUCT_H21_4512			0x4512
@@ -332,6 +335,24 @@ static int  option_resume(struct usb_serial *serial);
 #define ALCATEL_VENDOR_ID			0x1bbb
 #define ALCATEL_PRODUCT_X060S			0x0000
 
+#define PIRELLI_VENDOR_ID			0x1266
+#define PIRELLI_PRODUCT_C100_1			0x1002
+#define PIRELLI_PRODUCT_C100_2			0x1003
+#define PIRELLI_PRODUCT_1004			0x1004
+#define PIRELLI_PRODUCT_1005			0x1005
+#define PIRELLI_PRODUCT_1006			0x1006
+#define PIRELLI_PRODUCT_1007			0x1007
+#define PIRELLI_PRODUCT_1008			0x1008
+#define PIRELLI_PRODUCT_1009			0x1009
+#define PIRELLI_PRODUCT_100A			0x100a
+#define PIRELLI_PRODUCT_100B			0x100b
+#define PIRELLI_PRODUCT_100C			0x100c
+#define PIRELLI_PRODUCT_100D			0x100d
+#define PIRELLI_PRODUCT_100E			0x100e
+#define PIRELLI_PRODUCT_100F			0x100f
+#define PIRELLI_PRODUCT_1011			0x1011
+#define PIRELLI_PRODUCT_1012			0x1012
+
 /* Airplus products */
 #define AIRPLUS_VENDOR_ID			0x1011
 #define AIRPLUS_PRODUCT_MCD650			0x3198
@@ -547,7 +568,8 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE(KYOCERA_VENDOR_ID, KYOCERA_PRODUCT_KPC680) },
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6000)}, /* ZTE AC8700 */
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6613)}, /* Onda H600/ZTE MF330 */
-	{ USB_DEVICE(MAXON_VENDOR_ID, 0x6280) }, /* BP3-USB & BP3-EXT HSDPA */
+	{ USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6280) }, /* BP3-USB & BP3-EXT HSDPA */
+	{ USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6008) },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UC864E) },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UC864G) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */
@@ -659,6 +681,7 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE(BENQ_VENDOR_ID, BENQ_PRODUCT_H10) },
 	{ USB_DEVICE(DLINK_VENDOR_ID, DLINK_PRODUCT_DWM_652) },
 	{ USB_DEVICE(ALINK_VENDOR_ID, DLINK_PRODUCT_DWM_652_U5) }, /* Yes, ALINK_VENDOR_ID */
+	{ USB_DEVICE(ALINK_VENDOR_ID, DLINK_PRODUCT_DWM_652_U5A) },
 	{ USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H21_4512) },
 	{ USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H21_4523) },
 	{ USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H20_4515) },
@@ -666,7 +689,6 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE(TOSHIBA_VENDOR_ID, TOSHIBA_PRODUCT_G450) },
 	{ USB_DEVICE(TOSHIBA_VENDOR_ID, TOSHIBA_PRODUCT_HSDPA_MINICARD ) }, /* Toshiba 3G HSDPA == Novatel Expedite EU870D MiniCard */
 	{ USB_DEVICE(ALINK_VENDOR_ID, 0x9000) },
-	{ USB_DEVICE(ALINK_VENDOR_ID, 0xce16) },
 	{ USB_DEVICE_AND_INTERFACE_INFO(ALINK_VENDOR_ID, ALINK_PRODUCT_3GU, 0xff, 0xff, 0xff) },
 	{ USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_X060S) },
 	{ USB_DEVICE(AIRPLUS_VENDOR_ID, AIRPLUS_PRODUCT_MCD650) },
@@ -675,6 +697,24 @@ static const struct usb_device_id option_ids[] = {
   	  .driver_info = (kernel_ulong_t)&four_g_w14_blacklist
   	},
 	{ USB_DEVICE(HAIER_VENDOR_ID, HAIER_PRODUCT_CE100) },
+	/* Pirelli  */
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_C100_1)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_C100_2)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1004)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1005)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1006)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1007)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1008)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1009)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100A)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100B) },
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100C) },
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100D) },
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100E) },
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100F) },
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1011)},
+	{ USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1012)},
+
 	{ } /* Terminating entry */
 };
 MODULE_DEVICE_TABLE(usb, option_ids);
@@ -798,12 +838,19 @@ static int option_probe(struct usb_serial *serial,
 			const struct usb_device_id *id)
 {
 	struct option_intf_private *data;
+
 	/* D-Link DWM 652 still exposes CD-Rom emulation interface in modem mode */
 	if (serial->dev->descriptor.idVendor == DLINK_VENDOR_ID &&
 		serial->dev->descriptor.idProduct == DLINK_PRODUCT_DWM_652 &&
 		serial->interface->cur_altsetting->desc.bInterfaceClass == 0x8)
 		return -ENODEV;
 
+	/* Bandrich modem and AT command interface is 0xff */
+	if ((serial->dev->descriptor.idVendor == BANDRICH_VENDOR_ID ||
+		serial->dev->descriptor.idVendor == PIRELLI_VENDOR_ID) &&
+		serial->interface->cur_altsetting->desc.bInterfaceClass != 0xff)
+		return -ENODEV;
+
 	data = serial->private = kzalloc(sizeof(struct option_intf_private), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index 310ff6ec656..53a2d5a935a 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -47,6 +47,35 @@ static const struct usb_device_id id_table[] = {
 	{USB_DEVICE(0x05c6, 0x9221)},	/* Generic Gobi QDL device */
 	{USB_DEVICE(0x05c6, 0x9231)},	/* Generic Gobi QDL device */
 	{USB_DEVICE(0x1f45, 0x0001)},	/* Unknown Gobi QDL device */
+	{USB_DEVICE(0x413c, 0x8185)},	/* Dell Gobi 2000 QDL device (N0218, VU936) */
+	{USB_DEVICE(0x413c, 0x8186)},	/* Dell Gobi 2000 Modem device (N0218, VU936) */
+	{USB_DEVICE(0x05c6, 0x9224)},	/* Sony Gobi 2000 QDL device (N0279, VU730) */
+	{USB_DEVICE(0x05c6, 0x9225)},	/* Sony Gobi 2000 Modem device (N0279, VU730) */
+	{USB_DEVICE(0x05c6, 0x9244)},	/* Samsung Gobi 2000 QDL device (VL176) */
+	{USB_DEVICE(0x05c6, 0x9245)},	/* Samsung Gobi 2000 Modem device (VL176) */
+	{USB_DEVICE(0x03f0, 0x241d)},	/* HP Gobi 2000 QDL device (VP412) */
+	{USB_DEVICE(0x03f0, 0x251d)},	/* HP Gobi 2000 Modem device (VP412) */
+	{USB_DEVICE(0x05c6, 0x9214)},	/* Acer Gobi 2000 QDL device (VP413) */
+	{USB_DEVICE(0x05c6, 0x9215)},	/* Acer Gobi 2000 Modem device (VP413) */
+	{USB_DEVICE(0x05c6, 0x9264)},	/* Asus Gobi 2000 QDL device (VR305) */
+	{USB_DEVICE(0x05c6, 0x9265)},	/* Asus Gobi 2000 Modem device (VR305) */
+	{USB_DEVICE(0x05c6, 0x9234)},	/* Top Global Gobi 2000 QDL device (VR306) */
+	{USB_DEVICE(0x05c6, 0x9235)},	/* Top Global Gobi 2000 Modem device (VR306) */
+	{USB_DEVICE(0x05c6, 0x9274)},	/* iRex Technologies Gobi 2000 QDL device (VR307) */
+	{USB_DEVICE(0x05c6, 0x9275)},	/* iRex Technologies Gobi 2000 Modem device (VR307) */
+	{USB_DEVICE(0x1199, 0x9000)},	/* Sierra Wireless Gobi 2000 QDL device (VT773) */
+	{USB_DEVICE(0x1199, 0x9001)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x1199, 0x9002)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x1199, 0x9003)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x1199, 0x9004)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x1199, 0x9005)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x1199, 0x9006)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x1199, 0x9007)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x1199, 0x9008)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x1199, 0x9009)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x1199, 0x900a)},	/* Sierra Wireless Gobi 2000 Modem device (VT773) */
+	{USB_DEVICE(0x16d8, 0x8001)},	/* CMDTech Gobi 2000 QDL device (VU922) */
+	{USB_DEVICE(0x16d8, 0x8002)},	/* CMDTech Gobi 2000 Modem device (VU922) */
 	{ }				/* Terminating entry */
 };
 MODULE_DEVICE_TABLE(usb, id_table);
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 98b549b1cab..ccf1dbbb87e 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -374,6 +374,15 @@ UNUSUAL_DEV(  0x04ce, 0x0002, 0x0074, 0x0074,
 		US_SC_DEVICE, US_PR_DEVICE, NULL,
 		US_FL_FIX_INQUIRY),
 
+/* Reported by Ondrej Zary <linux@rainbow-software.org>
+ * The device reports one sector more and breaks when that sector is accessed
+ */
+UNUSUAL_DEV(  0x04ce, 0x0002, 0x026c, 0x026c,
+		"ScanLogic",
+		"SL11R-IDE",
+		US_SC_DEVICE, US_PR_DEVICE, NULL,
+		US_FL_FIX_CAPACITY),
+
 /* Reported by Kriston Fincher <kriston@airmail.net>
  * Patch submitted by Sean Millichamp <sean@bruenor.org>
  * This is to support the Panasonic PalmCam PV-SD4090
@@ -1380,20 +1389,6 @@ UNUSUAL_DEV(  0x0f19, 0x0105, 0x0100, 0x0100,
 		US_SC_DEVICE, US_PR_DEVICE, NULL,
 		US_FL_IGNORE_RESIDUE ),
 
-/* Jeremy Katz <katzj@redhat.com>:
- * The Blackberry Pearl can run in two modes; a usb-storage only mode
- * and a mode that allows access via mass storage and to its database.
- * The berry_charge module will set the device to dual mode and thus we
- * should ignore its native mode if that module is built
- */
-#ifdef CONFIG_USB_BERRY_CHARGE
-UNUSUAL_DEV(  0x0fca, 0x0006, 0x0001, 0x0001,
-		"RIM",
-		"Blackberry Pearl",
-		US_SC_DEVICE, US_PR_DEVICE, NULL,
-		US_FL_IGNORE_DEVICE ),
-#endif
-
 /* Reported by Michael Stattmann <michael@stattmann.com> */
 UNUSUAL_DEV(  0x0fce, 0xd008, 0x0000, 0x0000,
 		"Sony Ericsson",
diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c
index e7eeb63fab2..b409c228f25 100644
--- a/drivers/uwb/hwa-rc.c
+++ b/drivers/uwb/hwa-rc.c
@@ -891,7 +891,7 @@ static int hwarc_post_reset(struct usb_interface *iface)
 }
 
 /** USB device ID's that we handle */
-static struct usb_device_id hwarc_id_table[] = {
+static const struct usb_device_id hwarc_id_table[] = {
 	/* D-Link DUB-1210 */
 	{ USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3d02, 0xe0, 0x01, 0x02),
 	  .driver_info = WUSB_QUIRK_WHCI_CMD_EVT },
diff --git a/drivers/uwb/i1480/dfu/usb.c b/drivers/uwb/i1480/dfu/usb.c
index 0bb665a0c02..a99e211a1b8 100644
--- a/drivers/uwb/i1480/dfu/usb.c
+++ b/drivers/uwb/i1480/dfu/usb.c
@@ -120,8 +120,7 @@ int i1480_usb_write(struct i1480 *i1480, u32 memory_address,
 		result = usb_control_msg(
 			i1480_usb->usb_dev, usb_sndctrlpipe(i1480_usb->usb_dev, 0),
 			0xf0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-			cpu_to_le16(memory_address & 0xffff),
-			cpu_to_le16((memory_address >> 16) & 0xffff),
+			memory_address,	(memory_address >> 16),
 			i1480->cmd_buf, buffer_size, 100 /* FIXME: arbitrary */);
 		if (result < 0)
 			break;
@@ -166,8 +165,7 @@ int i1480_usb_read(struct i1480 *i1480, u32 addr, size_t size)
 		result = usb_control_msg(
 			i1480_usb->usb_dev, usb_rcvctrlpipe(i1480_usb->usb_dev, 0),
 			0xf0, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
-			cpu_to_le16(itr_addr & 0xffff),
-			cpu_to_le16((itr_addr >> 16) & 0xffff),
+			itr_addr, (itr_addr >> 16),
 			i1480->cmd_buf + itr, itr_size,
 			100 /* FIXME: arbitrary */);
 		if (result < 0) {
@@ -413,6 +411,10 @@ error:
 	return result;
 }
 
+MODULE_FIRMWARE("i1480-pre-phy-0.0.bin");
+MODULE_FIRMWARE("i1480-usb-0.0.bin");
+MODULE_FIRMWARE("i1480-phy-0.0.bin");
+
 #define i1480_USB_DEV(v, p)				\
 {							\
 	.match_flags = USB_DEVICE_ID_MATCH_DEVICE	\
@@ -430,7 +432,7 @@ error:
 
 
 /** USB device ID's that we handle */
-static struct usb_device_id i1480_usb_id_table[] = {
+static const struct usb_device_id i1480_usb_id_table[] = {
 	i1480_USB_DEV(0x8086, 0xdf3b),
 	i1480_USB_DEV(0x15a9, 0x0005),
 	i1480_USB_DEV(0x07d1, 0x3802),
diff --git a/drivers/uwb/wlp/messages.c b/drivers/uwb/wlp/messages.c
index aa42fcee4c4..75164866c2d 100644
--- a/drivers/uwb/wlp/messages.c
+++ b/drivers/uwb/wlp/messages.c
@@ -259,6 +259,63 @@ out:
 }
 
 
+static ssize_t wlp_get_attribute(struct wlp *wlp, u16 type_code,
+	struct wlp_attr_hdr *attr_hdr, void *value, ssize_t value_len,
+	ssize_t buflen)
+{
+	struct device *dev = &wlp->rc->uwb_dev.dev;
+	ssize_t attr_len = sizeof(*attr_hdr) + value_len;
+	if (buflen < 0)
+		return -EINVAL;
+	if (buflen < attr_len) {
+		dev_err(dev, "WLP: Not enough space in buffer to parse"
+			" attribute field. Need %d, received %zu\n",
+			(int)attr_len, buflen);
+		return -EIO;
+	}
+	if (wlp_check_attr_hdr(wlp, attr_hdr, type_code, value_len) < 0) {
+		dev_err(dev, "WLP: Header verification failed. \n");
+		return -EINVAL;
+	}
+	memcpy(value, (void *)attr_hdr + sizeof(*attr_hdr), value_len);
+	return attr_len;
+}
+
+static ssize_t wlp_vget_attribute(struct wlp *wlp, u16 type_code,
+	struct wlp_attr_hdr *attr_hdr, void *value, ssize_t max_value_len,
+	ssize_t buflen)
+{
+	struct device *dev = &wlp->rc->uwb_dev.dev;
+	size_t len;
+	if (buflen < 0)
+		return -EINVAL;
+	if (buflen < sizeof(*attr_hdr)) {
+		dev_err(dev, "WLP: Not enough space in buffer to parse"
+			" header.\n");
+		return -EIO;
+	}
+	if (le16_to_cpu(attr_hdr->type) != type_code) {
+		dev_err(dev, "WLP: Unexpected attribute type. Got %u, "
+			"expected %u.\n", le16_to_cpu(attr_hdr->type),
+			type_code);
+		return -EINVAL;
+	}
+	len = le16_to_cpu(attr_hdr->length);
+	if (len > max_value_len) {
+		dev_err(dev, "WLP: Attribute larger than maximum "
+			"allowed. Received %zu, max is %d.\n", len,
+			(int)max_value_len);
+		return -EFBIG;
+	}
+	if (buflen < sizeof(*attr_hdr) + len) {
+		dev_err(dev, "WLP: Not enough space in buffer to parse "
+			"variable data.\n");
+		return -EIO;
+	}
+	memcpy(value, (void *)attr_hdr + sizeof(*attr_hdr), len);
+	return sizeof(*attr_hdr) + len;
+}
+
 /**
  * Get value of attribute from fixed size attribute field.
  *
@@ -274,22 +331,8 @@ out:
 ssize_t wlp_get_##name(struct wlp *wlp, struct wlp_attr_##name *attr,	\
 		      type *value, ssize_t buflen)			\
 {									\
-	struct device *dev = &wlp->rc->uwb_dev.dev;			\
-	if (buflen < 0)							\
-		return -EINVAL;						\
-	if (buflen < sizeof(*attr)) {					\
-		dev_err(dev, "WLP: Not enough space in buffer to parse"	\
-			" attribute field. Need %d, received %zu\n",	\
-			(int)sizeof(*attr), buflen);			\
-		return -EIO;						\
-	}								\
-	if (wlp_check_attr_hdr(wlp, &attr->hdr, type_code,		\
-			       sizeof(attr->name)) < 0) {		\
-		dev_err(dev, "WLP: Header verification failed. \n");	\
-		return -EINVAL;						\
-	}								\
-	*value = attr->name;						\
-	return sizeof(*attr);						\
+	return wlp_get_attribute(wlp, (type_code), &attr->hdr,		\
+				 value, sizeof(*value), buflen);	\
 }
 
 #define wlp_get_sparse(type, type_code, name) \
@@ -313,35 +356,8 @@ static ssize_t wlp_get_##name(struct wlp *wlp,				\
 			      struct wlp_attr_##name *attr,		\
 			      type_val *value, ssize_t buflen)		\
 {									\
-	struct device *dev = &wlp->rc->uwb_dev.dev;			\
-	size_t len;							\
-	if (buflen < 0)							\
-		return -EINVAL;						\
-	if (buflen < sizeof(*attr)) {					\
-		dev_err(dev, "WLP: Not enough space in buffer to parse"	\
-			" header.\n");					\
-		return -EIO;						\
-	}								\
-	if (le16_to_cpu(attr->hdr.type) != type_code) {			\
-		dev_err(dev, "WLP: Unexpected attribute type. Got %u, "	\
-			"expected %u.\n", le16_to_cpu(attr->hdr.type),	\
-			type_code);					\
-		return -EINVAL;						\
-	}								\
-	len = le16_to_cpu(attr->hdr.length);				\
-	if (len > max) {						\
-		dev_err(dev, "WLP: Attribute larger than maximum "	\
-			"allowed. Received %zu, max is %d.\n", len,	\
-			(int)max);					\
-		return -EFBIG;						\
-	}								\
-	if (buflen < sizeof(*attr) + len) {				\
-		dev_err(dev, "WLP: Not enough space in buffer to parse "\
-			"variable data.\n");				\
-		return -EIO;						\
-	}								\
-	memcpy(value, (void *) attr + sizeof(*attr), len);		\
-	return sizeof(*attr) + len;					\
+	return wlp_vget_attribute(wlp, (type_code), &attr->hdr, 	\
+			      value, (max), buflen);			\
 }
 
 wlp_get(u8, WLP_ATTR_WLP_VER, version)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ad37da2b6cb..a6a88dfd502 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -125,7 +125,7 @@ static void handle_tx(struct vhost_net *net)
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
 
-	if (wmem < sock->sk->sk_sndbuf * 2)
+	if (wmem < sock->sk->sk_sndbuf / 2)
 		tx_poll_stop(net);
 	hdr_size = vq->hdr_size;
 
@@ -508,12 +508,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	/* Verify that ring has been setup correctly. */
 	if (!vhost_vq_access_ok(vq)) {
 		r = -EFAULT;
-		goto err;
+		goto err_vq;
 	}
 	sock = get_socket(fd);
 	if (IS_ERR(sock)) {
 		r = PTR_ERR(sock);
-		goto err;
+		goto err_vq;
 	}
 
 	/* start polling new socket */
@@ -524,12 +524,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 	vhost_net_disable_vq(n, vq);
 	rcu_assign_pointer(vq->private_data, sock);
 	vhost_net_enable_vq(n, vq);
-	mutex_unlock(&vq->mutex);
 done:
 	if (oldsock) {
 		vhost_net_flush_vq(n, index);
 		fput(oldsock->file);
 	}
+
+err_vq:
+	mutex_unlock(&vq->mutex);
 err:
 	mutex_unlock(&n->dev.mutex);
 	return r;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 7cd55e07879..7bd7a1e4409 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -476,8 +476,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		if (r < 0)
 			break;
 		eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
-		if (IS_ERR(eventfp))
-			return PTR_ERR(eventfp);
+		if (IS_ERR(eventfp)) {
+			r = PTR_ERR(eventfp);
+			break;
+		}
 		if (eventfp != vq->kick) {
 			pollstop = filep = vq->kick;
 			pollstart = vq->kick = eventfp;
@@ -489,8 +491,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		if (r < 0)
 			break;
 		eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
-		if (IS_ERR(eventfp))
-			return PTR_ERR(eventfp);
+		if (IS_ERR(eventfp)) {
+			r = PTR_ERR(eventfp);
+			break;
+		}
 		if (eventfp != vq->call) {
 			filep = vq->call;
 			ctx = vq->call_ctx;
@@ -505,8 +509,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
 		if (r < 0)
 			break;
 		eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
-		if (IS_ERR(eventfp))
-			return PTR_ERR(eventfp);
+		if (IS_ERR(eventfp)) {
+			r = PTR_ERR(eventfp);
+			break;
+		}
 		if (eventfp != vq->error) {
 			filep = vq->error;
 			vq->error = eventfp;
diff --git a/drivers/video/geode/lxfb.h b/drivers/video/geode/lxfb.h
index cc781c00f75..e4c4d89b786 100644
--- a/drivers/video/geode/lxfb.h
+++ b/drivers/video/geode/lxfb.h
@@ -365,6 +365,8 @@ enum fp_registers {
 	FP_CRC, /* 0x458 */
 };
 
+#define FP_PT2_HSP			(1 << 22)
+#define FP_PT2_VSP			(1 << 23)
 #define FP_PT2_SCRC			(1 << 27)	/* shfclk free */
 
 #define FP_PM_P				(1 << 24)	/* panel power ctl */
diff --git a/drivers/video/geode/lxfb_ops.c b/drivers/video/geode/lxfb_ops.c
index 0e5d8c7c3eb..bc35a95e59d 100644
--- a/drivers/video/geode/lxfb_ops.c
+++ b/drivers/video/geode/lxfb_ops.c
@@ -274,7 +274,15 @@ static void lx_graphics_enable(struct fb_info *info)
 		u32 msrlo, msrhi;
 
 		write_fp(par, FP_PT1, 0);
-		write_fp(par, FP_PT2, FP_PT2_SCRC);
+		temp = FP_PT2_SCRC;
+
+		if (info->var.sync & FB_SYNC_HOR_HIGH_ACT)
+			temp |= FP_PT2_HSP;
+
+		if (info->var.sync & FB_SYNC_VERT_HIGH_ACT)
+			temp |= FP_PT2_VSP;
+
+		write_fp(par, FP_PT2, temp);
 		write_fp(par, FP_DFC, FP_DFC_BC);
 
 		msrlo = MSR_LX_MSR_PADSEL_TFT_SEL_LOW;
diff --git a/drivers/video/omap2/displays/panel-generic.c b/drivers/video/omap2/displays/panel-generic.c
index c59e4baed8b..300eff5de1b 100644
--- a/drivers/video/omap2/displays/panel-generic.c
+++ b/drivers/video/omap2/displays/panel-generic.c
@@ -116,6 +116,24 @@ static int generic_panel_resume(struct omap_dss_device *dssdev)
 	return 0;
 }
 
+static void generic_panel_set_timings(struct omap_dss_device *dssdev,
+		struct omap_video_timings *timings)
+{
+	dpi_set_timings(dssdev, timings);
+}
+
+static void generic_panel_get_timings(struct omap_dss_device *dssdev,
+		struct omap_video_timings *timings)
+{
+	*timings = dssdev->panel.timings;
+}
+
+static int generic_panel_check_timings(struct omap_dss_device *dssdev,
+		struct omap_video_timings *timings)
+{
+	return dpi_check_timings(dssdev, timings);
+}
+
 static struct omap_dss_driver generic_driver = {
 	.probe		= generic_panel_probe,
 	.remove		= generic_panel_remove,
@@ -125,6 +143,10 @@ static struct omap_dss_driver generic_driver = {
 	.suspend	= generic_panel_suspend,
 	.resume		= generic_panel_resume,
 
+	.set_timings	= generic_panel_set_timings,
+	.get_timings	= generic_panel_get_timings,
+	.check_timings	= generic_panel_check_timings,
+
 	.driver         = {
 		.name   = "generic_panel",
 		.owner  = THIS_MODULE,
diff --git a/drivers/video/omap2/dss/dss.c b/drivers/video/omap2/dss/dss.c
index 8254a4232a5..54344184dd7 100644
--- a/drivers/video/omap2/dss/dss.c
+++ b/drivers/video/omap2/dss/dss.c
@@ -590,6 +590,9 @@ int dss_init(bool skip_init)
 		}
 	}
 
+	dss.dsi_clk_source = DSS_SRC_DSS1_ALWON_FCLK;
+	dss.dispc_clk_source = DSS_SRC_DSS1_ALWON_FCLK;
+
 	dss_save_context();
 
 	rev = dss_read_reg(DSS_REVISION);
diff --git a/drivers/video/omap2/vram.c b/drivers/video/omap2/vram.c
index 55a4de5e5d1..b266ffae0bd 100644
--- a/drivers/video/omap2/vram.c
+++ b/drivers/video/omap2/vram.c
@@ -511,13 +511,14 @@ static u32 omap_vram_sdram_size __initdata;
 static u32 omap_vram_def_sdram_size __initdata;
 static u32 omap_vram_def_sdram_start __initdata;
 
-static void __init omap_vram_early_vram(char **p)
+static int __init omap_vram_early_vram(char *p)
 {
-	omap_vram_def_sdram_size = memparse(*p, p);
-	if (**p == ',')
-		omap_vram_def_sdram_start = simple_strtoul((*p) + 1, p, 16);
+	omap_vram_def_sdram_size = memparse(p, &p);
+	if (*p == ',')
+		omap_vram_def_sdram_start = simple_strtoul(p + 1, &p, 16);
+	return 0;
 }
-__early_param("vram=", omap_vram_early_vram);
+early_param("vram", omap_vram_early_vram);
 
 /*
  * Called from map_io. We need to call to this early enough so that we
diff --git a/drivers/video/pxa168fb.c b/drivers/video/pxa168fb.c
index 75285d3f393..c91a7f70f7b 100644
--- a/drivers/video/pxa168fb.c
+++ b/drivers/video/pxa168fb.c
@@ -668,7 +668,7 @@ static int __init pxa168fb_probe(struct platform_device *pdev)
 	/*
 	 * Map LCD controller registers.
 	 */
-	fbi->reg_base = ioremap_nocache(res->start, res->end - res->start);
+	fbi->reg_base = ioremap_nocache(res->start, resource_size(res));
 	if (fbi->reg_base == NULL) {
 		ret = -ENOMEM;
 		goto failed;
diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f071be6..5f85b594761 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
 
 source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
+source "fs/ceph/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa4691..97f340f14ba 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
+obj-$(CONFIG_CEPH_FS)		+= ceph/
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef50437003..bb4ed144d0e 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
 	if (!permits)
 		goto out_unlock;
 
-	memcpy(permits->permits, xpermits->permits,
-	       count * sizeof(struct afs_permit));
+	if (xpermits)
+		memcpy(permits->permits, xpermits->permits,
+			count * sizeof(struct afs_permit));
 
 	_debug("key %x access %x",
 	       key_serial(key), vnode->status.caller_access);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 15d80bb35d6..9b6aef0f75e 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -75,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
 	struct file *file = cprm->file;
 	mm_segment_t fs;
 	int has_dumped = 0;
-	unsigned long dump_start, dump_size;
+	void __user *dump_start;
+	int dump_size;
 	struct user dump;
 #ifdef __alpha__
-#       define START_DATA(u)	(u.start_data)
+#       define START_DATA(u)	((void __user *)u.start_data)
 #else
-#	define START_DATA(u)	((u.u_tsize << PAGE_SHIFT) + u.start_code)
+#	define START_DATA(u)	((void __user *)((u.u_tsize << PAGE_SHIFT) + \
+				 u.start_code))
 #endif
-#       define START_STACK(u)   (u.start_stack)
+#       define START_STACK(u)   ((void __user *)u.start_stack)
 
 	fs = get_fs();
 	set_fs(KERNEL_DS);
@@ -104,9 +106,9 @@ static int aout_core_dump(struct coredump_params *cprm)
 
 /* make sure we actually have a data and stack area to dump */
 	set_fs(USER_DS);
-	if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+	if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
 		dump.u_dsize = 0;
-	if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+	if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
 		dump.u_ssize = 0;
 
 	set_fs(KERNEL_DS);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c32d00a669..7ab23e006e4 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1590,7 +1590,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
 	struct vm_area_struct *vma;
 	size_t size = 0;
 
-	for (vma = current->mm->mmap; vma; vma->vm_next)
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next)
 		if (maydump(vma, mm_flags))
 			size += vma->vm_end - vma->vm_start;
 	return size;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 00000000000..04b8280582a
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
+config CEPH_FS
+        tristate "Ceph distributed file system (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select LIBCRC32C
+	select CONFIG_CRYPTO_AES
+	help
+	  Choose Y or M here to include support for mounting the
+	  experimental Ceph distributed file system.  Ceph is an extremely
+	  scalable file system designed to provide high performance,
+	  reliable access to petabytes of storage.
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+config CEPH_FS_PRETTYDEBUG
+	bool "Include file:line in ceph debug output"
+	depends on CEPH_FS
+	default n
+	help
+	  If you say Y here, debug output will include a filename and
+	  line to aid debugging.  This icnreases kernel size and slows
+	  execution slightly when debug call sites are enabled (e.g.,
+	  via CONFIG_DYNAMIC_DEBUG).
+
+	  If unsure, say N.
+
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 00000000000..6a660e610be
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+	export.o caps.o snap.o xattr.o \
+	messenger.o msgpool.o buffer.o pagelist.o \
+	mds_client.o mdsmap.o \
+	mon_client.o \
+	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+	debugfs.o \
+	auth.o auth_none.o \
+	crypto.o armor.o \
+	auth_x.o \
+	ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
+
+modules_install:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
+
+clean:
+	$(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 00000000000..18352fab37c
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
+#
+# The following files are shared by (and manually synchronized
+# between) the Ceph userland and kernel client.
+#
+# userland                  kernel
+src/include/ceph_fs.h	    fs/ceph/ceph_fs.h
+src/include/ceph_fs.cc	    fs/ceph/ceph_fs.c
+src/include/msgr.h	    fs/ceph/msgr.h
+src/include/rados.h	    fs/ceph/rados.h
+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
+src/include/ceph_frag.h	    fs/ceph/ceph_frag.h
+src/include/ceph_frag.cc    fs/ceph/ceph_frag.c
+src/include/ceph_hash.h	    fs/ceph/ceph_hash.h
+src/include/ceph_hash.cc    fs/ceph/ceph_hash.c
+src/crush/crush.c	    fs/ceph/crush/crush.c
+src/crush/crush.h	    fs/ceph/crush/crush.h
+src/crush/mapper.c	    fs/ceph/crush/mapper.c
+src/crush/mapper.h	    fs/ceph/crush/mapper.h
+src/crush/hash.h	    fs/ceph/crush/hash.h
+src/crush/hash.c	    fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 00000000000..ce8ef610772
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1194 @@
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>	/* generic_writepages */
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "super.h"
+#include "osd_client.h"
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page.  This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode.  In the absense of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail).  The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress.  In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.)  i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER.  So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_.  Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb)				\
+	(CONGESTION_ON_THRESH(congestion_kb) -				\
+	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+
+
+/*
+ * Dirty a page.  Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate.  If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	int undo = 0;
+	struct ceph_snap_context *snapc;
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
+
+	if (TestSetPageDirty(page)) {
+		dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+		     mapping->host, page, page->index);
+		return 0;
+	}
+
+	inode = mapping->host;
+	ci = ceph_inode(inode);
+
+	/*
+	 * Note that we're grabbing a snapc ref here without holding
+	 * any locks!
+	 */
+	snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+
+	/* dirty the head */
+	spin_lock(&inode->i_lock);
+	if (ci->i_wrbuffer_ref_head == 0)
+		ci->i_head_snapc = ceph_get_snap_context(snapc);
+	++ci->i_wrbuffer_ref_head;
+	if (ci->i_wrbuffer_ref == 0)
+		igrab(inode);
+	++ci->i_wrbuffer_ref;
+	dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+	     "snapc %p seq %lld (%d snaps)\n",
+	     mapping->host, page, page->index,
+	     ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+	     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+	     snapc, snapc->seq, snapc->num_snaps);
+	spin_unlock(&inode->i_lock);
+
+	/* now adjust page */
+	spin_lock_irq(&mapping->tree_lock);
+	if (page->mapping) {	/* Race with truncate? */
+		WARN_ON_ONCE(!PageUptodate(page));
+
+		if (mapping_cap_account_dirty(mapping)) {
+			__inc_zone_page_state(page, NR_FILE_DIRTY);
+			__inc_bdi_stat(mapping->backing_dev_info,
+					BDI_RECLAIMABLE);
+			task_io_account_write(PAGE_CACHE_SIZE);
+		}
+		radix_tree_tag_set(&mapping->page_tree,
+				page_index(page), PAGECACHE_TAG_DIRTY);
+
+		/*
+		 * Reference snap context in page->private.  Also set
+		 * PagePrivate so that we get invalidatepage callback.
+		 */
+		page->private = (unsigned long)snapc;
+		SetPagePrivate(page);
+	} else {
+		dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+		undo = 1;
+	}
+
+	spin_unlock_irq(&mapping->tree_lock);
+
+	if (undo)
+		/* whoops, we failed to dirty the page */
+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+
+	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	BUG_ON(!PageDirty(page));
+	return 1;
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately.  Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_snap_context *snapc = (void *)page->private;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!page->private);
+	BUG_ON(!PagePrivate(page));
+	BUG_ON(!page->mapping);
+
+	inode = page->mapping->host;
+
+	/*
+	 * We can get non-dirty pages here due to races between
+	 * set_page_dirty and truncate_complete_page; just spit out a
+	 * warning, in case we end up with accounting problems later.
+	 */
+	if (!PageDirty(page))
+		pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+
+	if (offset == 0)
+		ClearPageChecked(page);
+
+	ci = ceph_inode(inode);
+	if (offset == 0) {
+		dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+		     inode, page, page->index, offset);
+		ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+		ceph_put_snap_context(snapc);
+		page->private = 0;
+		ClearPagePrivate(page);
+	} else {
+		dout("%p invalidatepage %p idx %lu partial dirty page\n",
+		     inode, page, page->index);
+	}
+}
+
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+	struct inode *inode = page->mapping ? page->mapping->host : NULL;
+	dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+	WARN_ON(PageDirty(page));
+	WARN_ON(page->private);
+	WARN_ON(PagePrivate(page));
+	return 0;
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	int err = 0;
+	u64 len = PAGE_CACHE_SIZE;
+
+	dout("readpage inode %p file %p page %p index %lu\n",
+	     inode, filp, page, page->index);
+	err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+				  page->index << PAGE_CACHE_SHIFT, &len,
+				  ci->i_truncate_seq, ci->i_truncate_size,
+				  &page, 1);
+	if (err == -ENOENT)
+		err = 0;
+	if (err < 0) {
+		SetPageError(page);
+		goto out;
+	} else if (err < PAGE_CACHE_SIZE) {
+		/* zero fill remainder of page */
+		zero_user_segment(page, err, PAGE_CACHE_SIZE);
+	}
+	SetPageUptodate(page);
+
+out:
+	return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+	int r = readpage_nounlock(filp, page);
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+					   unsigned *nr_pages)
+{
+	struct page **pages;
+	struct page *page;
+	int next_index, contig_pages = 0;
+
+	/* build page vector */
+	pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	BUG_ON(list_empty(page_list));
+	next_index = list_entry(page_list->prev, struct page, lru)->index;
+	list_for_each_entry_reverse(page, page_list, lru) {
+		if (page->index == next_index) {
+			dout("readpages page %d %p\n", contig_pages, page);
+			pages[contig_pages] = page;
+			contig_pages++;
+			next_index++;
+		} else {
+			break;
+		}
+	}
+	*nr_pages = contig_pages;
+	return pages;
+}
+
+/*
+ * Read multiple pages.  Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *page_list, unsigned nr_pages)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+	int rc = 0;
+	struct page **pages;
+	struct pagevec pvec;
+	loff_t offset;
+	u64 len;
+
+	dout("readpages %p file %p nr_pages %d\n",
+	     inode, file, nr_pages);
+
+	pages = page_vector_from_list(page_list, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	/* guess read extent */
+	offset = pages[0]->index << PAGE_CACHE_SHIFT;
+	len = nr_pages << PAGE_CACHE_SHIFT;
+	rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+				 offset, &len,
+				 ci->i_truncate_seq, ci->i_truncate_size,
+				 pages, nr_pages);
+	if (rc == -ENOENT)
+		rc = 0;
+	if (rc < 0)
+		goto out;
+
+	/* set uptodate and add to lru in pagevec-sized chunks */
+	pagevec_init(&pvec, 0);
+	for (; !list_empty(page_list) && len > 0;
+	     rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
+		struct page *page =
+			list_entry(page_list->prev, struct page, lru);
+
+		list_del(&page->lru);
+
+		if (rc < (int)PAGE_CACHE_SIZE) {
+			/* zero (remainder of) page */
+			int s = rc < 0 ? 0 : rc;
+			zero_user_segment(page, s, PAGE_CACHE_SIZE);
+		}
+
+		if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+			page_cache_release(page);
+			dout("readpages %p add_to_page_cache failed %p\n",
+			     inode, page);
+			continue;
+		}
+		dout("readpages %p adding %p idx %lu\n", inode, page,
+		     page->index);
+		flush_dcache_page(page);
+		SetPageUptodate(page);
+		unlock_page(page);
+		if (pagevec_add(&pvec, page) == 0)
+			pagevec_lru_add_file(&pvec);   /* add to lru */
+	}
+	pagevec_lru_add_file(&pvec);
+	rc = 0;
+
+out:
+	kfree(pages);
+	return rc;
+}
+
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ *
+ * Caller holds i_lock.
+ */
+static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
+						      u64 *snap_size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_snap_context *snapc = NULL;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+		     capsnap->context, capsnap->dirty_pages);
+		if (capsnap->dirty_pages) {
+			snapc = ceph_get_snap_context(capsnap->context);
+			if (snap_size)
+				*snap_size = capsnap->size;
+			break;
+		}
+	}
+	if (!snapc && ci->i_snap_realm) {
+		snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+		dout(" head snapc %p has %d dirty pages\n",
+		     snapc, ci->i_wrbuffer_ref_head);
+	}
+	return snapc;
+}
+
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+						    u64 *snap_size)
+{
+	struct ceph_snap_context *snapc = NULL;
+
+	spin_lock(&inode->i_lock);
+	snapc = __get_oldest_context(inode, snap_size);
+	spin_unlock(&inode->i_lock);
+	return snapc;
+}
+
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_client *client;
+	struct ceph_osd_client *osdc;
+	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+	int len = PAGE_CACHE_SIZE;
+	loff_t i_size;
+	int err = 0;
+	struct ceph_snap_context *snapc;
+	u64 snap_size = 0;
+	long writeback_stat;
+
+	dout("writepage %p idx %lu\n", page, page->index);
+
+	if (!page->mapping || !page->mapping->host) {
+		dout("writepage %p - no mapping\n", page);
+		return -EFAULT;
+	}
+	inode = page->mapping->host;
+	ci = ceph_inode(inode);
+	client = ceph_inode_to_client(inode);
+	osdc = &client->osdc;
+
+	/* verify this is a writeable snap context */
+	snapc = (void *)page->private;
+	if (snapc == NULL) {
+		dout("writepage %p page %p not dirty?\n", inode, page);
+		goto out;
+	}
+	if (snapc != get_oldest_context(inode, &snap_size)) {
+		dout("writepage %p page %p snapc %p not writeable - noop\n",
+		     inode, page, (void *)page->private);
+		/* we should only noop if called by kswapd */
+		WARN_ON((current->flags & PF_MEMALLOC) == 0);
+		goto out;
+	}
+
+	/* is this a partial page at end of file? */
+	if (snap_size)
+		i_size = snap_size;
+	else
+		i_size = i_size_read(inode);
+	if (i_size < page_off + len)
+		len = i_size - page_off;
+
+	dout("writepage %p page %p index %lu on %llu~%u\n",
+	     inode, page, page->index, page_off, len);
+
+	writeback_stat = atomic_long_inc_return(&client->writeback_count);
+	if (writeback_stat >
+	    CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+		set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+
+	set_page_writeback(page);
+	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+				   &ci->i_layout, snapc,
+				   page_off, len,
+				   ci->i_truncate_seq, ci->i_truncate_size,
+				   &inode->i_mtime,
+				   &page, 1, 0, 0, true);
+	if (err < 0) {
+		dout("writepage setting page/mapping error %d %p\n", err, page);
+		SetPageError(page);
+		mapping_set_error(&inode->i_data, err);
+		if (wbc)
+			wbc->pages_skipped++;
+	} else {
+		dout("writepage cleaned page %p\n", page);
+		err = 0;  /* vfs expects us to return 0 */
+	}
+	page->private = 0;
+	ClearPagePrivate(page);
+	end_page_writeback(page);
+	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+	ceph_put_snap_context(snapc);
+out:
+	return err;
+}
+
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int err;
+	struct inode *inode = page->mapping->host;
+	BUG_ON(!inode);
+	igrab(inode);
+	err = writepage_nounlock(page, wbc);
+	unlock_page(page);
+	iput(inode);
+	return err;
+}
+
+
+/*
+ * lame release_pages helper.  release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+	struct pagevec pvec;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	for (i = 0; i < num; i++) {
+		if (pagevec_add(&pvec, pages[i]) == 0)
+			pagevec_release(&pvec);
+	}
+	pagevec_release(&pvec);
+}
+
+
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+			      struct ceph_msg *msg)
+{
+	struct inode *inode = req->r_inode;
+	struct ceph_osd_reply_head *replyhead;
+	struct ceph_osd_op *op;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned wrote;
+	struct page *page;
+	int i;
+	struct ceph_snap_context *snapc = req->r_snapc;
+	struct address_space *mapping = inode->i_mapping;
+	struct writeback_control *wbc = req->r_wbc;
+	__s32 rc = -EIO;
+	u64 bytes = 0;
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	long writeback_stat;
+	unsigned issued = __ceph_caps_issued(ci, NULL);
+
+	/* parse reply */
+	replyhead = msg->front.iov_base;
+	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+	op = (void *)(replyhead + 1);
+	rc = le32_to_cpu(replyhead->result);
+	bytes = le64_to_cpu(op->extent.length);
+
+	if (rc >= 0) {
+		/*
+		 * Assume we wrote the pages we originally sent.  The
+		 * osd might reply with fewer pages if our writeback
+		 * raced with a truncation and was adjusted at the osd,
+		 * so don't believe the reply.
+		 */
+		wrote = req->r_num_pages;
+	} else {
+		wrote = 0;
+		mapping_set_error(mapping, rc);
+	}
+	dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+	     inode, rc, bytes, wrote);
+
+	/* clean all pages */
+	for (i = 0; i < req->r_num_pages; i++) {
+		page = req->r_pages[i];
+		BUG_ON(!page);
+		WARN_ON(!PageUptodate(page));
+
+		writeback_stat =
+			atomic_long_dec_return(&client->writeback_count);
+		if (writeback_stat <
+		    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+			clear_bdi_congested(&client->backing_dev_info,
+					    BLK_RW_ASYNC);
+
+		if (i >= wrote) {
+			dout("inode %p skipping page %p\n", inode, page);
+			wbc->pages_skipped++;
+		}
+		page->private = 0;
+		ClearPagePrivate(page);
+		ceph_put_snap_context(snapc);
+		dout("unlocking %d %p\n", i, page);
+		end_page_writeback(page);
+
+		/*
+		 * We lost the cache cap, need to truncate the page before
+		 * it is unlocked, otherwise we'd truncate it later in the
+		 * page truncation thread, possibly losing some data that
+		 * raced its way in
+		 */
+		if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+			generic_error_remove_page(inode->i_mapping, page);
+
+		unlock_page(page);
+	}
+	dout("%p wrote+cleaned %d pages\n", inode, wrote);
+	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+
+	ceph_release_pages(req->r_pages, req->r_num_pages);
+	if (req->r_pages_from_pool)
+		mempool_free(req->r_pages,
+			     ceph_client(inode->i_sb)->wb_pagevec_pool);
+	else
+		kfree(req->r_pages);
+	ceph_osdc_put_request(req);
+}
+
+/*
+ * allocate a page vec, either directly, or if necessary, via a the
+ * mempool.  we avoid the mempool if we can because req->r_num_pages
+ * may be less than the maximum write size.
+ */
+static void alloc_page_vec(struct ceph_client *client,
+			   struct ceph_osd_request *req)
+{
+	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+			       GFP_NOFS);
+	if (!req->r_pages) {
+		req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+		req->r_pages_from_pool = 1;
+		WARN_ON(!req->r_pages);
+	}
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client;
+	pgoff_t index, start, end;
+	int range_whole = 0;
+	int should_loop = 1;
+	pgoff_t max_pages = 0, max_pages_ever = 0;
+	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+	struct pagevec pvec;
+	int done = 0;
+	int rc = 0;
+	unsigned wsize = 1 << inode->i_blkbits;
+	struct ceph_osd_request *req = NULL;
+	int do_sync;
+	u64 snap_size = 0;
+
+	/*
+	 * Include a 'sync' in the OSD request if this is a data
+	 * integrity write (e.g., O_SYNC write or fsync()), or if our
+	 * cap is being revoked.
+	 */
+	do_sync = wbc->sync_mode == WB_SYNC_ALL;
+	if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+		do_sync = 1;
+	dout("writepages_start %p dosync=%d (mode=%s)\n",
+	     inode, do_sync,
+	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+	client = ceph_inode_to_client(inode);
+	if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+		pr_warning("writepage_start %p on forced umount\n", inode);
+		return -EIO; /* we're in a forced umount, don't write! */
+	}
+	if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+		wsize = client->mount_args->wsize;
+	if (wsize < PAGE_CACHE_SIZE)
+		wsize = PAGE_CACHE_SIZE;
+	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+
+	/* ?? */
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		dout(" writepages congested\n");
+		wbc->encountered_congestion = 1;
+		goto out_final;
+	}
+
+	/* where to start/end? */
+	if (wbc->range_cyclic) {
+		start = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+		dout(" cyclic, start at %lu\n", start);
+	} else {
+		start = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		should_loop = 0;
+		dout(" not cyclic, %lu to %lu\n", start, end);
+	}
+	index = start;
+
+retry:
+	/* find oldest snap context with dirty data */
+	ceph_put_snap_context(snapc);
+	snapc = get_oldest_context(inode, &snap_size);
+	if (!snapc) {
+		/* hmm, why does writepages get called when there
+		   is no dirty data? */
+		dout(" no snap context with dirty data?\n");
+		goto out;
+	}
+	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+	     snapc, snapc->seq, snapc->num_snaps);
+	if (last_snapc && snapc != last_snapc) {
+		/* if we switched to a newer snapc, restart our scan at the
+		 * start of the original file range. */
+		dout("  snapc differs from last pass, restarting at %lu\n",
+		     index);
+		index = start;
+	}
+	last_snapc = snapc;
+
+	while (!done && index <= end) {
+		unsigned i;
+		int first;
+		pgoff_t next;
+		int pvec_pages, locked_pages;
+		struct page *page;
+		int want;
+		u64 offset, len;
+		struct ceph_osd_request_head *reqhead;
+		struct ceph_osd_op *op;
+		long writeback_stat;
+
+		next = 0;
+		locked_pages = 0;
+		max_pages = max_pages_ever;
+
+get_more_pages:
+		first = -1;
+		want = min(end - index,
+			   min((pgoff_t)PAGEVEC_SIZE,
+			       max_pages - (pgoff_t)locked_pages) - 1)
+			+ 1;
+		pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+						PAGECACHE_TAG_DIRTY,
+						want);
+		dout("pagevec_lookup_tag got %d\n", pvec_pages);
+		if (!pvec_pages && !locked_pages)
+			break;
+		for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+			page = pvec.pages[i];
+			dout("? %p idx %lu\n", page, page->index);
+			if (locked_pages == 0)
+				lock_page(page);  /* first page */
+			else if (!trylock_page(page))
+				break;
+
+			/* only dirty pages, or our accounting breaks */
+			if (unlikely(!PageDirty(page)) ||
+			    unlikely(page->mapping != mapping)) {
+				dout("!dirty or !mapping %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (!wbc->range_cyclic && page->index > end) {
+				dout("end of range %p\n", page);
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (next && (page->index != next)) {
+				dout("not consecutive %p\n", page);
+				unlock_page(page);
+				break;
+			}
+			if (wbc->sync_mode != WB_SYNC_NONE) {
+				dout("waiting on writeback %p\n", page);
+				wait_on_page_writeback(page);
+			}
+			if ((snap_size && page_offset(page) > snap_size) ||
+			    (!snap_size &&
+			     page_offset(page) > i_size_read(inode))) {
+				dout("%p page eof %llu\n", page, snap_size ?
+				     snap_size : i_size_read(inode));
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			if (PageWriteback(page)) {
+				dout("%p under writeback\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/* only if matching snap context */
+			if (snapc != (void *)page->private) {
+				dout("page snapc %p != oldest %p\n",
+				     (void *)page->private, snapc);
+				unlock_page(page);
+				if (!locked_pages)
+					continue; /* keep looking for snap */
+				break;
+			}
+
+			if (!clear_page_dirty_for_io(page)) {
+				dout("%p !clear_page_dirty_for_io\n", page);
+				unlock_page(page);
+				break;
+			}
+
+			/* ok */
+			if (locked_pages == 0) {
+				/* prepare async write request */
+				offset = page->index << PAGE_CACHE_SHIFT;
+				len = wsize;
+				req = ceph_osdc_new_request(&client->osdc,
+					    &ci->i_layout,
+					    ceph_vino(inode),
+					    offset, &len,
+					    CEPH_OSD_OP_WRITE,
+					    CEPH_OSD_FLAG_WRITE |
+						    CEPH_OSD_FLAG_ONDISK,
+					    snapc, do_sync,
+					    ci->i_truncate_seq,
+					    ci->i_truncate_size,
+					    &inode->i_mtime, true, 1);
+				max_pages = req->r_num_pages;
+
+				alloc_page_vec(client, req);
+				req->r_callback = writepages_finish;
+				req->r_inode = inode;
+				req->r_wbc = wbc;
+			}
+
+			/* note position of first page in pvec */
+			if (first < 0)
+				first = i;
+			dout("%p will write page %p idx %lu\n",
+			     inode, page, page->index);
+
+			writeback_stat = atomic_long_inc_return(&client->writeback_count);
+			if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+				set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+			}
+
+			set_page_writeback(page);
+			req->r_pages[locked_pages] = page;
+			locked_pages++;
+			next = page->index + 1;
+		}
+
+		/* did we get anything? */
+		if (!locked_pages)
+			goto release_pvec_pages;
+		if (i) {
+			int j;
+			BUG_ON(!locked_pages || first < 0);
+
+			if (pvec_pages && i == pvec_pages &&
+			    locked_pages < max_pages) {
+				dout("reached end pvec, trying for more\n");
+				pagevec_reinit(&pvec);
+				goto get_more_pages;
+			}
+
+			/* shift unused pages over in the pvec...  we
+			 * will need to release them below. */
+			for (j = i; j < pvec_pages; j++) {
+				dout(" pvec leftover page %p\n",
+				     pvec.pages[j]);
+				pvec.pages[j-i+first] = pvec.pages[j];
+			}
+			pvec.nr -= i-first;
+		}
+
+		/* submit the write */
+		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+			  (u64)locked_pages << PAGE_CACHE_SHIFT);
+		dout("writepages got %d pages at %llu~%llu\n",
+		     locked_pages, offset, len);
+
+		/* revise final length, page count */
+		req->r_num_pages = locked_pages;
+		reqhead = req->r_request->front.iov_base;
+		op = (void *)(reqhead + 1);
+		op->extent.length = cpu_to_le64(len);
+		op->payload_len = cpu_to_le32(len);
+		req->r_request->hdr.data_len = cpu_to_le32(len);
+
+		ceph_osdc_start_request(&client->osdc, req, true);
+		req = NULL;
+
+		/* continue? */
+		index = next;
+		wbc->nr_to_write -= locked_pages;
+		if (wbc->nr_to_write <= 0)
+			done = 1;
+
+release_pvec_pages:
+		dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+		     pvec.nr ? pvec.pages[0] : NULL);
+		pagevec_release(&pvec);
+
+		if (locked_pages && !done)
+			goto retry;
+	}
+
+	if (should_loop && !done) {
+		/* more to do; loop back to beginning of file */
+		dout("writepages looping back to beginning of file\n");
+		should_loop = 0;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+
+out:
+	if (req)
+		ceph_osdc_put_request(req);
+	if (rc > 0)
+		rc = 0;  /* vfs expects us to return 0 */
+	ceph_put_snap_context(snapc);
+	dout("writepages done, rc = %d\n", rc);
+out_final:
+	return rc;
+}
+
+
+
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+					   struct ceph_snap_context *snapc)
+{
+	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+	return !oldest || snapc->seq <= oldest->seq;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
+ */
+static int ceph_update_writeable_page(struct file *file,
+			    loff_t pos, unsigned len,
+			    struct page *page)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	loff_t page_off = pos & PAGE_CACHE_MASK;
+	int pos_in_page = pos & ~PAGE_CACHE_MASK;
+	int end_in_page = pos_in_page + len;
+	loff_t i_size;
+	struct ceph_snap_context *snapc;
+	int r;
+
+retry_locked:
+	/* writepages currently holds page lock, but if we change that later, */
+	wait_on_page_writeback(page);
+
+	/* check snap context */
+	BUG_ON(!ci->i_snap_realm);
+	down_read(&mdsc->snap_rwsem);
+	BUG_ON(!ci->i_snap_realm->cached_context);
+	if (page->private &&
+	    (void *)page->private != ci->i_snap_realm->cached_context) {
+		/*
+		 * this page is already dirty in another (older) snap
+		 * context!  is it writeable now?
+		 */
+		snapc = get_oldest_context(inode, NULL);
+		up_read(&mdsc->snap_rwsem);
+
+		if (snapc != (void *)page->private) {
+			dout(" page %p snapc %p not current or oldest\n",
+			     page, (void *)page->private);
+			/*
+			 * queue for writeback, and wait for snapc to
+			 * be writeable or written
+			 */
+			snapc = ceph_get_snap_context((void *)page->private);
+			unlock_page(page);
+			ceph_queue_writeback(inode);
+			r = wait_event_interruptible(ci->i_cap_wq,
+			       context_is_writeable_or_written(inode, snapc));
+			ceph_put_snap_context(snapc);
+			if (r == -ERESTARTSYS)
+				return r;
+			return -EAGAIN;
+		}
+
+		/* yay, writeable, do it now (without dropping page lock) */
+		dout(" page %p snapc %p not current, but oldest\n",
+		     page, snapc);
+		if (!clear_page_dirty_for_io(page))
+			goto retry_locked;
+		r = writepage_nounlock(page, NULL);
+		if (r < 0)
+			goto fail_nosnap;
+		goto retry_locked;
+	}
+
+	if (PageUptodate(page)) {
+		dout(" page %p already uptodate\n", page);
+		return 0;
+	}
+
+	/* full page? */
+	if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+		return 0;
+
+	/* past end of file? */
+	i_size = inode->i_size;   /* caller holds i_mutex */
+
+	if (i_size + len > inode->i_sb->s_maxbytes) {
+		/* file is too big */
+		r = -EINVAL;
+		goto fail;
+	}
+
+	if (page_off >= i_size ||
+	    (pos_in_page == 0 && (pos+len) >= i_size &&
+	     end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+		dout(" zeroing %p 0 - %d and %d - %d\n",
+		     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+		zero_user_segments(page,
+				   0, pos_in_page,
+				   end_in_page, PAGE_CACHE_SIZE);
+		return 0;
+	}
+
+	/* we need to read it. */
+	up_read(&mdsc->snap_rwsem);
+	r = readpage_nounlock(file, page);
+	if (r < 0)
+		goto fail_nosnap;
+	goto retry_locked;
+
+fail:
+	up_read(&mdsc->snap_rwsem);
+fail_nosnap:
+	unlock_page(page);
+	return r;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	int r;
+
+	do {
+		/* get a page */
+		page = grab_cache_page_write_begin(mapping, index, 0);
+		if (!page)
+			return -ENOMEM;
+		*pagep = page;
+
+		dout("write_begin file %p inode %p page %p %d~%d\n", file,
+	     	inode, page, (int)pos, (int)len);
+
+		r = ceph_update_writeable_page(file, pos, len, page);
+	} while (r == -EAGAIN);
+
+	return r;
+}
+
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	int check_cap = 0;
+
+	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+	     inode, page, (int)pos, (int)copied, (int)len);
+
+	/* zero the stale part of the page if we did a short copy */
+	if (copied < len)
+		zero_user_segment(page, from+copied, len);
+
+	/* did file size increase? */
+	/* (no need for i_size_read(); we caller holds i_mutex */
+	if (pos+copied > inode->i_size)
+		check_cap = ceph_inode_set_size(inode, pos+copied);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	set_page_dirty(page);
+
+	unlock_page(page);
+	up_read(&mdsc->snap_rwsem);
+	page_cache_release(page);
+
+	if (check_cap)
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+
+	return copied;
+}
+
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+			      const struct iovec *iov,
+			      loff_t pos, unsigned long nr_segs)
+{
+	WARN_ON(1);
+	return -EINVAL;
+}
+
+const struct address_space_operations ceph_aops = {
+	.readpage = ceph_readpage,
+	.readpages = ceph_readpages,
+	.writepage = ceph_writepage,
+	.writepages = ceph_writepages_start,
+	.write_begin = ceph_write_begin,
+	.write_end = ceph_write_end,
+	.set_page_dirty = ceph_set_page_dirty,
+	.invalidatepage = ceph_invalidatepage,
+	.releasepage = ceph_releasepage,
+	.direct_IO = ceph_direct_io,
+};
+
+
+/*
+ * vm ops
+ */
+
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct page *page = vmf->page;
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	loff_t off = page->index << PAGE_CACHE_SHIFT;
+	loff_t size, len;
+	int ret;
+
+	size = i_size_read(inode);
+	if (off + PAGE_CACHE_SIZE <= size)
+		len = PAGE_CACHE_SIZE;
+	else
+		len = size & ~PAGE_CACHE_MASK;
+
+	dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+	     off, len, page, page->index);
+
+	lock_page(page);
+
+	ret = VM_FAULT_NOPAGE;
+	if ((off > size) ||
+	    (page->mapping != inode->i_mapping))
+		goto out;
+
+	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+	if (ret == 0) {
+		/* success.  we'll keep the page locked. */
+		set_page_dirty(page);
+		up_read(&mdsc->snap_rwsem);
+		ret = VM_FAULT_LOCKED;
+	} else {
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else
+			ret = VM_FAULT_SIGBUS;
+	}
+out:
+	dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+	if (ret != VM_FAULT_LOCKED)
+		unlock_page(page);
+	return ret;
+}
+
+static struct vm_operations_struct ceph_vmops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= ceph_page_mkwrite,
+};
+
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ceph_vmops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 00000000000..67b2c030924
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
+
+#include <linux/errno.h>
+
+/*
+ * base64 encode/decode.
+ */
+
+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+	return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+	if (c >= 'A' && c <= 'Z')
+		return c - 'A';
+	if (c >= 'a' && c <= 'z')
+		return c - 'a' + 26;
+	if (c >= '0' && c <= '9')
+		return c - '0' + 52;
+	if (c == '+')
+		return 62;
+	if (c == '/')
+		return 63;
+	if (c == '=')
+		return 0; /* just non-negative, please */
+	return -EINVAL;
+}
+
+int ceph_armor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+	int line = 0;
+
+	while (src < end) {
+		unsigned char a, b, c;
+
+		a = *src++;
+		*dst++ = encode_bits(a >> 2);
+		if (src < end) {
+			b = *src++;
+			*dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+			if (src < end) {
+				c = *src++;
+				*dst++ = encode_bits(((b & 15) << 2) |
+						     (c >> 6));
+				*dst++ = encode_bits(c & 63);
+			} else {
+				*dst++ = encode_bits((b & 15) << 2);
+				*dst++ = '=';
+			}
+		} else {
+			*dst++ = encode_bits(((a & 3) << 4));
+			*dst++ = '=';
+			*dst++ = '=';
+		}
+		olen += 4;
+		line += 4;
+		if (line == 64) {
+			line = 0;
+			*(dst++) = '\n';
+			olen++;
+		}
+	}
+	return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+	int olen = 0;
+
+	while (src < end) {
+		int a, b, c, d;
+
+		if (src < end && src[0] == '\n')
+			src++;
+		if (src + 4 > end)
+			return -EINVAL;
+		a = decode_bits(src[0]);
+		b = decode_bits(src[1]);
+		c = decode_bits(src[2]);
+		d = decode_bits(src[3]);
+		if (a < 0 || b < 0 || c < 0 || d < 0)
+			return -EINVAL;
+
+		*dst++ = (a << 2) | (b >> 4);
+		if (src[2] == '=')
+			return olen + 1;
+		*dst++ = ((b & 15) << 4) | (c >> 2);
+		if (src[3] == '=')
+			return olen + 2;
+		*dst++ = ((c & 3) << 6) | d;
+		olen += 3;
+		src += 4;
+	}
+	return olen;
+}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 00000000000..abb204fea6c
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,257 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/err.h>
+
+#include "types.h"
+#include "auth_none.h"
+#include "auth_x.h"
+#include "decode.h"
+#include "super.h"
+
+#include "messenger.h"
+
+/*
+ * get protocol handler
+ */
+static u32 supported_protocols[] = {
+	CEPH_AUTH_NONE,
+	CEPH_AUTH_CEPHX
+};
+
+int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+{
+	switch (protocol) {
+	case CEPH_AUTH_NONE:
+		return ceph_auth_none_init(ac);
+	case CEPH_AUTH_CEPHX:
+		return ceph_x_init(ac);
+	default:
+		return -ENOENT;
+	}
+}
+
+/*
+ * setup, teardown.
+ */
+struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
+{
+	struct ceph_auth_client *ac;
+	int ret;
+
+	dout("auth_init name '%s' secret '%s'\n", name, secret);
+
+	ret = -ENOMEM;
+	ac = kzalloc(sizeof(*ac), GFP_NOFS);
+	if (!ac)
+		goto out;
+
+	ac->negotiating = true;
+	if (name)
+		ac->name = name;
+	else
+		ac->name = CEPH_AUTH_NAME_DEFAULT;
+	dout("auth_init name %s secret %s\n", ac->name, secret);
+	ac->secret = secret;
+	return ac;
+
+out:
+	return ERR_PTR(ret);
+}
+
+void ceph_auth_destroy(struct ceph_auth_client *ac)
+{
+	dout("auth_destroy %p\n", ac);
+	if (ac->ops)
+		ac->ops->destroy(ac);
+	kfree(ac);
+}
+
+/*
+ * Reset occurs when reconnecting to the monitor.
+ */
+void ceph_auth_reset(struct ceph_auth_client *ac)
+{
+	dout("auth_reset %p\n", ac);
+	if (ac->ops && !ac->negotiating)
+		ac->ops->reset(ac);
+	ac->negotiating = true;
+}
+
+int ceph_entity_name_encode(const char *name, void **p, void *end)
+{
+	int len = strlen(name);
+
+	if (*p + 2*sizeof(u32) + len > end)
+		return -ERANGE;
+	ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_32(p, len);
+	ceph_encode_copy(p, name, len);
+	return 0;
+}
+
+/*
+ * Initiate protocol negotiation with monitor.  Include entity name
+ * and list supported protocols.
+ */
+int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
+{
+	struct ceph_mon_request_header *monhdr = buf;
+	void *p = monhdr + 1, *end = buf + len, *lenp;
+	int i, num;
+	int ret;
+
+	dout("auth_build_hello\n");
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, 0);  /* no protocol, yet */
+
+	lenp = p;
+	p += sizeof(u32);
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	ceph_encode_8(&p, 1);
+	num = ARRAY_SIZE(supported_protocols);
+	ceph_encode_32(&p, num);
+	ceph_decode_need(&p, end, num * sizeof(u32), bad);
+	for (i = 0; i < num; i++)
+		ceph_encode_32(&p, supported_protocols[i]);
+
+	ret = ceph_entity_name_encode(ac->name, &p, end);
+	if (ret < 0)
+		return ret;
+	ceph_decode_need(&p, end, sizeof(u64), bad);
+	ceph_encode_64(&p, ac->global_id);
+
+	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
+	return p - buf;
+
+bad:
+	return -ERANGE;
+}
+
+int ceph_build_auth_request(struct ceph_auth_client *ac,
+			   void *msg_buf, size_t msg_len)
+{
+	struct ceph_mon_request_header *monhdr = msg_buf;
+	void *p = monhdr + 1;
+	void *end = msg_buf + msg_len;
+	int ret;
+
+	monhdr->have_version = 0;
+	monhdr->session_mon = cpu_to_le16(-1);
+	monhdr->session_mon_tid = 0;
+
+	ceph_encode_32(&p, ac->protocol);
+
+	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
+	if (ret < 0) {
+		pr_err("error %d building request\n", ret);
+		return ret;
+	}
+	dout(" built request %d bytes\n", ret);
+	ceph_encode_32(&p, ret);
+	return p + ret - msg_buf;
+}
+
+/*
+ * Handle auth message from monitor.
+ */
+int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+			   void *buf, size_t len,
+			   void *reply_buf, size_t reply_len)
+{
+	void *p = buf;
+	void *end = buf + len;
+	int protocol;
+	s32 result;
+	u64 global_id;
+	void *payload, *payload_end;
+	int payload_len;
+	char *result_msg;
+	int result_msg_len;
+	int ret = -EINVAL;
+
+	dout("handle_auth_reply %p %p\n", p, end);
+	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
+	protocol = ceph_decode_32(&p);
+	result = ceph_decode_32(&p);
+	global_id = ceph_decode_64(&p);
+	payload_len = ceph_decode_32(&p);
+	payload = p;
+	p += payload_len;
+	ceph_decode_need(&p, end, sizeof(u32), bad);
+	result_msg_len = ceph_decode_32(&p);
+	result_msg = p;
+	p += result_msg_len;
+	if (p != end)
+		goto bad;
+
+	dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
+	     result_msg, global_id, payload_len);
+
+	payload_end = payload + payload_len;
+
+	if (global_id && ac->global_id != global_id) {
+		dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
+		ac->global_id = global_id;
+	}
+
+	if (ac->negotiating) {
+		/* server does not support our protocols? */
+		if (!protocol && result < 0) {
+			ret = result;
+			goto out;
+		}
+		/* set up (new) protocol handler? */
+		if (ac->protocol && ac->protocol != protocol) {
+			ac->ops->destroy(ac);
+			ac->protocol = 0;
+			ac->ops = NULL;
+		}
+		if (ac->protocol != protocol) {
+			ret = ceph_auth_init_protocol(ac, protocol);
+			if (ret) {
+				pr_err("error %d on auth protocol %d init\n",
+				       ret, protocol);
+				goto out;
+			}
+		}
+
+		ac->negotiating = false;
+	}
+
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+	if (ret == -EAGAIN) {
+		return ceph_build_auth_request(ac, reply_buf, reply_len);
+	} else if (ret) {
+		pr_err("authentication error %d\n", ret);
+		return ret;
+	}
+	return 0;
+
+bad:
+	pr_err("failed to decode auth msg\n");
+out:
+	return ret;
+}
+
+int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len)
+{
+	if (!ac->protocol)
+		return ceph_auth_build_hello(ac, msg_buf, msg_len);
+	BUG_ON(!ac->ops);
+	if (!ac->ops->is_authenticated(ac))
+		return ceph_build_auth_request(ac, msg_buf, msg_len);
+	return 0;
+}
+
+int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
+{
+	if (!ac->ops)
+		return 0;
+	return ac->ops->is_authenticated(ac);
+}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 00000000000..ca4f57cfb26
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
+#ifndef _FS_CEPH_AUTH_H
+#define _FS_CEPH_AUTH_H
+
+#include "types.h"
+#include "buffer.h"
+
+/*
+ * Abstract interface for communicating with the authenticate module.
+ * There is some handshake that takes place between us and the monitor
+ * to acquire the necessary keys.  These are used to generate an
+ * 'authorizer' that we use when connecting to a service (mds, osd).
+ */
+
+struct ceph_auth_client;
+struct ceph_authorizer;
+
+struct ceph_auth_client_ops {
+	/*
+	 * true if we are authenticated and can connect to
+	 * services.
+	 */
+	int (*is_authenticated)(struct ceph_auth_client *ac);
+
+	/*
+	 * build requests and process replies during monitor
+	 * handshake.  if handle_reply returns -EAGAIN, we build
+	 * another request.
+	 */
+	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
+	int (*handle_reply)(struct ceph_auth_client *ac, int result,
+			    void *buf, void *end);
+
+	/*
+	 * Create authorizer for connecting to a service, and verify
+	 * the response to authenticate the service.
+	 */
+	int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
+				 struct ceph_authorizer **a,
+				 void **buf, size_t *len,
+				 void **reply_buf, size_t *reply_len);
+	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
+				       struct ceph_authorizer *a, size_t len);
+	void (*destroy_authorizer)(struct ceph_auth_client *ac,
+				   struct ceph_authorizer *a);
+	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
+				      int peer_type);
+
+	/* reset when we (re)connect to a monitor */
+	void (*reset)(struct ceph_auth_client *ac);
+
+	void (*destroy)(struct ceph_auth_client *ac);
+};
+
+struct ceph_auth_client {
+	u32 protocol;           /* CEPH_AUTH_* */
+	void *private;          /* for use by protocol implementation */
+	const struct ceph_auth_client_ops *ops;  /* null iff protocol==0 */
+
+	bool negotiating;       /* true if negotiating protocol */
+	const char *name;       /* entity name */
+	u64 global_id;          /* our unique id in system */
+	const char *secret;     /* our secret key */
+	unsigned want_keys;     /* which services we want */
+};
+
+extern struct ceph_auth_client *ceph_auth_init(const char *name,
+					       const char *secret);
+extern void ceph_auth_destroy(struct ceph_auth_client *ac);
+
+extern void ceph_auth_reset(struct ceph_auth_client *ac);
+
+extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
+				 void *buf, size_t len);
+extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
+				  void *buf, size_t len,
+				  void *reply_buf, size_t reply_len);
+extern int ceph_entity_name_encode(const char *name, void **p, void *end);
+
+extern int ceph_build_auth(struct ceph_auth_client *ac,
+		    void *msg_buf, size_t msg_len);
+
+extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
+#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 00000000000..b4ef6f0a6c8
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,121 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "auth_none.h"
+#include "auth.h"
+#include "decode.h"
+
+static void reset(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)
+{
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static int is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	return !xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+			void *buf, void *end)
+{
+	struct ceph_auth_none_info *xi = ac->private;
+
+	xi->starting = false;
+	return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id.  we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_auth_none_info *ai = ac->private;
+	struct ceph_none_authorizer *au = &ai->au;
+	void *p, *end;
+	int ret;
+
+	if (!ai->built_authorizer) {
+		p = au->buf;
+		end = p + sizeof(au->buf);
+		ceph_encode_8(&p, 1);
+		ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+		if (ret < 0)
+			goto bad;
+		ceph_decode_need(&p, end, sizeof(u64), bad2);
+		ceph_encode_64(&p, ac->global_id);
+		au->buf_len = p - (void *)au->buf;
+		ai->built_authorizer = true;
+		dout("built authorizer len %d\n", au->buf_len);
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf;
+	*len = au->buf_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+
+bad2:
+	ret = -ERANGE;
+bad:
+	return ret;
+}
+
+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	/* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+	.reset = reset,
+	.destroy = destroy,
+	.is_authenticated = is_authenticated,
+	.handle_reply = handle_reply,
+	.create_authorizer = ceph_auth_none_create_authorizer,
+	.destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+	struct ceph_auth_none_info *xi;
+
+	dout("ceph_auth_none_init %p\n", ac);
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		return -ENOMEM;
+
+	xi->starting = true;
+	xi->built_authorizer = false;
+
+	ac->protocol = CEPH_AUTH_NONE;
+	ac->private = xi;
+	ac->ops = &ceph_auth_none_ops;
+	return 0;
+}
+
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 00000000000..56c05533a31
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include "auth.h"
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+	char buf[128];
+	int buf_len;
+	char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+	bool starting;
+	bool built_authorizer;
+	struct ceph_none_authorizer au;   /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 00000000000..8d8a8496476
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,679 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+#include "crypto.h"
+#include "auth.h"
+#include "decode.h"
+
+struct kmem_cache *ceph_x_ticketbuf_cachep;
+
+#define TEMP_TICKET_BUF_LEN	256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+
+	ceph_x_validate_tickets(ac, &need);
+	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+	     ac->want_keys, need, xi->have_keys);
+	return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_encrypt_buflen(int ilen)
+{
+	return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+		sizeof(u32);
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+			  void *ibuf, int ilen, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head = {
+		.struct_v = 1,
+		.magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+	};
+	size_t len = olen - sizeof(u32);
+	int ret;
+
+	ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+			    &head, sizeof(head), ibuf, ilen);
+	if (ret)
+		return ret;
+	ceph_encode_32(&obuf, len);
+	return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+			  void **p, void *end, void *obuf, size_t olen)
+{
+	struct ceph_x_encrypt_header head;
+	size_t head_len = sizeof(head);
+	int len, ret;
+
+	len = ceph_decode_32(p);
+	if (*p + len > end)
+		return -EINVAL;
+
+	dout("ceph_x_decrypt len %d\n", len);
+	ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+			    *p, len);
+	if (ret)
+		return ret;
+	if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+		return -EPERM;
+	*p += len;
+	return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
+						 int service)
+{
+	struct ceph_x_ticket_handler *th;
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
+
+	while (*p) {
+		parent = *p;
+		th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+		if (service < th->service)
+			p = &(*p)->rb_left;
+		else if (service > th->service)
+			p = &(*p)->rb_right;
+		else
+			return th;
+	}
+
+	/* add it */
+	th = kzalloc(sizeof(*th), GFP_NOFS);
+	if (!th)
+		return ERR_PTR(-ENOMEM);
+	th->service = service;
+	rb_link_node(&th->node, parent, p);
+	rb_insert_color(&th->node, &xi->ticket_handlers);
+	return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+				  struct ceph_x_ticket_handler *th)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("remove_ticket_handler %p %d\n", th, th->service);
+	rb_erase(&th->node, &xi->ticket_handlers);
+	ceph_crypto_key_destroy(&th->session_key);
+	if (th->ticket_blob)
+		ceph_buffer_put(th->ticket_blob);
+	kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+				    struct ceph_crypto_key *secret,
+				    void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int num;
+	void *p = buf;
+	int ret;
+	char *dbuf;
+	char *ticket_buf;
+	u8 struct_v;
+
+	dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
+	if (!dbuf)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
+				      GFP_NOFS | GFP_ATOMIC);
+	if (!ticket_buf)
+		goto out_dbuf;
+
+	ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+	struct_v = ceph_decode_8(&p);
+	if (struct_v != 1)
+		goto bad;
+	num = ceph_decode_32(&p);
+	dout("%d tickets\n", num);
+	while (num--) {
+		int type;
+		u8 struct_v;
+		struct ceph_x_ticket_handler *th;
+		void *dp, *dend;
+		int dlen;
+		char is_enc;
+		struct timespec validity;
+		struct ceph_crypto_key old_key;
+		void *tp, *tpend;
+		struct ceph_timespec new_validity;
+		struct ceph_crypto_key new_session_key;
+		struct ceph_buffer *new_ticket_blob;
+		unsigned long new_expires, new_renew_after;
+		u64 new_secret_id;
+
+		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+		type = ceph_decode_32(&p);
+		dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+		struct_v = ceph_decode_8(&p);
+		if (struct_v != 1)
+			goto bad;
+
+		th = get_ticket_handler(ac, type);
+		if (IS_ERR(th)) {
+			ret = PTR_ERR(th);
+			goto out;
+		}
+
+		/* blob for me */
+		dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+				      TEMP_TICKET_BUF_LEN);
+		if (dlen <= 0) {
+			ret = dlen;
+			goto out;
+		}
+		dout(" decrypted %d bytes\n", dlen);
+		dend = dbuf + dlen;
+		dp = dbuf;
+
+		struct_v = ceph_decode_8(&dp);
+		if (struct_v != 1)
+			goto bad;
+
+		memcpy(&old_key, &th->session_key, sizeof(old_key));
+		ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
+		if (ret)
+			goto out;
+
+		ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+		ceph_decode_timespec(&validity, &new_validity);
+		new_expires = get_seconds() + validity.tv_sec;
+		new_renew_after = new_expires - (validity.tv_sec / 4);
+		dout(" expires=%lu renew_after=%lu\n", new_expires,
+		     new_renew_after);
+
+		/* ticket blob for service */
+		ceph_decode_8_safe(&p, end, is_enc, bad);
+		tp = ticket_buf;
+		if (is_enc) {
+			/* encrypted */
+			dout(" encrypted ticket\n");
+			dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+					      TEMP_TICKET_BUF_LEN);
+			if (dlen < 0) {
+				ret = dlen;
+				goto out;
+			}
+			dlen = ceph_decode_32(&tp);
+		} else {
+			/* unencrypted */
+			ceph_decode_32_safe(&p, end, dlen, bad);
+			ceph_decode_need(&p, end, dlen, bad);
+			ceph_decode_copy(&p, ticket_buf, dlen);
+		}
+		tpend = tp + dlen;
+		dout(" ticket blob is %d bytes\n", dlen);
+		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+		struct_v = ceph_decode_8(&tp);
+		new_secret_id = ceph_decode_64(&tp);
+		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
+		if (ret)
+			goto out;
+
+		/* all is well, update our ticket */
+		ceph_crypto_key_destroy(&th->session_key);
+		if (th->ticket_blob)
+			ceph_buffer_put(th->ticket_blob);
+		th->session_key = new_session_key;
+		th->ticket_blob = new_ticket_blob;
+		th->validity = new_validity;
+		th->secret_id = new_secret_id;
+		th->expires = new_expires;
+		th->renew_after = new_renew_after;
+		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+		     type, ceph_entity_type_name(type), th->secret_id,
+		     (int)th->ticket_blob->vec.iov_len);
+		xi->have_keys |= th->service;
+	}
+
+	ret = 0;
+out:
+	kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
+out_dbuf:
+	kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
+	return ret;
+
+bad:
+	ret = -EINVAL;
+	goto out;
+}
+
+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+				   struct ceph_x_ticket_handler *th,
+				   struct ceph_x_authorizer *au)
+{
+	int maxlen;
+	struct ceph_x_authorize_a *msg_a;
+	struct ceph_x_authorize_b msg_b;
+	void *p, *end;
+	int ret;
+	int ticket_blob_len =
+		(th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+	dout("build_authorizer for %s %p\n",
+	     ceph_entity_type_name(th->service), au);
+
+	maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+		ceph_x_encrypt_buflen(ticket_blob_len);
+	dout("  need len %d\n", maxlen);
+	if (au->buf && au->buf->alloc_len < maxlen) {
+		ceph_buffer_put(au->buf);
+		au->buf = NULL;
+	}
+	if (!au->buf) {
+		au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
+		if (!au->buf)
+			return -ENOMEM;
+	}
+	au->service = th->service;
+
+	msg_a = au->buf->vec.iov_base;
+	msg_a->struct_v = 1;
+	msg_a->global_id = cpu_to_le64(ac->global_id);
+	msg_a->service_id = cpu_to_le32(th->service);
+	msg_a->ticket_blob.struct_v = 1;
+	msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+	msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+	if (ticket_blob_len) {
+		memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+		       th->ticket_blob->vec.iov_len);
+	}
+	dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+	     le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+	p = msg_a + 1;
+	p += ticket_blob_len;
+	end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+	get_random_bytes(&au->nonce, sizeof(au->nonce));
+	msg_b.struct_v = 1;
+	msg_b.nonce = cpu_to_le64(au->nonce);
+	ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+			     p, end - p);
+	if (ret < 0)
+		goto out_buf;
+	p += ret;
+	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+	dout(" built authorizer nonce %llx len %d\n", au->nonce,
+	     (int)au->buf->vec.iov_len);
+	BUG_ON(au->buf->vec.iov_len > maxlen);
+	return 0;
+
+out_buf:
+	ceph_buffer_put(au->buf);
+	au->buf = NULL;
+	return ret;
+}
+
+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+				void **p, void *end)
+{
+	ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, th->secret_id);
+	if (th->ticket_blob) {
+		const char *buf = th->ticket_blob->vec.iov_base;
+		u32 len = th->ticket_blob->vec.iov_len;
+
+		ceph_encode_32_safe(p, end, len, bad);
+		ceph_encode_copy_safe(p, end, buf, len, bad);
+	} else {
+		ceph_encode_32_safe(p, end, 0, bad);
+	}
+
+	return 0;
+bad:
+	return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+	int want = ac->want_keys;
+	struct ceph_x_info *xi = ac->private;
+	int service;
+
+	*pneed = ac->want_keys & ~(xi->have_keys);
+
+	for (service = 1; service <= want; service <<= 1) {
+		struct ceph_x_ticket_handler *th;
+
+		if (!(ac->want_keys & service))
+			continue;
+
+		if (*pneed & service)
+			continue;
+
+		th = get_ticket_handler(ac, service);
+
+		if (!th) {
+			*pneed |= service;
+			continue;
+		}
+
+		if (get_seconds() >= th->renew_after)
+			*pneed |= service;
+		if (get_seconds() >= th->expires)
+			xi->have_keys &= ~service;
+	}
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+				void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	int need;
+	struct ceph_x_request_header *head = buf;
+	int ret;
+	struct ceph_x_ticket_handler *th =
+		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+	ceph_x_validate_tickets(ac, &need);
+
+	dout("build_request want %x have %x need %x\n",
+	     ac->want_keys, xi->have_keys, need);
+
+	if (need & CEPH_ENTITY_TYPE_AUTH) {
+		struct ceph_x_authenticate *auth = (void *)(head + 1);
+		void *p = auth + 1;
+		struct ceph_x_challenge_blob tmp;
+		char tmp_enc[40];
+		u64 *u;
+
+		if (p > end)
+			return -ERANGE;
+
+		dout(" get_auth_session_key\n");
+		head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+		/* encrypt and hash */
+		get_random_bytes(&auth->client_challenge, sizeof(u64));
+		tmp.client_challenge = auth->client_challenge;
+		tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+		ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+				     tmp_enc, sizeof(tmp_enc));
+		if (ret < 0)
+			return ret;
+
+		auth->struct_v = 1;
+		auth->key = 0;
+		for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+			auth->key ^= *u;
+		dout(" server_challenge %llx client_challenge %llx key %llx\n",
+		     xi->server_challenge, le64_to_cpu(auth->client_challenge),
+		     le64_to_cpu(auth->key));
+
+		/* now encode the old ticket if exists */
+		ret = ceph_x_encode_ticket(th, &p, end);
+		if (ret < 0)
+			return ret;
+
+		return p - buf;
+	}
+
+	if (need) {
+		void *p = head + 1;
+		struct ceph_x_service_ticket_request *req;
+
+		if (p > end)
+			return -ERANGE;
+		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+		BUG_ON(!th);
+		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+		if (ret)
+			return ret;
+		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+				 xi->auth_authorizer.buf->vec.iov_len);
+
+		req = p;
+		req->keys = cpu_to_le32(need);
+		p += sizeof(*req);
+		return p - buf;
+	}
+
+	return 0;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+			       void *buf, void *end)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_reply_header *head = buf;
+	struct ceph_x_ticket_handler *th;
+	int len = end - buf;
+	int op;
+	int ret;
+
+	if (result)
+		return result;  /* XXX hmm? */
+
+	if (xi->starting) {
+		/* it's a hello */
+		struct ceph_x_server_challenge *sc = buf;
+
+		if (len != sizeof(*sc))
+			return -EINVAL;
+		xi->server_challenge = le64_to_cpu(sc->server_challenge);
+		dout("handle_reply got server challenge %llx\n",
+		     xi->server_challenge);
+		xi->starting = false;
+		xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+		return -EAGAIN;
+	}
+
+	op = le32_to_cpu(head->op);
+	result = le32_to_cpu(head->result);
+	dout("handle_reply op %d result %d\n", op, result);
+	switch (op) {
+	case CEPHX_GET_AUTH_SESSION_KEY:
+		/* verify auth key */
+		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+					       buf + sizeof(*head), end);
+		break;
+
+	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+		BUG_ON(!th);
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       buf + sizeof(*head), end);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (ret)
+		return ret;
+	if (ac->want_keys == xi->have_keys)
+		return 0;
+	return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_authorizer **a,
+	void **buf, size_t *len,
+	void **reply_buf, size_t *reply_len)
+{
+	struct ceph_x_authorizer *au;
+	struct ceph_x_ticket_handler *th;
+	int ret;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	au = kzalloc(sizeof(*au), GFP_NOFS);
+	if (!au)
+		return -ENOMEM;
+
+	ret = ceph_x_build_authorizer(ac, th, au);
+	if (ret) {
+		kfree(au);
+		return ret;
+	}
+
+	*a = (struct ceph_authorizer *)au;
+	*buf = au->buf->vec.iov_base;
+	*len = au->buf->vec.iov_len;
+	*reply_buf = au->reply_buf;
+	*reply_len = sizeof(au->reply_buf);
+	return 0;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+					  struct ceph_authorizer *a, size_t len)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+	struct ceph_x_ticket_handler *th;
+	int ret = 0;
+	struct ceph_x_authorize_reply reply;
+	void *p = au->reply_buf;
+	void *end = p + sizeof(au->reply_buf);
+
+	th = get_ticket_handler(ac, au->service);
+	if (!th)
+		return -EIO;  /* hrm! */
+	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+	if (ret < 0)
+		return ret;
+	if (ret != sizeof(reply))
+		return -EPERM;
+
+	if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+		ret = -EPERM;
+	else
+		ret = 0;
+	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+	     au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
+	return ret;
+}
+
+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+
+	ceph_buffer_put(au->buf);
+	kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+
+	dout("reset\n");
+	xi->starting = true;
+	xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct rb_node *p;
+
+	dout("ceph_x_destroy %p\n", ac);
+	ceph_crypto_key_destroy(&xi->secret);
+
+	while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+		struct ceph_x_ticket_handler *th =
+			rb_entry(p, struct ceph_x_ticket_handler, node);
+		remove_ticket_handler(ac, th);
+	}
+
+	kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+
+	kfree(ac->private);
+	ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+				   int peer_type)
+{
+	struct ceph_x_ticket_handler *th;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (th && !IS_ERR(th))
+		remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+	.is_authenticated = ceph_x_is_authenticated,
+	.build_request = ceph_x_build_request,
+	.handle_reply = ceph_x_handle_reply,
+	.create_authorizer = ceph_x_create_authorizer,
+	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+	.destroy_authorizer = ceph_x_destroy_authorizer,
+	.invalidate_authorizer = ceph_x_invalidate_authorizer,
+	.reset =  ceph_x_reset,
+	.destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+	struct ceph_x_info *xi;
+	int ret;
+
+	dout("ceph_x_init %p\n", ac);
+	xi = kzalloc(sizeof(*xi), GFP_NOFS);
+	if (!xi)
+		return -ENOMEM;
+
+	ret = -ENOMEM;
+	ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
+				      TEMP_TICKET_BUF_LEN, 8,
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      NULL);
+	if (!ceph_x_ticketbuf_cachep)
+		goto done_nomem;
+	ret = -EINVAL;
+	if (!ac->secret) {
+		pr_err("no secret set (for auth_x protocol)\n");
+		goto done_nomem;
+	}
+
+	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+	if (ret)
+		goto done_nomem;
+
+	xi->starting = true;
+	xi->ticket_handlers = RB_ROOT;
+
+	ac->protocol = CEPH_AUTH_CEPHX;
+	ac->private = xi;
+	ac->ops = &ceph_x_ops;
+	return 0;
+
+done_nomem:
+	kfree(xi);
+	if (ceph_x_ticketbuf_cachep)
+		kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+	return ret;
+}
+
+
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 00000000000..ff6f8180e68
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include "crypto.h"
+#include "auth.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+	struct rb_node node;
+	unsigned service;
+
+	struct ceph_crypto_key session_key;
+	struct ceph_timespec validity;
+
+	u64 secret_id;
+	struct ceph_buffer *ticket_blob;
+
+	unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+	struct ceph_buffer *buf;
+	unsigned service;
+	u64 nonce;
+	char reply_buf[128];  /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+	struct ceph_crypto_key secret;
+
+	bool starting;
+	u64 server_challenge;
+
+	unsigned have_keys;
+	struct rb_root ticket_handlers;
+
+	struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 00000000000..671d30576c4
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY      0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY          0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+	__u8 struct_v;
+	__le64 secret_id;
+	__le32 blob_len;
+	char blob[];
+} __attribute__ ((packed));
+
+
+/* common request/reply headers */
+struct ceph_x_request_header {
+	__le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+	__le16 op;
+	__le32 result;
+} __attribute__ ((packed));
+
+
+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+	__u8 struct_v;
+	__le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+	__u8 struct_v;
+	__le64 client_challenge;
+	__le64 key;
+	/* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+	__u8 struct_v;
+	__le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+	__le64 server_challenge;
+	__le64 client_challenge;
+} __attribute__ ((packed));
+
+
+
+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ *  a - service id, ticket blob
+ *  b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+	__u8 struct_v;
+	__le64 global_id;
+	__le32 service_id;
+	struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+	__u8 struct_v;
+	__le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+	__u8 struct_v;
+	__le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+	__u8 struct_v;
+	__le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 00000000000..b98086c7aeb
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,78 @@
+
+#include "ceph_debug.h"
+#include "buffer.h"
+#include "decode.h"
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+	struct ceph_buffer *b;
+
+	b = kmalloc(sizeof(*b), gfp);
+	if (!b)
+		return NULL;
+
+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+	if (b->vec.iov_base) {
+		b->is_vmalloc = false;
+	} else {
+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+		if (!b->vec.iov_base) {
+			kfree(b);
+			return NULL;
+		}
+		b->is_vmalloc = true;
+	}
+
+	kref_init(&b->kref);
+	b->alloc_len = len;
+	b->vec.iov_len = len;
+	dout("buffer_new %p\n", b);
+	return b;
+}
+
+void ceph_buffer_release(struct kref *kref)
+{
+	struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+	dout("buffer_release %p\n", b);
+	if (b->vec.iov_base) {
+		if (b->is_vmalloc)
+			vfree(b->vec.iov_base);
+		else
+			kfree(b->vec.iov_base);
+	}
+	kfree(b);
+}
+
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
+{
+	b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+	if (b->vec.iov_base) {
+		b->is_vmalloc = false;
+	} else {
+		b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+		b->is_vmalloc = true;
+	}
+	if (!b->vec.iov_base)
+		return -ENOMEM;
+	b->alloc_len = len;
+	b->vec.iov_len = len;
+	return 0;
+}
+
+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+	size_t len;
+
+	ceph_decode_need(p, end, sizeof(u32), bad);
+	len = ceph_decode_32(p);
+	dout("decode_buffer len %d\n", (int)len);
+	ceph_decode_need(p, end, len, bad);
+	*b = ceph_buffer_new(len, GFP_NOFS);
+	if (!*b)
+		return -ENOMEM;
+	ceph_decode_copy(p, (*b)->vec.iov_base, len);
+	return 0;
+bad:
+	return -EINVAL;
+}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 00000000000..58d19014068
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+	struct kref kref;
+	struct kvec vec;
+	size_t alloc_len;
+	bool is_vmalloc;
+};
+
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+	kref_get(&b->kref);
+	return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+	kref_put(&b->kref, ceph_buffer_release);
+}
+
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
+
+#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 00000000000..7d0a0d0adc1
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2932 @@
+#include "ceph_debug.h"
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "decode.h"
+#include "messenger.h"
+
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes).  Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server.  When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+
+static char *gcap_string(char *s, int c)
+{
+	if (c & CEPH_CAP_GSHARED)
+		*s++ = 's';
+	if (c & CEPH_CAP_GEXCL)
+		*s++ = 'x';
+	if (c & CEPH_CAP_GCACHE)
+		*s++ = 'c';
+	if (c & CEPH_CAP_GRD)
+		*s++ = 'r';
+	if (c & CEPH_CAP_GWR)
+		*s++ = 'w';
+	if (c & CEPH_CAP_GBUFFER)
+		*s++ = 'b';
+	if (c & CEPH_CAP_GLAZYIO)
+		*s++ = 'l';
+	return s;
+}
+
+const char *ceph_cap_string(int caps)
+{
+	int i;
+	char *s;
+	int c;
+
+	spin_lock(&cap_str_lock);
+	i = last_cap_str++;
+	if (last_cap_str == MAX_CAP_STR)
+		last_cap_str = 0;
+	spin_unlock(&cap_str_lock);
+
+	s = cap_str[i];
+
+	if (caps & CEPH_CAP_PIN)
+		*s++ = 'p';
+
+	c = (caps >> CEPH_CAP_SAUTH) & 3;
+	if (c) {
+		*s++ = 'A';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SLINK) & 3;
+	if (c) {
+		*s++ = 'L';
+		s = gcap_string(s, c);
+	}
+
+	c = (caps >> CEPH_CAP_SXATTR) & 3;
+	if (c) {
+		*s++ = 'X';
+		s = gcap_string(s, c);
+	}
+
+	c = caps >> CEPH_CAP_SFILE;
+	if (c) {
+		*s++ = 'F';
+		s = gcap_string(s, c);
+	}
+
+	if (s == cap_str[i])
+		*s++ = '-';
+	*s = 0;
+	return cap_str[i];
+}
+
+/*
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations.  This ensures that we preallocate
+ * memory needed to successfully process an MDS response.  (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
+ *
+ * Reservations are 'owned' by a ceph_cap_reservation context.
+ */
+static spinlock_t caps_list_lock;
+static struct list_head caps_list;  /* unused (reserved or unreserved) */
+static int caps_total_count;        /* total caps allocated */
+static int caps_use_count;          /* in use */
+static int caps_reserve_count;      /* unused, reserved */
+static int caps_avail_count;        /* unused, unreserved */
+static int caps_min_count;          /* keep at least this many (unreserved) */
+
+void __init ceph_caps_init(void)
+{
+	INIT_LIST_HEAD(&caps_list);
+	spin_lock_init(&caps_list_lock);
+}
+
+void ceph_caps_finalize(void)
+{
+	struct ceph_cap *cap;
+
+	spin_lock(&caps_list_lock);
+	while (!list_empty(&caps_list)) {
+		cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+		list_del(&cap->caps_item);
+		kmem_cache_free(ceph_cap_cachep, cap);
+	}
+	caps_total_count = 0;
+	caps_avail_count = 0;
+	caps_use_count = 0;
+	caps_reserve_count = 0;
+	caps_min_count = 0;
+	spin_unlock(&caps_list_lock);
+}
+
+void ceph_adjust_min_caps(int delta)
+{
+	spin_lock(&caps_list_lock);
+	caps_min_count += delta;
+	BUG_ON(caps_min_count < 0);
+	spin_unlock(&caps_list_lock);
+}
+
+int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+{
+	int i;
+	struct ceph_cap *cap;
+	int have;
+	int alloc = 0;
+	LIST_HEAD(newcaps);
+	int ret = 0;
+
+	dout("reserve caps ctx=%p need=%d\n", ctx, need);
+
+	/* first reserve any caps that are already allocated */
+	spin_lock(&caps_list_lock);
+	if (caps_avail_count >= need)
+		have = need;
+	else
+		have = caps_avail_count;
+	caps_avail_count -= have;
+	caps_reserve_count += have;
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+
+	for (i = have; i < need; i++) {
+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+		if (!cap) {
+			ret = -ENOMEM;
+			goto out_alloc_count;
+		}
+		list_add(&cap->caps_item, &newcaps);
+		alloc++;
+	}
+	BUG_ON(have + alloc != need);
+
+	spin_lock(&caps_list_lock);
+	caps_total_count += alloc;
+	caps_reserve_count += alloc;
+	list_splice(&newcaps, &caps_list);
+
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+
+	ctx->count = need;
+	dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+	     ctx, caps_total_count, caps_use_count, caps_reserve_count,
+	     caps_avail_count);
+	return 0;
+
+out_alloc_count:
+	/* we didn't manage to reserve as much as we needed */
+	pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+		   ctx, need, have);
+	return ret;
+}
+
+int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+{
+	dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+	if (ctx->count) {
+		spin_lock(&caps_list_lock);
+		BUG_ON(caps_reserve_count < ctx->count);
+		caps_reserve_count -= ctx->count;
+		caps_avail_count += ctx->count;
+		ctx->count = 0;
+		dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+		     caps_total_count, caps_use_count, caps_reserve_count,
+		     caps_avail_count);
+		BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+		       caps_avail_count);
+		spin_unlock(&caps_list_lock);
+	}
+	return 0;
+}
+
+static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+{
+	struct ceph_cap *cap = NULL;
+
+	/* temporary, until we do something about cap import/export */
+	if (!ctx)
+		return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+
+	spin_lock(&caps_list_lock);
+	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+	     ctx, ctx->count, caps_total_count, caps_use_count,
+	     caps_reserve_count, caps_avail_count);
+	BUG_ON(!ctx->count);
+	BUG_ON(ctx->count > caps_reserve_count);
+	BUG_ON(list_empty(&caps_list));
+
+	ctx->count--;
+	caps_reserve_count--;
+	caps_use_count++;
+
+	cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+	list_del(&cap->caps_item);
+
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+	return cap;
+}
+
+void ceph_put_cap(struct ceph_cap *cap)
+{
+	spin_lock(&caps_list_lock);
+	dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+	     cap, caps_total_count, caps_use_count,
+	     caps_reserve_count, caps_avail_count);
+	caps_use_count--;
+	/*
+	 * Keep some preallocated caps around (ceph_min_count), to
+	 * avoid lots of free/alloc churn.
+	 */
+	if (caps_avail_count >= caps_reserve_count + caps_min_count) {
+		caps_total_count--;
+		kmem_cache_free(ceph_cap_cachep, cap);
+	} else {
+		caps_avail_count++;
+		list_add(&cap->caps_item, &caps_list);
+	}
+
+	BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+	       caps_avail_count);
+	spin_unlock(&caps_list_lock);
+}
+
+void ceph_reservation_status(struct ceph_client *client,
+			     int *total, int *avail, int *used, int *reserved,
+			     int *min)
+{
+	if (total)
+		*total = caps_total_count;
+	if (avail)
+		*avail = caps_avail_count;
+	if (used)
+		*used = caps_use_count;
+	if (reserved)
+		*reserved = caps_reserve_count;
+	if (min)
+		*min = caps_min_count;
+}
+
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+	struct ceph_cap *cap;
+	struct rb_node *n = ci->i_caps.rb_node;
+
+	while (n) {
+		cap = rb_entry(n, struct ceph_cap, ci_node);
+		if (mds < cap->mds)
+			n = n->rb_left;
+		else if (mds > cap->mds)
+			n = n->rb_right;
+		else
+			return cap;
+	}
+	return NULL;
+}
+
+/*
+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+{
+	struct ceph_cap *cap;
+	int mds = -1;
+	struct rb_node *p;
+
+	/* prefer mds with WR|WRBUFFER|EXCL caps */
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		mds = cap->mds;
+		if (mseq)
+			*mseq = cap->mseq;
+		if (cap->issued & (CEPH_CAP_FILE_WR |
+				   CEPH_CAP_FILE_BUFFER |
+				   CEPH_CAP_FILE_EXCL))
+			break;
+	}
+	return mds;
+}
+
+int ceph_get_cap_mds(struct inode *inode)
+{
+	int mds;
+	spin_lock(&inode->i_lock);
+	mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+	spin_unlock(&inode->i_lock);
+	return mds;
+}
+
+/*
+ * Called under i_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+			      struct ceph_cap *new)
+{
+	struct rb_node **p = &ci->i_caps.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_cap *cap = NULL;
+
+	while (*p) {
+		parent = *p;
+		cap = rb_entry(parent, struct ceph_cap, ci_node);
+		if (new->mds < cap->mds)
+			p = &(*p)->rb_left;
+		else if (new->mds > cap->mds)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->ci_node, parent, p);
+	rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS.  Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	struct ceph_mount_args *ma = mdsc->client->mount_args;
+
+	ci->i_hold_caps_min = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_min * HZ);
+	ci->i_hold_caps_max = round_jiffies(jiffies +
+					    ma->caps_wanted_delay_max * HZ);
+	dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+	     ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_lock
+ *    -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+				struct ceph_inode_info *ci)
+{
+	__cap_set_timeouts(mdsc, ci);
+	dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+	     ci->i_ceph_flags, ci->i_hold_caps_max);
+	if (!mdsc->stopping) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (!list_empty(&ci->i_cap_delay_list)) {
+			if (ci->i_ceph_flags & CEPH_I_FLUSH)
+				goto no_change;
+			list_del_init(&ci->i_cap_delay_list);
+		}
+		list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+		spin_unlock(&mdsc->cap_delay_lock);
+	}
+}
+
+/*
+ * Queue an inode for immediate writeback.  Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+				      struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+	spin_lock(&mdsc->cap_delay_lock);
+	ci->i_ceph_flags |= CEPH_I_FLUSH;
+	if (!list_empty(&ci->i_cap_delay_list))
+		list_del_init(&ci->i_cap_delay_list);
+	list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+			       struct ceph_inode_info *ci)
+{
+	dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+	if (list_empty(&ci->i_cap_delay_list))
+		return;
+	spin_lock(&mdsc->cap_delay_lock);
+	list_del_init(&ci->i_cap_delay_list);
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+			      unsigned issued)
+{
+	unsigned had = __ceph_caps_issued(ci, NULL);
+
+	/*
+	 * Each time we receive FILE_CACHE anew, we increment
+	 * i_rdcache_gen.
+	 */
+	if ((issued & CEPH_CAP_FILE_CACHE) &&
+	    (had & CEPH_CAP_FILE_CACHE) == 0)
+		ci->i_rdcache_gen++;
+
+	/*
+	 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+	 * don't know what happened to this directory while we didn't
+	 * have the cap.
+	 */
+	if ((issued & CEPH_CAP_FILE_SHARED) &&
+	    (had & CEPH_CAP_FILE_SHARED) == 0) {
+		ci->i_shared_gen++;
+		if (S_ISDIR(ci->vfs_inode.i_mode)) {
+			dout(" marking %p NOT complete\n", &ci->vfs_inode);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+		}
+	}
+}
+
+/*
+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0.  (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+int ceph_add_cap(struct inode *inode,
+		 struct ceph_mds_session *session, u64 cap_id,
+		 int fmode, unsigned issued, unsigned wanted,
+		 unsigned seq, unsigned mseq, u64 realmino, int flags,
+		 struct ceph_cap_reservation *caps_reservation)
+{
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *new_cap = NULL;
+	struct ceph_cap *cap;
+	int mds = session->s_mds;
+	int actual_wanted;
+
+	dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+	     session->s_mds, cap_id, ceph_cap_string(issued), seq);
+
+	/*
+	 * If we are opening the file, include file mode wanted bits
+	 * in wanted.
+	 */
+	if (fmode >= 0)
+		wanted |= ceph_caps_for_mode(fmode);
+
+retry:
+	spin_lock(&inode->i_lock);
+	cap = __get_cap_for_mds(ci, mds);
+	if (!cap) {
+		if (new_cap) {
+			cap = new_cap;
+			new_cap = NULL;
+		} else {
+			spin_unlock(&inode->i_lock);
+			new_cap = get_cap(caps_reservation);
+			if (new_cap == NULL)
+				return -ENOMEM;
+			goto retry;
+		}
+
+		cap->issued = 0;
+		cap->implemented = 0;
+		cap->mds = mds;
+		cap->mds_wanted = 0;
+
+		cap->ci = ci;
+		__insert_cap_node(ci, cap);
+
+		/* clear out old exporting info?  (i.e. on cap import) */
+		if (ci->i_cap_exporting_mds == mds) {
+			ci->i_cap_exporting_issued = 0;
+			ci->i_cap_exporting_mseq = 0;
+			ci->i_cap_exporting_mds = -1;
+		}
+
+		/* add to session cap list */
+		cap->session = session;
+		spin_lock(&session->s_cap_lock);
+		list_add_tail(&cap->session_caps, &session->s_caps);
+		session->s_nr_caps++;
+		spin_unlock(&session->s_cap_lock);
+	}
+
+	if (!ci->i_snap_realm) {
+		/*
+		 * add this inode to the appropriate snap realm
+		 */
+		struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+							       realmino);
+		if (realm) {
+			ceph_get_snap_realm(mdsc, realm);
+			spin_lock(&realm->inodes_with_caps_lock);
+			ci->i_snap_realm = realm;
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			spin_unlock(&realm->inodes_with_caps_lock);
+		} else {
+			pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+			       realmino);
+		}
+	}
+
+	__check_cap_issue(ci, cap, issued);
+
+	/*
+	 * If we are issued caps we don't want, or the mds' wanted
+	 * value appears to be off, queue a check so we'll release
+	 * later and/or update the mds wanted value.
+	 */
+	actual_wanted = __ceph_caps_wanted(ci);
+	if ((wanted & ~actual_wanted) ||
+	    (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+		dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+		     ceph_cap_string(issued), ceph_cap_string(wanted),
+		     ceph_cap_string(actual_wanted));
+		__cap_delay_requeue(mdsc, ci);
+	}
+
+	if (flags & CEPH_CAP_FLAG_AUTH)
+		ci->i_auth_cap = cap;
+	else if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+	     inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+	     ceph_cap_string(issued|cap->issued), seq, mds);
+	cap->cap_id = cap_id;
+	cap->issued = issued;
+	cap->implemented |= issued;
+	cap->mds_wanted |= wanted;
+	cap->seq = seq;
+	cap->issue_seq = seq;
+	cap->mseq = mseq;
+	cap->cap_gen = session->s_cap_gen;
+
+	if (fmode >= 0)
+		__ceph_get_fmode(ci, fmode);
+	spin_unlock(&inode->i_lock);
+	wake_up(&ci->i_cap_wq);
+	return 0;
+}
+
+/*
+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+	unsigned long ttl;
+	u32 gen;
+
+	spin_lock(&cap->session->s_cap_lock);
+	gen = cap->session->s_cap_gen;
+	ttl = cap->session->s_cap_ttl;
+	spin_unlock(&cap->session->s_cap_lock);
+
+	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+		dout("__cap_is_valid %p cap %p issued %s "
+		     "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+		     cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Return set of valid cap bits issued to us.  Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+	int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	if (implemented)
+		*implemented = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		dout("__ceph_caps_issued %p cap %p issued %s\n",
+		     &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+		have |= cap->issued;
+		if (implemented)
+			*implemented |= cap->implemented;
+	}
+	return have;
+}
+
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+	int have = ci->i_snap_caps;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (cap == ocap)
+			continue;
+		if (!__cap_is_valid(cap))
+			continue;
+		have |= cap->issued;
+	}
+	return have;
+}
+
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+	struct ceph_mds_session *s = cap->session;
+
+	spin_lock(&s->s_cap_lock);
+	if (s->s_cap_iterator == NULL) {
+		dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+		     s->s_mds);
+		list_move_tail(&cap->session_caps, &s->s_caps);
+	} else {
+		dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+		     &cap->ci->vfs_inode, cap, s->s_mds);
+	}
+	spin_unlock(&s->s_cap_lock);
+}
+
+/*
+ * Check if we hold the given mask.  If so, move the cap(s) to the
+ * front of their respective LRUs.  (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int have = ci->i_snap_caps;
+
+	if ((have & mask) == mask) {
+		dout("__ceph_caps_issued_mask %p snap issued %s"
+		     " (mask %s)\n", &ci->vfs_inode,
+		     ceph_cap_string(have),
+		     ceph_cap_string(mask));
+		return 1;
+	}
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		if ((cap->issued & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p cap %p issued %s"
+			     " (mask %s)\n", &ci->vfs_inode, cap,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch)
+				__touch_cap(cap);
+			return 1;
+		}
+
+		/* does a combination of caps satisfy mask? */
+		have |= cap->issued;
+		if ((have & mask) == mask) {
+			dout("__ceph_caps_issued_mask %p combo issued %s"
+			     " (mask %s)\n", &ci->vfs_inode,
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(mask));
+			if (touch) {
+				struct rb_node *q;
+
+				/* touch this + preceeding caps */
+				__touch_cap(cap);
+				for (q = rb_first(&ci->i_caps); q != p;
+				     q = rb_next(q)) {
+					cap = rb_entry(q, struct ceph_cap,
+						       ci_node);
+					if (!__cap_is_valid(cap))
+						continue;
+					__touch_cap(cap);
+				}
+			}
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (__cap_is_valid(cap) &&
+		    (cap->implemented & ~cap->issued & mask)) {
+			ret = 1;
+			break;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	dout("ceph_caps_revoking %p %s = %d\n", inode,
+	     ceph_cap_string(mask), ret);
+	return ret;
+}
+
+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+	int used = 0;
+	if (ci->i_pin_ref)
+		used |= CEPH_CAP_PIN;
+	if (ci->i_rd_ref)
+		used |= CEPH_CAP_FILE_RD;
+	if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+		used |= CEPH_CAP_FILE_CACHE;
+	if (ci->i_wr_ref)
+		used |= CEPH_CAP_FILE_WR;
+	if (ci->i_wrbuffer_ref)
+		used |= CEPH_CAP_FILE_BUFFER;
+	return used;
+}
+
+/*
+ * wanted, by virtue of open file modes
+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+	int want = 0;
+	int mode;
+	for (mode = 0; mode < 4; mode++)
+		if (ci->i_nr_by_mode[mode])
+			want |= ceph_caps_for_mode(mode);
+	return want;
+}
+
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+	struct ceph_cap *cap;
+	struct rb_node *p;
+	int mds_wanted = 0;
+
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		if (!__cap_is_valid(cap))
+			continue;
+		mds_wanted |= cap->mds_wanted;
+	}
+	return mds_wanted;
+}
+
+/*
+ * called under i_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+}
+
+/*
+ * caller should hold i_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap)
+{
+	struct ceph_mds_session *session = cap->session;
+	struct ceph_inode_info *ci = cap->ci;
+	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+
+	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+
+	/* remove from inode list */
+	rb_erase(&cap->ci_node, &ci->i_caps);
+	cap->ci = NULL;
+	if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	/* remove from session list */
+	spin_lock(&session->s_cap_lock);
+	if (session->s_cap_iterator == cap) {
+		/* not yet, we are iterating over this very cap */
+		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
+		     cap, cap->session);
+	} else {
+		list_del_init(&cap->session_caps);
+		session->s_nr_caps--;
+		cap->session = NULL;
+	}
+	spin_unlock(&session->s_cap_lock);
+
+	if (cap->session == NULL)
+		ceph_put_cap(cap);
+
+	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		ci->i_snap_realm_counter++;
+		ci->i_snap_realm = NULL;
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+	if (!__ceph_is_any_real_caps(ci))
+		__cap_delay_cancel(mdsc, ci);
+}
+
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+			u64 ino, u64 cid, int op,
+			int caps, int wanted, int dirty,
+			u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+			u64 size, u64 max_size,
+			struct timespec *mtime, struct timespec *atime,
+			u64 time_warp_seq,
+			uid_t uid, gid_t gid, mode_t mode,
+			u64 xattr_version,
+			struct ceph_buffer *xattrs_buf,
+			u64 follows)
+{
+	struct ceph_mds_caps *fc;
+	struct ceph_msg *msg;
+
+	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+	     " seq %u/%u mseq %u follows %lld size %llu/%llu"
+	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+	     cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+	     ceph_cap_string(dirty),
+	     seq, issue_seq, mseq, follows, size, max_size,
+	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+
+	msg->hdr.tid = cpu_to_le64(flush_tid);
+
+	fc = msg->front.iov_base;
+	memset(fc, 0, sizeof(*fc));
+
+	fc->cap_id = cpu_to_le64(cid);
+	fc->op = cpu_to_le32(op);
+	fc->seq = cpu_to_le32(seq);
+	fc->issue_seq = cpu_to_le32(issue_seq);
+	fc->migrate_seq = cpu_to_le32(mseq);
+	fc->caps = cpu_to_le32(caps);
+	fc->wanted = cpu_to_le32(wanted);
+	fc->dirty = cpu_to_le32(dirty);
+	fc->ino = cpu_to_le64(ino);
+	fc->snap_follows = cpu_to_le64(follows);
+
+	fc->size = cpu_to_le64(size);
+	fc->max_size = cpu_to_le64(max_size);
+	if (mtime)
+		ceph_encode_timespec(&fc->mtime, mtime);
+	if (atime)
+		ceph_encode_timespec(&fc->atime, atime);
+	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+
+	fc->uid = cpu_to_le32(uid);
+	fc->gid = cpu_to_le32(gid);
+	fc->mode = cpu_to_le32(mode);
+
+	fc->xattr_version = cpu_to_le64(xattr_version);
+	if (xattrs_buf) {
+		msg->middle = ceph_buffer_get(xattrs_buf);
+		fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+		msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+	}
+
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * Queue cap releases when an inode is dropped from our cache.  Since
+ * inode is about to be destroyed, there is no need for i_lock.
+ */
+void ceph_queue_caps_release(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct rb_node *p;
+
+	p = rb_first(&ci->i_caps);
+	while (p) {
+		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+		struct ceph_mds_session *session = cap->session;
+		struct ceph_msg *msg;
+		struct ceph_mds_cap_release *head;
+		struct ceph_mds_cap_item *item;
+
+		spin_lock(&session->s_cap_lock);
+		BUG_ON(!session->s_num_cap_releases);
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg, list_head);
+
+		dout(" adding %p release to mds%d msg %p (%d left)\n",
+		     inode, session->s_mds, msg, session->s_num_cap_releases);
+
+		BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+		head = msg->front.iov_base;
+		head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+		item = msg->front.iov_base + msg->front.iov_len;
+		item->ino = cpu_to_le64(ceph_ino(inode));
+		item->cap_id = cpu_to_le64(cap->cap_id);
+		item->migrate_seq = cpu_to_le32(cap->mseq);
+		item->seq = cpu_to_le32(cap->issue_seq);
+
+		session->s_num_cap_releases--;
+
+		msg->front.iov_len += sizeof(*item);
+		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+			dout(" release msg %p full\n", msg);
+			list_move_tail(&msg->list_head,
+				       &session->s_cap_releases_done);
+		} else {
+			dout(" release msg %p at %d/%d (%d)\n", msg,
+			     (int)le32_to_cpu(head->num),
+			     (int)CEPH_CAPS_PER_RELEASE,
+			     (int)msg->front.iov_len);
+		}
+		spin_unlock(&session->s_cap_lock);
+		p = rb_next(p);
+		__ceph_remove_cap(cap);
+	}
+}
+
+/*
+ * Send a cap msg on the given inode.  Update our caps state, then
+ * drop i_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE.  Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+		      int op, int used, int want, int retain, int flushing,
+		      unsigned *pflush_tid)
+	__releases(cap->ci->vfs_inode->i_lock)
+{
+	struct ceph_inode_info *ci = cap->ci;
+	struct inode *inode = &ci->vfs_inode;
+	u64 cap_id = cap->cap_id;
+	int held, revoking, dropping, keep;
+	u64 seq, issue_seq, mseq, time_warp_seq, follows;
+	u64 size, max_size;
+	struct timespec mtime, atime;
+	int wake = 0;
+	mode_t mode;
+	uid_t uid;
+	gid_t gid;
+	struct ceph_mds_session *session;
+	u64 xattr_version = 0;
+	int delayed = 0;
+	u64 flush_tid = 0;
+	int i;
+	int ret;
+
+	held = cap->issued | cap->implemented;
+	revoking = cap->implemented & ~cap->issued;
+	retain &= ~revoking;
+	dropping = cap->issued & ~retain;
+
+	dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+	     inode, cap, cap->session,
+	     ceph_cap_string(held), ceph_cap_string(held & retain),
+	     ceph_cap_string(revoking));
+	BUG_ON((retain & CEPH_CAP_PIN) == 0);
+
+	session = cap->session;
+
+	/* don't release wanted unless we've waited a bit. */
+	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+	    time_before(jiffies, ci->i_hold_caps_min)) {
+		dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+		     ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap->issued & retain),
+		     ceph_cap_string(cap->mds_wanted),
+		     ceph_cap_string(want));
+		want |= cap->mds_wanted;
+		retain |= cap->issued;
+		delayed = 1;
+	}
+	ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+
+	cap->issued &= retain;  /* drop bits we don't want */
+	if (cap->implemented & ~cap->issued) {
+		/*
+		 * Wake up any waiters on wanted -> needed transition.
+		 * This is due to the weird transition from buffered
+		 * to sync IO... we need to flush dirty pages _before_
+		 * allowing sync writes to avoid reordering.
+		 */
+		wake = 1;
+	}
+	cap->implemented &= cap->issued | used;
+	cap->mds_wanted = want;
+
+	if (flushing) {
+		/*
+		 * assign a tid for flush operations so we can avoid
+		 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+		 * clean type races.  track latest tid for every bit
+		 * so we can handle flush AxFw, flush Fw, and have the
+		 * first ack clean Ax.
+		 */
+		flush_tid = ++ci->i_cap_flush_last_tid;
+		if (pflush_tid)
+			*pflush_tid = flush_tid;
+		dout(" cap_flush_tid %d\n", (int)flush_tid);
+		for (i = 0; i < CEPH_CAP_BITS; i++)
+			if (flushing & (1 << i))
+				ci->i_cap_flush_tid[i] = flush_tid;
+	}
+
+	keep = cap->implemented;
+	seq = cap->seq;
+	issue_seq = cap->issue_seq;
+	mseq = cap->mseq;
+	size = inode->i_size;
+	ci->i_reported_size = size;
+	max_size = ci->i_wanted_max_size;
+	ci->i_requested_max_size = max_size;
+	mtime = inode->i_mtime;
+	atime = inode->i_atime;
+	time_warp_seq = ci->i_time_warp_seq;
+	follows = ci->i_snap_realm->cached_context->seq;
+	uid = inode->i_uid;
+	gid = inode->i_gid;
+	mode = inode->i_mode;
+
+	if (dropping & CEPH_CAP_XATTR_EXCL) {
+		__ceph_build_xattrs_blob(ci);
+		xattr_version = ci->i_xattrs.version + 1;
+	}
+
+	spin_unlock(&inode->i_lock);
+
+	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+		size, max_size, &mtime, &atime, time_warp_seq,
+		uid, gid, mode,
+		xattr_version,
+		(flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+		follows);
+	if (ret < 0) {
+		dout("error sending cap msg, must requeue %p\n", inode);
+		delayed = 1;
+	}
+
+	if (wake)
+		wake_up(&ci->i_cap_wq);
+
+	return delayed;
+}
+
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken.  This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Called under i_lock.  Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			struct ceph_mds_session **psession)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int mds;
+	struct ceph_cap_snap *capsnap;
+	u32 mseq;
+	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+						    session->s_mutex */
+	u64 next_follows = 0;  /* keep track of how far we've gotten through the
+			     i_cap_snaps list, and skip these entries next time
+			     around to avoid an infinite loop */
+
+	if (psession)
+		session = *psession;
+
+	dout("__flush_snaps %p\n", inode);
+retry:
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		/* avoid an infiniute loop after retry */
+		if (capsnap->follows < next_follows)
+			continue;
+		/*
+		 * we need to wait for sync writes to complete and for dirty
+		 * pages to be written out.
+		 */
+		if (capsnap->dirty_pages || capsnap->writing)
+			continue;
+
+		/* pick mds, take s_mutex */
+		mds = __ceph_get_cap_mds(ci, &mseq);
+		if (session && session->s_mds != mds) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			session = NULL;
+		}
+		if (!session) {
+			spin_unlock(&inode->i_lock);
+			mutex_lock(&mdsc->mutex);
+			session = __ceph_lookup_mds_session(mdsc, mds);
+			mutex_unlock(&mdsc->mutex);
+			if (session) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				mutex_lock(&session->s_mutex);
+			}
+			/*
+			 * if session == NULL, we raced against a cap
+			 * deletion.  retry, and we'll get a better
+			 * @mds value next time.
+			 */
+			spin_lock(&inode->i_lock);
+			goto retry;
+		}
+
+		capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+		atomic_inc(&capsnap->nref);
+		if (!list_empty(&capsnap->flushing_item))
+			list_del_init(&capsnap->flushing_item);
+		list_add_tail(&capsnap->flushing_item,
+			      &session->s_cap_snaps_flushing);
+		spin_unlock(&inode->i_lock);
+
+		dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+		     inode, capsnap, next_follows, capsnap->size);
+		send_cap_msg(session, ceph_vino(inode).ino, 0,
+			     CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+			     capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+			     capsnap->size, 0,
+			     &capsnap->mtime, &capsnap->atime,
+			     capsnap->time_warp_seq,
+			     capsnap->uid, capsnap->gid, capsnap->mode,
+			     0, NULL,
+			     capsnap->follows);
+
+		next_follows = capsnap->follows + 1;
+		ceph_put_cap_snap(capsnap);
+
+		spin_lock(&inode->i_lock);
+		goto retry;
+	}
+
+	/* we flushed them all; remove this inode from the queue */
+	spin_lock(&mdsc->snap_flush_lock);
+	list_del_init(&ci->i_snap_flush_item);
+	spin_unlock(&mdsc->snap_flush_lock);
+
+	if (psession)
+		*psession = session;
+	else if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+}
+
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+	struct inode *inode = &ci->vfs_inode;
+
+	spin_lock(&inode->i_lock);
+	__ceph_flush_snaps(ci, NULL);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Mark caps dirty.  If inode is newly dirty, add to the global dirty
+ * list.
+ */
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	int was = ci->i_dirty_caps;
+	int dirty = 0;
+
+	dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+	     ceph_cap_string(mask), ceph_cap_string(was),
+	     ceph_cap_string(was | mask));
+	ci->i_dirty_caps |= mask;
+	if (was == 0) {
+		dout(" inode %p now dirty\n", &ci->vfs_inode);
+		BUG_ON(!list_empty(&ci->i_dirty_item));
+		spin_lock(&mdsc->cap_dirty_lock);
+		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+		spin_unlock(&mdsc->cap_dirty_lock);
+		if (ci->i_flushing_caps == 0) {
+			igrab(inode);
+			dirty |= I_DIRTY_SYNC;
+		}
+	}
+	BUG_ON(list_empty(&ci->i_dirty_item));
+	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+	    (mask & CEPH_CAP_FILE_BUFFER))
+		dirty |= I_DIRTY_DATASYNC;
+	if (dirty)
+		__mark_inode_dirty(inode, dirty);
+	__cap_delay_requeue(mdsc, ci);
+}
+
+/*
+ * Add dirty inode to the flushing list.  Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int flushing;
+
+	BUG_ON(ci->i_dirty_caps == 0);
+	BUG_ON(list_empty(&ci->i_dirty_item));
+
+	flushing = ci->i_dirty_caps;
+	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+	     ceph_cap_string(flushing),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps | flushing));
+	ci->i_flushing_caps |= flushing;
+	ci->i_dirty_caps = 0;
+	dout(" inode %p now !dirty\n", inode);
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	list_del_init(&ci->i_dirty_item);
+
+	ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+	if (list_empty(&ci->i_flushing_item)) {
+		list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		mdsc->num_cap_flushing++;
+		dout(" inode %p now flushing seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	} else {
+		list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+		dout(" inode %p now flushing (more) seq %lld\n", inode,
+		     ci->i_cap_flush_seq);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+
+	return flushing;
+}
+
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int mapping_is_empty(struct address_space *mapping)
+{
+	struct page *page = find_get_page(mapping, 0);
+
+	if (!page)
+		return 1;
+
+	put_page(page);
+	return 0;
+}
+
+static int try_nonblocking_invalidate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u32 invalidating_gen = ci->i_rdcache_gen;
+
+	spin_unlock(&inode->i_lock);
+	invalidate_mapping_pages(&inode->i_data, 0, -1);
+	spin_lock(&inode->i_lock);
+
+	if (mapping_is_empty(&inode->i_data) &&
+	    invalidating_gen == ci->i_rdcache_gen) {
+		/* success. */
+		dout("try_nonblocking_invalidate %p success\n", inode);
+		ci->i_rdcache_gen = 0;
+		ci->i_rdcache_revoking = 0;
+		return 0;
+	}
+	dout("try_nonblocking_invalidate %p failed\n", inode);
+	return -1;
+}
+
+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps.  Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ *  CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ *    cap release further.
+ *  CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ *  CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ *    further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+		     struct ceph_mds_session *session)
+	__releases(session->s_mutex)
+{
+	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap *cap;
+	int file_wanted, used;
+	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
+	int issued, implemented, want, retain, revoking, flushing = 0;
+	int mds = -1;   /* keep track of how far we've gone through i_caps list
+			   to avoid an infinite loop on retry */
+	struct rb_node *p;
+	int tried_invalidate = 0;
+	int delayed = 0, sent = 0, force_requeue = 0, num;
+	int queue_invalidate = 0;
+	int is_delayed = flags & CHECK_CAPS_NODELAY;
+
+	/* if we are unmounting, flush any unused caps immediately. */
+	if (mdsc->stopping)
+		is_delayed = 1;
+
+	spin_lock(&inode->i_lock);
+
+	if (ci->i_ceph_flags & CEPH_I_FLUSH)
+		flags |= CHECK_CAPS_FLUSH;
+
+	/* flush snaps first time around only */
+	if (!list_empty(&ci->i_cap_snaps))
+		__ceph_flush_snaps(ci, &session);
+	goto retry_locked;
+retry:
+	spin_lock(&inode->i_lock);
+retry_locked:
+	file_wanted = __ceph_caps_file_wanted(ci);
+	used = __ceph_caps_used(ci);
+	want = file_wanted | used;
+	issued = __ceph_caps_issued(ci, &implemented);
+	revoking = implemented & ~issued;
+
+	retain = want | CEPH_CAP_PIN;
+	if (!mdsc->stopping && inode->i_nlink > 0) {
+		if (want) {
+			retain |= CEPH_CAP_ANY;       /* be greedy */
+		} else {
+			retain |= CEPH_CAP_ANY_SHARED;
+			/*
+			 * keep RD only if we didn't have the file open RW,
+			 * because then the mds would revoke it anyway to
+			 * journal max_size=0.
+			 */
+			if (ci->i_max_size == 0)
+				retain |= CEPH_CAP_ANY_RD;
+		}
+	}
+
+	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+	     " issued %s revoking %s retain %s %s%s%s\n", inode,
+	     ceph_cap_string(file_wanted),
+	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+	     ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(issued), ceph_cap_string(revoking),
+	     ceph_cap_string(retain),
+	     (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+	     (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+	     (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+
+	/*
+	 * If we no longer need to hold onto old our caps, and we may
+	 * have cached pages, but don't want them, then try to invalidate.
+	 * If we fail, it's because pages are locked.... try again later.
+	 */
+	if ((!is_delayed || mdsc->stopping) &&
+	    ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
+	    ci->i_rdcache_gen &&                     /* may have cached pages */
+	    (file_wanted == 0 ||                     /* no open files */
+	     (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
+	    !tried_invalidate) {
+		dout("check_caps trying to invalidate on %p\n", inode);
+		if (try_nonblocking_invalidate(inode) < 0) {
+			if (revoking & CEPH_CAP_FILE_CACHE) {
+				dout("check_caps queuing invalidate\n");
+				queue_invalidate = 1;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			} else {
+				dout("check_caps failed to invalidate pages\n");
+				/* we failed to invalidate pages.  check these
+				   caps again later. */
+				force_requeue = 1;
+				__cap_set_timeouts(mdsc, ci);
+			}
+		}
+		tried_invalidate = 1;
+		goto retry_locked;
+	}
+
+	num = 0;
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		cap = rb_entry(p, struct ceph_cap, ci_node);
+		num++;
+
+		/* avoid looping forever */
+		if (mds >= cap->mds ||
+		    ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+			continue;
+
+		/* NOTE: no side-effects allowed, until we take s_mutex */
+
+		revoking = cap->implemented & ~cap->issued;
+		if (revoking)
+			dout(" mds%d revoking %s\n", cap->mds,
+			     ceph_cap_string(revoking));
+
+		if (cap == ci->i_auth_cap &&
+		    (cap->issued & CEPH_CAP_FILE_WR)) {
+			/* request larger max_size from MDS? */
+			if (ci->i_wanted_max_size > ci->i_max_size &&
+			    ci->i_wanted_max_size > ci->i_requested_max_size) {
+				dout("requesting new max_size\n");
+				goto ack;
+			}
+
+			/* approaching file_max? */
+			if ((inode->i_size << 1) >= ci->i_max_size &&
+			    (ci->i_reported_size << 1) < ci->i_max_size) {
+				dout("i_size approaching max_size\n");
+				goto ack;
+			}
+		}
+		/* flush anything dirty? */
+		if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+		    ci->i_dirty_caps) {
+			dout("flushing dirty caps\n");
+			goto ack;
+		}
+
+		/* completed revocation? going down and there are no caps? */
+		if (revoking && (revoking & used) == 0) {
+			dout("completed revocation of %s\n",
+			     ceph_cap_string(cap->implemented & ~cap->issued));
+			goto ack;
+		}
+
+		/* want more caps from mds? */
+		if (want & ~(cap->mds_wanted | cap->issued))
+			goto ack;
+
+		/* things we might delay */
+		if ((cap->issued & ~retain) == 0 &&
+		    cap->mds_wanted == want)
+			continue;     /* nope, all good */
+
+		if (is_delayed)
+			goto ack;
+
+		/* delay? */
+		if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max)) {
+			dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+			     ceph_cap_string(cap->issued),
+			     ceph_cap_string(cap->issued & retain),
+			     ceph_cap_string(cap->mds_wanted),
+			     ceph_cap_string(want));
+			delayed++;
+			continue;
+		}
+
+ack:
+		if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+			dout(" skipping %p I_NOFLUSH set\n", inode);
+			continue;
+		}
+
+		if (session && session != cap->session) {
+			dout("oops, wrong session %p mutex\n", session);
+			mutex_unlock(&session->s_mutex);
+			session = NULL;
+		}
+		if (!session) {
+			session = cap->session;
+			if (mutex_trylock(&session->s_mutex) == 0) {
+				dout("inverting session/ino locks on %p\n",
+				     session);
+				spin_unlock(&inode->i_lock);
+				if (took_snap_rwsem) {
+					up_read(&mdsc->snap_rwsem);
+					took_snap_rwsem = 0;
+				}
+				mutex_lock(&session->s_mutex);
+				goto retry;
+			}
+		}
+		/* take snap_rwsem after session mutex */
+		if (!took_snap_rwsem) {
+			if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+				dout("inverting snap/in locks on %p\n",
+				     inode);
+				spin_unlock(&inode->i_lock);
+				down_read(&mdsc->snap_rwsem);
+				took_snap_rwsem = 1;
+				goto retry;
+			}
+			took_snap_rwsem = 1;
+		}
+
+		if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+			flushing = __mark_caps_flushing(inode, session);
+
+		mds = cap->mds;  /* remember mds, so we don't repeat */
+		sent++;
+
+		/* __send_cap drops i_lock */
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
+				      retain, flushing, NULL);
+		goto retry; /* retake i_lock and restart our cap scan. */
+	}
+
+	/*
+	 * Reschedule delayed caps release if we delayed anything,
+	 * otherwise cancel.
+	 */
+	if (delayed && is_delayed)
+		force_requeue = 1;   /* __send_cap delayed release; requeue */
+	if (!delayed && !is_delayed)
+		__cap_delay_cancel(mdsc, ci);
+	else if (!is_delayed || force_requeue)
+		__cap_delay_requeue(mdsc, ci);
+
+	spin_unlock(&inode->i_lock);
+
+	if (queue_invalidate)
+		ceph_queue_invalidate(inode);
+
+	if (session)
+		mutex_unlock(&session->s_mutex);
+	if (took_snap_rwsem)
+		up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+			  unsigned *flush_tid)
+{
+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int unlock_session = session ? 0 : 1;
+	int flushing = 0;
+
+retry:
+	spin_lock(&inode->i_lock);
+	if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+		dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+		goto out;
+	}
+	if (ci->i_dirty_caps && ci->i_auth_cap) {
+		struct ceph_cap *cap = ci->i_auth_cap;
+		int used = __ceph_caps_used(ci);
+		int want = __ceph_caps_wanted(ci);
+		int delayed;
+
+		if (!session) {
+			spin_unlock(&inode->i_lock);
+			session = cap->session;
+			mutex_lock(&session->s_mutex);
+			goto retry;
+		}
+		BUG_ON(session != cap->session);
+		if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+			goto out;
+
+		flushing = __mark_caps_flushing(inode, session);
+
+		/* __send_cap drops i_lock */
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+				     cap->issued | cap->implemented, flushing,
+				     flush_tid);
+		if (!delayed)
+			goto out_unlocked;
+
+		spin_lock(&inode->i_lock);
+		__cap_delay_requeue(mdsc, ci);
+	}
+out:
+	spin_unlock(&inode->i_lock);
+out_unlocked:
+	if (session && unlock_session)
+		mutex_unlock(&session->s_mutex);
+	return flushing;
+}
+
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int dirty, i, ret = 1;
+
+	spin_lock(&inode->i_lock);
+	dirty = __ceph_caps_dirty(ci);
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((ci->i_flushing_caps & (1 << i)) &&
+		    ci->i_cap_flush_tid[i] <= tid) {
+			/* still flushing this bit */
+			ret = 0;
+			break;
+		}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/*
+ * Wait on any unsafe replies for the given inode.  First wait on the
+ * newest request, and make that the upper bound.  Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_writes;
+	struct ceph_osd_request *req;
+	u64 last_tid;
+
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	/* set upper bound as _last_ entry in chain */
+	req = list_entry(head->prev, struct ceph_osd_request,
+			 r_unsafe_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_osdc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+		dout("sync_write_wait on tid %llu (until %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		spin_lock(&ci->i_unsafe_lock);
+		ceph_osdc_put_request(req);
+
+		/*
+		 * from here on look at first entry in chain, since we
+		 * only want to wait for anything older than last_tid
+		 */
+		if (list_empty(head))
+			break;
+		req = list_entry(head->next, struct ceph_osd_request,
+				 r_unsafe_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+}
+
+int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int ret;
+	int dirty;
+
+	dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+	sync_write_wait(inode);
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret < 0)
+		return ret;
+
+	dirty = try_flush_caps(inode, NULL, &flush_tid);
+	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+	/*
+	 * only wait on non-file metadata writeback (the mds
+	 * can recover size and mtime, so we don't need to
+	 * wait for that)
+	 */
+	if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+		dout("fsync waiting for flush_tid %u\n", flush_tid);
+		ret = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	}
+
+	dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
+	return ret;
+}
+
+/*
+ * Flush any dirty caps back to the mds.  If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	unsigned flush_tid;
+	int err = 0;
+	int dirty;
+	int wait = wbc->sync_mode == WB_SYNC_ALL;
+
+	dout("write_inode %p wait=%d\n", inode, wait);
+	if (wait) {
+		dirty = try_flush_caps(inode, NULL, &flush_tid);
+		if (dirty)
+			err = wait_event_interruptible(ci->i_cap_wq,
+				       caps_are_flushed(inode, flush_tid));
+	} else {
+		struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+
+		spin_lock(&inode->i_lock);
+		if (__ceph_caps_dirty(ci))
+			__cap_delay_requeue_front(mdsc, ci);
+		spin_unlock(&inode->i_lock);
+	}
+	return err;
+}
+
+/*
+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_cap_snap *capsnap;
+
+	dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+	list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+			    flushing_item) {
+		struct ceph_inode_info *ci = capsnap->ci;
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+
+		spin_lock(&inode->i_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+			     cap, capsnap);
+			__ceph_flush_snaps(ci, &session);
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+			spin_unlock(&inode->i_lock);
+		}
+	}
+}
+
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci;
+
+	kick_flushing_capsnaps(mdsc, session);
+
+	dout("kick_flushing_caps mds%d\n", session->s_mds);
+	list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+		struct inode *inode = &ci->vfs_inode;
+		struct ceph_cap *cap;
+		int delayed = 0;
+
+		spin_lock(&inode->i_lock);
+		cap = ci->i_auth_cap;
+		if (cap && cap->session == session) {
+			dout("kick_flushing_caps %p cap %p %s\n", inode,
+			     cap, ceph_cap_string(ci->i_flushing_caps));
+			delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+					     __ceph_caps_used(ci),
+					     __ceph_caps_wanted(ci),
+					     cap->issued | cap->implemented,
+					     ci->i_flushing_caps, NULL);
+			if (delayed) {
+				spin_lock(&inode->i_lock);
+				__cap_delay_requeue(mdsc, ci);
+				spin_unlock(&inode->i_lock);
+			}
+		} else {
+			pr_err("%p auth cap %p not mds%d ???\n", inode,
+			       cap, session->s_mds);
+			spin_unlock(&inode->i_lock);
+		}
+	}
+}
+
+
+/*
+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+	if (got & CEPH_CAP_PIN)
+		ci->i_pin_ref++;
+	if (got & CEPH_CAP_FILE_RD)
+		ci->i_rd_ref++;
+	if (got & CEPH_CAP_FILE_CACHE)
+		ci->i_rdcache_ref++;
+	if (got & CEPH_CAP_FILE_WR)
+		ci->i_wr_ref++;
+	if (got & CEPH_CAP_FILE_BUFFER) {
+		if (ci->i_wrbuffer_ref == 0)
+			igrab(&ci->vfs_inode);
+		ci->i_wrbuffer_ref++;
+		dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
+		     &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
+	}
+}
+
+/*
+ * Try to grab cap references.  Specify those refs we @want, and the
+ * minimal set we @need.  Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+			    int *got, loff_t endoff, int *check_max, int *err)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int ret = 0;
+	int have, implemented;
+	int file_wanted;
+
+	dout("get_cap_refs %p need %s want %s\n", inode,
+	     ceph_cap_string(need), ceph_cap_string(want));
+	spin_lock(&inode->i_lock);
+
+	/* make sure file is actually open */
+	file_wanted = __ceph_caps_file_wanted(ci);
+	if ((file_wanted & need) == 0) {
+		dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+		     ceph_cap_string(need), ceph_cap_string(file_wanted));
+		*err = -EBADF;
+		ret = 1;
+		goto out;
+	}
+
+	if (need & CEPH_CAP_FILE_WR) {
+		if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+			     inode, endoff, ci->i_max_size);
+			if (endoff > ci->i_wanted_max_size) {
+				*check_max = 1;
+				ret = 1;
+			}
+			goto out;
+		}
+		/*
+		 * If a sync write is in progress, we must wait, so that we
+		 * can get a final snapshot value for size+mtime.
+		 */
+		if (__ceph_have_pending_cap_snap(ci)) {
+			dout("get_cap_refs %p cap_snap_pending\n", inode);
+			goto out;
+		}
+	}
+	have = __ceph_caps_issued(ci, &implemented);
+
+	/*
+	 * disallow writes while a truncate is pending
+	 */
+	if (ci->i_truncate_pending)
+		have &= ~CEPH_CAP_FILE_WR;
+
+	if ((have & need) == need) {
+		/*
+		 * Look at (implemented & ~have & not) so that we keep waiting
+		 * on transition from wanted -> needed caps.  This is needed
+		 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+		 * going before a prior buffered writeback happens.
+		 */
+		int not = want & ~(have & need);
+		int revoking = implemented & ~have;
+		dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+		     inode, ceph_cap_string(have), ceph_cap_string(not),
+		     ceph_cap_string(revoking));
+		if ((revoking & not) == 0) {
+			*got = need | (have & want);
+			__take_cap_refs(ci, *got);
+			ret = 1;
+		}
+	} else {
+		dout("get_cap_refs %p have %s needed %s\n", inode,
+		     ceph_cap_string(have), ceph_cap_string(need));
+	}
+out:
+	spin_unlock(&inode->i_lock);
+	dout("get_cap_refs %p ret %d got %s\n", inode,
+	     ret, ceph_cap_string(*got));
+	return ret;
+}
+
+/*
+ * Check the offset we are writing up to against our current
+ * max_size.  If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int check = 0;
+
+	/* do we need to explicitly request a larger max_size? */
+	spin_lock(&inode->i_lock);
+	if ((endoff >= ci->i_max_size ||
+	     endoff > (inode->i_size << 1)) &&
+	    endoff > ci->i_wanted_max_size) {
+		dout("write %p at large endoff %llu, req max_size\n",
+		     inode, endoff);
+		ci->i_wanted_max_size = endoff;
+		check = 1;
+	}
+	spin_unlock(&inode->i_lock);
+	if (check)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+
+/*
+ * Wait for caps, and take cap references.  If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
+		  loff_t endoff)
+{
+	int check_max, ret, err;
+
+retry:
+	if (endoff > 0)
+		check_max_size(&ci->vfs_inode, endoff);
+	check_max = 0;
+	err = 0;
+	ret = wait_event_interruptible(ci->i_cap_wq,
+				       try_get_cap_refs(ci, need, want,
+							got, endoff,
+							&check_max, &err));
+	if (err)
+		ret = err;
+	if (check_max)
+		goto retry;
+	return ret;
+}
+
+/*
+ * Take cap refs.  Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+	spin_lock(&ci->vfs_inode.i_lock);
+	__take_cap_refs(ci, caps);
+	spin_unlock(&ci->vfs_inode.i_lock);
+}
+
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0, put = 0, flushsnaps = 0, wake = 0;
+	struct ceph_cap_snap *capsnap;
+
+	spin_lock(&inode->i_lock);
+	if (had & CEPH_CAP_PIN)
+		--ci->i_pin_ref;
+	if (had & CEPH_CAP_FILE_RD)
+		if (--ci->i_rd_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_CACHE)
+		if (--ci->i_rdcache_ref == 0)
+			last++;
+	if (had & CEPH_CAP_FILE_BUFFER) {
+		if (--ci->i_wrbuffer_ref == 0) {
+			last++;
+			put++;
+		}
+		dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
+		     inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
+	}
+	if (had & CEPH_CAP_FILE_WR)
+		if (--ci->i_wr_ref == 0) {
+			last++;
+			if (!list_empty(&ci->i_cap_snaps)) {
+				capsnap = list_first_entry(&ci->i_cap_snaps,
+						     struct ceph_cap_snap,
+						     ci_item);
+				if (capsnap->writing) {
+					capsnap->writing = 0;
+					flushsnaps =
+						__ceph_finish_cap_snap(ci,
+								       capsnap);
+					wake = 1;
+				}
+			}
+		}
+	spin_unlock(&inode->i_lock);
+
+	dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
+	     last ? "last" : "");
+
+	if (last && !flushsnaps)
+		ceph_check_caps(ci, 0, NULL);
+	else if (flushsnaps)
+		ceph_flush_snaps(ci);
+	if (wake)
+		wake_up(&ci->i_cap_wq);
+	if (put)
+		iput(inode);
+}
+
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context.  Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS.  If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				struct ceph_snap_context *snapc)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+	int last_snap = 0;
+	int found = 0;
+	struct ceph_cap_snap *capsnap = NULL;
+
+	spin_lock(&inode->i_lock);
+	ci->i_wrbuffer_ref -= nr;
+	last = !ci->i_wrbuffer_ref;
+
+	if (ci->i_head_snapc == snapc) {
+		ci->i_wrbuffer_ref_head -= nr;
+		if (!ci->i_wrbuffer_ref_head) {
+			ceph_put_snap_context(ci->i_head_snapc);
+			ci->i_head_snapc = NULL;
+		}
+		dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+		     inode,
+		     ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
+		     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+		     last ? " LAST" : "");
+	} else {
+		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+			if (capsnap->context == snapc) {
+				found = 1;
+				capsnap->dirty_pages -= nr;
+				last_snap = !capsnap->dirty_pages;
+				break;
+			}
+		}
+		BUG_ON(!found);
+		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+		     " snap %lld %d/%d -> %d/%d %s%s\n",
+		     inode, capsnap, capsnap->context->seq,
+		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
+		     last ? " (wrbuffer last)" : "",
+		     last_snap ? " (capsnap last)" : "");
+	}
+
+	spin_unlock(&inode->i_lock);
+
+	if (last) {
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+		iput(inode);
+	} else if (last_snap) {
+		ceph_flush_snaps(ci);
+		wake_up(&ci->i_cap_wq);
+	}
+}
+
+/*
+ * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex and i_lock, we drop both.
+ *
+ * return value:
+ *  0 - ok
+ *  1 - check_caps on auth cap only (writeback)
+ *  2 - check_caps (ack revoke)
+ */
+static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+			     struct ceph_mds_session *session,
+			     struct ceph_cap *cap,
+			     struct ceph_buffer *xattr_buf)
+	__releases(inode->i_lock)
+	__releases(session->s_mutex)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(grant->seq);
+	int newcaps = le32_to_cpu(grant->caps);
+	int issued, implemented, used, wanted, dirty;
+	u64 size = le64_to_cpu(grant->size);
+	u64 max_size = le64_to_cpu(grant->max_size);
+	struct timespec mtime, atime, ctime;
+	int check_caps = 0;
+	int wake = 0;
+	int writeback = 0;
+	int revoked_rdcache = 0;
+	int queue_invalidate = 0;
+
+	dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+	     inode, cap, mds, seq, ceph_cap_string(newcaps));
+	dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+		inode->i_size);
+
+	/*
+	 * If CACHE is being revoked, and we have no dirty buffers,
+	 * try to invalidate (once).  (If there are dirty buffers, we
+	 * will invalidate _after_ writeback.)
+	 */
+	if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+	    !ci->i_wrbuffer_ref) {
+		if (try_nonblocking_invalidate(inode) == 0) {
+			revoked_rdcache = 1;
+		} else {
+			/* there were locked pages.. invalidate later
+			   in a separate thread. */
+			if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+				queue_invalidate = 1;
+				ci->i_rdcache_revoking = ci->i_rdcache_gen;
+			}
+		}
+	}
+
+	/* side effects now are allowed */
+
+	issued = __ceph_caps_issued(ci, &implemented);
+	issued |= implemented | __ceph_caps_dirty(ci);
+
+	cap->cap_gen = session->s_cap_gen;
+
+	__check_cap_issue(ci, cap, newcaps);
+
+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(grant->mode);
+		inode->i_uid = le32_to_cpu(grant->uid);
+		inode->i_gid = le32_to_cpu(grant->gid);
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     inode->i_uid, inode->i_gid);
+	}
+
+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+		inode->i_nlink = le32_to_cpu(grant->nlink);
+
+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+		int len = le32_to_cpu(grant->xattr_len);
+		u64 version = le64_to_cpu(grant->xattr_version);
+
+		if (version > ci->i_xattrs.version) {
+			dout(" got new xattrs v%llu on %p len %d\n",
+			     version, inode, len);
+			if (ci->i_xattrs.blob)
+				ceph_buffer_put(ci->i_xattrs.blob);
+			ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+			ci->i_xattrs.version = version;
+		}
+	}
+
+	/* size/ctime/mtime/atime? */
+	ceph_fill_file_size(inode, issued,
+			    le32_to_cpu(grant->truncate_seq),
+			    le64_to_cpu(grant->truncate_size), size);
+	ceph_decode_timespec(&mtime, &grant->mtime);
+	ceph_decode_timespec(&atime, &grant->atime);
+	ceph_decode_timespec(&ctime, &grant->ctime);
+	ceph_fill_file_time(inode, issued,
+			    le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+			    &atime);
+
+	/* max size increase? */
+	if (max_size != ci->i_max_size) {
+		dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+		ci->i_max_size = max_size;
+		if (max_size >= ci->i_wanted_max_size) {
+			ci->i_wanted_max_size = 0;  /* reset */
+			ci->i_requested_max_size = 0;
+		}
+		wake = 1;
+	}
+
+	/* check cap bits */
+	wanted = __ceph_caps_wanted(ci);
+	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
+	dout(" my wanted = %s, used = %s, dirty %s\n",
+	     ceph_cap_string(wanted),
+	     ceph_cap_string(used),
+	     ceph_cap_string(dirty));
+	if (wanted != le32_to_cpu(grant->wanted)) {
+		dout("mds wanted %s -> %s\n",
+		     ceph_cap_string(le32_to_cpu(grant->wanted)),
+		     ceph_cap_string(wanted));
+		grant->wanted = cpu_to_le32(wanted);
+	}
+
+	cap->seq = seq;
+
+	/* file layout may have changed */
+	ci->i_layout = grant->layout;
+
+	/* revocation, grant, or no-op? */
+	if (cap->issued & ~newcaps) {
+		dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps));
+		if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+			writeback = 1; /* will delay ack */
+		else if (dirty & ~newcaps)
+			check_caps = 1;  /* initiate writeback in check_caps */
+		else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+			   revoked_rdcache)
+			check_caps = 2;     /* send revoke ack in check_caps */
+		cap->issued = newcaps;
+		cap->implemented |= newcaps;
+	} else if (cap->issued == newcaps) {
+		dout("caps unchanged: %s -> %s\n",
+		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+	} else {
+		dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+		     ceph_cap_string(newcaps));
+		cap->issued = newcaps;
+		cap->implemented |= newcaps; /* add bits only, to
+					      * avoid stepping on a
+					      * pending revocation */
+		wake = 1;
+	}
+	BUG_ON(cap->issued & ~cap->implemented);
+
+	spin_unlock(&inode->i_lock);
+	if (writeback)
+		/*
+		 * queue inode for writeback: we can't actually call
+		 * filemap_write_and_wait, etc. from message handler
+		 * context.
+		 */
+		ceph_queue_writeback(inode);
+	if (queue_invalidate)
+		ceph_queue_invalidate(inode);
+	if (wake)
+		wake_up(&ci->i_cap_wq);
+
+	if (check_caps == 1)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+				session);
+	else if (check_caps == 2)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+	else
+		mutex_unlock(&session->s_mutex);
+}
+
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
+				 struct ceph_mds_caps *m,
+				 struct ceph_mds_session *session,
+				 struct ceph_cap *cap)
+	__releases(inode->i_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+	unsigned seq = le32_to_cpu(m->seq);
+	int dirty = le32_to_cpu(m->dirty);
+	int cleaned = 0;
+	int drop = 0;
+	int i;
+
+	for (i = 0; i < CEPH_CAP_BITS; i++)
+		if ((dirty & (1 << i)) &&
+		    flush_tid == ci->i_cap_flush_tid[i])
+			cleaned |= 1 << i;
+
+	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+	     " flushing %s -> %s\n",
+	     inode, session->s_mds, seq, ceph_cap_string(dirty),
+	     ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+	     ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+
+	if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+		goto out;
+
+	ci->i_flushing_caps &= ~cleaned;
+
+	spin_lock(&mdsc->cap_dirty_lock);
+	if (ci->i_flushing_caps == 0) {
+		list_del_init(&ci->i_flushing_item);
+		if (!list_empty(&session->s_cap_flushing))
+			dout(" mds%d still flushing cap on %p\n",
+			     session->s_mds,
+			     &list_entry(session->s_cap_flushing.next,
+					 struct ceph_inode_info,
+					 i_flushing_item)->vfs_inode);
+		mdsc->num_cap_flushing--;
+		wake_up(&mdsc->cap_flushing_wq);
+		dout(" inode %p now !flushing\n", inode);
+
+		if (ci->i_dirty_caps == 0) {
+			dout(" inode %p now clean\n", inode);
+			BUG_ON(!list_empty(&ci->i_dirty_item));
+			drop = 1;
+		} else {
+			BUG_ON(list_empty(&ci->i_dirty_item));
+		}
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+	wake_up(&ci->i_cap_wq);
+
+out:
+	spin_unlock(&inode->i_lock);
+	if (drop)
+		iput(inode);
+}
+
+/*
+ * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
+				     struct ceph_mds_caps *m,
+				     struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 follows = le64_to_cpu(m->snap_follows);
+	struct ceph_cap_snap *capsnap;
+	int drop = 0;
+
+	dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+	     inode, ci, session->s_mds, follows);
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+		if (capsnap->follows == follows) {
+			if (capsnap->flush_tid != flush_tid) {
+				dout(" cap_snap %p follows %lld tid %lld !="
+				     " %lld\n", capsnap, follows,
+				     flush_tid, capsnap->flush_tid);
+				break;
+			}
+			WARN_ON(capsnap->dirty_pages || capsnap->writing);
+			dout(" removing cap_snap %p follows %lld\n",
+			     capsnap, follows);
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+			drop = 1;
+			break;
+		} else {
+			dout(" skipping cap_snap %p follows %lld\n",
+			     capsnap, capsnap->follows);
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	if (drop)
+		iput(inode);
+}
+
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+			     struct ceph_mds_caps *trunc,
+			     struct ceph_mds_session *session)
+	__releases(inode->i_lock)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	int seq = le32_to_cpu(trunc->seq);
+	u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+	u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+	u64 size = le64_to_cpu(trunc->size);
+	int implemented = 0;
+	int dirty = __ceph_caps_dirty(ci);
+	int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+	int queue_trunc = 0;
+
+	issued |= implemented | dirty;
+
+	dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+	     inode, mds, seq, truncate_size, truncate_seq);
+	queue_trunc = ceph_fill_file_size(inode, issued,
+					  truncate_seq, truncate_size, size);
+	spin_unlock(&inode->i_lock);
+
+	if (queue_trunc)
+		ceph_queue_vmtruncate(inode);
+}
+
+/*
+ * Handle EXPORT from MDS.  Cap is being migrated _from_ this mds to a
+ * different one.  If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+			      struct ceph_mds_session *session)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	unsigned mseq = le32_to_cpu(ex->migrate_seq);
+	struct ceph_cap *cap = NULL, *t;
+	struct rb_node *p;
+	int remember = 1;
+
+	dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+	     inode, ci, mds, mseq);
+
+	spin_lock(&inode->i_lock);
+
+	/* make sure we haven't seen a higher mseq */
+	for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+		t = rb_entry(p, struct ceph_cap, ci_node);
+		if (ceph_seq_cmp(t->mseq, mseq) > 0) {
+			dout(" higher mseq on cap from mds%d\n",
+			     t->session->s_mds);
+			remember = 0;
+		}
+		if (t->session->s_mds == mds)
+			cap = t;
+	}
+
+	if (cap) {
+		if (remember) {
+			/* make note */
+			ci->i_cap_exporting_mds = mds;
+			ci->i_cap_exporting_mseq = mseq;
+			ci->i_cap_exporting_issued = cap->issued;
+		}
+		__ceph_remove_cap(cap);
+	}
+	/* else, we already released it */
+
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Handle cap IMPORT.  If there are temp bits from an older EXPORT,
+ * clean them up.
+ *
+ * caller holds s_mutex.
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+			      struct inode *inode, struct ceph_mds_caps *im,
+			      struct ceph_mds_session *session,
+			      void *snaptrace, int snaptrace_len)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int mds = session->s_mds;
+	unsigned issued = le32_to_cpu(im->caps);
+	unsigned wanted = le32_to_cpu(im->wanted);
+	unsigned seq = le32_to_cpu(im->seq);
+	unsigned mseq = le32_to_cpu(im->migrate_seq);
+	u64 realmino = le64_to_cpu(im->realm);
+	u64 cap_id = le64_to_cpu(im->cap_id);
+
+	if (ci->i_cap_exporting_mds >= 0 &&
+	    ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+		dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+		     " - cleared exporting from mds%d\n",
+		     inode, ci, mds, mseq,
+		     ci->i_cap_exporting_mds);
+		ci->i_cap_exporting_issued = 0;
+		ci->i_cap_exporting_mseq = 0;
+		ci->i_cap_exporting_mds = -1;
+	} else {
+		dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
+		     inode, ci, mds, mseq);
+	}
+
+	down_write(&mdsc->snap_rwsem);
+	ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
+			       false);
+	downgrade_write(&mdsc->snap_rwsem);
+	ceph_add_cap(inode, session, cap_id, -1,
+		     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
+		     NULL /* no caps context */);
+	try_flush_caps(inode, session, NULL);
+	up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+		      struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct super_block *sb = mdsc->client->sb;
+	struct inode *inode;
+	struct ceph_cap *cap;
+	struct ceph_mds_caps *h;
+	int mds = session->s_mds;
+	int op;
+	u32 seq;
+	struct ceph_vino vino;
+	u64 cap_id;
+	u64 size, max_size;
+	u64 tid;
+	void *snaptrace;
+
+	dout("handle_caps from mds%d\n", mds);
+
+	/* decode */
+	tid = le64_to_cpu(msg->hdr.tid);
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = msg->front.iov_base;
+	snaptrace = h + 1;
+	op = le32_to_cpu(h->op);
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	cap_id = le64_to_cpu(h->cap_id);
+	seq = le32_to_cpu(h->seq);
+	size = le64_to_cpu(h->size);
+	max_size = le64_to_cpu(h->max_size);
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+	     (unsigned)seq);
+
+	/* lookup ino */
+	inode = ceph_find_inode(sb, vino);
+	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+	     vino.snap, inode);
+	if (!inode) {
+		dout(" i don't have ino %llx\n", vino.ino);
+		goto done;
+	}
+
+	/* these will work even if we don't have a cap yet */
+	switch (op) {
+	case CEPH_CAP_OP_FLUSHSNAP_ACK:
+		handle_cap_flushsnap_ack(inode, tid, h, session);
+		goto done;
+
+	case CEPH_CAP_OP_EXPORT:
+		handle_cap_export(inode, h, session);
+		goto done;
+
+	case CEPH_CAP_OP_IMPORT:
+		handle_cap_import(mdsc, inode, h, session,
+				  snaptrace, le32_to_cpu(h->snap_trace_len));
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+				session);
+		goto done_unlocked;
+	}
+
+	/* the rest require a cap */
+	spin_lock(&inode->i_lock);
+	cap = __get_cap_for_mds(ceph_inode(inode), mds);
+	if (!cap) {
+		dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+		     inode, ceph_ino(inode), ceph_snap(inode), mds);
+		spin_unlock(&inode->i_lock);
+		goto done;
+	}
+
+	/* note that each of these drops i_lock for us */
+	switch (op) {
+	case CEPH_CAP_OP_REVOKE:
+	case CEPH_CAP_OP_GRANT:
+		handle_cap_grant(inode, h, session, cap, msg->middle);
+		goto done_unlocked;
+
+	case CEPH_CAP_OP_FLUSH_ACK:
+		handle_cap_flush_ack(inode, tid, h, session, cap);
+		break;
+
+	case CEPH_CAP_OP_TRUNC:
+		handle_cap_trunc(inode, h, session);
+		break;
+
+	default:
+		spin_unlock(&inode->i_lock);
+		pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+		       ceph_cap_op_name(op));
+	}
+
+done:
+	mutex_unlock(&session->s_mutex);
+done_unlocked:
+	if (inode)
+		iput(inode);
+	return;
+
+bad:
+	pr_err("ceph_handle_caps: corrupt message\n");
+	ceph_msg_dump(msg);
+	return;
+}
+
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	int flags = CHECK_CAPS_NODELAY;
+
+	dout("check_delayed_caps\n");
+	while (1) {
+		spin_lock(&mdsc->cap_delay_lock);
+		if (list_empty(&mdsc->cap_delay_list))
+			break;
+		ci = list_first_entry(&mdsc->cap_delay_list,
+				      struct ceph_inode_info,
+				      i_cap_delay_list);
+		if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+		    time_before(jiffies, ci->i_hold_caps_max))
+			break;
+		list_del_init(&ci->i_cap_delay_list);
+		spin_unlock(&mdsc->cap_delay_lock);
+		dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+		ceph_check_caps(ci, flags, NULL);
+	}
+	spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci, *nci = NULL;
+	struct inode *inode, *ninode = NULL;
+	struct list_head *p, *n;
+
+	dout("flush_dirty_caps\n");
+	spin_lock(&mdsc->cap_dirty_lock);
+	list_for_each_safe(p, n, &mdsc->cap_dirty) {
+		if (nci) {
+			ci = nci;
+			inode = ninode;
+			ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+			dout("flush_dirty_caps inode %p (was next inode)\n",
+			     inode);
+		} else {
+			ci = list_entry(p, struct ceph_inode_info,
+					i_dirty_item);
+			inode = igrab(&ci->vfs_inode);
+			BUG_ON(!inode);
+			dout("flush_dirty_caps inode %p\n", inode);
+		}
+		if (n != &mdsc->cap_dirty) {
+			nci = list_entry(n, struct ceph_inode_info,
+					 i_dirty_item);
+			ninode = igrab(&nci->vfs_inode);
+			BUG_ON(!ninode);
+			nci->i_ceph_flags |= CEPH_I_NOFLUSH;
+			dout("flush_dirty_caps next inode %p, noflush\n",
+			     ninode);
+		} else {
+			nci = NULL;
+			ninode = NULL;
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+		if (inode) {
+			ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+					NULL);
+			iput(inode);
+		}
+		spin_lock(&mdsc->cap_dirty_lock);
+	}
+	spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+/*
+ * Drop open file reference.  If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+	struct inode *inode = &ci->vfs_inode;
+	int last = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+	     ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+	BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+	if (--ci->i_nr_by_mode[fmode] == 0)
+		last++;
+	spin_unlock(&inode->i_lock);
+
+	if (last && ci->i_vino.snap == CEPH_NOSNAP)
+		ceph_check_caps(ci, 0, NULL);
+}
+
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,
+			      int mds, int drop, int unless, int force)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	struct ceph_mds_request_release *rel = *p;
+	int ret = 0;
+	int used = 0;
+
+	spin_lock(&inode->i_lock);
+	used = __ceph_caps_used(ci);
+
+	dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
+	     mds, ceph_cap_string(used), ceph_cap_string(drop),
+	     ceph_cap_string(unless));
+
+	/* only drop unused caps */
+	drop &= ~used;
+
+	cap = __get_cap_for_mds(ci, mds);
+	if (cap && __cap_is_valid(cap)) {
+		if (force ||
+		    ((cap->issued & drop) &&
+		     (cap->issued & unless) == 0)) {
+			if ((cap->issued & drop) &&
+			    (cap->issued & unless) == 0) {
+				dout("encode_inode_release %p cap %p %s -> "
+				     "%s\n", inode, cap,
+				     ceph_cap_string(cap->issued),
+				     ceph_cap_string(cap->issued & ~drop));
+				cap->issued &= ~drop;
+				cap->implemented &= ~drop;
+				if (ci->i_ceph_flags & CEPH_I_NODELAY) {
+					int wanted = __ceph_caps_wanted(ci);
+					dout("  wanted %s -> %s (act %s)\n",
+					     ceph_cap_string(cap->mds_wanted),
+					     ceph_cap_string(cap->mds_wanted &
+							     ~wanted),
+					     ceph_cap_string(wanted));
+					cap->mds_wanted &= wanted;
+				}
+			} else {
+				dout("encode_inode_release %p cap %p %s"
+				     " (force)\n", inode, cap,
+				     ceph_cap_string(cap->issued));
+			}
+
+			rel->ino = cpu_to_le64(ceph_ino(inode));
+			rel->cap_id = cpu_to_le64(cap->cap_id);
+			rel->seq = cpu_to_le32(cap->seq);
+			rel->issue_seq = cpu_to_le32(cap->issue_seq),
+			rel->mseq = cpu_to_le32(cap->mseq);
+			rel->caps = cpu_to_le32(cap->issued);
+			rel->wanted = cpu_to_le32(cap->mds_wanted);
+			rel->dname_len = 0;
+			rel->dname_seq = 0;
+			*p += sizeof(*rel);
+			ret = 1;
+		} else {
+			dout("encode_inode_release %p cap %p %s\n",
+			     inode, cap, ceph_cap_string(cap->issued));
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+int ceph_encode_dentry_release(void **p, struct dentry *dentry,
+			       int mds, int drop, int unless)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct ceph_mds_request_release *rel = *p;
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int force = 0;
+	int ret;
+
+	/*
+	 * force an record for the directory caps if we have a dentry lease.
+	 * this is racy (can't take i_lock and d_lock together), but it
+	 * doesn't have to be perfect; the mds will revoke anything we don't
+	 * release.
+	 */
+	spin_lock(&dentry->d_lock);
+	if (di->lease_session && di->lease_session->s_mds == mds)
+		force = 1;
+	spin_unlock(&dentry->d_lock);
+
+	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+
+	spin_lock(&dentry->d_lock);
+	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+		dout("encode_dentry_release %p mds%d seq %d\n",
+		     dentry, mds, (int)di->lease_seq);
+		rel->dname_len = cpu_to_le32(dentry->d_name.len);
+		memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+		*p += dentry->d_name.len;
+		rel->dname_seq = cpu_to_le32(di->lease_seq);
+	}
+	spin_unlock(&dentry->d_lock);
+	return ret;
+}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 00000000000..1818c230561
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+#  define dout(fmt, ...)						\
+	pr_debug(" %12.12s:%-4d : " fmt,				\
+		 ceph_file_part(__FILE__, sizeof(__FILE__)),		\
+		 __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+#  define dout(fmt, ...)	do {				\
+		if (0)						\
+			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
+	} while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...)	pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 00000000000..ab6cf35c409
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type
+ */
+#include "types.h"
+
+int ceph_frag_compare(__u32 a, __u32 b)
+{
+	unsigned va = ceph_frag_value(a);
+	unsigned vb = ceph_frag_value(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	va = ceph_frag_bits(a);
+	vb = ceph_frag_bits(b);
+	if (va < vb)
+		return -1;
+	if (va > vb)
+		return 1;
+	return 0;
+}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 00000000000..793f50cb7c2
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef _FS_CEPH_FRAG_H
+#define _FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask.  Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ *   8 upper bits = "bits"
+ *  24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value.  This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically.  However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+	return (b << 24) |
+		(v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+	return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+	return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+	return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+	return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+	return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+	/* is sub as specific as us, and contained by us? */
+	return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+	       (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f) - 1,
+			 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+	return ceph_frag_bits(f) > 0 &&
+		(ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+		      ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f)+1,
+	      ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+	int newbits = ceph_frag_bits(f) + by;
+	return ceph_frag_make(newbits,
+			 ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+	return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+	return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+	return ceph_frag_make(ceph_frag_bits(f),
+			 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 00000000000..79d76bc4303
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include "types.h"
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	__u32 os = le32_to_cpu(layout->fl_object_size);
+
+	/* stripe unit, object size must be non-zero, 64k increment */
+	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+		return 0;
+	/* object size must be a multiple of stripe unit */
+	if (os < su || os % su)
+		return 0;
+	/* stripe count must be non-zero */
+	if (!sc)
+		return 0;
+	return 1;
+}
+
+
+int ceph_flags_to_mode(int flags)
+{
+#ifdef O_DIRECTORY  /* fixme */
+	if ((flags & O_DIRECTORY) == O_DIRECTORY)
+		return CEPH_FILE_MODE_PIN;
+#endif
+#ifdef O_LAZY
+	if (flags & O_LAZY)
+		return CEPH_FILE_MODE_LAZY;
+#endif
+	if ((flags & O_APPEND) == O_APPEND)
+		flags |= O_WRONLY;
+
+	flags &= O_ACCMODE;
+	if ((flags & O_RDWR) == O_RDWR)
+		return CEPH_FILE_MODE_RDWR;
+	if ((flags & O_WRONLY) == O_WRONLY)
+		return CEPH_FILE_MODE_WR;
+	return CEPH_FILE_MODE_RD;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+	switch (mode) {
+	case CEPH_FILE_MODE_PIN:
+		return CEPH_CAP_PIN;
+	case CEPH_FILE_MODE_RD:
+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+	case CEPH_FILE_MODE_RDWR:
+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	case CEPH_FILE_MODE_WR:
+		return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+			CEPH_CAP_FILE_EXCL |
+			CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+			CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+			CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+	}
+	return 0;
+}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 00000000000..0c2241ef365
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef _FS_CEPH_CEPH_FS_H
+#define _FS_CEPH_CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * Ceph release version
+ */
+#define CEPH_VERSION_MAJOR 0
+#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_PATCH 0
+
+#define _CEPH_STRINGIFY(x) #x
+#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
+#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
+	"." CEPH_STRINGIFY(z)
+#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
+				       CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
+
+/*
+ * subprotocol versions.  when specific messages types or high-level
+ * protocols change, bump the affected components.  we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL     8 /* cluster internal */
+#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MON_PROTOCOL     5 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL   24 /* server/client */
+#define CEPH_MDSC_PROTOCOL   32 /* server/client */
+#define CEPH_MONC_PROTOCOL   15 /* server/client */
+
+
+#define CEPH_INO_ROOT  1
+#define CEPH_INO_CEPH  2        /* hidden .ceph dir */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON   31
+
+
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_SUPPORTED  0
+#define CEPH_FEATURE_REQUIRED   0
+
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+	/* file -> object mapping */
+	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
+				      of page size. */
+	__le32 fl_stripe_count;    /* over this many objects */
+	__le32 fl_object_size;     /* until objects are this big, then move to
+				      new objects */
+	__le32 fl_cas_hash;        /* 0 = none; 1 = sha256 */
+
+	/* pg -> disk layout */
+	__le32 fl_object_stripe_unit;  /* for per-object parity, if any */
+
+	/* object -> pg layout */
+	__le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES  0x1
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN	0x0
+#define CEPH_AUTH_NONE	 	0x1
+#define CEPH_AUTH_CEPHX	 	0x2
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN               1
+#define CEPH_MSG_PING                   2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP                4
+#define CEPH_MSG_MON_GET_MAP            5
+#define CEPH_MSG_STATFS                 13
+#define CEPH_MSG_STATFS_REPLY           14
+#define CEPH_MSG_MON_SUBSCRIBE          15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK      16
+#define CEPH_MSG_AUTH			17
+#define CEPH_MSG_AUTH_REPLY		18
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP                21
+
+#define CEPH_MSG_CLIENT_SESSION         22
+#define CEPH_MSG_CLIENT_RECONNECT       23
+
+#define CEPH_MSG_CLIENT_REQUEST         24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_CAPS            0x310
+#define CEPH_MSG_CLIENT_LEASE           0x311
+#define CEPH_MSG_CLIENT_SNAP            0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+
+/* osd */
+#define CEPH_MSG_OSD_MAP          41
+#define CEPH_MSG_OSD_OP           42
+#define CEPH_MSG_OSD_OPREPLY      43
+
+struct ceph_mon_request_header {
+	__le64 have_version;
+	__le16 session_mon;
+	__le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+	__le64 kb, kb_used, kb_avail;
+	__le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+	struct ceph_fsid fsid;
+	__le64 version;
+	struct ceph_statfs st;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+	__le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+	struct ceph_mon_request_header monhdr;
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+	struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_item {
+	__le64 have_version;	__le64 have;
+	__u8 onetime;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+	__le32 duration;         /* seconds */
+	struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mds states
+ *   > 0 -> in
+ *  <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE          0  /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED     -1  /* down, once existed, but no subtrees.
+					  empty log. */
+#define CEPH_MDS_STATE_BOOT        -4  /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY     -5  /* up, idle.  waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING    -6  /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING    -7  /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY       8  /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE      9  /* up, disambiguating distributed
+					  operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT    10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN       11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE       13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING     14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ *  - these are bitmasks.. we can compose them
+ *  - they also define the lock ordering by the MDS
+ *  - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DN          1
+#define CEPH_LOCK_ISNAP       2
+#define CEPH_LOCK_IVERSION    4     /* mds internal */
+#define CEPH_LOCK_IFILE       8     /* mds internal */
+#define CEPH_LOCK_IAUTH       32
+#define CEPH_LOCK_ILINK       64
+#define CEPH_LOCK_IDFT        128   /* dir frag tree */
+#define CEPH_LOCK_INEST       256   /* mds internal */
+#define CEPH_LOCK_IXATTR      512
+#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
+
+/* client_session ops */
+enum {
+	CEPH_SESSION_REQUEST_OPEN,
+	CEPH_SESSION_OPEN,
+	CEPH_SESSION_REQUEST_CLOSE,
+	CEPH_SESSION_CLOSE,
+	CEPH_SESSION_REQUEST_RENEWCAPS,
+	CEPH_SESSION_RENEWCAPS,
+	CEPH_SESSION_STALE,
+	CEPH_SESSION_RECALL_STATE,
+};
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+	__le32 op;
+	__le64 seq;
+	struct ceph_timespec stamp;
+	__le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ *  & 0x001000 -> write op
+ *  & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ &  & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE        0x001000
+enum {
+	CEPH_MDS_OP_LOOKUP     = 0x00100,
+	CEPH_MDS_OP_GETATTR    = 0x00101,
+	CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+
+	CEPH_MDS_OP_SETXATTR   = 0x01105,
+	CEPH_MDS_OP_RMXATTR    = 0x01106,
+	CEPH_MDS_OP_SETLAYOUT  = 0x01107,
+	CEPH_MDS_OP_SETATTR    = 0x01108,
+
+	CEPH_MDS_OP_MKNOD      = 0x01201,
+	CEPH_MDS_OP_LINK       = 0x01202,
+	CEPH_MDS_OP_UNLINK     = 0x01203,
+	CEPH_MDS_OP_RENAME     = 0x01204,
+	CEPH_MDS_OP_MKDIR      = 0x01220,
+	CEPH_MDS_OP_RMDIR      = 0x01221,
+	CEPH_MDS_OP_SYMLINK    = 0x01222,
+
+	CEPH_MDS_OP_CREATE     = 0x01301,
+	CEPH_MDS_OP_OPEN       = 0x00302,
+	CEPH_MDS_OP_READDIR    = 0x00305,
+
+	CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+	CEPH_MDS_OP_MKSNAP     = 0x01400,
+	CEPH_MDS_OP_RMSNAP     = 0x01401,
+	CEPH_MDS_OP_LSSNAP     = 0x00402,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE   1
+#define CEPH_SETATTR_UID    2
+#define CEPH_SETATTR_GID    4
+#define CEPH_SETATTR_MTIME  8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE  32
+#define CEPH_SETATTR_CTIME 64
+
+union ceph_mds_request_args {
+	struct {
+		__le32 mask;                 /* CEPH_CAP_* */
+	} __attribute__ ((packed)) getattr;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+	} __attribute__ ((packed)) setattr;
+	struct {
+		__le32 frag;                 /* which dir fragment */
+		__le32 max_entries;          /* how many dentries to grab */
+	} __attribute__ ((packed)) readdir;
+	struct {
+		__le32 mode;
+		__le32 rdev;
+	} __attribute__ ((packed)) mknod;
+	struct {
+		__le32 mode;
+	} __attribute__ ((packed)) mkdir;
+	struct {
+		__le32 flags;
+		__le32 mode;
+		__le32 stripe_unit;          /* layout for newly created file */
+		__le32 stripe_count;         /* ... */
+		__le32 object_size;
+		__le32 file_replication;
+		__le32 preferred;
+	} __attribute__ ((packed)) open;
+	struct {
+		__le32 flags;
+	} __attribute__ ((packed)) setxattr;
+	struct {
+		struct ceph_file_layout layout;
+	} __attribute__ ((packed)) setlayout;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
+
+struct ceph_mds_request_head {
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+	__le64 ino, cap_id;            /* ino and unique cap id */
+	__le32 caps, wanted;           /* new issued, wanted */
+	__le32 seq, issue_seq, mseq;
+	__le32 dname_seq;              /* if releasing a dentry lease, a */
+	__le32 dname_len;              /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+	__le32 op;
+	__le32 result;
+	__le32 mdsmap_epoch;
+	__u8 safe;                     /* true if committed to disk */
+	__u8 is_dentry, is_target;     /* true if dentry, target inode records
+					  are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+	__le32 frag;                   /* this frag splits... */
+	__le32 by;                     /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+	__le32 nsplits;                /* num ceph_frag_tree_split records */
+	struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+	__le32 caps, wanted;           /* caps issued, wanted */
+	__le64 cap_id;
+	__le32 seq, mseq;
+	__le64 realm;                  /* snap realm */
+	__u8 flags;                    /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH  1          /* cap is issued by auth mds */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+	__le64 ino;
+	__le64 snapid;
+	__le32 rdev;
+	__le64 version;                /* inode version */
+	__le64 xattr_version;          /* version for xattr blob */
+	struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+	struct ceph_file_layout layout;
+	struct ceph_timespec ctime, mtime, atime;
+	__le32 time_warp_seq;
+	__le64 size, max_size, truncate_size;
+	__le32 truncate_seq;
+	__le32 mode, uid, gid;
+	__le32 nlink;
+	__le64 files, subdirs, rbytes, rfiles, rsubdirs;  /* dir stats */
+	struct ceph_timespec rctime;
+	struct ceph_frag_tree_head fragtree;  /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+	__le16 mask;            /* lease type(s) */
+	__le32 duration_ms;     /* lease duration */
+	__le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+	__le32 frag;            /* fragment */
+	__le32 auth;            /* auth mds, if this is a delegation point */
+	__le32 ndist;           /* number of mds' this is replicated on */
+	__le32 dist[];
+} __attribute__ ((packed));
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN        0
+#define CEPH_FILE_MODE_RD         1
+#define CEPH_FILE_MODE_WR         2
+#define CEPH_FILE_MODE_RDWR       3  /* RD | WR */
+#define CEPH_FILE_MODE_LAZY       4  /* lazy io */
+#define CEPH_FILE_MODE_NUM        8  /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+
+/* capability bits */
+#define CEPH_CAP_PIN         1  /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED     1  /* client can reads */
+#define CEPH_CAP_GEXCL       2  /* client can read and update */
+#define CEPH_CAP_GCACHE      4  /* (file) client can cache reads */
+#define CEPH_CAP_GRD         8  /* (file) client can read */
+#define CEPH_CAP_GWR        16  /* (file) client can write */
+#define CEPH_CAP_GBUFFER    32  /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND  64  /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO   128  /* (file) client can perform lazy io */
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH      2
+#define CEPH_CAP_SLINK      4
+#define CEPH_CAP_SXATTR     6
+#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
+
+#define CEPH_CAP_BITS       16
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED  << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x)    (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED   (CEPH_CAP_GSHARED   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL     (CEPH_CAP_GEXCL     << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE    (CEPH_CAP_GCACHE    << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD       (CEPH_CAP_GRD       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR       (CEPH_CAP_GWR       << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE     CEPH_CAP_PIN  /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK  CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID      CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE     CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK    CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT   CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME    CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE     CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME    CEPH_CAP_FILE_SHARED  /* fixme */
+#define CEPH_STAT_CAP_XATTR    CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN |			\
+				 CEPH_CAP_AUTH_SHARED |	\
+				 CEPH_CAP_LINK_SHARED |	\
+				 CEPH_CAP_FILE_SHARED |	\
+				 CEPH_CAP_XATTR_SHARED)
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED |			\
+			      CEPH_CAP_LINK_SHARED |			\
+			      CEPH_CAP_XATTR_SHARED |			\
+			      CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD   (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD |	\
+			   CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL |		\
+			   CEPH_CAP_LINK_EXCL |		\
+			   CEPH_CAP_XATTR_EXCL |	\
+			   CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |	\
+			      CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+			CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+	CEPH_CAP_OP_GRANT,         /* mds->client grant */
+	CEPH_CAP_OP_REVOKE,        /* mds->client revoke */
+	CEPH_CAP_OP_TRUNC,         /* mds->client trunc notify */
+	CEPH_CAP_OP_EXPORT,        /* mds has exported the cap */
+	CEPH_CAP_OP_IMPORT,        /* mds has imported the cap */
+	CEPH_CAP_OP_UPDATE,        /* client->mds update */
+	CEPH_CAP_OP_DROP,          /* client->mds drop cap bits */
+	CEPH_CAP_OP_FLUSH,         /* client->mds cap writeback */
+	CEPH_CAP_OP_FLUSH_ACK,     /* mds->client flushed */
+	CEPH_CAP_OP_FLUSHSNAP,     /* client->mds flush snapped metadata */
+	CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+	CEPH_CAP_OP_RELEASE,       /* client->mds release (clean) cap */
+	CEPH_CAP_OP_RENEW,         /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+	__le32 op;                  /* CEPH_CAP_OP_* */
+	__le64 ino, realm;
+	__le64 cap_id;
+	__le32 seq, issue_seq;
+	__le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+	__le32 migrate_seq;
+	__le64 snap_follows;
+	__le32 snap_trace_len;
+
+	/* authlock */
+	__le32 uid, gid, mode;
+
+	/* linklock */
+	__le32 nlink;
+
+	/* xattrlock */
+	__le32 xattr_len;
+	__le64 xattr_version;
+
+	/* filelock */
+	__le64 size, max_size, truncate_size;
+	__le32 truncate_seq;
+	struct ceph_timespec mtime, atime, ctime;
+	struct ceph_file_layout layout;
+	__le32 time_warp_seq;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+	__le32 num;                /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+	__le64 ino;
+	__le64 cap_id;
+	__le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE           1  /*    mds  -> client */
+#define CEPH_MDS_LEASE_RELEASE          2  /* client  -> mds    */
+#define CEPH_MDS_LEASE_RENEW            3  /* client <-> mds    */
+#define CEPH_MDS_LEASE_REVOKE_ACK       4  /* client  -> mds    */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+	__u8 action;            /* CEPH_MDS_LEASE_* */
+	__le16 mask;            /* which lease */
+	__le64 ino;
+	__le64 first, last;     /* snap range */
+	__le32 seq;
+	__le32 duration_ms;     /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+	__le64 cap_id;
+	__le32 wanted;
+	__le32 issued;
+	__le64 size;
+	struct ceph_timespec mtime, atime;
+	__le64 snaprealm;
+	__le64 pathbase;        /* base ino for our path to this ino */
+} __attribute__ ((packed));
+/* followed by encoded string */
+
+struct ceph_mds_snaprealm_reconnect {
+	__le64 ino;     /* snap realm base */
+	__le64 seq;     /* snap seq for this snap realm */
+	__le64 parent;  /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+	CEPH_SNAP_OP_UPDATE,  /* CREATE or DESTROY */
+	CEPH_SNAP_OP_CREATE,
+	CEPH_SNAP_OP_DESTROY,
+	CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+	__le32 op;                /* CEPH_SNAP_OP_* */
+	__le64 split;             /* ino to split off, if any */
+	__le32 num_split_inos;    /* # inos belonging to new child realm */
+	__le32 num_split_realms;  /* # child realms udner new child realm */
+	__le32 trace_len;         /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+	__le64 ino;           /* ino */
+	__le64 created;       /* snap: when created */
+	__le64 parent;        /* ino: parent realm */
+	__le64 parent_since;  /* snap: same parent since */
+	__le64 seq;           /* snap: version */
+	__le32 num_snaps;
+	__le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 00000000000..bd570015d14
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+
+#include "types.h"
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c)						\
+	do {							\
+		a = a - b;  a = a - c;  a = a ^ (c >> 13);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 8);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 13);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 12);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 16);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 5);	\
+		a = a - b;  a = a - c;  a = a ^ (c >> 3);	\
+		b = b - c;  b = b - a;  b = b ^ (a << 10);	\
+		c = c - a;  c = c - b;  c = c ^ (b >> 15);	\
+	} while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+	const unsigned char *k = (const unsigned char *)str;
+	__u32 a, b, c;  /* the internal state */
+	__u32 len;      /* how many key bytes still need mixing */
+
+	/* Set up the internal state */
+	len = length;
+	a = 0x9e3779b9;      /* the golden ratio; an arbitrary value */
+	b = a;
+	c = 0;               /* variable initialization of internal state */
+
+	/* handle most of the key */
+	while (len >= 12) {
+		a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+			 ((__u32)k[3] << 24));
+		b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+			 ((__u32)k[7] << 24));
+		c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+			 ((__u32)k[11] << 24));
+		mix(a, b, c);
+		k = k + 12;
+		len = len - 12;
+	}
+
+	/* handle the last 11 bytes */
+	c = c + length;
+	switch (len) {            /* all the case statements fall through */
+	case 11:
+		c = c + ((__u32)k[10] << 24);
+	case 10:
+		c = c + ((__u32)k[9] << 16);
+	case 9:
+		c = c + ((__u32)k[8] << 8);
+		/* the first byte of c is reserved for the length */
+	case 8:
+		b = b + ((__u32)k[7] << 24);
+	case 7:
+		b = b + ((__u32)k[6] << 16);
+	case 6:
+		b = b + ((__u32)k[5] << 8);
+	case 5:
+		b = b + k[4];
+	case 4:
+		a = a + ((__u32)k[3] << 24);
+	case 3:
+		a = a + ((__u32)k[2] << 16);
+	case 2:
+		a = a + ((__u32)k[1] << 8);
+	case 1:
+		a = a + k[0];
+		/* case 0: nothing left to add */
+	}
+	mix(a, b, c);
+
+	return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+	unsigned long hash = 0;
+	unsigned char c;
+
+	while (length--) {
+		c = *str++;
+		hash = (hash + (c << 4) + (c >> 4)) * 11;
+	}
+	return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return ceph_str_hash_linux(s, len);
+	case CEPH_STR_HASH_RJENKINS:
+		return ceph_str_hash_rjenkins(s, len);
+	default:
+		return -1;
+	}
+}
+
+const char *ceph_str_hash_name(int type)
+{
+	switch (type) {
+	case CEPH_STR_HASH_LINUX:
+		return "linux";
+	case CEPH_STR_HASH_RJENKINS:
+		return "rjenkins";
+	default:
+		return "unknown";
+	}
+}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 00000000000..5ac470c433c
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef _FS_CEPH_HASH_H
+#define _FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 00000000000..8e4be6a80c6
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
+/*
+ * Ceph string constants
+ */
+#include "types.h"
+
+const char *ceph_entity_type_name(int type)
+{
+	switch (type) {
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+	case CEPH_ENTITY_TYPE_AUTH: return "auth";
+	default: return "unknown";
+	}
+}
+
+const char *ceph_osd_op_name(int op)
+{
+	switch (op) {
+	case CEPH_OSD_OP_READ: return "read";
+	case CEPH_OSD_OP_STAT: return "stat";
+
+	case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+	case CEPH_OSD_OP_WRITE: return "write";
+	case CEPH_OSD_OP_DELETE: return "delete";
+	case CEPH_OSD_OP_TRUNCATE: return "truncate";
+	case CEPH_OSD_OP_ZERO: return "zero";
+	case CEPH_OSD_OP_WRITEFULL: return "writefull";
+
+	case CEPH_OSD_OP_APPEND: return "append";
+	case CEPH_OSD_OP_STARTSYNC: return "startsync";
+	case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+	case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+	case CEPH_OSD_OP_TMAPUP: return "tmapup";
+	case CEPH_OSD_OP_TMAPGET: return "tmapget";
+	case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+	case CEPH_OSD_OP_GETXATTR: return "getxattr";
+	case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+	case CEPH_OSD_OP_SETXATTR: return "setxattr";
+	case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+	case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+	case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+
+	case CEPH_OSD_OP_PULL: return "pull";
+	case CEPH_OSD_OP_PUSH: return "push";
+	case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+	case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+	case CEPH_OSD_OP_SCRUB: return "scrub";
+
+	case CEPH_OSD_OP_WRLOCK: return "wrlock";
+	case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+	case CEPH_OSD_OP_RDLOCK: return "rdlock";
+	case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+	case CEPH_OSD_OP_UPLOCK: return "uplock";
+	case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+	case CEPH_OSD_OP_CALL: return "call";
+
+	case CEPH_OSD_OP_PGLS: return "pgls";
+	}
+	return "???";
+}
+
+const char *ceph_mds_state_name(int s)
+{
+	switch (s) {
+		/* down and out */
+	case CEPH_MDS_STATE_DNE:        return "down:dne";
+	case CEPH_MDS_STATE_STOPPED:    return "down:stopped";
+		/* up and out */
+	case CEPH_MDS_STATE_BOOT:       return "up:boot";
+	case CEPH_MDS_STATE_STANDBY:    return "up:standby";
+	case CEPH_MDS_STATE_STANDBY_REPLAY:    return "up:standby-replay";
+	case CEPH_MDS_STATE_CREATING:   return "up:creating";
+	case CEPH_MDS_STATE_STARTING:   return "up:starting";
+		/* up and in */
+	case CEPH_MDS_STATE_REPLAY:     return "up:replay";
+	case CEPH_MDS_STATE_RESOLVE:    return "up:resolve";
+	case CEPH_MDS_STATE_RECONNECT:  return "up:reconnect";
+	case CEPH_MDS_STATE_REJOIN:     return "up:rejoin";
+	case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+	case CEPH_MDS_STATE_ACTIVE:     return "up:active";
+	case CEPH_MDS_STATE_STOPPING:   return "up:stopping";
+	}
+	return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+	switch (op) {
+	case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+	case CEPH_SESSION_OPEN: return "open";
+	case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+	case CEPH_SESSION_CLOSE: return "close";
+	case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+	case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+	case CEPH_SESSION_STALE: return "stale";
+	case CEPH_SESSION_RECALL_STATE: return "recall_state";
+	}
+	return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+	switch (op) {
+	case CEPH_MDS_OP_LOOKUP:  return "lookup";
+	case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
+	case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
+	case CEPH_MDS_OP_GETATTR:  return "getattr";
+	case CEPH_MDS_OP_SETXATTR: return "setxattr";
+	case CEPH_MDS_OP_SETATTR: return "setattr";
+	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+	case CEPH_MDS_OP_READDIR: return "readdir";
+	case CEPH_MDS_OP_MKNOD: return "mknod";
+	case CEPH_MDS_OP_LINK: return "link";
+	case CEPH_MDS_OP_UNLINK: return "unlink";
+	case CEPH_MDS_OP_RENAME: return "rename";
+	case CEPH_MDS_OP_MKDIR: return "mkdir";
+	case CEPH_MDS_OP_RMDIR: return "rmdir";
+	case CEPH_MDS_OP_SYMLINK: return "symlink";
+	case CEPH_MDS_OP_CREATE: return "create";
+	case CEPH_MDS_OP_OPEN: return "open";
+	case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+	case CEPH_MDS_OP_LSSNAP: return "lssnap";
+	case CEPH_MDS_OP_MKSNAP: return "mksnap";
+	case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+	}
+	return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+	switch (op) {
+	case CEPH_CAP_OP_GRANT: return "grant";
+	case CEPH_CAP_OP_REVOKE: return "revoke";
+	case CEPH_CAP_OP_TRUNC: return "trunc";
+	case CEPH_CAP_OP_EXPORT: return "export";
+	case CEPH_CAP_OP_IMPORT: return "import";
+	case CEPH_CAP_OP_UPDATE: return "update";
+	case CEPH_CAP_OP_DROP: return "drop";
+	case CEPH_CAP_OP_FLUSH: return "flush";
+	case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+	case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+	case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+	case CEPH_CAP_OP_RELEASE: return "release";
+	case CEPH_CAP_OP_RENEW: return "renew";
+	}
+	return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+	switch (o) {
+	case CEPH_MDS_LEASE_REVOKE: return "revoke";
+	case CEPH_MDS_LEASE_RELEASE: return "release";
+	case CEPH_MDS_LEASE_RENEW: return "renew";
+	case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+	}
+	return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+	switch (o) {
+	case CEPH_SNAP_OP_UPDATE: return "update";
+	case CEPH_SNAP_OP_CREATE: return "create";
+	case CEPH_SNAP_OP_DESTROY: return "destroy";
+	case CEPH_SNAP_OP_SPLIT: return "split";
+	}
+	return "???";
+}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 00000000000..fabd302e577
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include "crush.h"
+
+const char *crush_bucket_alg_name(int alg)
+{
+	switch (alg) {
+	case CRUSH_BUCKET_UNIFORM: return "uniform";
+	case CRUSH_BUCKET_LIST: return "list";
+	case CRUSH_BUCKET_TREE: return "tree";
+	case CRUSH_BUCKET_STRAW: return "straw";
+	default: return "unknown";
+	}
+}
+
+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+	if (p >= b->size)
+		return 0;
+
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return ((struct crush_bucket_uniform *)b)->item_weight;
+	case CRUSH_BUCKET_LIST:
+		return ((struct crush_bucket_list *)b)->item_weights[p];
+	case CRUSH_BUCKET_TREE:
+		if (p & 1)
+			return ((struct crush_bucket_tree *)b)->node_weights[p];
+		return 0;
+	case CRUSH_BUCKET_STRAW:
+		return ((struct crush_bucket_straw *)b)->item_weights[p];
+	}
+	return 0;
+}
+
+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+	int i, b, c;
+
+	for (b = 0; b < map->max_buckets; b++) {
+		if (map->buckets[b] == NULL)
+			continue;
+		for (i = 0; i < map->buckets[b]->size; i++) {
+			c = map->buckets[b]->items[i];
+			BUG_ON(c >= map->max_devices ||
+			       c < -map->max_buckets);
+			if (c >= 0)
+				map->device_parents[c] = map->buckets[b]->id;
+			else
+				map->bucket_parents[-1-c] = map->buckets[b]->id;
+		}
+	}
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+	kfree(b->item_weights);
+	kfree(b->sum_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+	kfree(b->node_weights);
+	kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+	kfree(b->straws);
+	kfree(b->item_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)
+{
+	switch (b->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
+		break;
+	case CRUSH_BUCKET_LIST:
+		crush_destroy_bucket_list((struct crush_bucket_list *)b);
+		break;
+	case CRUSH_BUCKET_TREE:
+		crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
+		break;
+	case CRUSH_BUCKET_STRAW:
+		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
+		break;
+	}
+}
+
+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+	int b;
+
+	/* buckets */
+	if (map->buckets) {
+		for (b = 0; b < map->max_buckets; b++) {
+			if (map->buckets[b] == NULL)
+				continue;
+			crush_destroy_bucket(map->buckets[b]);
+		}
+		kfree(map->buckets);
+	}
+
+	/* rules */
+	if (map->rules) {
+		for (b = 0; b < map->max_rules; b++)
+			kfree(map->rules[b]);
+		kfree(map->rules);
+	}
+
+	kfree(map->bucket_parents);
+	kfree(map->device_parents);
+	kfree(map);
+}
+
+
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 00000000000..dcd7e752370
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
+#ifndef _CRUSH_CRUSH_H
+#define _CRUSH_CRUSH_H
+
+#include <linux/types.h>
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ *     http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
+ *
+ * LGPL2
+ */
+
+
+#define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
+
+
+#define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_SET   10  /* max size of a mapping result */
+
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices.  A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+	__u32 op;
+	__s32 arg1;
+	__s32 arg2;
+};
+
+/* step op codes */
+enum {
+	CRUSH_RULE_NOOP = 0,
+	CRUSH_RULE_TAKE = 1,          /* arg1 = value to start with */
+	CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+				      /* arg2 = type */
+	CRUSH_RULE_CHOOSE_INDEP = 3,  /* same */
+	CRUSH_RULE_EMIT = 4,          /* no args */
+	CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+	CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N            0
+#define CRUSH_CHOOSE_N_MINUS(x)   (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+	__u8 ruleset;
+	__u8 type;
+	__u8 min_size;
+	__u8 max_size;
+};
+
+struct crush_rule {
+	__u32 len;
+	struct crush_rule_mask mask;
+	struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+			      (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets).  Items within a bucket are chosen using one of a
+ * few different algorithms.  The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ *  Bucket Alg     Speed       Additions    Removals
+ *  ------------------------------------------------
+ *  uniform         O(1)       poor         poor
+ *  list            O(n)       optimal      poor
+ *  tree            O(log n)   good         good
+ *  straw           O(n)       optimal      optimal
+ */
+enum {
+	CRUSH_BUCKET_UNIFORM = 1,
+	CRUSH_BUCKET_LIST = 2,
+	CRUSH_BUCKET_TREE = 3,
+	CRUSH_BUCKET_STRAW = 4
+};
+extern const char *crush_bucket_alg_name(int alg);
+
+struct crush_bucket {
+	__s32 id;        /* this'll be negative */
+	__u16 type;      /* non-zero; type=0 is reserved for devices */
+	__u8 alg;        /* one of CRUSH_BUCKET_* */
+	__u8 hash;       /* which hash function to use, CRUSH_HASH_* */
+	__u32 weight;    /* 16-bit fixed point */
+	__u32 size;      /* num items */
+	__s32 *items;
+
+	/*
+	 * cached random permutation: used for uniform bucket and for
+	 * the linear search fallback for the other bucket types.
+	 */
+	__u32 perm_x;  /* @x for which *perm is defined */
+	__u32 perm_n;  /* num elements of *perm that are permuted/defined */
+	__u32 *perm;
+};
+
+struct crush_bucket_uniform {
+	struct crush_bucket h;
+	__u32 item_weight;  /* 16-bit fixed point; all items equally weighted */
+};
+
+struct crush_bucket_list {
+	struct crush_bucket h;
+	__u32 *item_weights;  /* 16-bit fixed point */
+	__u32 *sum_weights;   /* 16-bit fixed point.  element i is sum
+				 of weights 0..i, inclusive */
+};
+
+struct crush_bucket_tree {
+	struct crush_bucket h;  /* note: h.size is _tree_ size, not number of
+				   actual items */
+	__u8 num_nodes;
+	__u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+	struct crush_bucket h;
+	__u32 *item_weights;   /* 16-bit fixed point */
+	__u32 *straws;         /* 16-bit fixed point */
+};
+
+
+
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+	struct crush_bucket **buckets;
+	struct crush_rule **rules;
+
+	/*
+	 * Parent pointers to identify the parent bucket a device or
+	 * bucket in the hierarchy.  If an item appears more than
+	 * once, this is the _last_ time it appeared (where buckets
+	 * are processed in bucket id order, from -1 on down to
+	 * -max_buckets.
+	 */
+	__u32 *bucket_parents;
+	__u32 *device_parents;
+
+	__s32 max_buckets;
+	__u32 max_rules;
+	__s32 max_devices;
+};
+
+
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+
+#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 00000000000..5873aed694b
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include "hash.h"
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do {			\
+		a = a-b;  a = a-c;  a = a^(c>>13);	\
+		b = b-c;  b = b-a;  b = b^(a<<8);	\
+		c = c-a;  c = c-b;  c = c^(b>>13);	\
+		a = a-b;  a = a-c;  a = a^(c>>12);	\
+		b = b-c;  b = b-a;  b = b^(a<<16);	\
+		c = c-a;  c = c-b;  c = c^(b>>5);	\
+		a = a-b;  a = a-c;  a = a^(c>>3);	\
+		b = b-c;  b = b-a;  b = b^(a<<10);	\
+		c = c-a;  c = c-b;  c = c^(b>>15);	\
+	} while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+	__u32 hash = crush_hash_seed ^ a;
+	__u32 b = a;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, a, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(x, a, hash);
+	crush_hashmix(b, y, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(a, x, hash);
+	crush_hashmix(y, b, hash);
+	crush_hashmix(c, x, hash);
+	crush_hashmix(y, d, hash);
+	return hash;
+}
+
+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+				      __u32 e)
+{
+	__u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+	__u32 x = 231232;
+	__u32 y = 1232;
+	crush_hashmix(a, b, hash);
+	crush_hashmix(c, d, hash);
+	crush_hashmix(e, x, hash);
+	crush_hashmix(y, a, hash);
+	crush_hashmix(b, x, hash);
+	crush_hashmix(y, c, hash);
+	crush_hashmix(d, x, hash);
+	crush_hashmix(y, e, hash);
+	return hash;
+}
+
+
+__u32 crush_hash32(int type, __u32 a)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1(a);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_2(a, b);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_3(a, b, c);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_4(a, b, c, d);
+	default:
+		return 0;
+	}
+}
+
+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return crush_hash32_rjenkins1_5(a, b, c, d, e);
+	default:
+		return 0;
+	}
+}
+
+const char *crush_hash_name(int type)
+{
+	switch (type) {
+	case CRUSH_HASH_RJENKINS1:
+		return "rjenkins1";
+	default:
+		return "unknown";
+	}
+}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 00000000000..ff48e110e4b
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
+#ifndef _CRUSH_HASH_H
+#define _CRUSH_HASH_H
+
+#define CRUSH_HASH_RJENKINS1   0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+			    __u32 e);
+
+#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 00000000000..9ba54efb654
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+#  define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include "crush.h"
+#include "hash.h"
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+{
+	int i;
+
+	for (i = 0; i < map->max_rules; i++) {
+		if (map->rules[i] &&
+		    map->rules[i]->mask.ruleset == ruleset &&
+		    map->rules[i]->mask.type == type &&
+		    map->rules[i]->mask.min_size <= size &&
+		    map->rules[i]->mask.max_size >= size)
+			return i;
+	}
+	return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors.  Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+			      int x, int r)
+{
+	unsigned pr = r % bucket->size;
+	unsigned i, s;
+
+	/* start a new permutation if @x has changed */
+	if (bucket->perm_x != x || bucket->perm_n == 0) {
+		dprintk("bucket %d new x=%d\n", bucket->id, x);
+		bucket->perm_x = x;
+
+		/* optimize common r=0 case */
+		if (pr == 0) {
+			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+				bucket->size;
+			bucket->perm[0] = s;
+			bucket->perm_n = 0xffff;   /* magic value, see below */
+			goto out;
+		}
+
+		for (i = 0; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm_n = 0;
+	} else if (bucket->perm_n == 0xffff) {
+		/* clean up after the r=0 case above */
+		for (i = 1; i < bucket->size; i++)
+			bucket->perm[i] = i;
+		bucket->perm[bucket->perm[0]] = 0;
+		bucket->perm_n = 1;
+	}
+
+	/* calculate permutation up to pr */
+	for (i = 0; i < bucket->perm_n; i++)
+		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+	while (bucket->perm_n <= pr) {
+		unsigned p = bucket->perm_n;
+		/* no point in swapping the final entry */
+		if (p < bucket->size - 1) {
+			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+				(bucket->size - p);
+			if (i) {
+				unsigned t = bucket->perm[p + i];
+				bucket->perm[p + i] = bucket->perm[p];
+				bucket->perm[p] = t;
+			}
+			dprintk(" perm_choose swap %d with %d\n", p, p+i);
+		}
+		bucket->perm_n++;
+	}
+	for (i = 0; i < bucket->size; i++)
+		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);
+
+	s = bucket->perm[pr];
+out:
+	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+		bucket->size, x, r, pr, s);
+	return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+				 int x, int r)
+{
+	return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+			      int x, int r)
+{
+	int i;
+
+	for (i = bucket->h.size-1; i >= 0; i--) {
+		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+					 r, bucket->h.id);
+		w &= 0xffff;
+		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+			"sw %x rand %llx",
+			i, x, r, bucket->h.items[i], bucket->item_weights[i],
+			bucket->sum_weights[i], w);
+		w *= bucket->sum_weights[i];
+		w = w >> 16;
+		/*dprintk(" scaled %llx\n", w);*/
+		if (w < bucket->item_weights[i])
+			return bucket->h.items[i];
+	}
+
+	BUG_ON(1);
+	return 0;
+}
+
+
+/* (binary) tree */
+static int height(int n)
+{
+	int h = 0;
+	while ((n & 1) == 0) {
+		h++;
+		n = n >> 1;
+	}
+	return h;
+}
+
+static int left(int x)
+{
+	int h = height(x);
+	return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+	int h = height(x);
+	return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+	return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+			      int x, int r)
+{
+	int n, l;
+	__u32 w;
+	__u64 t;
+
+	/* start at root */
+	n = bucket->num_nodes >> 1;
+
+	while (!terminal(n)) {
+		/* pick point in [0, w) */
+		w = bucket->node_weights[n];
+		t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+					  bucket->h.id) * (__u64)w;
+		t = t >> 32;
+
+		/* descend to the left or right? */
+		l = left(n);
+		if (t < bucket->node_weights[l])
+			n = l;
+		else
+			n = right(n);
+	}
+
+	return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+			       int x, int r)
+{
+	int i;
+	int high = 0;
+	__u64 high_draw = 0;
+	__u64 draw;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+		draw &= 0xffff;
+		draw *= bucket->straws[i];
+		if (i == 0 || draw > high_draw) {
+			high = i;
+			high_draw = draw;
+		}
+	}
+	return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+	dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+	switch (in->alg) {
+	case CRUSH_BUCKET_UNIFORM:
+		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+					  x, r);
+	case CRUSH_BUCKET_LIST:
+		return bucket_list_choose((struct crush_bucket_list *)in,
+					  x, r);
+	case CRUSH_BUCKET_TREE:
+		return bucket_tree_choose((struct crush_bucket_tree *)in,
+					  x, r);
+	case CRUSH_BUCKET_STRAW:
+		return bucket_straw_choose((struct crush_bucket_straw *)in,
+					   x, r);
+	default:
+		BUG_ON(1);
+		return in->items[0];
+	}
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+	if (weight[item] >= 0x1000)
+		return 0;
+	if (weight[item] == 0)
+		return 1;
+	if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+	    < weight[item])
+		return 0;
+	return 1;
+}
+
+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+			struct crush_bucket *bucket,
+			__u32 *weight,
+			int x, int numrep, int type,
+			int *out, int outpos,
+			int firstn, int recurse_to_leaf,
+			int *out2)
+{
+	int rep;
+	int ftotal, flocal;
+	int retry_descent, retry_bucket, skip_rep;
+	struct crush_bucket *in = bucket;
+	int r;
+	int i;
+	int item = 0;
+	int itemtype;
+	int collide, reject;
+	const int orig_tries = 5; /* attempts before we fall back to search */
+	dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+
+	for (rep = outpos; rep < numrep; rep++) {
+		/* keep trying until we get a non-out, non-colliding item */
+		ftotal = 0;
+		skip_rep = 0;
+		do {
+			retry_descent = 0;
+			in = bucket;               /* initial bucket */
+
+			/* choose through intervening buckets */
+			flocal = 0;
+			do {
+				collide = 0;
+				retry_bucket = 0;
+				r = rep;
+				if (in->alg == CRUSH_BUCKET_UNIFORM) {
+					/* be careful */
+					if (firstn || numrep >= in->size)
+						/* r' = r + f_total */
+						r += ftotal;
+					else if (in->size % numrep == 0)
+						/* r'=r+(n+1)*f_local */
+						r += (numrep+1) *
+							(flocal+ftotal);
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				} else {
+					if (firstn)
+						/* r' = r + f_total */
+						r += ftotal;
+					else
+						/* r' = r + n*f_local */
+						r += numrep * (flocal+ftotal);
+				}
+
+				/* bucket choose */
+				if (in->size == 0) {
+					reject = 1;
+					goto reject;
+				}
+				if (flocal >= (in->size>>1) &&
+				    flocal > orig_tries)
+					item = bucket_perm_choose(in, x, r);
+				else
+					item = crush_bucket_choose(in, x, r);
+				BUG_ON(item >= map->max_devices);
+
+				/* desired type? */
+				if (item < 0)
+					itemtype = map->buckets[-1-item]->type;
+				else
+					itemtype = 0;
+				dprintk("  item %d type %d\n", item, itemtype);
+
+				/* keep going? */
+				if (itemtype != type) {
+					BUG_ON(item >= 0 ||
+					       (-1-item) >= map->max_buckets);
+					in = map->buckets[-1-item];
+					continue;
+				}
+
+				/* collision? */
+				for (i = 0; i < outpos; i++) {
+					if (out[i] == item) {
+						collide = 1;
+						break;
+					}
+				}
+
+				if (recurse_to_leaf &&
+				    item < 0 &&
+				    crush_choose(map, map->buckets[-1-item],
+						 weight,
+						 x, outpos+1, 0,
+						 out2, outpos,
+						 firstn, 0, NULL) <= outpos) {
+					reject = 1;
+				} else {
+					/* out? */
+					if (itemtype == 0)
+						reject = is_out(map, weight,
+								item, x);
+					else
+						reject = 0;
+				}
+
+reject:
+				if (reject || collide) {
+					ftotal++;
+					flocal++;
+
+					if (collide && flocal < 3)
+						/* retry locally a few times */
+						retry_bucket = 1;
+					else if (flocal < in->size + orig_tries)
+						/* exhaustive bucket search */
+						retry_bucket = 1;
+					else if (ftotal < 20)
+						/* then retry descent */
+						retry_descent = 1;
+					else
+						/* else give up */
+						skip_rep = 1;
+					dprintk("  reject %d  collide %d  "
+						"ftotal %d  flocal %d\n",
+						reject, collide, ftotal,
+						flocal);
+				}
+			} while (retry_bucket);
+		} while (retry_descent);
+
+		if (skip_rep) {
+			dprintk("skip rep\n");
+			continue;
+		}
+
+		dprintk("choose got %d\n", item);
+		out[outpos] = item;
+		outpos++;
+	}
+
+	dprintk("choose returns %d\n", outpos);
+	return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+		  int ruleno, int x, int *result, int result_max,
+		  int force, __u32 *weight)
+{
+	int result_len;
+	int force_context[CRUSH_MAX_DEPTH];
+	int force_pos = -1;
+	int a[CRUSH_MAX_SET];
+	int b[CRUSH_MAX_SET];
+	int c[CRUSH_MAX_SET];
+	int recurse_to_leaf;
+	int *w;
+	int wsize = 0;
+	int *o;
+	int osize;
+	int *tmp;
+	struct crush_rule *rule;
+	int step;
+	int i, j;
+	int numrep;
+	int firstn;
+	int rc = -1;
+
+	BUG_ON(ruleno >= map->max_rules);
+
+	rule = map->rules[ruleno];
+	result_len = 0;
+	w = a;
+	o = b;
+
+	/*
+	 * determine hierarchical context of force, if any.  note
+	 * that this may or may not correspond to the specific types
+	 * referenced by the crush rule.
+	 */
+	if (force >= 0) {
+		if (force >= map->max_devices ||
+		    map->device_parents[force] == 0) {
+			/*dprintk("CRUSH: forcefed device dne\n");*/
+			rc = -1;  /* force fed device dne */
+			goto out;
+		}
+		if (!is_out(map, weight, force, x)) {
+			while (1) {
+				force_context[++force_pos] = force;
+				if (force >= 0)
+					force = map->device_parents[force];
+				else
+					force = map->bucket_parents[-1-force];
+				if (force == 0)
+					break;
+			}
+		}
+	}
+
+	for (step = 0; step < rule->len; step++) {
+		firstn = 0;
+		switch (rule->steps[step].op) {
+		case CRUSH_RULE_TAKE:
+			w[0] = rule->steps[step].arg1;
+			if (force_pos >= 0) {
+				BUG_ON(force_context[force_pos] != w[0]);
+				force_pos--;
+			}
+			wsize = 1;
+			break;
+
+		case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+		case CRUSH_RULE_CHOOSE_FIRSTN:
+			firstn = 1;
+		case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+		case CRUSH_RULE_CHOOSE_INDEP:
+			BUG_ON(wsize == 0);
+
+			recurse_to_leaf =
+				rule->steps[step].op ==
+				 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+				rule->steps[step].op ==
+				CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+			/* reset output */
+			osize = 0;
+
+			for (i = 0; i < wsize; i++) {
+				/*
+				 * see CRUSH_N, CRUSH_N_MINUS macros.
+				 * basically, numrep <= 0 means relative to
+				 * the provided result_max
+				 */
+				numrep = rule->steps[step].arg1;
+				if (numrep <= 0) {
+					numrep += result_max;
+					if (numrep <= 0)
+						continue;
+				}
+				j = 0;
+				if (osize == 0 && force_pos >= 0) {
+					/* skip any intermediate types */
+					while (force_pos &&
+					       force_context[force_pos] < 0 &&
+					       rule->steps[step].arg2 !=
+					       map->buckets[-1 -
+					       force_context[force_pos]]->type)
+						force_pos--;
+					o[osize] = force_context[force_pos];
+					if (recurse_to_leaf)
+						c[osize] = force_context[0];
+					j++;
+					force_pos--;
+				}
+				osize += crush_choose(map,
+						      map->buckets[-1-w[i]],
+						      weight,
+						      x, numrep,
+						      rule->steps[step].arg2,
+						      o+osize, j,
+						      firstn,
+						      recurse_to_leaf, c+osize);
+			}
+
+			if (recurse_to_leaf)
+				/* copy final _leaf_ values to output set */
+				memcpy(o, c, osize*sizeof(*o));
+
+			/* swap t and w arrays */
+			tmp = o;
+			o = w;
+			w = tmp;
+			wsize = osize;
+			break;
+
+
+		case CRUSH_RULE_EMIT:
+			for (i = 0; i < wsize && result_len < result_max; i++) {
+				result[result_len] = w[i];
+				result_len++;
+			}
+			wsize = 0;
+			break;
+
+		default:
+			BUG_ON(1);
+		}
+	}
+	rc = result_len;
+
+out:
+	return rc;
+}
+
+
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 00000000000..98e90046fd9
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
+#ifndef _CRUSH_MAPPER_H
+#define _CRUSH_MAPPER_H
+
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.
+ *
+ * LGPL2
+ */
+
+#include "crush.h"
+
+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
+extern int crush_do_rule(struct crush_map *map,
+			 int ruleno,
+			 int x, int *result, int result_max,
+			 int forcefeed,    /* -1 for none */
+			 __u32 *weights);
+
+#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 00000000000..291ac288e79
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,408 @@
+
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <crypto/hash.h>
+
+#include "crypto.h"
+#include "decode.h"
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	if (*p + sizeof(u16) + sizeof(key->created) +
+	    sizeof(u16) + key->len > end)
+		return -ERANGE;
+	ceph_encode_16(p, key->type);
+	ceph_encode_copy(p, &key->created, sizeof(key->created));
+	ceph_encode_16(p, key->len);
+	ceph_encode_copy(p, key->key, key->len);
+	return 0;
+}
+
+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+	ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+	key->type = ceph_decode_16(p);
+	ceph_decode_copy(p, &key->created, sizeof(key->created));
+	key->len = ceph_decode_16(p);
+	ceph_decode_need(p, end, key->len, bad);
+	key->key = kmalloc(key->len, GFP_NOFS);
+	if (!key->key)
+		return -ENOMEM;
+	ceph_decode_copy(p, key->key, key->len);
+	return 0;
+
+bad:
+	dout("failed to decode crypto key\n");
+	return -EINVAL;
+}
+
+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+	int inlen = strlen(inkey);
+	int blen = inlen * 3 / 4;
+	void *buf, *p;
+	int ret;
+
+	dout("crypto_key_unarmor %s\n", inkey);
+	buf = kmalloc(blen, GFP_NOFS);
+	if (!buf)
+		return -ENOMEM;
+	blen = ceph_unarmor(buf, inkey, inkey+inlen);
+	if (blen < 0) {
+		kfree(buf);
+		return blen;
+	}
+
+	p = buf;
+	ret = ceph_crypto_key_decode(key, &p, p + blen);
+	kfree(buf);
+	if (ret)
+		return ret;
+	dout("crypto_key_unarmor key %p type %d len %d\n", key,
+	     key->type, key->len);
+	return 0;
+}
+
+
+
+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+	return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+const u8 *aes_iv = "cephsageyudagreg";
+
+int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+		     const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[2], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - (src_len & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 2);
+	sg_set_buf(&sg_in[0], src, src_len);
+	sg_set_buf(&sg_in[1], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+			src, src_len, 1);
+	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
+		      const void *src1, size_t src1_len,
+		      const void *src2, size_t src2_len)
+{
+	struct scatterlist sg_in[3], sg_out[1];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+	int ret;
+	void *iv;
+	int ivsize;
+	size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+	char pad[16];
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	memset(pad, zero_padding, zero_padding);
+
+	*dst_len = src1_len + src2_len + zero_padding;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 3);
+	sg_set_buf(&sg_in[0], src1, src1_len);
+	sg_set_buf(&sg_in[1], src2, src2_len);
+	sg_set_buf(&sg_in[2], pad, zero_padding);
+	sg_init_table(sg_out, 1);
+	sg_set_buf(sg_out, dst, *dst_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+	/*
+	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+			src1, src1_len, 1);
+	print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+			src2, src2_len, 1);
+	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
+			pad, zero_padding, 1);
+	*/
+	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+				     src1_len + src2_len + zero_padding);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0)
+		pr_err("ceph_aes_crypt2 failed %d\n", ret);
+	/*
+	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+		     const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[2];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	sg_init_table(sg_in, 1);
+	sg_init_table(sg_out, 2);
+	sg_set_buf(sg_in, src, src_len);
+	sg_set_buf(&sg_out[0], dst, *dst_len);
+	sg_set_buf(&sg_out[1], pad, sizeof(pad));
+
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst_len)
+		last_byte = ((char *)dst)[src_len - 1];
+	else
+		last_byte = pad[src_len - *dst_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		*dst_len = src_len - last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst, *dst_len, 1);
+	*/
+	return 0;
+}
+
+int ceph_aes_decrypt2(const void *key, int key_len,
+		      void *dst1, size_t *dst1_len,
+		      void *dst2, size_t *dst2_len,
+		      const void *src, size_t src_len)
+{
+	struct scatterlist sg_in[1], sg_out[3];
+	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+	struct blkcipher_desc desc = { .tfm = tfm };
+	char pad[16];
+	void *iv;
+	int ivsize;
+	int ret;
+	int last_byte;
+
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	sg_init_table(sg_in, 1);
+	sg_set_buf(sg_in, src, src_len);
+	sg_init_table(sg_out, 3);
+	sg_set_buf(&sg_out[0], dst1, *dst1_len);
+	sg_set_buf(&sg_out[1], dst2, *dst2_len);
+	sg_set_buf(&sg_out[2], pad, sizeof(pad));
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
+	iv = crypto_blkcipher_crt(tfm)->iv;
+	ivsize = crypto_blkcipher_ivsize(tfm);
+
+	memcpy(iv, aes_iv, ivsize);
+
+	/*
+	print_hex_dump(KERN_ERR, "dec  key: ", DUMP_PREFIX_NONE, 16, 1,
+		       key, key_len, 1);
+	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
+		       src, src_len, 1);
+	*/
+
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+	crypto_free_blkcipher(tfm);
+	if (ret < 0) {
+		pr_err("ceph_aes_decrypt failed %d\n", ret);
+		return ret;
+	}
+
+	if (src_len <= *dst1_len)
+		last_byte = ((char *)dst1)[src_len - 1];
+	else if (src_len <= *dst1_len + *dst2_len)
+		last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+	else
+		last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+	if (last_byte <= 16 && src_len >= last_byte) {
+		src_len -= last_byte;
+	} else {
+		pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+		       last_byte, (int)src_len);
+		return -EPERM;  /* bad padding */
+	}
+
+	if (src_len < *dst1_len) {
+		*dst1_len = src_len;
+		*dst2_len = 0;
+	} else {
+		*dst2_len = src_len - *dst1_len;
+	}
+	/*
+	print_hex_dump(KERN_ERR, "dec  out1: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst1, *dst1_len, 1);
+	print_hex_dump(KERN_ERR, "dec  out2: ", DUMP_PREFIX_NONE, 16, 1,
+		       dst2, *dst2_len, 1);
+	*/
+
+	return 0;
+}
+
+
+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len)
+{
+	size_t t;
+
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst1_len + *dst2_len < src_len)
+			return -ERANGE;
+		t = min(*dst1_len, src_len);
+		memcpy(dst1, src, t);
+		*dst1_len = t;
+		src += t;
+		src_len -= t;
+		if (src_len) {
+			t = min(*dst2_len, src_len);
+			memcpy(dst2, src, t);
+			*dst2_len = t;
+		}
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_decrypt2(secret->key, secret->len,
+					 dst1, dst1_len, dst2, dst2_len,
+					 src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		 const void *src, size_t src_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src_len)
+			return -ERANGE;
+		memcpy(dst, src, src_len);
+		*dst_len = src_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt(secret->key, secret->len, dst,
+					dst_len, src, src_len);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+		  const void *src1, size_t src1_len,
+		  const void *src2, size_t src2_len)
+{
+	switch (secret->type) {
+	case CEPH_CRYPTO_NONE:
+		if (*dst_len < src1_len + src2_len)
+			return -ERANGE;
+		memcpy(dst, src1, src1_len);
+		memcpy(dst + src1_len, src2, src2_len);
+		*dst_len = src1_len + src2_len;
+		return 0;
+
+	case CEPH_CRYPTO_AES:
+		return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+					 src1, src1_len, src2, src2_len);
+
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 00000000000..40b502e6bd8
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H
+
+#include "types.h"
+#include "buffer.h"
+
+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+	int type;
+	struct ceph_timespec created;
+	int len;
+	void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+	kfree(key->key);
+}
+
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+				  void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+			void *dst, size_t *dst_len,
+			const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+			void *dst1, size_t *dst1_len,
+			void *dst2, size_t *dst2_len,
+			const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+			 void *dst, size_t *dst_len,
+			 const void *src1, size_t src1_len,
+			 const void *src2, size_t src2_len);
+
+/* armor.c */
+extern int ceph_armor(char *dst, const void *src, const void *end);
+extern int ceph_unarmor(void *dst, const char *src, const char *end);
+
+#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 00000000000..e159f141511
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,483 @@
+#include "ceph_debug.h"
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "mon_client.h"
+#include "auth.h"
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client*  - an instance of the ceph client
+ *      .../osdmap      - current osdmap
+ *      .../mdsmap      - current mdsmap
+ *      .../monmap      - current monmap
+ *      .../osdc        - active osd requests
+ *      .../mdsc        - active mds requests
+ *      .../monc        - mon client state
+ *      .../dentry_lru  - dump contents of dentry lru
+ *      .../caps        - expose cap (reservation) stats
+ *      .../bdi         - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->monc.monmap == NULL)
+		return 0;
+
+	seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+	for (i = 0; i < client->monc.monmap->num_mon; i++) {
+		struct ceph_entity_inst *inst =
+			&client->monc.monmap->mon_inst[i];
+
+		seq_printf(s, "\t%s%lld\t%s\n",
+			   ENTITY_NAME(inst->name),
+			   pr_addr(&inst->addr.in_addr));
+	}
+	return 0;
+}
+
+static int mdsmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+
+	if (client->mdsc.mdsmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+	seq_printf(s, "session_timeout %d\n",
+		       client->mdsc.mdsmap->m_session_timeout);
+	seq_printf(s, "session_autoclose %d\n",
+		       client->mdsc.mdsmap->m_session_autoclose);
+	for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+		struct ceph_entity_addr *addr =
+			&client->mdsc.mdsmap->m_info[i].addr;
+		int state = client->mdsc.mdsmap->m_info[i].state;
+
+		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+			       ceph_mds_state_name(state));
+	}
+	return 0;
+}
+
+static int osdmap_show(struct seq_file *s, void *p)
+{
+	int i;
+	struct ceph_client *client = s->private;
+	struct rb_node *n;
+
+	if (client->osdc.osdmap == NULL)
+		return 0;
+	seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+	seq_printf(s, "flags%s%s\n",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+		   " NEARFULL" : "",
+		   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+		   " FULL" : "");
+	for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+		struct ceph_pg_pool_info *pool =
+			rb_entry(n, struct ceph_pg_pool_info, node);
+		seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+			   pool->id, pool->v.pg_num, pool->pg_num_mask,
+			   pool->v.lpg_num, pool->lpg_num_mask);
+	}
+	for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+		struct ceph_entity_addr *addr =
+			&client->osdc.osdmap->osd_addr[i];
+		int state = client->osdc.osdmap->osd_state[i];
+		char sb[64];
+
+		seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+			   i, pr_addr(&addr->in_addr),
+			   ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+			   ceph_osdmap_state_str(sb, sizeof(sb), state));
+	}
+	return 0;
+}
+
+static int monc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_client *monc = &client->monc;
+	struct rb_node *rp;
+
+	mutex_lock(&monc->mutex);
+
+	if (monc->have_mdsmap)
+		seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+	if (monc->have_osdmap)
+		seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+	if (monc->want_next_osdmap)
+		seq_printf(s, "want next osdmap\n");
+
+	for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
+		req = rb_entry(rp, struct ceph_mon_statfs_request, node);
+		seq_printf(s, "%lld statfs\n", req->tid);
+	}
+
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int mdsc_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	struct rb_node *rp;
+	int pathlen;
+	u64 pathbase;
+	char *path;
+
+	mutex_lock(&mdsc->mutex);
+	for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
+		req = rb_entry(rp, struct ceph_mds_request, r_node);
+
+		if (req->r_request)
+			seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+		else
+			seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+
+		seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+
+		if (req->r_got_unsafe)
+			seq_printf(s, "\t(unsafe)");
+		else
+			seq_printf(s, "\t");
+
+		if (req->r_inode) {
+			seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+		} else if (req->r_dentry) {
+			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+						    &pathbase, 0);
+			spin_lock(&req->r_dentry->d_lock);
+			seq_printf(s, " #%llx/%.*s (%s)",
+				   ceph_ino(req->r_dentry->d_parent->d_inode),
+				   req->r_dentry->d_name.len,
+				   req->r_dentry->d_name.name,
+				   path ? path : "");
+			spin_unlock(&req->r_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path1) {
+			seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+				   req->r_path1);
+		}
+
+		if (req->r_old_dentry) {
+			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+						    &pathbase, 0);
+			spin_lock(&req->r_old_dentry->d_lock);
+			seq_printf(s, " #%llx/%.*s (%s)",
+			   ceph_ino(req->r_old_dentry->d_parent->d_inode),
+				   req->r_old_dentry->d_name.len,
+				   req->r_old_dentry->d_name.name,
+				   path ? path : "");
+			spin_unlock(&req->r_old_dentry->d_lock);
+			kfree(path);
+		} else if (req->r_path2) {
+			if (req->r_ino2.ino)
+				seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+					   req->r_path2);
+			else
+				seq_printf(s, " %s", req->r_path2);
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	return 0;
+}
+
+static int osdc_show(struct seq_file *s, void *pp)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_osd_client *osdc = &client->osdc;
+	struct rb_node *p;
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		struct ceph_osd_request *req;
+		struct ceph_osd_request_head *head;
+		struct ceph_osd_op *op;
+		int num_ops;
+		int opcode, olen;
+		int i;
+
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+			   req->r_osd ? req->r_osd->o_osd : -1,
+			   le32_to_cpu(req->r_pgid.pool),
+			   le16_to_cpu(req->r_pgid.ps));
+
+		head = req->r_request->front.iov_base;
+		op = (void *)(head + 1);
+
+		num_ops = le16_to_cpu(head->num_ops);
+		olen = le32_to_cpu(head->object_len);
+		seq_printf(s, "%.*s", olen,
+			   (const char *)(head->ops + num_ops));
+
+		if (req->r_reassert_version.epoch)
+			seq_printf(s, "\t%u'%llu",
+			   (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+			   le64_to_cpu(req->r_reassert_version.version));
+		else
+			seq_printf(s, "\t");
+
+		for (i = 0; i < num_ops; i++) {
+			opcode = le16_to_cpu(op->op);
+			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+			op++;
+		}
+
+		seq_printf(s, "\n");
+	}
+	mutex_unlock(&osdc->request_mutex);
+	return 0;
+}
+
+static int caps_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = p;
+	int total, avail, used, reserved, min;
+
+	ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+	seq_printf(s, "total\t\t%d\n"
+		   "avail\t\t%d\n"
+		   "used\t\t%d\n"
+		   "reserved\t%d\n"
+		   "min\t%d\n",
+		   total, avail, used, reserved, min);
+	return 0;
+}
+
+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+	struct ceph_client *client = s->private;
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_dentry_info *di;
+
+	spin_lock(&mdsc->dentry_lru_lock);
+	list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+		struct dentry *dentry = di->dentry;
+		seq_printf(s, "%p %p\t%.*s\n",
+			   di, dentry, dentry->d_name.len, dentry->d_name.name);
+	}
+	spin_unlock(&mdsc->dentry_lru_lock);
+
+	return 0;
+}
+
+#define DEFINE_SHOW_FUNC(name) 						\
+static int name##_open(struct inode *inode, struct file *file)		\
+{									\
+	struct seq_file *sf;						\
+	int ret;							\
+									\
+	ret = single_open(file, name, NULL);				\
+	sf = file->private_data;					\
+	sf->private = inode->i_private;					\
+	return ret;							\
+}									\
+									\
+static const struct file_operations name##_fops = {			\
+	.open		= name##_open,					\
+	.read		= seq_read,					\
+	.llseek		= seq_lseek,					\
+	.release	= single_release,				\
+};
+
+DEFINE_SHOW_FUNC(monmap_show)
+DEFINE_SHOW_FUNC(mdsmap_show)
+DEFINE_SHOW_FUNC(osdmap_show)
+DEFINE_SHOW_FUNC(monc_show)
+DEFINE_SHOW_FUNC(mdsc_show)
+DEFINE_SHOW_FUNC(osdc_show)
+DEFINE_SHOW_FUNC(dentry_lru_show)
+DEFINE_SHOW_FUNC(caps_show)
+
+static int congestion_kb_set(void *data, u64 val)
+{
+	struct ceph_client *client = (struct ceph_client *)data;
+
+	if (client)
+		client->mount_args->congestion_kb = (int)val;
+
+	return 0;
+}
+
+static int congestion_kb_get(void *data, u64 *val)
+{
+	struct ceph_client *client = (struct ceph_client *)data;
+
+	if (client)
+		*val = (u64)client->mount_args->congestion_kb;
+
+	return 0;
+}
+
+
+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+			congestion_kb_set, "%llu\n");
+
+int __init ceph_debugfs_init(void)
+{
+	ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+	if (!ceph_debugfs_dir)
+		return -ENOMEM;
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+	debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	int ret = 0;
+	char name[80];
+
+	snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
+		 PR_FSID(&client->fsid), client->monc.auth->global_id);
+
+	client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+	if (!client->debugfs_dir)
+		goto out;
+
+	client->monc.debugfs_file = debugfs_create_file("monc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &monc_show_fops);
+	if (!client->monc.debugfs_file)
+		goto out;
+
+	client->mdsc.debugfs_file = debugfs_create_file("mdsc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &mdsc_show_fops);
+	if (!client->mdsc.debugfs_file)
+		goto out;
+
+	client->osdc.debugfs_file = debugfs_create_file("osdc",
+						      0600,
+						      client->debugfs_dir,
+						      client,
+						      &osdc_show_fops);
+	if (!client->osdc.debugfs_file)
+		goto out;
+
+	client->debugfs_monmap = debugfs_create_file("monmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&monmap_show_fops);
+	if (!client->debugfs_monmap)
+		goto out;
+
+	client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&mdsmap_show_fops);
+	if (!client->debugfs_mdsmap)
+		goto out;
+
+	client->debugfs_osdmap = debugfs_create_file("osdmap",
+					0600,
+					client->debugfs_dir,
+					client,
+					&osdmap_show_fops);
+	if (!client->debugfs_osdmap)
+		goto out;
+
+	client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+					0600,
+					client->debugfs_dir,
+					client,
+					&dentry_lru_show_fops);
+	if (!client->debugfs_dentry_lru)
+		goto out;
+
+	client->debugfs_caps = debugfs_create_file("caps",
+						   0400,
+						   client->debugfs_dir,
+						   client,
+						   &caps_show_fops);
+	if (!client->debugfs_caps)
+		goto out;
+
+	client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
+						   0600,
+						   client->debugfs_dir,
+						   client,
+						   &congestion_kb_fops);
+	if (!client->debugfs_congestion_kb)
+		goto out;
+
+	sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
+	client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
+						     name);
+
+	return 0;
+
+out:
+	ceph_debugfs_client_cleanup(client);
+	return ret;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+	debugfs_remove(client->debugfs_bdi);
+	debugfs_remove(client->debugfs_caps);
+	debugfs_remove(client->debugfs_dentry_lru);
+	debugfs_remove(client->debugfs_osdmap);
+	debugfs_remove(client->debugfs_mdsmap);
+	debugfs_remove(client->debugfs_monmap);
+	debugfs_remove(client->osdc.debugfs_file);
+	debugfs_remove(client->mdsc.debugfs_file);
+	debugfs_remove(client->monc.debugfs_file);
+	debugfs_remove(client->debugfs_congestion_kb);
+	debugfs_remove(client->debugfs_dir);
+}
+
+#else  // CONFIG_DEBUG_FS
+
+int __init ceph_debugfs_init(void)
+{
+	return 0;
+}
+
+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+	return 0;
+}
+
+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif  // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 00000000000..65b3e022eaf
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+
+#include <asm/unaligned.h>
+#include <linux/time.h>
+
+#include "types.h"
+
+/*
+ * in all cases,
+ *   void **p     pointer to position pointer
+ *   void *end    pointer to end of buffer (last byte + 1)
+ */
+
+static inline u64 ceph_decode_64(void **p)
+{
+	u64 v = get_unaligned_le64(*p);
+	*p += sizeof(u64);
+	return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+	u32 v = get_unaligned_le32(*p);
+	*p += sizeof(u32);
+	return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+	u16 v = get_unaligned_le16(*p);
+	*p += sizeof(u16);
+	return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+	u8 v = *(u8 *)*p;
+	(*p)++;
+	return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+	memcpy(pv, *p, n);
+	*p += n;
+}
+
+/*
+ * bounds check input.
+ */
+#define ceph_decode_need(p, end, n, bad)		\
+	do {						\
+		if (unlikely(*(p) + (n) > (end))) 	\
+			goto bad;			\
+	} while (0)
+
+#define ceph_decode_64_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u64), bad);	\
+		v = ceph_decode_64(p);				\
+	} while (0)
+#define ceph_decode_32_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u32), bad);	\
+		v = ceph_decode_32(p);				\
+	} while (0)
+#define ceph_decode_16_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u16), bad);	\
+		v = ceph_decode_16(p);				\
+	} while (0)
+#define ceph_decode_8_safe(p, end, v, bad)			\
+	do {							\
+		ceph_decode_need(p, end, sizeof(u8), bad);	\
+		v = ceph_decode_8(p);				\
+	} while (0)
+
+#define ceph_decode_copy_safe(p, end, pv, n, bad)		\
+	do {							\
+		ceph_decode_need(p, end, n, bad);		\
+		ceph_decode_copy(p, pv, n);			\
+	} while (0)
+
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+static inline void ceph_decode_timespec(struct timespec *ts,
+					const struct ceph_timespec *tv)
+{
+	ts->tv_sec = le32_to_cpu(tv->tv_sec);
+	ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+					const struct timespec *ts)
+{
+	tv->tv_sec = cpu_to_le32(ts->tv_sec);
+	tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+}
+
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+	a->in_addr.ss_family = htons(a->in_addr.ss_family);
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+	a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+	WARN_ON(a->in_addr.ss_family == 512);
+}
+
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+	put_unaligned_le64(v, (__le64 *)*p);
+	*p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+	put_unaligned_le32(v, (__le32 *)*p);
+	*p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+	put_unaligned_le16(v, (__le16 *)*p);
+	*p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+	*(u8 *)*p = v;
+	(*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+	memcpy(*p, s, len);
+	*p += len;
+}
+
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+					u64 ino, const char *path)
+{
+	u32 len = path ? strlen(path) : 0;
+	BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+	ceph_encode_8(p, 1);
+	ceph_encode_64(p, ino);
+	ceph_encode_32(p, len);
+	if (len)
+		memcpy(*p, path, len);
+	*p += len;
+}
+
+static inline void ceph_encode_string(void **p, void *end,
+				      const char *s, u32 len)
+{
+	BUG_ON(*p + sizeof(len) + len > end);
+	ceph_encode_32(p, len);
+	if (len)
+		memcpy(*p, s, len);
+	*p += len;
+}
+
+#define ceph_encode_need(p, end, n, bad)		\
+	do {						\
+		if (unlikely(*(p) + (n) > (end))) 	\
+			goto bad;			\
+	} while (0)
+
+#define ceph_encode_64_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u64), bad);	\
+		ceph_encode_64(p, v);				\
+	} while (0)
+#define ceph_encode_32_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u32), bad);	\
+		ceph_encode_32(p, v);			\
+	} while (0)
+#define ceph_encode_16_safe(p, end, v, bad)			\
+	do {							\
+		ceph_encode_need(p, end, sizeof(u16), bad);	\
+		ceph_encode_16(p, v);			\
+	} while (0)
+
+#define ceph_encode_copy_safe(p, end, pv, n, bad)		\
+	do {							\
+		ceph_encode_need(p, end, n, bad);		\
+		ceph_encode_copy(p, pv, n);			\
+	} while (0)
+
+
+#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 00000000000..8a9116e15b7
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1222 @@
+#include "ceph_debug.h"
+
+#include <linux/spinlock.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include "super.h"
+
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path.  Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino).  The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+
+const struct inode_operations ceph_dir_iops;
+const struct file_operations ceph_dir_fops;
+struct dentry_operations ceph_dentry_ops;
+
+/*
+ * Initialize ceph dentry state.
+ */
+int ceph_init_dentry(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+
+	if (dentry->d_fsdata)
+		return 0;
+
+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+		dentry->d_op = &ceph_dentry_ops;
+	else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+		dentry->d_op = &ceph_snapdir_dentry_ops;
+	else
+		dentry->d_op = &ceph_snap_dentry_ops;
+
+	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+	if (!di)
+		return -ENOMEM;          /* oh well */
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_fsdata) /* lost a race */
+		goto out_unlock;
+	di->dentry = dentry;
+	di->lease_session = NULL;
+	dentry->d_fsdata = di;
+	dentry->d_time = jiffies;
+	ceph_dentry_lru_add(dentry);
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return 0;
+}
+
+
+
+/*
+ * for readdir, we encode the directory frag and offset within that
+ * frag into f_pos.
+ */
+static unsigned fpos_frag(loff_t p)
+{
+	return p >> 32;
+}
+static unsigned fpos_off(loff_t p)
+{
+	return p & 0xffffffff;
+}
+
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache.  We make this work by carefully ordering dentries on
+ * d_u.d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * I_COMPLETE tells indicates we have all dentries in the dir.  It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *filp,
+			    void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_file_info *fi = filp->private_data;
+	struct dentry *parent = filp->f_dentry;
+	struct inode *dir = parent->d_inode;
+	struct list_head *p;
+	struct dentry *dentry, *last;
+	struct ceph_dentry_info *di;
+	int err = 0;
+
+	/* claim ref on last dentry we returned */
+	last = fi->dentry;
+	fi->dentry = NULL;
+
+	dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+	     last);
+
+	spin_lock(&dcache_lock);
+
+	/* start at beginning? */
+	if (filp->f_pos == 2 || (last &&
+				 filp->f_pos < ceph_dentry(last)->offset)) {
+		if (list_empty(&parent->d_subdirs))
+			goto out_unlock;
+		p = parent->d_subdirs.prev;
+		dout(" initial p %p/%p\n", p->prev, p->next);
+	} else {
+		p = last->d_u.d_child.prev;
+	}
+
+more:
+	dentry = list_entry(p, struct dentry, d_u.d_child);
+	di = ceph_dentry(dentry);
+	while (1) {
+		dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+		     parent->d_subdirs.prev, parent->d_subdirs.next);
+		if (p == &parent->d_subdirs) {
+			fi->at_end = 1;
+			goto out_unlock;
+		}
+		if (!d_unhashed(dentry) && dentry->d_inode &&
+		    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
+		    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
+		    filp->f_pos <= di->offset)
+			break;
+		dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
+		     dentry->d_name.len, dentry->d_name.name, di->offset,
+		     filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+		     !dentry->d_inode ? " null" : "");
+		p = p->prev;
+		dentry = list_entry(p, struct dentry, d_u.d_child);
+		di = ceph_dentry(dentry);
+	}
+
+	atomic_inc(&dentry->d_count);
+	spin_unlock(&dcache_lock);
+	spin_unlock(&inode->i_lock);
+
+	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+	filp->f_pos = di->offset;
+	err = filldir(dirent, dentry->d_name.name,
+		      dentry->d_name.len, di->offset,
+		      dentry->d_inode->i_ino,
+		      dentry->d_inode->i_mode >> 12);
+
+	if (last) {
+		if (err < 0) {
+			/* remember our position */
+			fi->dentry = last;
+			fi->next_offset = di->offset;
+		} else {
+			dput(last);
+		}
+		last = NULL;
+	}
+
+	spin_lock(&inode->i_lock);
+	spin_lock(&dcache_lock);
+
+	if (err < 0)
+		goto out_unlock;
+
+	last = dentry;
+
+	p = p->prev;
+	filp->f_pos++;
+
+	/* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+	if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
+		goto more;
+	dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+	err = -EAGAIN;
+
+out_unlock:
+	spin_unlock(&dcache_lock);
+
+	if (last) {
+		spin_unlock(&inode->i_lock);
+		dput(last);
+		spin_lock(&inode->i_lock);
+	}
+
+	return err;
+}
+
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+			    int len)
+{
+	kfree(fi->last_name);
+	fi->last_name = kmalloc(len+1, GFP_NOFS);
+	if (!fi->last_name)
+		return -ENOMEM;
+	memcpy(fi->last_name, name, len);
+	fi->last_name[len] = 0;
+	dout("note_last_dentry '%s'\n", fi->last_name);
+	return 0;
+}
+
+static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct ceph_file_info *fi = filp->private_data;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	unsigned frag = fpos_frag(filp->f_pos);
+	int off = fpos_off(filp->f_pos);
+	int err;
+	u32 ftype;
+	struct ceph_mds_reply_info_parsed *rinfo;
+	const int max_entries = client->mount_args->max_readdir;
+
+	dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+	if (fi->at_end)
+		return 0;
+
+	/* always start with . and .. */
+	if (filp->f_pos == 0) {
+		/* note dir version at start of readdir so we can tell
+		 * if any dentries get dropped */
+		fi->dir_release_count = ci->i_release_count;
+
+		dout("readdir off 0 -> '.'\n");
+		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+			    inode->i_ino, inode->i_mode >> 12) < 0)
+			return 0;
+		filp->f_pos = 1;
+		off = 1;
+	}
+	if (filp->f_pos == 1) {
+		dout("readdir off 1 -> '..'\n");
+		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+			    filp->f_dentry->d_parent->d_inode->i_ino,
+			    inode->i_mode >> 12) < 0)
+			return 0;
+		filp->f_pos = 2;
+		off = 2;
+	}
+
+	/* can we use the dcache? */
+	spin_lock(&inode->i_lock);
+	if ((filp->f_pos == 2 || fi->dentry) &&
+	    !ceph_test_opt(client, NOASYNCREADDIR) &&
+	    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+	    __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+		err = __dcache_readdir(filp, dirent, filldir);
+		if (err != -EAGAIN) {
+			spin_unlock(&inode->i_lock);
+			return err;
+		}
+	}
+	spin_unlock(&inode->i_lock);
+	if (fi->dentry) {
+		err = note_last_dentry(fi, fi->dentry->d_name.name,
+				       fi->dentry->d_name.len);
+		if (err)
+			return err;
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+
+	/* proceed with a normal readdir */
+
+more:
+	/* do we have the correct frag content buffered? */
+	if (fi->frag != frag || fi->last_readdir == NULL) {
+		struct ceph_mds_request *req;
+		int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+
+		/* discard old result, if any */
+		if (fi->last_readdir) {
+			ceph_mdsc_put_request(fi->last_readdir);
+			fi->last_readdir = NULL;
+		}
+
+		/* requery frag tree, as the frag topology may have changed */
+		frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
+
+		dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+		     ceph_vinop(inode), frag, fi->last_name);
+		req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+		req->r_inode = igrab(inode);
+		req->r_dentry = dget(filp->f_dentry);
+		/* hints to request -> mds selection code */
+		req->r_direct_mode = USE_AUTH_MDS;
+		req->r_direct_hash = ceph_frag_value(frag);
+		req->r_direct_is_hash = true;
+		req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+		req->r_readdir_offset = fi->next_offset;
+		req->r_args.readdir.frag = cpu_to_le32(frag);
+		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+		req->r_num_caps = max_entries;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		if (err < 0) {
+			ceph_mdsc_put_request(req);
+			return err;
+		}
+		dout("readdir got and parsed readdir result=%d"
+		     " on frag %x, end=%d, complete=%d\n", err, frag,
+		     (int)req->r_reply_info.dir_end,
+		     (int)req->r_reply_info.dir_complete);
+
+		if (!req->r_did_prepopulate) {
+			dout("readdir !did_prepopulate");
+			fi->dir_release_count--;    /* preclude I_COMPLETE */
+		}
+
+		/* note next offset and last dentry name */
+		fi->offset = fi->next_offset;
+		fi->last_readdir = req;
+
+		if (req->r_reply_info.dir_end) {
+			kfree(fi->last_name);
+			fi->last_name = NULL;
+			fi->next_offset = 0;
+		} else {
+			rinfo = &req->r_reply_info;
+			err = note_last_dentry(fi,
+				       rinfo->dir_dname[rinfo->dir_nr-1],
+				       rinfo->dir_dname_len[rinfo->dir_nr-1]);
+			if (err)
+				return err;
+			fi->next_offset += rinfo->dir_nr;
+		}
+	}
+
+	rinfo = &fi->last_readdir->r_reply_info;
+	dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+	     rinfo->dir_nr, off, fi->offset);
+	while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+		u64 pos = ceph_make_fpos(frag, off);
+		struct ceph_mds_reply_inode *in =
+			rinfo->dir_in[off - fi->offset].in;
+		dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+		     off, off - fi->offset, rinfo->dir_nr, pos,
+		     rinfo->dir_dname_len[off - fi->offset],
+		     rinfo->dir_dname[off - fi->offset], in);
+		BUG_ON(!in);
+		ftype = le32_to_cpu(in->mode) >> 12;
+		if (filldir(dirent,
+			    rinfo->dir_dname[off - fi->offset],
+			    rinfo->dir_dname_len[off - fi->offset],
+			    pos,
+			    le64_to_cpu(in->ino),
+			    ftype) < 0) {
+			dout("filldir stopping us...\n");
+			return 0;
+		}
+		off++;
+		filp->f_pos = pos + 1;
+	}
+
+	if (fi->last_name) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+		goto more;
+	}
+
+	/* more frags? */
+	if (!ceph_frag_is_rightmost(frag)) {
+		frag = ceph_frag_next(frag);
+		off = 0;
+		filp->f_pos = ceph_make_fpos(frag, off);
+		dout("readdir next frag is %x\n", frag);
+		goto more;
+	}
+	fi->at_end = 1;
+
+	/*
+	 * if dir_release_count still matches the dir, no dentries
+	 * were released during the whole readdir, and we should have
+	 * the complete dir contents in our cache.
+	 */
+	spin_lock(&inode->i_lock);
+	if (ci->i_release_count == fi->dir_release_count) {
+		dout(" marking %p complete\n", inode);
+		ci->i_ceph_flags |= CEPH_I_COMPLETE;
+		ci->i_max_offset = filp->f_pos;
+	}
+	spin_unlock(&inode->i_lock);
+
+	dout("readdir %p filp %p done.\n", inode, filp);
+	return 0;
+}
+
+static void reset_readdir(struct ceph_file_info *fi)
+{
+	if (fi->last_readdir) {
+		ceph_mdsc_put_request(fi->last_readdir);
+		fi->last_readdir = NULL;
+	}
+	kfree(fi->last_name);
+	fi->next_offset = 2;  /* compensate for . and .. */
+	if (fi->dentry) {
+		dput(fi->dentry);
+		fi->dentry = NULL;
+	}
+	fi->at_end = 0;
+}
+
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct ceph_file_info *fi = file->private_data;
+	struct inode *inode = file->f_mapping->host;
+	loff_t old_offset = offset;
+	loff_t retval;
+
+	mutex_lock(&inode->i_mutex);
+	switch (origin) {
+	case SEEK_END:
+		offset += inode->i_size + 2;   /* FIXME */
+		break;
+	case SEEK_CUR:
+		offset += file->f_pos;
+	}
+	retval = -EINVAL;
+	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+		if (offset != file->f_pos) {
+			file->f_pos = offset;
+			file->f_version = 0;
+			fi->at_end = 0;
+		}
+		retval = offset;
+
+		/*
+		 * discard buffered readdir content on seekdir(0), or
+		 * seek to new frag, or seek prior to current chunk.
+		 */
+		if (offset == 0 ||
+		    fpos_frag(offset) != fpos_frag(old_offset) ||
+		    fpos_off(offset) < fi->offset) {
+			dout("dir_llseek dropping %p content\n", file);
+			reset_readdir(fi);
+		}
+
+		/* bump dir_release_count if we did a forward seek */
+		if (offset > old_offset)
+			fi->dir_release_count--;
+	}
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+}
+
+/*
+ * Process result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+				  struct dentry *dentry, int err)
+{
+	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct inode *parent = dentry->d_parent->d_inode;
+
+	/* .snap dir? */
+	if (err == -ENOENT &&
+	    ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
+	    strcmp(dentry->d_name.name,
+		   client->mount_args->snapdir_name) == 0) {
+		struct inode *inode = ceph_get_snapdir(parent);
+		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
+		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+		d_add(dentry, inode);
+		err = 0;
+	}
+
+	if (err == -ENOENT) {
+		/* no trace? */
+		err = 0;
+		if (!req->r_reply_info.head->is_dentry) {
+			dout("ENOENT and no trace, dentry %p inode %p\n",
+			     dentry, dentry->d_inode);
+			if (dentry->d_inode) {
+				d_drop(dentry);
+				err = -ENOENT;
+			} else {
+				d_add(dentry, NULL);
+			}
+		}
+	}
+	if (err)
+		dentry = ERR_PTR(err);
+	else if (dentry != req->r_dentry)
+		dentry = dget(req->r_dentry);   /* we got spliced */
+	else
+		dentry = NULL;
+	return dentry;
+}
+
+static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+{
+	return ceph_ino(inode) == CEPH_INO_ROOT &&
+		strncmp(dentry->d_name.name, ".ceph", 5) == 0;
+}
+
+/*
+ * Look up a single dir entry.  If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int op;
+	int err;
+
+	dout("lookup %p dentry %p '%.*s'\n",
+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	err = ceph_init_dentry(dentry);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	/* open (but not create!) intent? */
+	if (nd &&
+	    (nd->flags & LOOKUP_OPEN) &&
+	    (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
+	    !(nd->intent.open.flags & O_CREAT)) {
+		int mode = nd->intent.open.create_mode & ~current->fs->umask;
+		return ceph_lookup_open(dir, dentry, nd, mode, 1);
+	}
+
+	/* can we conclude ENOENT locally? */
+	if (dentry->d_inode == NULL) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+		struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+		spin_lock(&dir->i_lock);
+		dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+		if (strncmp(dentry->d_name.name,
+			    client->mount_args->snapdir_name,
+			    dentry->d_name.len) &&
+		    !is_root_ceph_dentry(dir, dentry) &&
+		    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+		    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+			di->offset = ci->i_max_offset++;
+			spin_unlock(&dir->i_lock);
+			dout(" dir %p complete, -ENOENT\n", dir);
+			d_add(dentry, NULL);
+			di->lease_shared_gen = ci->i_shared_gen;
+			return NULL;
+		}
+		spin_unlock(&dir->i_lock);
+	}
+
+	op = ceph_snap(dir) == CEPH_SNAPDIR ?
+		CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+	req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	/* we only need inode linkage */
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_locked_dir = dir;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	dentry = ceph_finish_lookup(req, dentry, err);
+	ceph_mdsc_put_request(req);  /* will dput(dentry) */
+	dout("lookup result=%p\n", dentry);
+	return dentry;
+}
+
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+	struct dentry *result = ceph_lookup(dir, dentry, NULL);
+
+	if (result && !IS_ERR(result)) {
+		/*
+		 * We created the item, then did a lookup, and found
+		 * it was already linked to another inode we already
+		 * had in our cache (and thus got spliced).  Link our
+		 * dentry to that inode, but don't hash it, just in
+		 * case the VFS wants to dereference it.
+		 */
+		BUG_ON(!result->d_inode);
+		d_instantiate(dentry, result->d_inode);
+		return 0;
+	}
+	return PTR_ERR(result);
+}
+
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+		      int mode, dev_t rdev)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+	     dir, dentry, mode, rdev);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mknod.mode = cpu_to_le32(mode);
+	req->r_args.mknod.rdev = cpu_to_le32(rdev);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+	if (err)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
+		       struct nameidata *nd)
+{
+	dout("create in dir %p dentry %p name '%.*s'\n",
+	     dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (nd) {
+		BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
+		dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
+		/* hrm, what should i do here if we get aliased? */
+		if (IS_ERR(dentry))
+			return PTR_ERR(dentry);
+		return 0;
+	}
+
+	/* fall back to mknod */
+	return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
+}
+
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *dest)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_path2 = kstrdup(dest, GFP_NOFS);
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+	if (err)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* mkdir .snap/foo is a MKSNAP */
+		op = CEPH_MDS_OP_MKSNAP;
+		dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
+		     dentry->d_name.len, dentry->d_name.name, dentry);
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+		op = CEPH_MDS_OP_MKDIR;
+	} else {
+		goto out;
+	}
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_args.mkdir.mode = cpu_to_le32(mode);
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	ceph_mdsc_put_request(req);
+out:
+	if (err < 0)
+		d_drop(dentry);
+	return err;
+}
+
+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+		     struct dentry *dentry)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("link in dir %p old_dentry %p dentry %p\n", dir,
+	     old_dentry, dentry);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		d_drop(dentry);
+		return PTR_ERR(req);
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (err)
+		d_drop(dentry);
+	else if (!req->r_reply_info.head->is_dentry)
+		d_instantiate(dentry, igrab(old_dentry->d_inode));
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps.  If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+static int drop_caps_for_unlink(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+	spin_lock(&inode->i_lock);
+	if (inode->i_nlink == 1) {
+		drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+		ci->i_ceph_flags |= CEPH_I_NODELAY;
+	}
+	spin_unlock(&inode->i_lock);
+	return drop;
+}
+
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct inode *inode = dentry->d_inode;
+	struct ceph_mds_request *req;
+	int err = -EROFS;
+	int op;
+
+	if (ceph_snap(dir) == CEPH_SNAPDIR) {
+		/* rmdir .snap/foo is RMSNAP */
+		dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
+		     dentry->d_name.name, dentry);
+		op = CEPH_MDS_OP_RMSNAP;
+	} else if (ceph_snap(dir) == CEPH_NOSNAP) {
+		dout("unlink/rmdir dir %p dn %p inode %p\n",
+		     dir, dentry, inode);
+		op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+			CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+	} else
+		goto out;
+	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	req->r_locked_dir = dir;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_inode_drop = drop_caps_for_unlink(inode);
+	err = ceph_mdsc_do_request(mdsc, dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry)
+		d_delete(dentry);
+	ceph_mdsc_put_request(req);
+out:
+	return err;
+}
+
+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+		       struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(old_dir) != ceph_snap(new_dir))
+		return -EXDEV;
+	if (ceph_snap(old_dir) != CEPH_NOSNAP ||
+	    ceph_snap(new_dir) != CEPH_NOSNAP)
+		return -EROFS;
+	dout("rename dir %p dentry %p to dir %p dentry %p\n",
+	     old_dir, old_dentry, new_dir, new_dentry);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_dentry = dget(new_dentry);
+	req->r_num_caps = 2;
+	req->r_old_dentry = dget(old_dentry);
+	req->r_locked_dir = new_dir;
+	req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	/* release LINK_RDCACHE on source inode (mds will lock it) */
+	req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+	if (new_dentry->d_inode)
+		req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+	err = ceph_mdsc_do_request(mdsc, old_dir, req);
+	if (!err && !req->r_reply_info.head->is_dentry) {
+		/*
+		 * Normally d_move() is done by fill_trace (called by
+		 * do_request, above).  If there is no trace, we need
+		 * to do it here.
+		 */
+		d_move(old_dentry, new_dentry);
+	}
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+
+/*
+ * Check if dentry lease is valid.  If not, delete the lease.  Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *s;
+	int valid = 0;
+	u32 gen;
+	unsigned long ttl;
+	struct ceph_mds_session *session = NULL;
+	struct inode *dir = NULL;
+	u32 seq = 0;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (di && di->lease_session) {
+		s = di->lease_session;
+		spin_lock(&s->s_cap_lock);
+		gen = s->s_cap_gen;
+		ttl = s->s_cap_ttl;
+		spin_unlock(&s->s_cap_lock);
+
+		if (di->lease_gen == gen &&
+		    time_before(jiffies, dentry->d_time) &&
+		    time_before(jiffies, ttl)) {
+			valid = 1;
+			if (di->lease_renew_after &&
+			    time_after(jiffies, di->lease_renew_after)) {
+				/* we should renew */
+				dir = dentry->d_parent->d_inode;
+				session = ceph_get_mds_session(s);
+				seq = di->lease_seq;
+				di->lease_renew_after = 0;
+				di->lease_renew_from = jiffies;
+			}
+		}
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (session) {
+		ceph_mdsc_lease_send_msg(session, dir, dentry,
+					 CEPH_MDS_LEASE_RENEW, seq);
+		ceph_put_mds_session(session);
+	}
+	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+	return valid;
+}
+
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+	struct ceph_inode_info *ci = ceph_inode(dir);
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	int valid = 0;
+
+	spin_lock(&dir->i_lock);
+	if (ci->i_shared_gen == di->lease_shared_gen)
+		valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+	spin_unlock(&dir->i_lock);
+	dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+	     dir, (unsigned)ci->i_shared_gen, dentry,
+	     (unsigned)di->lease_shared_gen, valid);
+	return valid;
+}
+
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+
+	dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+	     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+
+	/* always trust cached snapped dentries, snapdir dentry */
+	if (ceph_snap(dir) != CEPH_NOSNAP) {
+		dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
+		     dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+		goto out_touch;
+	}
+	if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
+		goto out_touch;
+
+	if (dentry_lease_is_valid(dentry) ||
+	    dir_lease_is_valid(dir, dentry))
+		goto out_touch;
+
+	dout("d_revalidate %p invalid\n", dentry);
+	d_drop(dentry);
+	return 0;
+out_touch:
+	ceph_dentry_lru_touch(dentry);
+	return 1;
+}
+
+/*
+ * When a dentry is released, clear the dir I_COMPLETE if it was part
+ * of the current dir gen.
+ */
+static void ceph_dentry_release(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+
+	if (parent_inode) {
+		struct ceph_inode_info *ci = ceph_inode(parent_inode);
+
+		spin_lock(&parent_inode->i_lock);
+		if (ci->i_shared_gen == di->lease_shared_gen) {
+			dout(" clearing %p complete (d_release)\n",
+			     parent_inode);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+			ci->i_release_count++;
+		}
+		spin_unlock(&parent_inode->i_lock);
+	}
+	if (di) {
+		ceph_dentry_lru_del(dentry);
+		if (di->lease_session)
+			ceph_put_mds_session(di->lease_session);
+		kmem_cache_free(ceph_dentry_cachep, di);
+		dentry->d_fsdata = NULL;
+	}
+}
+
+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
+					  struct nameidata *nd)
+{
+	/*
+	 * Eventually, we'll want to revalidate snapped metadata
+	 * too... probably...
+	 */
+	return 1;
+}
+
+
+
+/*
+ * read() on a dir.  This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+			     loff_t *ppos)
+{
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int left;
+
+	if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+		return -EISDIR;
+
+	if (!cf->dir_info) {
+		cf->dir_info = kmalloc(1024, GFP_NOFS);
+		if (!cf->dir_info)
+			return -ENOMEM;
+		cf->dir_info_len =
+			sprintf(cf->dir_info,
+				"entries:   %20lld\n"
+				" files:    %20lld\n"
+				" subdirs:  %20lld\n"
+				"rentries:  %20lld\n"
+				" rfiles:   %20lld\n"
+				" rsubdirs: %20lld\n"
+				"rbytes:    %20lld\n"
+				"rctime:    %10ld.%09ld\n",
+				ci->i_files + ci->i_subdirs,
+				ci->i_files,
+				ci->i_subdirs,
+				ci->i_rfiles + ci->i_rsubdirs,
+				ci->i_rfiles,
+				ci->i_rsubdirs,
+				ci->i_rbytes,
+				(long)ci->i_rctime.tv_sec,
+				(long)ci->i_rctime.tv_nsec);
+	}
+
+	if (*ppos >= cf->dir_info_len)
+		return 0;
+	size = min_t(unsigned, size, cf->dir_info_len-*ppos);
+	left = copy_to_user(buf, cf->dir_info + *ppos, size);
+	if (left == size)
+		return -EFAULT;
+	*ppos += (size - left);
+	return size - left;
+}
+
+/*
+ * an fsync() on a dir will wait for any uncommitted directory
+ * operations to commit.
+ */
+static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+			  int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct list_head *head = &ci->i_unsafe_dirops;
+	struct ceph_mds_request *req;
+	u64 last_tid;
+	int ret = 0;
+
+	dout("dir_fsync %p\n", inode);
+	spin_lock(&ci->i_unsafe_lock);
+	if (list_empty(head))
+		goto out;
+
+	req = list_entry(head->prev,
+			 struct ceph_mds_request, r_unsafe_dir_item);
+	last_tid = req->r_tid;
+
+	do {
+		ceph_mdsc_get_request(req);
+		spin_unlock(&ci->i_unsafe_lock);
+		dout("dir_fsync %p wait on tid %llu (until %llu)\n",
+		     inode, req->r_tid, last_tid);
+		if (req->r_timeout) {
+			ret = wait_for_completion_timeout(
+				&req->r_safe_completion, req->r_timeout);
+			if (ret > 0)
+				ret = 0;
+			else if (ret == 0)
+				ret = -EIO;  /* timed out */
+		} else {
+			wait_for_completion(&req->r_safe_completion);
+		}
+		spin_lock(&ci->i_unsafe_lock);
+		ceph_mdsc_put_request(req);
+
+		if (ret || list_empty(head))
+			break;
+		req = list_entry(head->next,
+				 struct ceph_mds_request, r_unsafe_dir_item);
+	} while (req->r_tid < last_tid);
+out:
+	spin_unlock(&ci->i_unsafe_lock);
+	return ret;
+}
+
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
+	if (di) {
+		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_add_tail(&di->lru, &mdsc->dentry_lru);
+		mdsc->num_dentry++;
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
+	if (di) {
+		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_move_tail(&di->lru, &mdsc->dentry_lru);
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dn);
+	struct ceph_mds_client *mdsc;
+
+	dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
+	     dn->d_name.len, dn->d_name.name);
+	if (di) {
+		mdsc = &ceph_client(dn->d_sb)->mdsc;
+		spin_lock(&mdsc->dentry_lru_lock);
+		list_del_init(&di->lru);
+		mdsc->num_dentry--;
+		spin_unlock(&mdsc->dentry_lru_lock);
+	}
+}
+
+const struct file_operations ceph_dir_fops = {
+	.read = ceph_read_dir,
+	.readdir = ceph_readdir,
+	.llseek = ceph_dir_llseek,
+	.open = ceph_open,
+	.release = ceph_release,
+	.unlocked_ioctl = ceph_ioctl,
+	.fsync = ceph_dir_fsync,
+};
+
+const struct inode_operations ceph_dir_iops = {
+	.lookup = ceph_lookup,
+	.permission = ceph_permission,
+	.getattr = ceph_getattr,
+	.setattr = ceph_setattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+	.mknod = ceph_mknod,
+	.symlink = ceph_symlink,
+	.mkdir = ceph_mkdir,
+	.link = ceph_link,
+	.unlink = ceph_unlink,
+	.rmdir = ceph_unlink,
+	.rename = ceph_rename,
+	.create = ceph_create,
+};
+
+struct dentry_operations ceph_dentry_ops = {
+	.d_revalidate = ceph_d_revalidate,
+	.d_release = ceph_dentry_release,
+};
+
+struct dentry_operations ceph_snapdir_dentry_ops = {
+	.d_revalidate = ceph_snapdir_d_revalidate,
+};
+
+struct dentry_operations ceph_snap_dentry_ops = {
+};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 00000000000..fc68e39cbad
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,223 @@
+#include "ceph_debug.h"
+
+#include <linux/exportfs.h>
+#include <asm/unaligned.h>
+
+#include "super.h"
+
+/*
+ * NFS export support
+ *
+ * NFS re-export of a ceph mount is, at present, only semireliable.
+ * The basic issue is that the Ceph architectures doesn't lend itself
+ * well to generating filehandles that will remain valid forever.
+ *
+ * So, we do our best.  If you're lucky, your inode will be in the
+ * client's cache.  If it's not, and you have a connectable fh, then
+ * the MDS server may be able to find it for you.  Otherwise, you get
+ * ESTALE.
+ *
+ * There are ways to this more reliable, but in the non-connectable fh
+ * case, we won't every work perfectly, and in the connectable case,
+ * some changes are needed on the MDS side to work better.
+ */
+
+/*
+ * Basic fh
+ */
+struct ceph_nfs_fh {
+	u64 ino;
+} __attribute__ ((packed));
+
+/*
+ * Larger 'connectable' fh that includes parent ino and name hash.
+ * Use this whenever possible, as it works more reliably.
+ */
+struct ceph_nfs_confh {
+	u64 ino, parent_ino;
+	u32 parent_name_hash;
+} __attribute__ ((packed));
+
+static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
+			  int connectable)
+{
+	struct ceph_nfs_fh *fh = (void *)rawfh;
+	struct ceph_nfs_confh *cfh = (void *)rawfh;
+	struct dentry *parent = dentry->d_parent;
+	struct inode *inode = dentry->d_inode;
+	int type;
+
+	/* don't re-export snaps */
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EINVAL;
+
+	if (*max_len >= sizeof(*cfh)) {
+		dout("encode_fh %p connectable\n", dentry);
+		cfh->ino = ceph_ino(dentry->d_inode);
+		cfh->parent_ino = ceph_ino(parent->d_inode);
+		cfh->parent_name_hash = parent->d_name.hash;
+		*max_len = sizeof(*cfh);
+		type = 2;
+	} else if (*max_len > sizeof(*fh)) {
+		if (connectable)
+			return -ENOSPC;
+		dout("encode_fh %p\n", dentry);
+		fh->ino = ceph_ino(dentry->d_inode);
+		*max_len = sizeof(*fh);
+		type = 1;
+	} else {
+		return -ENOSPC;
+	}
+	return type;
+}
+
+/*
+ * convert regular fh to dentry
+ *
+ * FIXME: we should try harder by querying the mds for the ino.
+ */
+static struct dentry *__fh_to_dentry(struct super_block *sb,
+				     struct ceph_nfs_fh *fh)
+{
+	struct inode *inode;
+	struct dentry *dentry;
+	struct ceph_vino vino;
+	int err;
+
+	dout("__fh_to_dentry %llx\n", fh->ino);
+	vino.ino = fh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+
+	dentry = d_obtain_alias(inode);
+	if (!dentry) {
+		pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
+		       fh->ino, inode);
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	err = ceph_init_dentry(dentry);
+
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+	return dentry;
+}
+
+/*
+ * convert connectable fh to dentry
+ */
+static struct dentry *__cfh_to_dentry(struct super_block *sb,
+				      struct ceph_nfs_confh *cfh)
+{
+	struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+	struct inode *inode;
+	struct dentry *dentry;
+	struct ceph_vino vino;
+	int err;
+
+	dout("__cfh_to_dentry %llx (%llx/%x)\n",
+	     cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
+
+	vino.ino = cfh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode) {
+		struct ceph_mds_request *req;
+
+		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
+					       USE_ANY_MDS);
+		if (IS_ERR(req))
+			return ERR_PTR(PTR_ERR(req));
+
+		req->r_ino1 = vino;
+		req->r_ino2.ino = cfh->parent_ino;
+		req->r_ino2.snap = CEPH_NOSNAP;
+		req->r_path2 = kmalloc(16, GFP_NOFS);
+		snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, NULL, req);
+		ceph_mdsc_put_request(req);
+		inode = ceph_find_inode(sb, vino);
+		if (!inode)
+			return ERR_PTR(err ? err : -ESTALE);
+	}
+
+	dentry = d_obtain_alias(inode);
+	if (!dentry) {
+		pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
+		       cfh->ino, inode);
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+	return dentry;
+}
+
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	if (fh_type == 1)
+		return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
+	else
+		return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
+}
+
+/*
+ * get parent, if possible.
+ *
+ * FIXME: we could do better by querying the mds to discover the
+ * parent.
+ */
+static struct dentry *ceph_fh_to_parent(struct super_block *sb,
+					 struct fid *fid,
+					int fh_len, int fh_type)
+{
+	struct ceph_nfs_confh *cfh = (void *)fid->raw;
+	struct ceph_vino vino;
+	struct inode *inode;
+	struct dentry *dentry;
+	int err;
+
+	if (fh_type == 1)
+		return ERR_PTR(-ESTALE);
+
+	pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
+		 cfh->parent_name_hash);
+
+	vino.ino = cfh->ino;
+	vino.snap = CEPH_NOSNAP;
+	inode = ceph_find_inode(sb, vino);
+	if (!inode)
+		return ERR_PTR(-ESTALE);
+
+	dentry = d_obtain_alias(inode);
+	if (!dentry) {
+		pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
+		       cfh->ino, inode);
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	err = ceph_init_dentry(dentry);
+	if (err < 0) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
+	return dentry;
+}
+
+const struct export_operations ceph_export_ops = {
+	.encode_fh = ceph_encode_fh,
+	.fh_to_dentry = ceph_fh_to_dentry,
+	.fh_to_parent = ceph_fh_to_parent,
+};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 00000000000..5d2af8464f6
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,937 @@
+#include "ceph_debug.h"
+
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+
+#include "super.h"
+#include "mds_client.h"
+
+/*
+ * Ceph file operations
+ *
+ * Implement basic open/close functionality, and implement
+ * read/write.
+ *
+ * We implement three modes of file I/O:
+ *  - buffered uses the generic_file_aio_{read,write} helpers
+ *
+ *  - synchronous is used when there is multi-client read/write
+ *    sharing, avoids the page cache, and synchronously waits for an
+ *    ack from the OSD.
+ *
+ *  - direct io takes the variant of the sync path that references
+ *    user pages directly.
+ *
+ * fsync() flushes and waits on dirty pages, but just queues metadata
+ * for writeback: since the MDS can recover size and mtime there is no
+ * need to wait for MDS acknowledgement.
+ */
+
+
+/*
+ * Prepare an open request.  Preallocate ceph_cap to avoid an
+ * inopportune ENOMEM later.
+ */
+static struct ceph_mds_request *
+prepare_open_request(struct super_block *sb, int flags, int create_mode)
+{
+	struct ceph_client *client = ceph_sb_to_client(sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int want_auth = USE_ANY_MDS;
+	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
+
+	if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
+		want_auth = USE_AUTH_MDS;
+
+	req = ceph_mdsc_create_request(mdsc, op, want_auth);
+	if (IS_ERR(req))
+		goto out;
+	req->r_fmode = ceph_flags_to_mode(flags);
+	req->r_args.open.flags = cpu_to_le32(flags);
+	req->r_args.open.mode = cpu_to_le32(create_mode);
+	req->r_args.open.preferred = cpu_to_le32(-1);
+out:
+	return req;
+}
+
+/*
+ * initialize private struct file data.
+ * if we fail, clean up by dropping fmode reference on the ceph_inode
+ */
+static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
+{
+	struct ceph_file_info *cf;
+	int ret = 0;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+		dout("init_file %p %p 0%o (regular)\n", inode, file,
+		     inode->i_mode);
+		cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
+		if (cf == NULL) {
+			ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+			return -ENOMEM;
+		}
+		cf->fmode = fmode;
+		cf->next_offset = 2;
+		file->private_data = cf;
+		BUG_ON(inode->i_fop->release != ceph_release);
+		break;
+
+	case S_IFLNK:
+		dout("init_file %p %p 0%o (symlink)\n", inode, file,
+		     inode->i_mode);
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		break;
+
+	default:
+		dout("init_file %p %p 0%o (special)\n", inode, file,
+		     inode->i_mode);
+		/*
+		 * we need to drop the open ref now, since we don't
+		 * have .release set to ceph_release.
+		 */
+		ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
+		BUG_ON(inode->i_fop->release == ceph_release);
+
+		/* call the proper open fop */
+		ret = inode->i_fop->open(inode, file);
+	}
+	return ret;
+}
+
+/*
+ * If the filp already has private_data, that means the file was
+ * already opened by intent during lookup, and we do nothing.
+ *
+ * If we already have the requisite capabilities, we can satisfy
+ * the open request locally (no need to request new caps from the
+ * MDS).  We do, however, need to inform the MDS (asynchronously)
+ * if our wanted caps set expands.
+ */
+int ceph_open(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_file_info *cf = file->private_data;
+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	int err;
+	int flags, fmode, wanted;
+
+	if (cf) {
+		dout("open file %p is already opened\n", file);
+		return 0;
+	}
+
+	/* filter out O_CREAT|O_EXCL; vfs did that already.  yuck. */
+	flags = file->f_flags & ~(O_CREAT|O_EXCL);
+	if (S_ISDIR(inode->i_mode))
+		flags = O_DIRECTORY;  /* mds likes to know */
+
+	dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
+	     ceph_vinop(inode), file, flags, file->f_flags);
+	fmode = ceph_flags_to_mode(flags);
+	wanted = ceph_caps_for_mode(fmode);
+
+	/* snapped files are read-only */
+	if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
+		return -EROFS;
+
+	/* trivially open snapdir */
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		spin_lock(&inode->i_lock);
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+
+	/*
+	 * No need to block if we have any caps.  Update wanted set
+	 * asynchronously.
+	 */
+	spin_lock(&inode->i_lock);
+	if (__ceph_is_any_real_caps(ci)) {
+		int mds_wanted = __ceph_caps_mds_wanted(ci);
+		int issued = __ceph_caps_issued(ci, NULL);
+
+		dout("open %p fmode %d want %s issued %s using existing\n",
+		     inode, fmode, ceph_cap_string(wanted),
+		     ceph_cap_string(issued));
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+
+		/* adjust wanted? */
+		if ((issued & wanted) != wanted &&
+		    (mds_wanted & wanted) != wanted &&
+		    ceph_snap(inode) != CEPH_SNAPDIR)
+			ceph_check_caps(ci, 0, NULL);
+
+		return ceph_init_file(inode, file, fmode);
+	} else if (ceph_snap(inode) != CEPH_NOSNAP &&
+		   (ci->i_snap_caps & wanted) == wanted) {
+		__ceph_get_fmode(ci, fmode);
+		spin_unlock(&inode->i_lock);
+		return ceph_init_file(inode, file, fmode);
+	}
+	spin_unlock(&inode->i_lock);
+
+	dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
+	req = prepare_open_request(inode->i_sb, flags, 0);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_inode = igrab(inode);
+	req->r_num_caps = 1;
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	if (!err)
+		err = ceph_init_file(inode, file, req->r_fmode);
+	ceph_mdsc_put_request(req);
+	dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
+out:
+	return err;
+}
+
+
+/*
+ * Do a lookup + open with a single request.
+ *
+ * If this succeeds, but some subsequent check in the vfs
+ * may_open() fails, the struct *file gets cleaned up (i.e.
+ * ceph_release gets called).  So fear not!
+ */
+/*
+ * flags
+ *  path_lookup_open   -> LOOKUP_OPEN
+ *  path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
+ */
+struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+				struct nameidata *nd, int mode,
+				int locked_dir)
+{
+	struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct file *file = nd->intent.open.file;
+	struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
+	struct ceph_mds_request *req;
+	int err;
+	int flags = nd->intent.open.flags - 1;  /* silly vfs! */
+
+	dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
+	     dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
+
+	/* do the open */
+	req = prepare_open_request(dir->i_sb, flags, mode);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
+	req->r_dentry = dget(dentry);
+	req->r_num_caps = 2;
+	if (flags & O_CREAT) {
+		req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+	}
+	req->r_locked_dir = dir;           /* caller holds dir->i_mutex */
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	dentry = ceph_finish_lookup(req, dentry, err);
+	if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+		err = ceph_handle_notrace_create(dir, dentry);
+	if (!err)
+		err = ceph_init_file(req->r_dentry->d_inode, file,
+				     req->r_fmode);
+	ceph_mdsc_put_request(req);
+	dout("ceph_lookup_open result=%p\n", dentry);
+	return dentry;
+}
+
+int ceph_release(struct inode *inode, struct file *file)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_file_info *cf = file->private_data;
+
+	dout("release inode %p file %p\n", inode, file);
+	ceph_put_fmode(ci, cf->fmode);
+	if (cf->last_readdir)
+		ceph_mdsc_put_request(cf->last_readdir);
+	kfree(cf->last_name);
+	kfree(cf->dir_info);
+	dput(cf->dentry);
+	kmem_cache_free(ceph_file_cachep, cf);
+
+	/* wake up anyone waiting for caps on this inode */
+	wake_up(&ci->i_cap_wq);
+	return 0;
+}
+
+/*
+ * build a vector of user pages
+ */
+static struct page **get_direct_page_vector(const char __user *data,
+					    int num_pages,
+					    loff_t off, size_t len)
+{
+	struct page **pages;
+	int rc;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	down_read(&current->mm->mmap_sem);
+	rc = get_user_pages(current, current->mm, (unsigned long)data,
+			    num_pages, 0, 0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+	if (rc < 0)
+		goto fail;
+	return pages;
+
+fail:
+	kfree(pages);
+	return ERR_PTR(rc);
+}
+
+static void put_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	kfree(pages);
+}
+
+void ceph_release_page_vector(struct page **pages, int num_pages)
+{
+	int i;
+
+	for (i = 0; i < num_pages; i++)
+		__free_pages(pages[i], 0);
+	kfree(pages);
+}
+
+/*
+ * allocate a vector new pages
+ */
+static struct page **alloc_page_vector(int num_pages)
+{
+	struct page **pages;
+	int i;
+
+	pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = alloc_page(GFP_NOFS);
+		if (pages[i] == NULL) {
+			ceph_release_page_vector(pages, i);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	return pages;
+}
+
+/*
+ * copy user data into a page vector
+ */
+static int copy_user_to_page_vector(struct page **pages,
+				    const char __user *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, PAGE_CACHE_SIZE-po, left);
+		bad = copy_from_user(page_address(pages[i]) + po, data, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		po += l - bad;
+		if (po == PAGE_CACHE_SIZE) {
+			po = 0;
+			i++;
+		}
+	}
+	return len;
+}
+
+/*
+ * copy user data from a page vector into a user pointer
+ */
+static int copy_page_vector_to_user(struct page **pages, char __user *data,
+				    loff_t off, size_t len)
+{
+	int i = 0;
+	int po = off & ~PAGE_CACHE_MASK;
+	int left = len;
+	int l, bad;
+
+	while (left > 0) {
+		l = min_t(int, left, PAGE_CACHE_SIZE-po);
+		bad = copy_to_user(data, page_address(pages[i]) + po, l);
+		if (bad == l)
+			return -EFAULT;
+		data += l - bad;
+		left -= l - bad;
+		if (po) {
+			po += l - bad;
+			if (po == PAGE_CACHE_SIZE)
+				po = 0;
+		}
+		i++;
+	}
+	return len;
+}
+
+/*
+ * Zero an extent within a page vector.  Offset is relative to the
+ * start of the first page.
+ */
+static void zero_page_vector_range(int off, int len, struct page **pages)
+{
+	int i = off >> PAGE_CACHE_SHIFT;
+
+	off &= ~PAGE_CACHE_MASK;
+
+	dout("zero_page_vector_page %u~%u\n", off, len);
+
+	/* leading partial page? */
+	if (off) {
+		int end = min((int)PAGE_CACHE_SIZE, off + len);
+		dout("zeroing %d %p head from %d\n", i, pages[i],
+		     (int)off);
+		zero_user_segment(pages[i], off, end);
+		len -= (end - off);
+		i++;
+	}
+	while (len >= PAGE_CACHE_SIZE) {
+		dout("zeroing %d %p len=%d\n", i, pages[i], len);
+		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+		i++;
+	}
+	/* trailing partial page? */
+	if (len) {
+		dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
+		zero_user_segment(pages[i], 0, len);
+	}
+}
+
+
+/*
+ * Read a range of bytes striped over one or more objects.  Iterate over
+ * objects we stripe over.  (That's not atomic, but good enough for now.)
+ *
+ * If we get a short result from the OSD, check against i_size; we need to
+ * only return a short read to the caller if we hit EOF.
+ */
+static int striped_read(struct inode *inode,
+			u64 off, u64 len,
+			struct page **pages, int num_pages,
+			int *checkeof)
+{
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 pos, this_len;
+	int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
+	int left, pages_left;
+	int read;
+	struct page **page_pos;
+	int ret;
+	bool hit_stripe, was_short;
+
+	/*
+	 * we may need to do multiple reads.  not atomic, unfortunately.
+	 */
+	pos = off;
+	left = len;
+	page_pos = pages;
+	pages_left = num_pages;
+	read = 0;
+
+more:
+	this_len = left;
+	ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
+				  &ci->i_layout, pos, &this_len,
+				  ci->i_truncate_seq,
+				  ci->i_truncate_size,
+				  page_pos, pages_left);
+	hit_stripe = this_len < left;
+	was_short = ret >= 0 && ret < this_len;
+	if (ret == -ENOENT)
+		ret = 0;
+	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
+
+	if (ret > 0) {
+		int didpages =
+			((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
+
+		if (read < pos - off) {
+			dout(" zero gap %llu to %llu\n", off + read, pos);
+			zero_page_vector_range(page_off + read,
+					       pos - off - read, pages);
+		}
+		pos += ret;
+		read = pos - off;
+		left -= ret;
+		page_pos += didpages;
+		pages_left -= didpages;
+
+		/* hit stripe? */
+		if (left && hit_stripe)
+			goto more;
+	}
+
+	if (was_short) {
+		/* was original extent fully inside i_size? */
+		if (pos + left <= inode->i_size) {
+			dout("zero tail\n");
+			zero_page_vector_range(page_off + read, len - read,
+					       pages);
+			read = len;
+			goto out;
+		}
+
+		/* check i_size */
+		*checkeof = 1;
+	}
+
+out:
+	if (ret >= 0)
+		ret = read;
+	dout("striped_read returns %d\n", ret);
+	return ret;
+}
+
+/*
+ * Completely synchronous read and write methods.  Direct from __user
+ * buffer to osd, or directly to user pages (if O_DIRECT).
+ *
+ * If the read spans object boundary, just do multiple reads.
+ */
+static ssize_t ceph_sync_read(struct file *file, char __user *data,
+			      unsigned len, loff_t *poff, int *checkeof)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page **pages;
+	u64 off = *poff;
+	int num_pages = calc_pages_for(off, len);
+	int ret;
+
+	dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+	     (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (file->f_flags & O_DIRECT) {
+		pages = get_direct_page_vector(data, num_pages, off, len);
+
+		/*
+		 * flush any page cache pages in this range.  this
+		 * will make concurrent normal and O_DIRECT io slow,
+		 * but it will at least behave sensibly when they are
+		 * in sequence.
+		 */
+	} else {
+		pages = alloc_page_vector(num_pages);
+	}
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret < 0)
+		goto done;
+
+	ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+
+	if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
+		ret = copy_page_vector_to_user(pages, data, off, ret);
+	if (ret >= 0)
+		*poff = off + ret;
+
+done:
+	if (file->f_flags & O_DIRECT)
+		put_page_vector(pages, num_pages);
+	else
+		ceph_release_page_vector(pages, num_pages);
+	dout("sync_read result %d\n", ret);
+	return ret;
+}
+
+/*
+ * Write commit callback, called if we requested both an ACK and
+ * ONDISK commit reply from the OSD.
+ */
+static void sync_write_commit(struct ceph_osd_request *req,
+			      struct ceph_msg *msg)
+{
+	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+
+	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
+	spin_lock(&ci->i_unsafe_lock);
+	list_del_init(&req->r_unsafe_item);
+	spin_unlock(&ci->i_unsafe_lock);
+	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+}
+
+/*
+ * Synchronous write, straight from __user pointer or user pages (if
+ * O_DIRECT).
+ *
+ * If write spans object boundary, just do multiple writes.  (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct file *file, const char __user *data,
+			       size_t left, loff_t *offset)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_osd_request *req;
+	struct page **pages;
+	int num_pages;
+	long long unsigned pos;
+	u64 len;
+	int written = 0;
+	int flags;
+	int do_sync = 0;
+	int check_caps = 0;
+	int ret;
+	struct timespec mtime = CURRENT_TIME;
+
+	if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	dout("sync_write on file %p %lld~%u %s\n", file, *offset,
+	     (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+	if (file->f_flags & O_APPEND)
+		pos = i_size_read(inode);
+	else
+		pos = *offset;
+
+	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+	if (ret < 0)
+		return ret;
+
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    pos >> PAGE_CACHE_SHIFT,
+					    (pos + left) >> PAGE_CACHE_SHIFT);
+	if (ret < 0)
+		dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+	flags = CEPH_OSD_FLAG_ORDERSNAP |
+		CEPH_OSD_FLAG_ONDISK |
+		CEPH_OSD_FLAG_WRITE;
+	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
+		flags |= CEPH_OSD_FLAG_ACK;
+	else
+		do_sync = 1;
+
+	/*
+	 * we may need to do multiple writes here if we span an object
+	 * boundary.  this isn't atomic, unfortunately.  :(
+	 */
+more:
+	len = left;
+	req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
+				    ceph_vino(inode), pos, &len,
+				    CEPH_OSD_OP_WRITE, flags,
+				    ci->i_snap_realm->cached_context,
+				    do_sync,
+				    ci->i_truncate_seq, ci->i_truncate_size,
+				    &mtime, false, 2);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	num_pages = calc_pages_for(pos, len);
+
+	if (file->f_flags & O_DIRECT) {
+		pages = get_direct_page_vector(data, num_pages, pos, len);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto out;
+		}
+
+		/*
+		 * throw out any page cache pages in this range. this
+		 * may block.
+		 */
+		truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
+	} else {
+		pages = alloc_page_vector(num_pages);
+		if (IS_ERR(pages)) {
+			ret = PTR_ERR(pages);
+			goto out;
+		}
+		ret = copy_user_to_page_vector(pages, data, pos, len);
+		if (ret < 0) {
+			ceph_release_page_vector(pages, num_pages);
+			goto out;
+		}
+
+		if ((file->f_flags & O_SYNC) == 0) {
+			/* get a second commit callback */
+			req->r_safe_callback = sync_write_commit;
+			req->r_own_pages = 1;
+		}
+	}
+	req->r_pages = pages;
+	req->r_num_pages = num_pages;
+	req->r_inode = inode;
+
+	ret = ceph_osdc_start_request(&client->osdc, req, false);
+	if (!ret) {
+		if (req->r_safe_callback) {
+			/*
+			 * Add to inode unsafe list only after we
+			 * start_request so that a tid has been assigned.
+			 */
+			spin_lock(&ci->i_unsafe_lock);
+			list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
+			spin_unlock(&ci->i_unsafe_lock);
+			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+		}
+		ret = ceph_osdc_wait_request(&client->osdc, req);
+	}
+
+	if (file->f_flags & O_DIRECT)
+		put_page_vector(pages, num_pages);
+	else if (file->f_flags & O_SYNC)
+		ceph_release_page_vector(pages, num_pages);
+
+out:
+	ceph_osdc_put_request(req);
+	if (ret == 0) {
+		pos += len;
+		written += len;
+		left -= len;
+		if (left)
+			goto more;
+
+		ret = written;
+		*offset = pos;
+		if (pos > i_size_read(inode))
+			check_caps = ceph_inode_set_size(inode, pos);
+		if (check_caps)
+			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
+					NULL);
+	}
+	return ret;
+}
+
+/*
+ * Wrap generic_file_aio_read with checks for cap bits on the inode.
+ * Atomically grab references, so that those bits are not released
+ * back to the MDS mid-read.
+ *
+ * Hmm, the sync read case isn't actually async... should it be?
+ */
+static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			     unsigned long nr_segs, loff_t pos)
+{
+	struct file *filp = iocb->ki_filp;
+	loff_t *ppos = &iocb->ki_pos;
+	size_t len = iov->iov_len;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	void *base = iov->iov_base;
+	ssize_t ret;
+	int got = 0;
+	int checkeof = 0, read = 0;
+
+	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)len, inode);
+again:
+	__ceph_do_pending_vmtruncate(inode);
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
+			    &got, -1);
+	if (ret < 0)
+		goto out;
+	dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)len,
+	     ceph_cap_string(got));
+
+	if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS))
+		/* hmm, this isn't really async... */
+		ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
+	else
+		ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+
+out:
+	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
+	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
+	ceph_put_cap_refs(ci, got);
+
+	if (checkeof && ret >= 0) {
+		int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+
+		/* hit EOF or hole? */
+		if (statret == 0 && *ppos < inode->i_size) {
+			dout("aio_read sync_read hit hole, reading more\n");
+			read += ret;
+			base += ret;
+			len -= ret;
+			checkeof = 0;
+			goto again;
+		}
+	}
+	if (ret >= 0)
+		ret += read;
+
+	return ret;
+}
+
+/*
+ * Take cap references to avoid releasing caps to MDS mid-write.
+ *
+ * If we are synchronous, and write with an old snap context, the OSD
+ * may return EOLDSNAPC.  In that case, retry the write.. _after_
+ * dropping our cap refs and allowing the pending snap to logically
+ * complete _before_ this write occurs.
+ *
+ * If we are near ENOSPC, write synchronously.
+ */
+static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		       unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+	loff_t endoff = pos + iov->iov_len;
+	int got = 0;
+	int ret, err;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+retry_snap:
+	if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+		return -ENOSPC;
+	__ceph_do_pending_vmtruncate(inode);
+	dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     inode->i_size);
+	ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+			    &got, endoff);
+	if (ret < 0)
+		goto out;
+
+	dout("aio_write %p %llx.%llx %llu~%u  got cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     ceph_cap_string(got));
+
+	if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
+	    (iocb->ki_filp->f_flags & O_DIRECT) ||
+	    (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
+		ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
+			&iocb->ki_pos);
+	} else {
+		ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+		if ((ret >= 0 || ret == -EIOCBQUEUED) &&
+		    ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
+		     || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
+			err = vfs_fsync_range(file, file->f_path.dentry,
+					      pos, pos + ret - 1, 1);
+			if (err < 0)
+				ret = err;
+		}
+	}
+	if (ret >= 0) {
+		spin_lock(&inode->i_lock);
+		__ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		spin_unlock(&inode->i_lock);
+	}
+
+out:
+	dout("aio_write %p %llx.%llx %llu~%u  dropping cap refs on %s\n",
+	     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+	     ceph_cap_string(got));
+	ceph_put_cap_refs(ci, got);
+
+	if (ret == -EOLDSNAPC) {
+		dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
+		     inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
+		goto retry_snap;
+	}
+
+	return ret;
+}
+
+/*
+ * llseek.  be sure to verify file size on SEEK_END.
+ */
+static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct inode *inode = file->f_mapping->host;
+	int ret;
+
+	mutex_lock(&inode->i_mutex);
+	__ceph_do_pending_vmtruncate(inode);
+	switch (origin) {
+	case SEEK_END:
+		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+		if (ret < 0) {
+			offset = ret;
+			goto out;
+		}
+		offset += inode->i_size;
+		break;
+	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0) {
+			offset = file->f_pos;
+			goto out;
+		}
+		offset += file->f_pos;
+		break;
+	}
+
+	if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
+		offset = -EINVAL;
+		goto out;
+	}
+
+	/* Special lock needed here? */
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		file->f_version = 0;
+	}
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	return offset;
+}
+
+const struct file_operations ceph_file_fops = {
+	.open = ceph_open,
+	.release = ceph_release,
+	.llseek = ceph_llseek,
+	.read = do_sync_read,
+	.write = do_sync_write,
+	.aio_read = ceph_aio_read,
+	.aio_write = ceph_aio_write,
+	.mmap = ceph_mmap,
+	.fsync = ceph_fsync,
+	.splice_read = generic_file_splice_read,
+	.splice_write = generic_file_splice_write,
+	.unlocked_ioctl = ceph_ioctl,
+	.compat_ioctl	= ceph_ioctl,
+};
+
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 00000000000..aca82d55cc5
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1766 @@
+#include "ceph_debug.h"
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/namei.h>
+#include <linux/writeback.h>
+#include <linux/vmalloc.h>
+#include <linux/pagevec.h>
+
+#include "super.h"
+#include "decode.h"
+
+/*
+ * Ceph inode operations
+ *
+ * Implement basic inode helpers (get, alloc) and inode ops (getattr,
+ * setattr, etc.), xattr helpers, and helpers for assimilating
+ * metadata returned by the MDS into our cache.
+ *
+ * Also define helpers for doing asynchronous writeback, invalidation,
+ * and truncation for the benefit of those who can't afford to block
+ * (typically because they are in the message handler path).
+ */
+
+static const struct inode_operations ceph_symlink_iops;
+
+static void ceph_invalidate_work(struct work_struct *work);
+static void ceph_writeback_work(struct work_struct *work);
+static void ceph_vmtruncate_work(struct work_struct *work);
+
+/*
+ * find or create an inode, given the ceph ino number
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+{
+	struct inode *inode;
+	ino_t t = ceph_vino_to_ino(vino);
+
+	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (inode->i_state & I_NEW) {
+		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
+		     inode, ceph_vinop(inode), (u64)inode->i_ino);
+		unlock_new_inode(inode);
+	}
+
+	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
+	     vino.snap, inode);
+	return inode;
+}
+
+/*
+ * get/constuct snapdir inode for a given directory
+ */
+struct inode *ceph_get_snapdir(struct inode *parent)
+{
+	struct ceph_vino vino = {
+		.ino = ceph_ino(parent),
+		.snap = CEPH_SNAPDIR,
+	};
+	struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	BUG_ON(!S_ISDIR(parent->i_mode));
+	if (IS_ERR(inode))
+		return ERR_PTR(PTR_ERR(inode));
+	inode->i_mode = parent->i_mode;
+	inode->i_uid = parent->i_uid;
+	inode->i_gid = parent->i_gid;
+	inode->i_op = &ceph_dir_iops;
+	inode->i_fop = &ceph_dir_fops;
+	ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
+	ci->i_rbytes = 0;
+	return inode;
+}
+
+const struct inode_operations ceph_file_iops = {
+	.permission = ceph_permission,
+	.setattr = ceph_setattr,
+	.getattr = ceph_getattr,
+	.setxattr = ceph_setxattr,
+	.getxattr = ceph_getxattr,
+	.listxattr = ceph_listxattr,
+	.removexattr = ceph_removexattr,
+};
+
+
+/*
+ * We use a 'frag tree' to keep track of the MDS's directory fragments
+ * for a given inode (usually there is just a single fragment).  We
+ * need to know when a child frag is delegated to a new MDS, or when
+ * it is flagged as replicated, so we can direct our requests
+ * accordingly.
+ */
+
+/*
+ * find/create a frag in the tree
+ */
+static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
+						    u32 f)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_frag *frag;
+	int c;
+
+	p = &ci->i_fragtree.rb_node;
+	while (*p) {
+		parent = *p;
+		frag = rb_entry(parent, struct ceph_inode_frag, node);
+		c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return frag;
+	}
+
+	frag = kmalloc(sizeof(*frag), GFP_NOFS);
+	if (!frag) {
+		pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
+		       "frag %x\n", &ci->vfs_inode,
+		       ceph_vinop(&ci->vfs_inode), f);
+		return ERR_PTR(-ENOMEM);
+	}
+	frag->frag = f;
+	frag->split_by = 0;
+	frag->mds = -1;
+	frag->ndist = 0;
+
+	rb_link_node(&frag->node, parent, p);
+	rb_insert_color(&frag->node, &ci->i_fragtree);
+
+	dout("get_or_create_frag added %llx.%llx frag %x\n",
+	     ceph_vinop(&ci->vfs_inode), f);
+	return frag;
+}
+
+/*
+ * find a specific frag @f
+ */
+struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
+{
+	struct rb_node *n = ci->i_fragtree.rb_node;
+
+	while (n) {
+		struct ceph_inode_frag *frag =
+			rb_entry(n, struct ceph_inode_frag, node);
+		int c = ceph_frag_compare(f, frag->frag);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return frag;
+	}
+	return NULL;
+}
+
+/*
+ * Choose frag containing the given value @v.  If @pfrag is
+ * specified, copy the frag delegation info to the caller if
+ * it is present.
+ */
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+		     struct ceph_inode_frag *pfrag,
+		     int *found)
+{
+	u32 t = ceph_frag_make(0, 0);
+	struct ceph_inode_frag *frag;
+	unsigned nway, i;
+	u32 n;
+
+	if (found)
+		*found = 0;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	while (1) {
+		WARN_ON(!ceph_frag_contains_value(t, v));
+		frag = __ceph_find_frag(ci, t);
+		if (!frag)
+			break; /* t is a leaf */
+		if (frag->split_by == 0) {
+			if (pfrag)
+				memcpy(pfrag, frag, sizeof(*pfrag));
+			if (found)
+				*found = 1;
+			break;
+		}
+
+		/* choose child */
+		nway = 1 << frag->split_by;
+		dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
+		     frag->split_by, nway);
+		for (i = 0; i < nway; i++) {
+			n = ceph_frag_make_child(t, frag->split_by, i);
+			if (ceph_frag_contains_value(n, v)) {
+				t = n;
+				break;
+			}
+		}
+		BUG_ON(i == nway);
+	}
+	dout("choose_frag(%x) = %x\n", v, t);
+
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return t;
+}
+
+/*
+ * Process dirfrag (delegation) info from the mds.  Include leaf
+ * fragment in tree ONLY if ndist > 0.  Otherwise, only
+ * branches/splits are included in i_fragtree)
+ */
+static int ceph_fill_dirfrag(struct inode *inode,
+			     struct ceph_mds_reply_dirfrag *dirinfo)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	u32 id = le32_to_cpu(dirinfo->frag);
+	int mds = le32_to_cpu(dirinfo->auth);
+	int ndist = le32_to_cpu(dirinfo->ndist);
+	int i;
+	int err = 0;
+
+	mutex_lock(&ci->i_fragtree_mutex);
+	if (ndist == 0) {
+		/* no delegation info needed. */
+		frag = __ceph_find_frag(ci, id);
+		if (!frag)
+			goto out;
+		if (frag->split_by == 0) {
+			/* tree leaf, remove */
+			dout("fill_dirfrag removed %llx.%llx frag %x"
+			     " (no ref)\n", ceph_vinop(inode), id);
+			rb_erase(&frag->node, &ci->i_fragtree);
+			kfree(frag);
+		} else {
+			/* tree branch, keep and clear */
+			dout("fill_dirfrag cleared %llx.%llx frag %x"
+			     " referral\n", ceph_vinop(inode), id);
+			frag->mds = -1;
+			frag->ndist = 0;
+		}
+		goto out;
+	}
+
+
+	/* find/add this frag to store mds delegation info */
+	frag = __get_or_create_frag(ci, id);
+	if (IS_ERR(frag)) {
+		/* this is not the end of the world; we can continue
+		   with bad/inaccurate delegation info */
+		pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
+		       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
+		err = -ENOMEM;
+		goto out;
+	}
+
+	frag->mds = mds;
+	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
+	for (i = 0; i < frag->ndist; i++)
+		frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
+	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
+	     ceph_vinop(inode), frag->frag, frag->ndist);
+
+out:
+	mutex_unlock(&ci->i_fragtree_mutex);
+	return err;
+}
+
+
+/*
+ * initialize a newly allocated inode.
+ */
+struct inode *ceph_alloc_inode(struct super_block *sb)
+{
+	struct ceph_inode_info *ci;
+	int i;
+
+	ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+	if (!ci)
+		return NULL;
+
+	dout("alloc_inode %p\n", &ci->vfs_inode);
+
+	ci->i_version = 0;
+	ci->i_time_warp_seq = 0;
+	ci->i_ceph_flags = 0;
+	ci->i_release_count = 0;
+	ci->i_symlink = NULL;
+
+	ci->i_fragtree = RB_ROOT;
+	mutex_init(&ci->i_fragtree_mutex);
+
+	ci->i_xattrs.blob = NULL;
+	ci->i_xattrs.prealloc_blob = NULL;
+	ci->i_xattrs.dirty = false;
+	ci->i_xattrs.index = RB_ROOT;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.version = 0;
+	ci->i_xattrs.index_version = 0;
+
+	ci->i_caps = RB_ROOT;
+	ci->i_auth_cap = NULL;
+	ci->i_dirty_caps = 0;
+	ci->i_flushing_caps = 0;
+	INIT_LIST_HEAD(&ci->i_dirty_item);
+	INIT_LIST_HEAD(&ci->i_flushing_item);
+	ci->i_cap_flush_seq = 0;
+	ci->i_cap_flush_last_tid = 0;
+	memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+	init_waitqueue_head(&ci->i_cap_wq);
+	ci->i_hold_caps_min = 0;
+	ci->i_hold_caps_max = 0;
+	INIT_LIST_HEAD(&ci->i_cap_delay_list);
+	ci->i_cap_exporting_mds = 0;
+	ci->i_cap_exporting_mseq = 0;
+	ci->i_cap_exporting_issued = 0;
+	INIT_LIST_HEAD(&ci->i_cap_snaps);
+	ci->i_head_snapc = NULL;
+	ci->i_snap_caps = 0;
+
+	for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+		ci->i_nr_by_mode[i] = 0;
+
+	ci->i_truncate_seq = 0;
+	ci->i_truncate_size = 0;
+	ci->i_truncate_pending = 0;
+
+	ci->i_max_size = 0;
+	ci->i_reported_size = 0;
+	ci->i_wanted_max_size = 0;
+	ci->i_requested_max_size = 0;
+
+	ci->i_pin_ref = 0;
+	ci->i_rd_ref = 0;
+	ci->i_rdcache_ref = 0;
+	ci->i_wr_ref = 0;
+	ci->i_wrbuffer_ref = 0;
+	ci->i_wrbuffer_ref_head = 0;
+	ci->i_shared_gen = 0;
+	ci->i_rdcache_gen = 0;
+	ci->i_rdcache_revoking = 0;
+
+	INIT_LIST_HEAD(&ci->i_unsafe_writes);
+	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+	spin_lock_init(&ci->i_unsafe_lock);
+
+	ci->i_snap_realm = NULL;
+	INIT_LIST_HEAD(&ci->i_snap_realm_item);
+	INIT_LIST_HEAD(&ci->i_snap_flush_item);
+
+	INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
+	INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
+
+	INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
+
+	return &ci->vfs_inode;
+}
+
+void ceph_destroy_inode(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_inode_frag *frag;
+	struct rb_node *n;
+
+	dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+
+	ceph_queue_caps_release(inode);
+
+	/*
+	 * we may still have a snap_realm reference if there are stray
+	 * caps in i_cap_exporting_issued or i_snap_caps.
+	 */
+	if (ci->i_snap_realm) {
+		struct ceph_mds_client *mdsc =
+			&ceph_client(ci->vfs_inode.i_sb)->mdsc;
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+
+		dout(" dropping residual ref to snap realm %p\n", realm);
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+
+	kfree(ci->i_symlink);
+	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
+		frag = rb_entry(n, struct ceph_inode_frag, node);
+		rb_erase(n, &ci->i_fragtree);
+		kfree(frag);
+	}
+
+	__ceph_destroy_xattrs(ci);
+	if (ci->i_xattrs.blob)
+		ceph_buffer_put(ci->i_xattrs.blob);
+	if (ci->i_xattrs.prealloc_blob)
+		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+
+	kmem_cache_free(ceph_inode_cachep, ci);
+}
+
+
+/*
+ * Helpers to fill in size, ctime, mtime, and atime.  We have to be
+ * careful because either the client or MDS may have more up to date
+ * info, depending on which capabilities are held, and whether
+ * time_warp_seq or truncate_seq have increased.  (Ordinarily, mtime
+ * and size are monotonically increasing, except when utimes() or
+ * truncate() increments the corresponding _seq values.)
+ */
+int ceph_fill_file_size(struct inode *inode, int issued,
+			u32 truncate_seq, u64 truncate_size, u64 size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int queue_trunc = 0;
+
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
+	    (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
+		dout("size %lld -> %llu\n", inode->i_size, size);
+		inode->i_size = size;
+		inode->i_blocks = (size + (1<<9) - 1) >> 9;
+		ci->i_reported_size = size;
+		if (truncate_seq != ci->i_truncate_seq) {
+			dout("truncate_seq %u -> %u\n",
+			     ci->i_truncate_seq, truncate_seq);
+			ci->i_truncate_seq = truncate_seq;
+			/*
+			 * If we hold relevant caps, or in the case where we're
+			 * not the only client referencing this file and we
+			 * don't hold those caps, then we need to check whether
+			 * the file is either opened or mmaped
+			 */
+			if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
+				      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+				      CEPH_CAP_FILE_EXCL)) ||
+			    mapping_mapped(inode->i_mapping) ||
+			    __ceph_caps_file_wanted(ci)) {
+				ci->i_truncate_pending++;
+				queue_trunc = 1;
+			}
+		}
+	}
+	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
+	    ci->i_truncate_size != truncate_size) {
+		dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
+		     truncate_size);
+		ci->i_truncate_size = truncate_size;
+	}
+	return queue_trunc;
+}
+
+void ceph_fill_file_time(struct inode *inode, int issued,
+			 u64 time_warp_seq, struct timespec *ctime,
+			 struct timespec *mtime, struct timespec *atime)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int warn = 0;
+
+	if (issued & (CEPH_CAP_FILE_EXCL|
+		      CEPH_CAP_FILE_WR|
+		      CEPH_CAP_FILE_BUFFER)) {
+		if (timespec_compare(ctime, &inode->i_ctime) > 0) {
+			dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
+			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+			     ctime->tv_sec, ctime->tv_nsec);
+			inode->i_ctime = *ctime;
+		}
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
+			/* the MDS did a utimes() */
+			dout("mtime %ld.%09ld -> %ld.%09ld "
+			     "tw %d -> %d\n",
+			     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+			     mtime->tv_sec, mtime->tv_nsec,
+			     ci->i_time_warp_seq, (int)time_warp_seq);
+
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else if (time_warp_seq == ci->i_time_warp_seq) {
+			/* nobody did utimes(); take the max */
+			if (timespec_compare(mtime, &inode->i_mtime) > 0) {
+				dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_mtime.tv_sec,
+				     inode->i_mtime.tv_nsec,
+				     mtime->tv_sec, mtime->tv_nsec);
+				inode->i_mtime = *mtime;
+			}
+			if (timespec_compare(atime, &inode->i_atime) > 0) {
+				dout("atime %ld.%09ld -> %ld.%09ld inc\n",
+				     inode->i_atime.tv_sec,
+				     inode->i_atime.tv_nsec,
+				     atime->tv_sec, atime->tv_nsec);
+				inode->i_atime = *atime;
+			}
+		} else if (issued & CEPH_CAP_FILE_EXCL) {
+			/* we did a utimes(); ignore mds values */
+		} else {
+			warn = 1;
+		}
+	} else {
+		/* we have no write caps; whatever the MDS says is true */
+		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
+			inode->i_ctime = *ctime;
+			inode->i_mtime = *mtime;
+			inode->i_atime = *atime;
+			ci->i_time_warp_seq = time_warp_seq;
+		} else {
+			warn = 1;
+		}
+	}
+	if (warn) /* time_warp_seq shouldn't go backwards */
+		dout("%p mds time_warp_seq %llu < %u\n",
+		     inode, time_warp_seq, ci->i_time_warp_seq);
+}
+
+/*
+ * Populate an inode based on info from mds.  May be called on new or
+ * existing inodes.
+ */
+static int fill_inode(struct inode *inode,
+		      struct ceph_mds_reply_info_in *iinfo,
+		      struct ceph_mds_reply_dirfrag *dirinfo,
+		      struct ceph_mds_session *session,
+		      unsigned long ttl_from, int cap_fmode,
+		      struct ceph_cap_reservation *caps_reservation)
+{
+	struct ceph_mds_reply_inode *info = iinfo->in;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int i;
+	int issued, implemented;
+	struct timespec mtime, atime, ctime;
+	u32 nsplits;
+	struct ceph_buffer *xattr_blob = NULL;
+	int err = 0;
+	int queue_trunc = 0;
+
+	dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
+	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
+	     ci->i_version);
+
+	/*
+	 * prealloc xattr data, if it looks like we'll need it.  only
+	 * if len > 4 (meaning there are actually xattrs; the first 4
+	 * bytes are the xattr count).
+	 */
+	if (iinfo->xattr_len > 4) {
+		xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
+		if (!xattr_blob)
+			pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
+			       iinfo->xattr_len);
+	}
+
+	spin_lock(&inode->i_lock);
+
+	/*
+	 * provided version will be odd if inode value is projected,
+	 * even if stable.  skip the update if we have a newer info
+	 * (e.g., due to inode info racing form multiple MDSs), or if
+	 * we are getting projected (unstable) inode info.
+	 */
+	if (le64_to_cpu(info->version) > 0 &&
+	    (ci->i_version & ~1) > le64_to_cpu(info->version))
+		goto no_change;
+
+	issued = __ceph_caps_issued(ci, &implemented);
+	issued |= implemented | __ceph_caps_dirty(ci);
+
+	/* update inode */
+	ci->i_version = le64_to_cpu(info->version);
+	inode->i_version++;
+	inode->i_rdev = le32_to_cpu(info->rdev);
+
+	if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+		inode->i_mode = le32_to_cpu(info->mode);
+		inode->i_uid = le32_to_cpu(info->uid);
+		inode->i_gid = le32_to_cpu(info->gid);
+		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+		     inode->i_uid, inode->i_gid);
+	}
+
+	if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+		inode->i_nlink = le32_to_cpu(info->nlink);
+
+	/* be careful with mtime, atime, size */
+	ceph_decode_timespec(&atime, &info->atime);
+	ceph_decode_timespec(&mtime, &info->mtime);
+	ceph_decode_timespec(&ctime, &info->ctime);
+	queue_trunc = ceph_fill_file_size(inode, issued,
+					  le32_to_cpu(info->truncate_seq),
+					  le64_to_cpu(info->truncate_size),
+					  le64_to_cpu(info->size));
+	ceph_fill_file_time(inode, issued,
+			    le32_to_cpu(info->time_warp_seq),
+			    &ctime, &mtime, &atime);
+
+	ci->i_max_size = le64_to_cpu(info->max_size);
+	ci->i_layout = info->layout;
+	inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+
+	/* xattrs */
+	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
+	if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
+	    le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = xattr_blob;
+		if (xattr_blob)
+			memcpy(ci->i_xattrs.blob->vec.iov_base,
+			       iinfo->xattr_data, iinfo->xattr_len);
+		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+	}
+
+	inode->i_mapping->a_ops = &ceph_aops;
+	inode->i_mapping->backing_dev_info =
+		&ceph_client(inode->i_sb)->backing_dev_info;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFIFO:
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFSOCK:
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		inode->i_op = &ceph_file_iops;
+		break;
+	case S_IFREG:
+		inode->i_op = &ceph_file_iops;
+		inode->i_fop = &ceph_file_fops;
+		break;
+	case S_IFLNK:
+		inode->i_op = &ceph_symlink_iops;
+		if (!ci->i_symlink) {
+			int symlen = iinfo->symlink_len;
+			char *sym;
+
+			BUG_ON(symlen != inode->i_size);
+			spin_unlock(&inode->i_lock);
+
+			err = -ENOMEM;
+			sym = kmalloc(symlen+1, GFP_NOFS);
+			if (!sym)
+				goto out;
+			memcpy(sym, iinfo->symlink, symlen);
+			sym[symlen] = 0;
+
+			spin_lock(&inode->i_lock);
+			if (!ci->i_symlink)
+				ci->i_symlink = sym;
+			else
+				kfree(sym); /* lost a race */
+		}
+		break;
+	case S_IFDIR:
+		inode->i_op = &ceph_dir_iops;
+		inode->i_fop = &ceph_dir_fops;
+
+		ci->i_files = le64_to_cpu(info->files);
+		ci->i_subdirs = le64_to_cpu(info->subdirs);
+		ci->i_rbytes = le64_to_cpu(info->rbytes);
+		ci->i_rfiles = le64_to_cpu(info->rfiles);
+		ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
+		ceph_decode_timespec(&ci->i_rctime, &info->rctime);
+
+		/* set dir completion flag? */
+		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
+		    ceph_snap(inode) == CEPH_NOSNAP &&
+		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+			dout(" marking %p complete (empty)\n", inode);
+			ci->i_ceph_flags |= CEPH_I_COMPLETE;
+			ci->i_max_offset = 2;
+		}
+
+		/* it may be better to set st_size in getattr instead? */
+		if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+			inode->i_size = ci->i_rbytes;
+		break;
+	default:
+		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
+		       ceph_vinop(inode), inode->i_mode);
+	}
+
+no_change:
+	spin_unlock(&inode->i_lock);
+
+	/* queue truncate if we saw i_size decrease */
+	if (queue_trunc)
+		ceph_queue_vmtruncate(inode);
+
+	/* populate frag tree */
+	/* FIXME: move me up, if/when version reflects fragtree changes */
+	nsplits = le32_to_cpu(info->fragtree.nsplits);
+	mutex_lock(&ci->i_fragtree_mutex);
+	for (i = 0; i < nsplits; i++) {
+		u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
+		struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
+
+		if (IS_ERR(frag))
+			continue;
+		frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
+		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+	}
+	mutex_unlock(&ci->i_fragtree_mutex);
+
+	/* were we issued a capability? */
+	if (info->cap.caps) {
+		if (ceph_snap(inode) == CEPH_NOSNAP) {
+			ceph_add_cap(inode, session,
+				     le64_to_cpu(info->cap.cap_id),
+				     cap_fmode,
+				     le32_to_cpu(info->cap.caps),
+				     le32_to_cpu(info->cap.wanted),
+				     le32_to_cpu(info->cap.seq),
+				     le32_to_cpu(info->cap.mseq),
+				     le64_to_cpu(info->cap.realm),
+				     info->cap.flags,
+				     caps_reservation);
+		} else {
+			spin_lock(&inode->i_lock);
+			dout(" %p got snap_caps %s\n", inode,
+			     ceph_cap_string(le32_to_cpu(info->cap.caps)));
+			ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
+			if (cap_fmode >= 0)
+				__ceph_get_fmode(ci, cap_fmode);
+			spin_unlock(&inode->i_lock);
+		}
+	}
+
+	/* update delegation info? */
+	if (dirinfo)
+		ceph_fill_dirfrag(inode, dirinfo);
+
+	err = 0;
+
+out:
+	if (xattr_blob)
+		ceph_buffer_put(xattr_blob);
+	return err;
+}
+
+/*
+ * caller should hold session s_mutex.
+ */
+static void update_dentry_lease(struct dentry *dentry,
+				struct ceph_mds_reply_lease *lease,
+				struct ceph_mds_session *session,
+				unsigned long from_time)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+	long unsigned duration = le32_to_cpu(lease->duration_ms);
+	long unsigned ttl = from_time + (duration * HZ) / 1000;
+	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
+	struct inode *dir;
+
+	/* only track leases on regular dentries */
+	if (dentry->d_op != &ceph_dentry_ops)
+		return;
+
+	spin_lock(&dentry->d_lock);
+	dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
+	     dentry, le16_to_cpu(lease->mask), duration, ttl);
+
+	/* make lease_rdcache_gen match directory */
+	dir = dentry->d_parent->d_inode;
+	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
+
+	if (lease->mask == 0)
+		goto out_unlock;
+
+	if (di->lease_gen == session->s_cap_gen &&
+	    time_before(ttl, dentry->d_time))
+		goto out_unlock;  /* we already have a newer lease. */
+
+	if (di->lease_session && di->lease_session != session)
+		goto out_unlock;
+
+	ceph_dentry_lru_touch(dentry);
+
+	if (!di->lease_session)
+		di->lease_session = ceph_get_mds_session(session);
+	di->lease_gen = session->s_cap_gen;
+	di->lease_seq = le32_to_cpu(lease->seq);
+	di->lease_renew_after = half_ttl;
+	di->lease_renew_from = 0;
+	dentry->d_time = ttl;
+out_unlock:
+	spin_unlock(&dentry->d_lock);
+	return;
+}
+
+/*
+ * splice a dentry to an inode.
+ * caller must hold directory i_mutex for this to be safe.
+ *
+ * we will only rehash the resulting dentry if @prehash is
+ * true; @prehash will be set to false (for the benefit of
+ * the caller) if we fail.
+ */
+static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
+				    bool *prehash)
+{
+	struct dentry *realdn;
+
+	/* dn must be unhashed */
+	if (!d_unhashed(dn))
+		d_drop(dn);
+	realdn = d_materialise_unique(dn, in);
+	if (IS_ERR(realdn)) {
+		pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
+		       dn, in, ceph_vinop(in));
+		if (prehash)
+			*prehash = false; /* don't rehash on error */
+		dn = realdn; /* note realdn contains the error */
+		goto out;
+	} else if (realdn) {
+		dout("dn %p (%d) spliced with %p (%d) "
+		     "inode %p ino %llx.%llx\n",
+		     dn, atomic_read(&dn->d_count),
+		     realdn, atomic_read(&realdn->d_count),
+		     realdn->d_inode, ceph_vinop(realdn->d_inode));
+		dput(dn);
+		dn = realdn;
+	} else {
+		BUG_ON(!ceph_dentry(dn));
+
+		dout("dn %p attached to %p ino %llx.%llx\n",
+		     dn, dn->d_inode, ceph_vinop(dn->d_inode));
+	}
+	if ((!prehash || *prehash) && d_unhashed(dn))
+		d_rehash(dn);
+out:
+	return dn;
+}
+
+/*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+	struct dentry *dir = dn->d_parent;
+	struct inode *inode = dn->d_parent->d_inode;
+	struct ceph_dentry_info *di;
+
+	BUG_ON(!inode);
+
+	di = ceph_dentry(dn);
+
+	spin_lock(&inode->i_lock);
+	di->offset = ceph_inode(inode)->i_max_offset++;
+	spin_unlock(&inode->i_lock);
+
+	spin_lock(&dcache_lock);
+	spin_lock(&dn->d_lock);
+	list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
+	spin_unlock(&dn->d_lock);
+	spin_unlock(&dcache_lock);
+}
+
+/*
+ * Incorporate results into the local cache.  This is either just
+ * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
+ * after a lookup).
+ *
+ * A reply may contain
+ *         a directory inode along with a dentry.
+ *  and/or a target inode
+ *
+ * Called with snap_rwsem (read).
+ */
+int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
+		    struct ceph_mds_session *session)
+{
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct inode *in = NULL;
+	struct ceph_mds_reply_inode *ininfo;
+	struct ceph_vino vino;
+	int i = 0;
+	int err = 0;
+
+	dout("fill_trace %p is_dentry %d is_target %d\n", req,
+	     rinfo->head->is_dentry, rinfo->head->is_target);
+
+#if 0
+	/*
+	 * Debugging hook:
+	 *
+	 * If we resend completed ops to a recovering mds, we get no
+	 * trace.  Since that is very rare, pretend this is the case
+	 * to ensure the 'no trace' handlers in the callers behave.
+	 *
+	 * Fill in inodes unconditionally to avoid breaking cap
+	 * invariants.
+	 */
+	if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
+		pr_info("fill_trace faking empty trace on %lld %s\n",
+			req->r_tid, ceph_mds_op_name(rinfo->head->op));
+		if (rinfo->head->is_dentry) {
+			rinfo->head->is_dentry = 0;
+			err = fill_inode(req->r_locked_dir,
+					 &rinfo->diri, rinfo->dirfrag,
+					 session, req->r_request_started, -1);
+		}
+		if (rinfo->head->is_target) {
+			rinfo->head->is_target = 0;
+			ininfo = rinfo->targeti.in;
+			vino.ino = le64_to_cpu(ininfo->ino);
+			vino.snap = le64_to_cpu(ininfo->snapid);
+			in = ceph_get_inode(sb, vino);
+			err = fill_inode(in, &rinfo->targeti, NULL,
+					 session, req->r_request_started,
+					 req->r_fmode);
+			iput(in);
+		}
+	}
+#endif
+
+	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
+		dout("fill_trace reply is empty!\n");
+		if (rinfo->head->result == 0 && req->r_locked_dir) {
+			struct ceph_inode_info *ci =
+				ceph_inode(req->r_locked_dir);
+			dout(" clearing %p complete (empty trace)\n",
+			     req->r_locked_dir);
+			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+			ci->i_release_count++;
+		}
+		return 0;
+	}
+
+	if (rinfo->head->is_dentry) {
+		struct inode *dir = req->r_locked_dir;
+
+		err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+				 session, req->r_request_started, -1,
+				 &req->r_caps_reservation);
+		if (err < 0)
+			return err;
+	}
+
+	if (rinfo->head->is_dentry && !req->r_aborted) {
+		/*
+		 * lookup link rename   : null -> possibly existing inode
+		 * mknod symlink mkdir  : null -> new inode
+		 * unlink               : linked -> null
+		 */
+		struct inode *dir = req->r_locked_dir;
+		struct dentry *dn = req->r_dentry;
+		bool have_dir_cap, have_lease;
+
+		BUG_ON(!dn);
+		BUG_ON(!dir);
+		BUG_ON(dn->d_parent->d_inode != dir);
+		BUG_ON(ceph_ino(dir) !=
+		       le64_to_cpu(rinfo->diri.in->ino));
+		BUG_ON(ceph_snap(dir) !=
+		       le64_to_cpu(rinfo->diri.in->snapid));
+
+		/* do we have a lease on the whole dir? */
+		have_dir_cap =
+			(le32_to_cpu(rinfo->diri.in->cap.caps) &
+			 CEPH_CAP_FILE_SHARED);
+
+		/* do we have a dn lease? */
+		have_lease = have_dir_cap ||
+			(le16_to_cpu(rinfo->dlease->mask) &
+			 CEPH_LOCK_DN);
+
+		if (!have_lease)
+			dout("fill_trace  no dentry lease or dir cap\n");
+
+		/* rename? */
+		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+			dout(" src %p '%.*s' dst %p '%.*s'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry->d_name.len,
+			     req->r_old_dentry->d_name.name,
+			     dn, dn->d_name.len, dn->d_name.name);
+			dout("fill_trace doing d_move %p -> %p\n",
+			     req->r_old_dentry, dn);
+			d_move(req->r_old_dentry, dn);
+			dout(" src %p '%.*s' dst %p '%.*s'\n",
+			     req->r_old_dentry,
+			     req->r_old_dentry->d_name.len,
+			     req->r_old_dentry->d_name.name,
+			     dn, dn->d_name.len, dn->d_name.name);
+			/* ensure target dentry is invalidated, despite
+			   rehashing bug in vfs_rename_dir */
+			dn->d_time = jiffies;
+			ceph_dentry(dn)->lease_shared_gen = 0;
+			/* take overwritten dentry's readdir offset */
+			ceph_dentry(req->r_old_dentry)->offset =
+				ceph_dentry(dn)->offset;
+			dn = req->r_old_dentry;  /* use old_dentry */
+			in = dn->d_inode;
+		}
+
+		/* null dentry? */
+		if (!rinfo->head->is_target) {
+			dout("fill_trace null dentry\n");
+			if (dn->d_inode) {
+				dout("d_delete %p\n", dn);
+				d_delete(dn);
+			} else {
+				dout("d_instantiate %p NULL\n", dn);
+				d_instantiate(dn, NULL);
+				if (have_lease && d_unhashed(dn))
+					d_rehash(dn);
+				update_dentry_lease(dn, rinfo->dlease,
+						    session,
+						    req->r_request_started);
+			}
+			goto done;
+		}
+
+		/* attach proper inode */
+		ininfo = rinfo->targeti.in;
+		vino.ino = le64_to_cpu(ininfo->ino);
+		vino.snap = le64_to_cpu(ininfo->snapid);
+		if (!dn->d_inode) {
+			in = ceph_get_inode(sb, vino);
+			if (IS_ERR(in)) {
+				pr_err("fill_trace bad get_inode "
+				       "%llx.%llx\n", vino.ino, vino.snap);
+				err = PTR_ERR(in);
+				d_delete(dn);
+				goto done;
+			}
+			dn = splice_dentry(dn, in, &have_lease);
+			if (IS_ERR(dn)) {
+				err = PTR_ERR(dn);
+				goto done;
+			}
+			req->r_dentry = dn;  /* may have spliced */
+			ceph_set_dentry_offset(dn);
+			igrab(in);
+		} else if (ceph_ino(in) == vino.ino &&
+			   ceph_snap(in) == vino.snap) {
+			igrab(in);
+		} else {
+			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
+			     dn, in, ceph_ino(in), ceph_snap(in),
+			     vino.ino, vino.snap);
+			have_lease = false;
+			in = NULL;
+		}
+
+		if (have_lease)
+			update_dentry_lease(dn, rinfo->dlease, session,
+					    req->r_request_started);
+		dout(" final dn %p\n", dn);
+		i++;
+	} else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+		   req->r_op == CEPH_MDS_OP_MKSNAP) {
+		struct dentry *dn = req->r_dentry;
+
+		/* fill out a snapdir LOOKUPSNAP dentry */
+		BUG_ON(!dn);
+		BUG_ON(!req->r_locked_dir);
+		BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
+		ininfo = rinfo->targeti.in;
+		vino.ino = le64_to_cpu(ininfo->ino);
+		vino.snap = le64_to_cpu(ininfo->snapid);
+		in = ceph_get_inode(sb, vino);
+		if (IS_ERR(in)) {
+			pr_err("fill_inode get_inode badness %llx.%llx\n",
+			       vino.ino, vino.snap);
+			err = PTR_ERR(in);
+			d_delete(dn);
+			goto done;
+		}
+		dout(" linking snapped dir %p to dn %p\n", in, dn);
+		dn = splice_dentry(dn, in, NULL);
+		if (IS_ERR(dn)) {
+			err = PTR_ERR(dn);
+			goto done;
+		}
+		ceph_set_dentry_offset(dn);
+		req->r_dentry = dn;  /* may have spliced */
+		igrab(in);
+		rinfo->head->is_dentry = 1;  /* fool notrace handlers */
+	}
+
+	if (rinfo->head->is_target) {
+		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+		if (in == NULL || ceph_ino(in) != vino.ino ||
+		    ceph_snap(in) != vino.snap) {
+			in = ceph_get_inode(sb, vino);
+			if (IS_ERR(in)) {
+				err = PTR_ERR(in);
+				goto done;
+			}
+		}
+		req->r_target_inode = in;
+
+		err = fill_inode(in,
+				 &rinfo->targeti, NULL,
+				 session, req->r_request_started,
+				 (le32_to_cpu(rinfo->head->result) == 0) ?
+				 req->r_fmode : -1,
+				 &req->r_caps_reservation);
+		if (err < 0) {
+			pr_err("fill_inode badness %p %llx.%llx\n",
+			       in, ceph_vinop(in));
+			goto done;
+		}
+	}
+
+done:
+	dout("fill_trace done err=%d\n", err);
+	return err;
+}
+
+/*
+ * Prepopulate our cache with readdir results, leases, etc.
+ */
+int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+			     struct ceph_mds_session *session)
+{
+	struct dentry *parent = req->r_dentry;
+	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+	struct qstr dname;
+	struct dentry *dn;
+	struct inode *in;
+	int err = 0, i;
+	struct inode *snapdir = NULL;
+	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
+	u64 frag = le32_to_cpu(rhead->args.readdir.frag);
+	struct ceph_dentry_info *di;
+
+	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
+		snapdir = ceph_get_snapdir(parent->d_inode);
+		parent = d_find_alias(snapdir);
+		dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
+		     rinfo->dir_nr, parent);
+	} else {
+		dout("readdir_prepopulate %d items under dn %p\n",
+		     rinfo->dir_nr, parent);
+		if (rinfo->dir_dir)
+			ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
+	}
+
+	for (i = 0; i < rinfo->dir_nr; i++) {
+		struct ceph_vino vino;
+
+		dname.name = rinfo->dir_dname[i];
+		dname.len = rinfo->dir_dname_len[i];
+		dname.hash = full_name_hash(dname.name, dname.len);
+
+		vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
+		vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
+
+retry_lookup:
+		dn = d_lookup(parent, &dname);
+		dout("d_lookup on parent=%p name=%.*s got %p\n",
+		     parent, dname.len, dname.name, dn);
+
+		if (!dn) {
+			dn = d_alloc(parent, &dname);
+			dout("d_alloc %p '%.*s' = %p\n", parent,
+			     dname.len, dname.name, dn);
+			if (dn == NULL) {
+				dout("d_alloc badness\n");
+				err = -ENOMEM;
+				goto out;
+			}
+			err = ceph_init_dentry(dn);
+			if (err < 0)
+				goto out;
+		} else if (dn->d_inode &&
+			   (ceph_ino(dn->d_inode) != vino.ino ||
+			    ceph_snap(dn->d_inode) != vino.snap)) {
+			dout(" dn %p points to wrong inode %p\n",
+			     dn, dn->d_inode);
+			d_delete(dn);
+			dput(dn);
+			goto retry_lookup;
+		} else {
+			/* reorder parent's d_subdirs */
+			spin_lock(&dcache_lock);
+			spin_lock(&dn->d_lock);
+			list_move(&dn->d_u.d_child, &parent->d_subdirs);
+			spin_unlock(&dn->d_lock);
+			spin_unlock(&dcache_lock);
+		}
+
+		di = dn->d_fsdata;
+		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
+
+		/* inode */
+		if (dn->d_inode) {
+			in = dn->d_inode;
+		} else {
+			in = ceph_get_inode(parent->d_sb, vino);
+			if (in == NULL) {
+				dout("new_inode badness\n");
+				d_delete(dn);
+				dput(dn);
+				err = -ENOMEM;
+				goto out;
+			}
+			dn = splice_dentry(dn, in, NULL);
+		}
+
+		if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
+			       req->r_request_started, -1,
+			       &req->r_caps_reservation) < 0) {
+			pr_err("fill_inode badness on %p\n", in);
+			dput(dn);
+			continue;
+		}
+		update_dentry_lease(dn, rinfo->dir_dlease[i],
+				    req->r_session, req->r_request_started);
+		dput(dn);
+	}
+	req->r_did_prepopulate = true;
+
+out:
+	if (snapdir) {
+		iput(snapdir);
+		dput(parent);
+	}
+	dout("readdir_prepopulate done\n");
+	return err;
+}
+
+int ceph_inode_set_size(struct inode *inode, loff_t size)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
+	inode->i_size = size;
+	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
+
+	/* tell the MDS if we are approaching max_size */
+	if ((size << 1) >= ci->i_max_size &&
+	    (ci->i_reported_size << 1) < ci->i_max_size)
+		ret = 1;
+
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/*
+ * Write back inode data in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+void ceph_queue_writeback(struct inode *inode)
+{
+	if (queue_work(ceph_inode_to_client(inode)->wb_wq,
+		       &ceph_inode(inode)->i_wb_work)) {
+		dout("ceph_queue_writeback %p\n", inode);
+		igrab(inode);
+	} else {
+		dout("ceph_queue_writeback %p failed\n", inode);
+	}
+}
+
+static void ceph_writeback_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_wb_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("writeback %p\n", inode);
+	filemap_fdatawrite(&inode->i_data);
+	iput(inode);
+}
+
+/*
+ * queue an async invalidation
+ */
+void ceph_queue_invalidate(struct inode *inode)
+{
+	if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
+		       &ceph_inode(inode)->i_pg_inv_work)) {
+		dout("ceph_queue_invalidate %p\n", inode);
+		igrab(inode);
+	} else {
+		dout("ceph_queue_invalidate %p failed\n", inode);
+	}
+}
+
+/*
+ * invalidate any pages that are not dirty or under writeback.  this
+ * includes pages that are clean and mapped.
+ */
+static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t index;
+			int skip_page =
+				(PageDirty(page) || PageWriteback(page));
+
+			if (!skip_page)
+				skip_page = !trylock_page(page);
+
+			/*
+			 * We really shouldn't be looking at the ->index of an
+			 * unlocked page.  But we're not allowed to lock these
+			 * pages.  So we rely upon nobody altering the ->index
+			 * of this (pinned-by-us) page.
+			 */
+			index = page->index;
+			if (index > next)
+				next = index;
+			next++;
+
+			if (skip_page)
+				continue;
+
+			generic_error_remove_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+/*
+ * Invalidate inode pages in a worker thread.  (This can't be done
+ * in the message handler context.)
+ */
+static void ceph_invalidate_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_pg_inv_work);
+	struct inode *inode = &ci->vfs_inode;
+	u32 orig_gen;
+	int check = 0;
+
+	spin_lock(&inode->i_lock);
+	dout("invalidate_pages %p gen %d revoking %d\n", inode,
+	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
+	if (ci->i_rdcache_gen == 0 ||
+	    ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+		BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
+		/* nevermind! */
+		ci->i_rdcache_revoking = 0;
+		spin_unlock(&inode->i_lock);
+		goto out;
+	}
+	orig_gen = ci->i_rdcache_gen;
+	spin_unlock(&inode->i_lock);
+
+	ceph_invalidate_nondirty_pages(inode->i_mapping);
+
+	spin_lock(&inode->i_lock);
+	if (orig_gen == ci->i_rdcache_gen) {
+		dout("invalidate_pages %p gen %d successful\n", inode,
+		     ci->i_rdcache_gen);
+		ci->i_rdcache_gen = 0;
+		ci->i_rdcache_revoking = 0;
+		check = 1;
+	} else {
+		dout("invalidate_pages %p gen %d raced, gen now %d\n",
+		     inode, orig_gen, ci->i_rdcache_gen);
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (check)
+		ceph_check_caps(ci, 0, NULL);
+out:
+	iput(inode);
+}
+
+
+/*
+ * called by trunc_wq; take i_mutex ourselves
+ *
+ * We also truncate in a separate thread as well.
+ */
+static void ceph_vmtruncate_work(struct work_struct *work)
+{
+	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
+						  i_vmtruncate_work);
+	struct inode *inode = &ci->vfs_inode;
+
+	dout("vmtruncate_work %p\n", inode);
+	mutex_lock(&inode->i_mutex);
+	__ceph_do_pending_vmtruncate(inode);
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
+}
+
+/*
+ * Queue an async vmtruncate.  If we fail to queue work, we will handle
+ * the truncation the next time we call __ceph_do_pending_vmtruncate.
+ */
+void ceph_queue_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+		       &ci->i_vmtruncate_work)) {
+		dout("ceph_queue_vmtruncate %p\n", inode);
+		igrab(inode);
+	} else {
+		dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
+		     inode, ci->i_truncate_pending);
+	}
+}
+
+/*
+ * called with i_mutex held.
+ *
+ * Make sure any pending truncation is applied before doing anything
+ * that may depend on it.
+ */
+void __ceph_do_pending_vmtruncate(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	u64 to;
+	int wrbuffer_refs, wake = 0;
+
+retry:
+	spin_lock(&inode->i_lock);
+	if (ci->i_truncate_pending == 0) {
+		dout("__do_pending_vmtruncate %p none pending\n", inode);
+		spin_unlock(&inode->i_lock);
+		return;
+	}
+
+	/*
+	 * make sure any dirty snapped pages are flushed before we
+	 * possibly truncate them.. so write AND block!
+	 */
+	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+		dout("__do_pending_vmtruncate %p flushing snaps first\n",
+		     inode);
+		spin_unlock(&inode->i_lock);
+		filemap_write_and_wait_range(&inode->i_data, 0,
+					     inode->i_sb->s_maxbytes);
+		goto retry;
+	}
+
+	to = ci->i_truncate_size;
+	wrbuffer_refs = ci->i_wrbuffer_ref;
+	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+	     ci->i_truncate_pending, to);
+	spin_unlock(&inode->i_lock);
+
+	truncate_inode_pages(inode->i_mapping, to);
+
+	spin_lock(&inode->i_lock);
+	ci->i_truncate_pending--;
+	if (ci->i_truncate_pending == 0)
+		wake = 1;
+	spin_unlock(&inode->i_lock);
+
+	if (wrbuffer_refs == 0)
+		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+	if (wake)
+		wake_up(&ci->i_cap_wq);
+}
+
+
+/*
+ * symlinks
+ */
+static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
+	nd_set_link(nd, ci->i_symlink);
+	return NULL;
+}
+
+static const struct inode_operations ceph_symlink_iops = {
+	.readlink = generic_readlink,
+	.follow_link = ceph_sym_follow_link,
+};
+
+/*
+ * setattr
+ */
+int ceph_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	const unsigned int ia_valid = attr->ia_valid;
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+	int issued;
+	int release = 0, dirtied = 0;
+	int mask = 0;
+	int err = 0;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	__ceph_do_pending_vmtruncate(inode);
+
+	err = inode_change_ok(inode, attr);
+	if (err != 0)
+		return err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	spin_lock(&inode->i_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (ia_valid & ATTR_UID) {
+		dout("setattr %p uid %d -> %d\n", inode,
+		     inode->i_uid, attr->ia_uid);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_uid = attr->ia_uid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_uid != inode->i_uid) {
+			req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
+			mask |= CEPH_SETATTR_UID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_GID) {
+		dout("setattr %p gid %d -> %d\n", inode,
+		     inode->i_gid, attr->ia_gid);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_gid = attr->ia_gid;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_gid != inode->i_gid) {
+			req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
+			mask |= CEPH_SETATTR_GID;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+	if (ia_valid & ATTR_MODE) {
+		dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
+		     attr->ia_mode);
+		if (issued & CEPH_CAP_AUTH_EXCL) {
+			inode->i_mode = attr->ia_mode;
+			dirtied |= CEPH_CAP_AUTH_EXCL;
+		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+			   attr->ia_mode != inode->i_mode) {
+			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
+			mask |= CEPH_SETATTR_MODE;
+			release |= CEPH_CAP_AUTH_SHARED;
+		}
+	}
+
+	if (ia_valid & ATTR_ATIME) {
+		dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
+		     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_atime,
+					    &attr->ia_atime) < 0) {
+			inode->i_atime = attr->ia_atime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
+			ceph_encode_timespec(&req->r_args.setattr.atime,
+					     &attr->ia_atime);
+			mask |= CEPH_SETATTR_ATIME;
+			release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_MTIME) {
+		dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
+		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
+		     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
+		if (issued & CEPH_CAP_FILE_EXCL) {
+			ci->i_time_warp_seq++;
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_WR) &&
+			   timespec_compare(&inode->i_mtime,
+					    &attr->ia_mtime) < 0) {
+			inode->i_mtime = attr->ia_mtime;
+			dirtied |= CEPH_CAP_FILE_WR;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
+			ceph_encode_timespec(&req->r_args.setattr.mtime,
+					     &attr->ia_mtime);
+			mask |= CEPH_SETATTR_MTIME;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+	if (ia_valid & ATTR_SIZE) {
+		dout("setattr %p size %lld -> %lld\n", inode,
+		     inode->i_size, attr->ia_size);
+		if (attr->ia_size > inode->i_sb->s_maxbytes) {
+			err = -EINVAL;
+			goto out;
+		}
+		if ((issued & CEPH_CAP_FILE_EXCL) &&
+		    attr->ia_size > inode->i_size) {
+			inode->i_size = attr->ia_size;
+			inode->i_blocks =
+				(attr->ia_size + (1 << 9) - 1) >> 9;
+			inode->i_ctime = attr->ia_ctime;
+			ci->i_reported_size = attr->ia_size;
+			dirtied |= CEPH_CAP_FILE_EXCL;
+		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
+			   attr->ia_size != inode->i_size) {
+			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+			req->r_args.setattr.old_size =
+				cpu_to_le64(inode->i_size);
+			mask |= CEPH_SETATTR_SIZE;
+			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
+				CEPH_CAP_FILE_WR;
+		}
+	}
+
+	/* these do nothing */
+	if (ia_valid & ATTR_CTIME) {
+		bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
+					 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
+		dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
+		     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+		     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
+		     only ? "ctime only" : "ignored");
+		inode->i_ctime = attr->ia_ctime;
+		if (only) {
+			/*
+			 * if kernel wants to dirty ctime but nothing else,
+			 * we need to choose a cap to dirty under, or do
+			 * a almost-no-op setattr
+			 */
+			if (issued & CEPH_CAP_AUTH_EXCL)
+				dirtied |= CEPH_CAP_AUTH_EXCL;
+			else if (issued & CEPH_CAP_FILE_EXCL)
+				dirtied |= CEPH_CAP_FILE_EXCL;
+			else if (issued & CEPH_CAP_XATTR_EXCL)
+				dirtied |= CEPH_CAP_XATTR_EXCL;
+			else
+				mask |= CEPH_SETATTR_CTIME;
+		}
+	}
+	if (ia_valid & ATTR_FILE)
+		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
+
+	if (dirtied) {
+		__ceph_mark_dirty_caps(ci, dirtied);
+		inode->i_ctime = CURRENT_TIME;
+	}
+
+	release &= issued;
+	spin_unlock(&inode->i_lock);
+
+	if (mask) {
+		req->r_inode = igrab(inode);
+		req->r_inode_drop = release;
+		req->r_args.setattr.mask = cpu_to_le32(mask);
+		req->r_num_caps = 1;
+		err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	}
+	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
+	     ceph_cap_string(dirtied), mask);
+
+	ceph_mdsc_put_request(req);
+	__ceph_do_pending_vmtruncate(inode);
+	return err;
+out:
+	spin_unlock(&inode->i_lock);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Verify that we have a lease on the given mask.  If not,
+ * do a getattr against an mds.
+ */
+int ceph_do_getattr(struct inode *inode, int mask)
+{
+	struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req;
+	int err;
+
+	if (ceph_snap(inode) == CEPH_SNAPDIR) {
+		dout("do_getattr inode %p SNAPDIR\n", inode);
+		return 0;
+	}
+
+	dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+	if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
+		return 0;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_num_caps = 1;
+	req->r_args.getattr.mask = cpu_to_le32(mask);
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	ceph_mdsc_put_request(req);
+	dout("do_getattr result=%d\n", err);
+	return err;
+}
+
+
+/*
+ * Check inode permissions.  We verify we have a valid value for
+ * the AUTH cap, then call the generic handler.
+ */
+int ceph_permission(struct inode *inode, int mask)
+{
+	int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+
+	if (!err)
+		err = generic_permission(inode, mask, NULL);
+	return err;
+}
+
+/*
+ * Get all attributes.  Hopefully somedata we'll have a statlite()
+ * and can limit the fields we require to be accurate.
+ */
+int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int err;
+
+	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
+	if (!err) {
+		generic_fillattr(inode, stat);
+		stat->ino = inode->i_ino;
+		if (ceph_snap(inode) != CEPH_NOSNAP)
+			stat->dev = ceph_snap(inode);
+		else
+			stat->dev = 0;
+		if (S_ISDIR(inode->i_mode)) {
+			stat->size = ci->i_rbytes;
+			stat->blocks = 0;
+			stat->blksize = 65536;
+		}
+	}
+	return err;
+}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 00000000000..8a5bcae6284
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
+#include <linux/in.h>
+
+#include "ioctl.h"
+#include "super.h"
+#include "ceph_debug.h"
+
+
+/*
+ * ioctls
+ */
+
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+	struct ceph_ioctl_layout l;
+	int err;
+
+	err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+	if (!err) {
+		l.stripe_unit = ceph_file_layout_su(ci->i_layout);
+		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+		l.object_size = ceph_file_layout_object_size(ci->i_layout);
+		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+		l.preferred_osd =
+			(s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+		if (copy_to_user(arg, &l, sizeof(l)))
+			return -EFAULT;
+	}
+
+	return err;
+}
+
+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+	struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_ioctl_layout l;
+	int err, i;
+
+	/* copy and validate */
+	if (copy_from_user(&l, arg, sizeof(l)))
+		return -EFAULT;
+
+	if ((l.object_size & ~PAGE_MASK) ||
+	    (l.stripe_unit & ~PAGE_MASK) ||
+	    !l.stripe_unit ||
+	    (l.object_size &&
+	     (unsigned)l.object_size % (unsigned)l.stripe_unit))
+		return -EINVAL;
+
+	/* make sure it's a valid data pool */
+	if (l.data_pool > 0) {
+		mutex_lock(&mdsc->mutex);
+		err = -EINVAL;
+		for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+			if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+				err = 0;
+				break;
+			}
+		mutex_unlock(&mdsc->mutex);
+		if (err)
+			return err;
+	}
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+
+	req->r_args.setlayout.layout.fl_stripe_unit =
+		cpu_to_le32(l.stripe_unit);
+	req->r_args.setlayout.layout.fl_stripe_count =
+		cpu_to_le32(l.stripe_count);
+	req->r_args.setlayout.layout.fl_object_size =
+		cpu_to_le32(l.object_size);
+	req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+	req->r_args.setlayout.layout.fl_pg_preferred =
+		cpu_to_le32(l.preferred_osd);
+
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+	struct ceph_ioctl_dataloc dl;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+	u64 len = 1, olen;
+	u64 tmp;
+	struct ceph_object_layout ol;
+	struct ceph_pg pgid;
+
+	/* copy and validate */
+	if (copy_from_user(&dl, arg, sizeof(dl)))
+		return -EFAULT;
+
+	down_read(&osdc->map_sem);
+	ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+				      &dl.object_no, &dl.object_offset, &olen);
+	dl.file_offset -= dl.object_offset;
+	dl.object_size = ceph_file_layout_object_size(ci->i_layout);
+	dl.block_size = ceph_file_layout_su(ci->i_layout);
+
+	/* block_offset = object_offset % block_size */
+	tmp = dl.object_offset;
+	dl.block_offset = do_div(tmp, dl.block_size);
+
+	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+		 ceph_ino(inode), dl.object_no);
+	ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+				osdc->osdmap);
+
+	pgid = ol.ol_pgid;
+	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+	if (dl.osd >= 0) {
+		struct ceph_entity_addr *a =
+			ceph_osd_addr(osdc->osdmap, dl.osd);
+		if (a)
+			memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+	} else {
+		memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+	}
+	up_read(&osdc->map_sem);
+
+	/* send result back to user */
+	if (copy_to_user(arg, &dl, sizeof(dl)))
+		return -EFAULT;
+
+	return 0;
+}
+
+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+	switch (cmd) {
+	case CEPH_IOC_GET_LAYOUT:
+		return ceph_ioctl_get_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_SET_LAYOUT:
+		return ceph_ioctl_set_layout(file, (void __user *)arg);
+
+	case CEPH_IOC_GET_DATALOC:
+		return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+	}
+	return -ENOTTY;
+}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 00000000000..25e4f1a9d05
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define CEPH_IOCTL_MAGIC 0x97
+
+/* just use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+	__u64 stripe_unit, stripe_count, object_size;
+	__u64 data_pool;
+	__s64 preferred_osd;
+};
+
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1,		\
+				   struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2,		\
+				   struct ceph_ioctl_layout)
+
+/*
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+	__u64 file_offset;           /* in+out: file offset */
+	__u64 object_offset;         /* out: offset in object */
+	__u64 object_no;             /* out: object # */
+	__u64 object_size;           /* out: object size */
+	char object_name[64];        /* out: object name */
+	__u64 block_offset;          /* out: offset in block */
+	__u64 block_size;            /* out: block length */
+	__s64 osd;                   /* out: osd # */
+	struct sockaddr_storage osd_addr; /* out: osd address */
+};
+
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3,	\
+				   struct ceph_ioctl_dataloc)
+
+#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 00000000000..5c7920be642
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3042 @@
+#include "ceph_debug.h"
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "mds_client.h"
+#include "mon_client.h"
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+#include "pagelist.h"
+
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage.  Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid.  If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head);
+
+const static struct ceph_connection_operations mds_con_ops;
+
+
+/*
+ * mds reply parsing
+ */
+
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+			       struct ceph_mds_reply_info_in *info)
+{
+	int err = -EIO;
+
+	info->in = *p;
+	*p += sizeof(struct ceph_mds_reply_inode) +
+		sizeof(*info->in->fragtree.splits) *
+		le32_to_cpu(info->in->fragtree.nsplits);
+
+	ceph_decode_32_safe(p, end, info->symlink_len, bad);
+	ceph_decode_need(p, end, info->symlink_len, bad);
+	info->symlink = *p;
+	*p += info->symlink_len;
+
+	ceph_decode_32_safe(p, end, info->xattr_len, bad);
+	ceph_decode_need(p, end, info->xattr_len, bad);
+	info->xattr_data = *p;
+	*p += info->xattr_len;
+	return 0;
+bad:
+	return err;
+}
+
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+				  struct ceph_mds_reply_info_parsed *info)
+{
+	int err;
+
+	if (info->head->is_dentry) {
+		err = parse_reply_info_in(p, end, &info->diri);
+		if (err < 0)
+			goto out_bad;
+
+		if (unlikely(*p + sizeof(*info->dirfrag) > end))
+			goto bad;
+		info->dirfrag = *p;
+		*p += sizeof(*info->dirfrag) +
+			sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+		if (unlikely(*p > end))
+			goto bad;
+
+		ceph_decode_32_safe(p, end, info->dname_len, bad);
+		ceph_decode_need(p, end, info->dname_len, bad);
+		info->dname = *p;
+		*p += info->dname_len;
+		info->dlease = *p;
+		*p += sizeof(*info->dlease);
+	}
+
+	if (info->head->is_target) {
+		err = parse_reply_info_in(p, end, &info->targeti);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	if (unlikely(*p != end))
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing mds trace %d\n", err);
+	return err;
+}
+
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+				struct ceph_mds_reply_info_parsed *info)
+{
+	u32 num, i = 0;
+	int err;
+
+	info->dir_dir = *p;
+	if (*p + sizeof(*info->dir_dir) > end)
+		goto bad;
+	*p += sizeof(*info->dir_dir) +
+		sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+	if (*p > end)
+		goto bad;
+
+	ceph_decode_need(p, end, sizeof(num) + 2, bad);
+	num = ceph_decode_32(p);
+	info->dir_end = ceph_decode_8(p);
+	info->dir_complete = ceph_decode_8(p);
+	if (num == 0)
+		goto done;
+
+	/* alloc large array */
+	info->dir_nr = num;
+	info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
+			       sizeof(*info->dir_dname) +
+			       sizeof(*info->dir_dname_len) +
+			       sizeof(*info->dir_dlease),
+			       GFP_NOFS);
+	if (info->dir_in == NULL) {
+		err = -ENOMEM;
+		goto out_bad;
+	}
+	info->dir_dname = (void *)(info->dir_in + num);
+	info->dir_dname_len = (void *)(info->dir_dname + num);
+	info->dir_dlease = (void *)(info->dir_dname_len + num);
+
+	while (num) {
+		/* dentry */
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		info->dir_dname_len[i] = ceph_decode_32(p);
+		ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+		info->dir_dname[i] = *p;
+		*p += info->dir_dname_len[i];
+		dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+		     info->dir_dname[i]);
+		info->dir_dlease[i] = *p;
+		*p += sizeof(struct ceph_mds_reply_lease);
+
+		/* inode */
+		err = parse_reply_info_in(p, end, &info->dir_in[i]);
+		if (err < 0)
+			goto out_bad;
+		i++;
+		num--;
+	}
+
+done:
+	if (*p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("problem parsing dir contents %d\n", err);
+	return err;
+}
+
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+			    struct ceph_mds_reply_info_parsed *info)
+{
+	void *p, *end;
+	u32 len;
+	int err;
+
+	info->head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+
+	/* trace */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		err = parse_reply_info_trace(&p, p+len, info);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* dir content */
+	ceph_decode_32_safe(&p, end, len, bad);
+	if (len > 0) {
+		err = parse_reply_info_dir(&p, p+len, info);
+		if (err < 0)
+			goto out_bad;
+	}
+
+	/* snap blob */
+	ceph_decode_32_safe(&p, end, len, bad);
+	info->snapblob_len = len;
+	info->snapblob = p;
+	p += len;
+
+	if (p != end)
+		goto bad;
+	return 0;
+
+bad:
+	err = -EIO;
+out_bad:
+	pr_err("mds parse_reply err %d\n", err);
+	return err;
+}
+
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+	kfree(info->dir_in);
+}
+
+
+/*
+ * sessions
+ */
+static const char *session_state_name(int s)
+{
+	switch (s) {
+	case CEPH_MDS_SESSION_NEW: return "new";
+	case CEPH_MDS_SESSION_OPENING: return "opening";
+	case CEPH_MDS_SESSION_OPEN: return "open";
+	case CEPH_MDS_SESSION_HUNG: return "hung";
+	case CEPH_MDS_SESSION_CLOSING: return "closing";
+	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
+	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+	default: return "???";
+	}
+}
+
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+	if (atomic_inc_not_zero(&s->s_ref)) {
+		dout("mdsc get_session %p %d -> %d\n", s,
+		     atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+		return s;
+	} else {
+		dout("mdsc get_session %p 0 -- FAIL", s);
+		return NULL;
+	}
+}
+
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+	dout("mdsc put_session %p %d -> %d\n", s,
+	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
+	if (atomic_dec_and_test(&s->s_ref)) {
+		if (s->s_authorizer)
+			s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
+				s->s_mdsc->client->monc.auth, s->s_authorizer);
+		kfree(s);
+	}
+}
+
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+						   int mds)
+{
+	struct ceph_mds_session *session;
+
+	if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+		return NULL;
+	session = mdsc->sessions[mds];
+	dout("lookup_mds_session %p %d\n", session,
+	     atomic_read(&session->s_ref));
+	get_session(session);
+	return session;
+}
+
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+	if (mds >= mdsc->max_sessions)
+		return false;
+	return mdsc->sessions[mds];
+}
+
+static int __verify_registered_session(struct ceph_mds_client *mdsc,
+				       struct ceph_mds_session *s)
+{
+	if (s->s_mds >= mdsc->max_sessions ||
+	    mdsc->sessions[s->s_mds] != s)
+		return -ENOENT;
+	return 0;
+}
+
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+						 int mds)
+{
+	struct ceph_mds_session *s;
+
+	s = kzalloc(sizeof(*s), GFP_NOFS);
+	if (!s)
+		return ERR_PTR(-ENOMEM);
+	s->s_mdsc = mdsc;
+	s->s_mds = mds;
+	s->s_state = CEPH_MDS_SESSION_NEW;
+	s->s_ttl = 0;
+	s->s_seq = 0;
+	mutex_init(&s->s_mutex);
+
+	ceph_con_init(mdsc->client->msgr, &s->s_con);
+	s->s_con.private = s;
+	s->s_con.ops = &mds_con_ops;
+	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
+	s->s_con.peer_name.num = cpu_to_le64(mds);
+
+	spin_lock_init(&s->s_cap_lock);
+	s->s_cap_gen = 0;
+	s->s_cap_ttl = 0;
+	s->s_renew_requested = 0;
+	s->s_renew_seq = 0;
+	INIT_LIST_HEAD(&s->s_caps);
+	s->s_nr_caps = 0;
+	s->s_trim_caps = 0;
+	atomic_set(&s->s_ref, 1);
+	INIT_LIST_HEAD(&s->s_waiting);
+	INIT_LIST_HEAD(&s->s_unsafe);
+	s->s_num_cap_releases = 0;
+	s->s_cap_iterator = NULL;
+	INIT_LIST_HEAD(&s->s_cap_releases);
+	INIT_LIST_HEAD(&s->s_cap_releases_done);
+	INIT_LIST_HEAD(&s->s_cap_flushing);
+	INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+
+	dout("register_session mds%d\n", mds);
+	if (mds >= mdsc->max_sessions) {
+		int newmax = 1 << get_count_order(mds+1);
+		struct ceph_mds_session **sa;
+
+		dout("register_session realloc to %d\n", newmax);
+		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+		if (sa == NULL)
+			goto fail_realloc;
+		if (mdsc->sessions) {
+			memcpy(sa, mdsc->sessions,
+			       mdsc->max_sessions * sizeof(void *));
+			kfree(mdsc->sessions);
+		}
+		mdsc->sessions = sa;
+		mdsc->max_sessions = newmax;
+	}
+	mdsc->sessions[mds] = s;
+	atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
+
+	ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+	return s;
+
+fail_realloc:
+	kfree(s);
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __unregister_session(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_session *s)
+{
+	dout("__unregister_session mds%d %p\n", s->s_mds, s);
+	BUG_ON(mdsc->sessions[s->s_mds] != s);
+	mdsc->sessions[s->s_mds] = NULL;
+	ceph_con_close(&s->s_con);
+	ceph_put_mds_session(s);
+}
+
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+	if (req->r_session) {
+		ceph_put_mds_session(req->r_session);
+		req->r_session = NULL;
+	}
+}
+
+void ceph_mdsc_release_request(struct kref *kref)
+{
+	struct ceph_mds_request *req = container_of(kref,
+						    struct ceph_mds_request,
+						    r_kref);
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply) {
+		ceph_msg_put(req->r_reply);
+		destroy_reply_info(&req->r_reply_info);
+	}
+	if (req->r_inode) {
+		ceph_put_cap_refs(ceph_inode(req->r_inode),
+				  CEPH_CAP_PIN);
+		iput(req->r_inode);
+	}
+	if (req->r_locked_dir)
+		ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+				  CEPH_CAP_PIN);
+	if (req->r_target_inode)
+		iput(req->r_target_inode);
+	if (req->r_dentry)
+		dput(req->r_dentry);
+	if (req->r_old_dentry) {
+		ceph_put_cap_refs(
+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
+			CEPH_CAP_PIN);
+		dput(req->r_old_dentry);
+	}
+	kfree(req->r_path1);
+	kfree(req->r_path2);
+	put_request_session(req);
+	ceph_unreserve_caps(&req->r_caps_reservation);
+	kfree(req);
+}
+
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+					     u64 tid)
+{
+	struct ceph_mds_request *req;
+	struct rb_node *n = mdsc->request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mds_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else {
+			ceph_mdsc_get_request(req);
+			return req;
+		}
+	}
+	return NULL;
+}
+
+static void __insert_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *new)
+{
+	struct rb_node **p = &mdsc->request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mds_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mds_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &mdsc->request_tree);
+}
+
+/*
+ * Register an in-flight request, and assign a tid.  Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+			       struct ceph_mds_request *req,
+			       struct inode *dir)
+{
+	req->r_tid = ++mdsc->last_tid;
+	if (req->r_num_caps)
+		ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+	dout("__register_request %p tid %lld\n", req, req->r_tid);
+	ceph_mdsc_get_request(req);
+	__insert_request(mdsc, req);
+
+	if (dir) {
+		struct ceph_inode_info *ci = ceph_inode(dir);
+
+		spin_lock(&ci->i_unsafe_lock);
+		req->r_unsafe_dir = dir;
+		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
+}
+
+static void __unregister_request(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &mdsc->request_tree);
+	RB_CLEAR_NODE(&req->r_node);
+
+	if (req->r_unsafe_dir) {
+		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_dir_item);
+		spin_unlock(&ci->i_unsafe_lock);
+	}
+
+	ceph_mdsc_put_request(req);
+}
+
+/*
+ * Choose mds to send request to next.  If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds.  If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+static int __choose_mds(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct ceph_cap *cap;
+	int mode = req->r_direct_mode;
+	int mds = -1;
+	u32 hash = req->r_direct_hash;
+	bool is_hash = req->r_direct_is_hash;
+
+	/*
+	 * is there a specific mds we should try?  ignore hint if we have
+	 * no session and the mds is not up (active or recovering).
+	 */
+	if (req->r_resend_mds >= 0 &&
+	    (__have_session(mdsc, req->r_resend_mds) ||
+	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+		dout("choose_mds using resend_mds mds%d\n",
+		     req->r_resend_mds);
+		return req->r_resend_mds;
+	}
+
+	if (mode == USE_RANDOM_MDS)
+		goto random;
+
+	inode = NULL;
+	if (req->r_inode) {
+		inode = req->r_inode;
+	} else if (req->r_dentry) {
+		if (req->r_dentry->d_inode) {
+			inode = req->r_dentry->d_inode;
+		} else {
+			inode = req->r_dentry->d_parent->d_inode;
+			hash = req->r_dentry->d_name.hash;
+			is_hash = true;
+		}
+	}
+	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+	     (int)hash, mode);
+	if (!inode)
+		goto random;
+	ci = ceph_inode(inode);
+
+	if (is_hash && S_ISDIR(inode->i_mode)) {
+		struct ceph_inode_frag frag;
+		int found;
+
+		ceph_choose_frag(ci, hash, &frag, &found);
+		if (found) {
+			if (mode == USE_ANY_MDS && frag.ndist > 0) {
+				u8 r;
+
+				/* choose a random replica */
+				get_random_bytes(&r, 1);
+				r %= frag.ndist;
+				mds = frag.dist[r];
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (%d/%d)\n",
+				     inode, ceph_vinop(inode),
+				     frag.frag, frag.mds,
+				     (int)r, frag.ndist);
+				return mds;
+			}
+
+			/* since this file/dir wasn't known to be
+			 * replicated, then we want to look for the
+			 * authoritative mds. */
+			mode = USE_AUTH_MDS;
+			if (frag.mds >= 0) {
+				/* choose auth mds */
+				mds = frag.mds;
+				dout("choose_mds %p %llx.%llx "
+				     "frag %u mds%d (auth)\n",
+				     inode, ceph_vinop(inode), frag.frag, mds);
+				return mds;
+			}
+		}
+	}
+
+	spin_lock(&inode->i_lock);
+	cap = NULL;
+	if (mode == USE_AUTH_MDS)
+		cap = ci->i_auth_cap;
+	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+	if (!cap) {
+		spin_unlock(&inode->i_lock);
+		goto random;
+	}
+	mds = cap->session->s_mds;
+	dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+	     inode, ceph_vinop(inode), mds,
+	     cap == ci->i_auth_cap ? "auth " : "", cap);
+	spin_unlock(&inode->i_lock);
+	return mds;
+
+random:
+	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+	dout("choose_mds chose random mds%d\n", mds);
+	return mds;
+}
+
+
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_session_head *h;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+	if (IS_ERR(msg)) {
+		pr_err("create_session_msg ENOMEM creating msg\n");
+		return ERR_PTR(PTR_ERR(msg));
+	}
+	h = msg->front.iov_base;
+	h->op = cpu_to_le32(op);
+	h->seq = cpu_to_le64(seq);
+	return msg;
+}
+
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+			  struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int mstate;
+	int mds = session->s_mds;
+	int err = 0;
+
+	/* wait for mds to go active? */
+	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+	dout("open_session to mds%d (%s)\n", mds,
+	     ceph_mds_state_name(mstate));
+	session->s_state = CEPH_MDS_SESSION_OPENING;
+	session->s_renew_requested = jiffies;
+
+	/* send connect message */
+	msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+	if (IS_ERR(msg)) {
+		err = PTR_ERR(msg);
+		goto out;
+	}
+	ceph_con_send(&session->s_con, msg);
+
+out:
+	return 0;
+}
+
+/*
+ * session caps
+ */
+
+/*
+ * Free preallocated cap messages assigned to this session
+ */
+static void cleanup_cap_releases(struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	spin_lock(&session->s_cap_lock);
+	while (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	while (!list_empty(&session->s_cap_releases_done)) {
+		msg = list_first_entry(&session->s_cap_releases_done,
+				       struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * Helper to safely iterate over all caps associated with a session.
+ *
+ * caller must hold session s_mutex
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+				 int (*cb)(struct inode *, struct ceph_cap *,
+					    void *), void *arg)
+{
+	struct list_head *p;
+	struct ceph_cap *cap;
+	struct inode *inode, *last_inode = NULL;
+	struct ceph_cap *old_cap = NULL;
+	int ret;
+
+	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+	spin_lock(&session->s_cap_lock);
+	p = session->s_caps.next;
+	while (p != &session->s_caps) {
+		cap = list_entry(p, struct ceph_cap, session_caps);
+		inode = igrab(&cap->ci->vfs_inode);
+		if (!inode) {
+			p = p->next;
+			continue;
+		}
+		session->s_cap_iterator = cap;
+		spin_unlock(&session->s_cap_lock);
+
+		if (last_inode) {
+			iput(last_inode);
+			last_inode = NULL;
+		}
+		if (old_cap) {
+			ceph_put_cap(old_cap);
+			old_cap = NULL;
+		}
+
+		ret = cb(inode, cap, arg);
+		last_inode = inode;
+
+		spin_lock(&session->s_cap_lock);
+		p = p->next;
+		if (cap->ci == NULL) {
+			dout("iterate_session_caps  finishing cap %p removal\n",
+			     cap);
+			BUG_ON(cap->session != session);
+			list_del_init(&cap->session_caps);
+			session->s_nr_caps--;
+			cap->session = NULL;
+			old_cap = cap;  /* put_cap it w/o locks held */
+		}
+		if (ret < 0)
+			goto out;
+	}
+	ret = 0;
+out:
+	session->s_cap_iterator = NULL;
+	spin_unlock(&session->s_cap_lock);
+
+	if (last_inode)
+		iput(last_inode);
+	if (old_cap)
+		ceph_put_cap(old_cap);
+
+	return ret;
+}
+
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+				   void *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	dout("removing cap %p, ci is %p, inode is %p\n",
+	     cap, ci, &ci->vfs_inode);
+	ceph_remove_cap(cap);
+	return 0;
+}
+
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+	dout("remove_session_caps on %p\n", session);
+	iterate_session_caps(session, remove_session_caps_cb, NULL);
+	BUG_ON(session->s_nr_caps > 0);
+	cleanup_cap_releases(session);
+}
+
+/*
+ * wake up any threads waiting on this session's caps.  if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+			      void *arg)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	wake_up(&ci->i_cap_wq);
+	if (arg) {
+		spin_lock(&inode->i_lock);
+		ci->i_wanted_max_size = 0;
+		ci->i_requested_max_size = 0;
+		spin_unlock(&inode->i_lock);
+	}
+	return 0;
+}
+
+static void wake_up_session_caps(struct ceph_mds_session *session,
+				 int reconnect)
+{
+	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+	iterate_session_caps(session, wake_up_session_cb,
+			     (void *)(unsigned long)reconnect);
+}
+
+/*
+ * Send periodic message to MDS renewing all currently held caps.  The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int state;
+
+	if (time_after_eq(jiffies, session->s_cap_ttl) &&
+	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+		pr_info("mds%d caps stale\n", session->s_mds);
+	session->s_renew_requested = jiffies;
+
+	/* do not try to renew caps until a recovering mds has reconnected
+	 * with its clients. */
+	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+	if (state < CEPH_MDS_STATE_RECONNECT) {
+		dout("send_renew_caps ignoring mds%d (%s)\n",
+		     session->s_mds, ceph_mds_state_name(state));
+		return 0;
+	}
+
+	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+		ceph_mds_state_name(state));
+	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+				 ++session->s_renew_seq);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+	ceph_con_send(&session->s_con, msg);
+	return 0;
+}
+
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session, int is_renew)
+{
+	int was_stale;
+	int wake = 0;
+
+	spin_lock(&session->s_cap_lock);
+	was_stale = is_renew && (session->s_cap_ttl == 0 ||
+				 time_after_eq(jiffies, session->s_cap_ttl));
+
+	session->s_cap_ttl = session->s_renew_requested +
+		mdsc->mdsmap->m_session_timeout*HZ;
+
+	if (was_stale) {
+		if (time_before(jiffies, session->s_cap_ttl)) {
+			pr_info("mds%d caps renewed\n", session->s_mds);
+			wake = 1;
+		} else {
+			pr_info("mds%d caps still stale\n", session->s_mds);
+		}
+	}
+	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+	spin_unlock(&session->s_cap_lock);
+
+	if (wake)
+		wake_up_session_caps(session, 0);
+}
+
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+				 struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+	int err = 0;
+
+	dout("request_close_session mds%d state %s seq %lld\n",
+	     session->s_mds, session_state_name(session->s_state),
+	     session->s_seq);
+	msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+	if (IS_ERR(msg))
+		err = PTR_ERR(msg);
+	else
+		ceph_con_send(&session->s_con, msg);
+	return err;
+}
+
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session)
+{
+	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+		return 0;
+	session->s_state = CEPH_MDS_SESSION_CLOSING;
+	return request_close_session(mdsc, session);
+}
+
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy.  Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+	struct ceph_mds_session *session = arg;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int used, oissued, mine;
+
+	if (session->s_trim_caps <= 0)
+		return -1;
+
+	spin_lock(&inode->i_lock);
+	mine = cap->issued | cap->implemented;
+	used = __ceph_caps_used(ci);
+	oissued = __ceph_caps_issued_other(ci, cap);
+
+	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+	     ceph_cap_string(used));
+	if (ci->i_dirty_caps)
+		goto out;   /* dirty caps */
+	if ((used & ~oissued) & mine)
+		goto out;   /* we need these caps */
+
+	session->s_trim_caps--;
+	if (oissued) {
+		/* we aren't the only cap.. just remove us */
+		__ceph_remove_cap(cap);
+	} else {
+		/* try to drop referring dentries */
+		spin_unlock(&inode->i_lock);
+		d_prune_aliases(inode);
+		dout("trim_caps_cb %p cap %p  pruned, count now %d\n",
+		     inode, cap, atomic_read(&inode->i_count));
+		return 0;
+	}
+
+out:
+	spin_unlock(&inode->i_lock);
+	return 0;
+}
+
+/*
+ * Trim session cap count down to some max number.
+ */
+static int trim_caps(struct ceph_mds_client *mdsc,
+		     struct ceph_mds_session *session,
+		     int max_caps)
+{
+	int trim_caps = session->s_nr_caps - max_caps;
+
+	dout("trim_caps mds%d start: %d / %d, trim %d\n",
+	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+	if (trim_caps > 0) {
+		session->s_trim_caps = trim_caps;
+		iterate_session_caps(session, trim_caps_cb, session);
+		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+		     session->s_mds, session->s_nr_caps, max_caps,
+			trim_caps - session->s_trim_caps);
+		session->s_trim_caps = 0;
+	}
+	return 0;
+}
+
+/*
+ * Allocate cap_release messages.  If there is a partially full message
+ * in the queue, try to allocate enough to cover it's remainder, so that
+ * we can send it immediately.
+ *
+ * Called under s_mutex.
+ */
+static int add_cap_releases(struct ceph_mds_client *mdsc,
+			    struct ceph_mds_session *session,
+			    int extra)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_cap_release *head;
+	int err = -ENOMEM;
+
+	if (extra < 0)
+		extra = mdsc->client->mount_args->cap_release_safety;
+
+	spin_lock(&session->s_cap_lock);
+
+	if (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg,
+				 list_head);
+		head = msg->front.iov_base;
+		extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+	}
+
+	while (session->s_num_cap_releases < session->s_nr_caps + extra) {
+		spin_unlock(&session->s_cap_lock);
+		msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+				   0, 0, NULL);
+		if (!msg)
+			goto out_unlocked;
+		dout("add_cap_releases %p msg %p now %d\n", session, msg,
+		     (int)msg->front.iov_len);
+		head = msg->front.iov_base;
+		head->num = cpu_to_le32(0);
+		msg->front.iov_len = sizeof(*head);
+		spin_lock(&session->s_cap_lock);
+		list_add(&msg->list_head, &session->s_cap_releases);
+		session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+	}
+
+	if (!list_empty(&session->s_cap_releases)) {
+		msg = list_first_entry(&session->s_cap_releases,
+				       struct ceph_msg,
+				       list_head);
+		head = msg->front.iov_base;
+		if (head->num) {
+			dout(" queueing non-full %p (%d)\n", msg,
+			     le32_to_cpu(head->num));
+			list_move_tail(&msg->list_head,
+				      &session->s_cap_releases_done);
+			session->s_num_cap_releases -=
+				CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+		}
+	}
+	err = 0;
+	spin_unlock(&session->s_cap_lock);
+out_unlocked:
+	return err;
+}
+
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+	int mds, ret = 1;
+
+	dout("check_cap_flush want %lld\n", want_flush_seq);
+	mutex_lock(&mdsc->mutex);
+	for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+		struct ceph_mds_session *session = mdsc->sessions[mds];
+
+		if (!session)
+			continue;
+		get_session(session);
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&session->s_mutex);
+		if (!list_empty(&session->s_cap_flushing)) {
+			struct ceph_inode_info *ci =
+				list_entry(session->s_cap_flushing.next,
+					   struct ceph_inode_info,
+					   i_flushing_item);
+			struct inode *inode = &ci->vfs_inode;
+
+			spin_lock(&inode->i_lock);
+			if (ci->i_cap_flush_seq <= want_flush_seq) {
+				dout("check_cap_flush still flushing %p "
+				     "seq %lld <= %lld to mds%d\n", inode,
+				     ci->i_cap_flush_seq, want_flush_seq,
+				     session->s_mds);
+				ret = 0;
+			}
+			spin_unlock(&inode->i_lock);
+		}
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+
+		if (!ret)
+			return ret;
+		mutex_lock(&mdsc->mutex);
+	}
+
+	mutex_unlock(&mdsc->mutex);
+	dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+	return ret;
+}
+
+/*
+ * called under s_mutex
+ */
+static void send_cap_releases(struct ceph_mds_client *mdsc,
+		       struct ceph_mds_session *session)
+{
+	struct ceph_msg *msg;
+
+	dout("send_cap_releases mds%d\n", session->s_mds);
+	while (1) {
+		spin_lock(&session->s_cap_lock);
+		if (list_empty(&session->s_cap_releases_done))
+			break;
+		msg = list_first_entry(&session->s_cap_releases_done,
+				 struct ceph_msg, list_head);
+		list_del_init(&msg->list_head);
+		spin_unlock(&session->s_cap_lock);
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+		ceph_con_send(&session->s_con, msg);
+	}
+	spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * requests
+ */
+
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+	struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+
+	if (!req)
+		return ERR_PTR(-ENOMEM);
+
+	req->r_started = jiffies;
+	req->r_resend_mds = -1;
+	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+	req->r_fmode = -1;
+	kref_init(&req->r_kref);
+	INIT_LIST_HEAD(&req->r_wait);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+
+	req->r_op = op;
+	req->r_direct_mode = mode;
+	return req;
+}
+
+/*
+ * return oldest (lowest) request, tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
+{
+	if (RB_EMPTY_ROOT(&mdsc->request_tree))
+		return NULL;
+	return rb_entry(rb_first(&mdsc->request_tree),
+			struct ceph_mds_request, r_node);
+}
+
+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *req = __get_oldest_req(mdsc);
+
+	if (req)
+		return req->r_tid;
+	return 0;
+}
+
+/*
+ * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
+ * on build_path_from_dentry in fs/cifs/dir.c.
+ *
+ * If @stop_on_nosnap, generate path relative to the first non-snapped
+ * inode.
+ *
+ * Encode hidden .snap dirs as a double /, i.e.
+ *   foo/.snap/bar -> foo//bar
+ */
+char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+			   int stop_on_nosnap)
+{
+	struct dentry *temp;
+	char *path;
+	int len, pos;
+
+	if (dentry == NULL)
+		return ERR_PTR(-EINVAL);
+
+retry:
+	len = 0;
+	for (temp = dentry; !IS_ROOT(temp);) {
+		struct inode *inode = temp->d_inode;
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
+			len++;  /* slash only */
+		else if (stop_on_nosnap && inode &&
+			 ceph_snap(inode) == CEPH_NOSNAP)
+			break;
+		else
+			len += 1 + temp->d_name.len;
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+	if (len)
+		len--;  /* no leading '/' */
+
+	path = kmalloc(len+1, GFP_NOFS);
+	if (path == NULL)
+		return ERR_PTR(-ENOMEM);
+	pos = len;
+	path[pos] = 0;	/* trailing null */
+	for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
+		struct inode *inode = temp->d_inode;
+
+		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
+			dout("build_path_dentry path+%d: %p SNAPDIR\n",
+			     pos, temp);
+		} else if (stop_on_nosnap && inode &&
+			   ceph_snap(inode) == CEPH_NOSNAP) {
+			break;
+		} else {
+			pos -= temp->d_name.len;
+			if (pos < 0)
+				break;
+			strncpy(path + pos, temp->d_name.name,
+				temp->d_name.len);
+			dout("build_path_dentry path+%d: %p '%.*s'\n",
+			     pos, temp, temp->d_name.len, path + pos);
+		}
+		if (pos)
+			path[--pos] = '/';
+		temp = temp->d_parent;
+		if (temp == NULL) {
+			pr_err("build_path_dentry corrupt dentry\n");
+			kfree(path);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+	if (pos != 0) {
+		pr_err("build_path_dentry did not end path lookup where "
+		       "expected, namelen is %d, pos is %d\n", len, pos);
+		/* presumably this is only possible if racing with a
+		   rename of one of the parent directories (we can not
+		   lock the dentries above us to prevent this, but
+		   retrying should be harmless) */
+		kfree(path);
+		goto retry;
+	}
+
+	*base = ceph_ino(temp->d_inode);
+	*plen = len;
+	dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+	     dentry, atomic_read(&dentry->d_count), *base, len, path);
+	return path;
+}
+
+static int build_dentry_path(struct dentry *dentry,
+			     const char **ppath, int *ppathlen, u64 *pino,
+			     int *pfreepath)
+{
+	char *path;
+
+	if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
+		*pino = ceph_ino(dentry->d_parent->d_inode);
+		*ppath = dentry->d_name.name;
+		*ppathlen = dentry->d_name.len;
+		return 0;
+	}
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+static int build_inode_path(struct inode *inode,
+			    const char **ppath, int *ppathlen, u64 *pino,
+			    int *pfreepath)
+{
+	struct dentry *dentry;
+	char *path;
+
+	if (ceph_snap(inode) == CEPH_NOSNAP) {
+		*pino = ceph_ino(inode);
+		*ppathlen = 0;
+		return 0;
+	}
+	dentry = d_find_alias(inode);
+	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
+	dput(dentry);
+	if (IS_ERR(path))
+		return PTR_ERR(path);
+	*ppath = path;
+	*pfreepath = 1;
+	return 0;
+}
+
+/*
+ * request arguments may be specified via an inode *, a dentry *, or
+ * an explicit ino+path.
+ */
+static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
+				  const char *rpath, u64 rino,
+				  const char **ppath, int *pathlen,
+				  u64 *ino, int *freepath)
+{
+	int r = 0;
+
+	if (rinode) {
+		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
+		     ceph_snap(rinode));
+	} else if (rdentry) {
+		r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
+		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
+		     *ppath);
+	} else if (rpath) {
+		*ino = rino;
+		*ppath = rpath;
+		*pathlen = strlen(rpath);
+		dout(" path %.*s\n", *pathlen, rpath);
+	}
+
+	return r;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+					       struct ceph_mds_request *req,
+					       int mds)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_request_head *head;
+	const char *path1 = NULL;
+	const char *path2 = NULL;
+	u64 ino1 = 0, ino2 = 0;
+	int pathlen1 = 0, pathlen2 = 0;
+	int freepath1 = 0, freepath2 = 0;
+	int len;
+	u16 releases;
+	void *p, *end;
+	int ret;
+
+	ret = set_request_path_attr(req->r_inode, req->r_dentry,
+			      req->r_path1, req->r_ino1.ino,
+			      &path1, &pathlen1, &ino1, &freepath1);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out;
+	}
+
+	ret = set_request_path_attr(NULL, req->r_old_dentry,
+			      req->r_path2, req->r_ino2.ino,
+			      &path2, &pathlen2, &ino2, &freepath2);
+	if (ret < 0) {
+		msg = ERR_PTR(ret);
+		goto out_free1;
+	}
+
+	len = sizeof(*head) +
+		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+
+	/* calculate (max) length for cap releases */
+	len += sizeof(struct ceph_mds_request_release) *
+		(!!req->r_inode_drop + !!req->r_dentry_drop +
+		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+	if (req->r_dentry_drop)
+		len += req->r_dentry->d_name.len;
+	if (req->r_old_dentry_drop)
+		len += req->r_old_dentry->d_name.len;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+	if (IS_ERR(msg))
+		goto out_free2;
+
+	msg->hdr.tid = cpu_to_le64(req->r_tid);
+
+	head = msg->front.iov_base;
+	p = msg->front.iov_base + sizeof(*head);
+	end = msg->front.iov_base + msg->front.iov_len;
+
+	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+	head->op = cpu_to_le32(req->r_op);
+	head->caller_uid = cpu_to_le32(current_fsuid());
+	head->caller_gid = cpu_to_le32(current_fsgid());
+	head->args = req->r_args;
+
+	ceph_encode_filepath(&p, end, ino1, path1);
+	ceph_encode_filepath(&p, end, ino2, path2);
+
+	/* cap releases */
+	releases = 0;
+	if (req->r_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      req->r_inode ? req->r_inode : req->r_dentry->d_inode,
+		      mds, req->r_inode_drop, req->r_inode_unless, 0);
+	if (req->r_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_dentry,
+		       mds, req->r_dentry_drop, req->r_dentry_unless);
+	if (req->r_old_dentry_drop)
+		releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+		       mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
+	if (req->r_old_inode_drop)
+		releases += ceph_encode_inode_release(&p,
+		      req->r_old_dentry->d_inode,
+		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+	head->num_releases = cpu_to_le16(releases);
+
+	BUG_ON(p > end);
+	msg->front.iov_len = p - msg->front.iov_base;
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+
+	msg->pages = req->r_pages;
+	msg->nr_pages = req->r_num_pages;
+	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
+	msg->hdr.data_off = cpu_to_le16(0);
+
+out_free2:
+	if (freepath2)
+		kfree((char *)path2);
+out_free1:
+	if (freepath1)
+		kfree((char *)path1);
+out:
+	return msg;
+}
+
+/*
+ * called under mdsc->mutex if error, under no mutex if
+ * success.
+ */
+static void complete_request(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_request *req)
+{
+	if (req->r_callback)
+		req->r_callback(mdsc, req);
+	else
+		complete(&req->r_completion);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static int __prepare_send_request(struct ceph_mds_client *mdsc,
+				  struct ceph_mds_request *req,
+				  int mds)
+{
+	struct ceph_mds_request_head *rhead;
+	struct ceph_msg *msg;
+	int flags = 0;
+
+	req->r_mds = mds;
+	req->r_attempts++;
+	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
+	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
+
+	if (req->r_request) {
+		ceph_msg_put(req->r_request);
+		req->r_request = NULL;
+	}
+	msg = create_request_message(mdsc, req, mds);
+	if (IS_ERR(msg)) {
+		req->r_reply = ERR_PTR(PTR_ERR(msg));
+		complete_request(mdsc, req);
+		return -PTR_ERR(msg);
+	}
+	req->r_request = msg;
+
+	rhead = msg->front.iov_base;
+	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+	if (req->r_got_unsafe)
+		flags |= CEPH_MDS_FLAG_REPLAY;
+	if (req->r_locked_dir)
+		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
+	rhead->flags = cpu_to_le32(flags);
+	rhead->num_fwd = req->r_num_fwd;
+	rhead->num_retry = req->r_attempts - 1;
+
+	dout(" r_locked_dir = %p\n", req->r_locked_dir);
+
+	if (req->r_target_inode && req->r_got_unsafe)
+		rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+	else
+		rhead->ino = 0;
+	return 0;
+}
+
+/*
+ * send request, or put it on the appropriate wait list.
+ */
+static int __do_request(struct ceph_mds_client *mdsc,
+			struct ceph_mds_request *req)
+{
+	struct ceph_mds_session *session = NULL;
+	int mds = -1;
+	int err = -EAGAIN;
+
+	if (req->r_reply)
+		goto out;
+
+	if (req->r_timeout &&
+	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
+		dout("do_request timed out\n");
+		err = -EIO;
+		goto finish;
+	}
+
+	mds = __choose_mds(mdsc, req);
+	if (mds < 0 ||
+	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+		dout("do_request no mds or not active, waiting for map\n");
+		list_add(&req->r_wait, &mdsc->waiting_for_map);
+		goto out;
+	}
+
+	/* get, open session */
+	session = __ceph_lookup_mds_session(mdsc, mds);
+	if (!session) {
+		session = register_session(mdsc, mds);
+		if (IS_ERR(session)) {
+			err = PTR_ERR(session);
+			goto finish;
+		}
+	}
+	dout("do_request mds%d session %p state %s\n", mds, session,
+	     session_state_name(session->s_state));
+	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
+	    session->s_state != CEPH_MDS_SESSION_HUNG) {
+		if (session->s_state == CEPH_MDS_SESSION_NEW ||
+		    session->s_state == CEPH_MDS_SESSION_CLOSING)
+			__open_session(mdsc, session);
+		list_add(&req->r_wait, &session->s_waiting);
+		goto out_session;
+	}
+
+	/* send request */
+	req->r_session = get_session(session);
+	req->r_resend_mds = -1;   /* forget any previous mds hint */
+
+	if (req->r_request_started == 0)   /* note request start time */
+		req->r_request_started = jiffies;
+
+	err = __prepare_send_request(mdsc, req, mds);
+	if (!err) {
+		ceph_msg_get(req->r_request);
+		ceph_con_send(&session->s_con, req->r_request);
+	}
+
+out_session:
+	ceph_put_mds_session(session);
+out:
+	return err;
+
+finish:
+	req->r_reply = ERR_PTR(err);
+	complete_request(mdsc, req);
+	goto out;
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __wake_requests(struct ceph_mds_client *mdsc,
+			    struct list_head *head)
+{
+	struct ceph_mds_request *req, *nreq;
+
+	list_for_each_entry_safe(req, nreq, head, r_wait) {
+		list_del_init(&req->r_wait);
+		__do_request(mdsc, req);
+	}
+}
+
+/*
+ * Wake up threads with requests pending for @mds, so that they can
+ * resubmit their requests to a possibly different mds.  If @all is set,
+ * wake up if their requests has been forwarded to @mds, too.
+ */
+static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+{
+	struct ceph_mds_request *req;
+	struct rb_node *p;
+
+	dout("kick_requests mds%d\n", mds);
+	for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mds_request, r_node);
+		if (req->r_got_unsafe)
+			continue;
+		if (req->r_session &&
+		    req->r_session->s_mds == mds) {
+			dout(" kicking tid %llu\n", req->r_tid);
+			put_request_session(req);
+			__do_request(mdsc, req);
+		}
+	}
+}
+
+void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+			      struct ceph_mds_request *req)
+{
+	dout("submit_request on %p\n", req);
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, NULL);
+	__do_request(mdsc, req);
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Synchrously perform an mds request.  Take care of all of the
+ * session setup, forwarding, retry details.
+ */
+int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+			 struct inode *dir,
+			 struct ceph_mds_request *req)
+{
+	int err;
+
+	dout("do_request on %p\n", req);
+
+	/* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
+	if (req->r_inode)
+		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
+	if (req->r_locked_dir)
+		ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
+	if (req->r_old_dentry)
+		ceph_get_cap_refs(
+			ceph_inode(req->r_old_dentry->d_parent->d_inode),
+			CEPH_CAP_PIN);
+
+	/* issue */
+	mutex_lock(&mdsc->mutex);
+	__register_request(mdsc, req, dir);
+	__do_request(mdsc, req);
+
+	/* wait */
+	if (!req->r_reply) {
+		mutex_unlock(&mdsc->mutex);
+		if (req->r_timeout) {
+			err = (long)wait_for_completion_interruptible_timeout(
+				&req->r_completion, req->r_timeout);
+			if (err == 0)
+				req->r_reply = ERR_PTR(-EIO);
+			else if (err < 0)
+				req->r_reply = ERR_PTR(err);
+		} else {
+                        err = wait_for_completion_interruptible(
+                                &req->r_completion);
+                        if (err)
+                                req->r_reply = ERR_PTR(err);
+		}
+		mutex_lock(&mdsc->mutex);
+	}
+
+	if (IS_ERR(req->r_reply)) {
+		err = PTR_ERR(req->r_reply);
+		req->r_reply = NULL;
+
+		if (err == -ERESTARTSYS) {
+			/* aborted */
+			req->r_aborted = true;
+
+			if (req->r_locked_dir &&
+			    (req->r_op & CEPH_MDS_OP_WRITE)) {
+				struct ceph_inode_info *ci =
+					ceph_inode(req->r_locked_dir);
+
+				dout("aborted, clearing I_COMPLETE on %p\n", 
+				     req->r_locked_dir);
+				spin_lock(&req->r_locked_dir->i_lock);
+				ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+				ci->i_release_count++;
+				spin_unlock(&req->r_locked_dir->i_lock);
+			}
+		} else {
+			/* clean up this request */
+			__unregister_request(mdsc, req);
+			if (!list_empty(&req->r_unsafe_item))
+				list_del_init(&req->r_unsafe_item);
+			complete(&req->r_safe_completion);
+		}
+	} else if (req->r_err) {
+		err = req->r_err;
+	} else {
+		err = le32_to_cpu(req->r_reply_info.head->result);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	dout("do_request %p done, result %d\n", req, err);
+	return err;
+}
+
+/*
+ * Handle mds reply.
+ *
+ * We take the session mutex and parse and process the reply immediately.
+ * This preserves the logical ordering of replies, capabilities, etc., sent
+ * by the MDS as they are applied to our local cache.
+ */
+static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	struct ceph_mds_request *req;
+	struct ceph_mds_reply_head *head = msg->front.iov_base;
+	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
+	u64 tid;
+	int err, result;
+	int mds = session->s_mds;
+
+	if (msg->front.iov_len < sizeof(*head)) {
+		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
+		ceph_msg_dump(msg);
+		return;
+	}
+
+	/* get request, session */
+	tid = le64_to_cpu(msg->hdr.tid);
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("handle_reply on unknown tid %llu\n", tid);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+	dout("handle_reply %p\n", req);
+
+	/* correct session? */
+	if (req->r_session != session) {
+		pr_err("mdsc_handle_reply got %llu on session mds%d"
+		       " not mds%d\n", tid, session->s_mds,
+		       req->r_session ? req->r_session->s_mds : -1);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	/* dup? */
+	if ((req->r_got_unsafe && !head->safe) ||
+	    (req->r_got_safe && head->safe)) {
+		pr_warning("got a dup %s reply on %llu from mds%d\n",
+			   head->safe ? "safe" : "unsafe", tid, mds);
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+
+	result = le32_to_cpu(head->result);
+
+	/*
+	 * Tolerate 2 consecutive ESTALEs from the same mds.
+	 * FIXME: we should be looking at the cap migrate_seq.
+	 */
+	if (result == -ESTALE) {
+		req->r_direct_mode = USE_AUTH_MDS;
+		req->r_num_stale++;
+		if (req->r_num_stale <= 2) {
+			__do_request(mdsc, req);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		}
+	} else {
+		req->r_num_stale = 0;
+	}
+
+	if (head->safe) {
+		req->r_got_safe = true;
+		__unregister_request(mdsc, req);
+		complete(&req->r_safe_completion);
+
+		if (req->r_got_unsafe) {
+			/*
+			 * We already handled the unsafe response, now do the
+			 * cleanup.  No need to examine the response; the MDS
+			 * doesn't include any result info in the safe
+			 * response.  And even if it did, there is nothing
+			 * useful we could do with a revised return value.
+			 */
+			dout("got safe reply %llu, mds%d\n", tid, mds);
+			list_del_init(&req->r_unsafe_item);
+
+			/* last unsafe request during umount? */
+			if (mdsc->stopping && !__get_oldest_req(mdsc))
+				complete(&mdsc->safe_umount_waiters);
+			mutex_unlock(&mdsc->mutex);
+			goto out;
+		}
+	}
+
+	BUG_ON(req->r_reply);
+
+	if (!head->safe) {
+		req->r_got_unsafe = true;
+		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+	}
+
+	dout("handle_reply tid %lld result %d\n", tid, result);
+	rinfo = &req->r_reply_info;
+	err = parse_reply_info(msg, rinfo);
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+	if (err < 0) {
+		pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+		ceph_msg_dump(msg);
+		goto out_err;
+	}
+
+	/* snap trace */
+	if (rinfo->snapblob_len) {
+		down_write(&mdsc->snap_rwsem);
+		ceph_update_snap_trace(mdsc, rinfo->snapblob,
+			       rinfo->snapblob + rinfo->snapblob_len,
+			       le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+		downgrade_write(&mdsc->snap_rwsem);
+	} else {
+		down_read(&mdsc->snap_rwsem);
+	}
+
+	/* insert trace into our cache */
+	err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
+	if (err == 0) {
+		if (result == 0 && rinfo->dir_nr)
+			ceph_readdir_prepopulate(req, req->r_session);
+		ceph_unreserve_caps(&req->r_caps_reservation);
+	}
+
+	up_read(&mdsc->snap_rwsem);
+out_err:
+	if (err) {
+		req->r_err = err;
+	} else {
+		req->r_reply = msg;
+		ceph_msg_get(msg);
+	}
+
+	add_cap_releases(mdsc, req->r_session, -1);
+	mutex_unlock(&session->s_mutex);
+
+	/* kick calling process */
+	complete_request(mdsc, req);
+out:
+	ceph_mdsc_put_request(req);
+	return;
+}
+
+
+
+/*
+ * handle mds notification that our request has been forwarded.
+ */
+static void handle_forward(struct ceph_mds_client *mdsc,
+			   struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
+{
+	struct ceph_mds_request *req;
+	u64 tid = le64_to_cpu(msg->hdr.tid);
+	u32 next_mds;
+	u32 fwd_seq;
+	int err = -EINVAL;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+
+	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+	next_mds = ceph_decode_32(&p);
+	fwd_seq = ceph_decode_32(&p);
+
+	mutex_lock(&mdsc->mutex);
+	req = __lookup_request(mdsc, tid);
+	if (!req) {
+		dout("forward %llu to mds%d - req dne\n", tid, next_mds);
+		goto out;  /* dup reply? */
+	}
+
+	if (fwd_seq <= req->r_num_fwd) {
+		dout("forward %llu to mds%d - old seq %d <= %d\n",
+		     tid, next_mds, req->r_num_fwd, fwd_seq);
+	} else {
+		/* resend. forward race not possible; mds would drop */
+		dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+		req->r_num_fwd = fwd_seq;
+		req->r_resend_mds = next_mds;
+		put_request_session(req);
+		__do_request(mdsc, req);
+	}
+	ceph_mdsc_put_request(req);
+out:
+	mutex_unlock(&mdsc->mutex);
+	return;
+
+bad:
+	pr_err("mdsc_handle_forward decode error err=%d\n", err);
+}
+
+/*
+ * handle a mds session control message
+ */
+static void handle_session(struct ceph_mds_session *session,
+			   struct ceph_msg *msg)
+{
+	struct ceph_mds_client *mdsc = session->s_mdsc;
+	u32 op;
+	u64 seq;
+	int mds = session->s_mds;
+	struct ceph_mds_session_head *h = msg->front.iov_base;
+	int wake = 0;
+
+	/* decode */
+	if (msg->front.iov_len != sizeof(*h))
+		goto bad;
+	op = le32_to_cpu(h->op);
+	seq = le64_to_cpu(h->seq);
+
+	mutex_lock(&mdsc->mutex);
+	if (op == CEPH_SESSION_CLOSE)
+		__unregister_session(mdsc, session);
+	/* FIXME: this ttl calculation is generous */
+	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
+	mutex_unlock(&mdsc->mutex);
+
+	mutex_lock(&session->s_mutex);
+
+	dout("handle_session mds%d %s %p state %s seq %llu\n",
+	     mds, ceph_session_op_name(op), session,
+	     session_state_name(session->s_state), seq);
+
+	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		pr_info("mds%d came back\n", session->s_mds);
+	}
+
+	switch (op) {
+	case CEPH_SESSION_OPEN:
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		renewed_caps(mdsc, session, 0);
+		wake = 1;
+		if (mdsc->stopping)
+			__close_session(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RENEWCAPS:
+		if (session->s_renew_seq == seq)
+			renewed_caps(mdsc, session, 1);
+		break;
+
+	case CEPH_SESSION_CLOSE:
+		remove_session_caps(session);
+		wake = 1; /* for good measure */
+		complete(&mdsc->session_close_waiters);
+		kick_requests(mdsc, mds, 0);      /* cur only */
+		break;
+
+	case CEPH_SESSION_STALE:
+		pr_info("mds%d caps went stale, renewing\n",
+			session->s_mds);
+		spin_lock(&session->s_cap_lock);
+		session->s_cap_gen++;
+		session->s_cap_ttl = 0;
+		spin_unlock(&session->s_cap_lock);
+		send_renew_caps(mdsc, session);
+		break;
+
+	case CEPH_SESSION_RECALL_STATE:
+		trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
+		break;
+
+	default:
+		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
+		WARN_ON(1);
+	}
+
+	mutex_unlock(&session->s_mutex);
+	if (wake) {
+		mutex_lock(&mdsc->mutex);
+		__wake_requests(mdsc, &session->s_waiting);
+		mutex_unlock(&mdsc->mutex);
+	}
+	return;
+
+bad:
+	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
+	       (int)msg->front.iov_len);
+	ceph_msg_dump(msg);
+	return;
+}
+
+
+/*
+ * called under session->mutex.
+ */
+static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *session)
+{
+	struct ceph_mds_request *req, *nreq;
+	int err;
+
+	dout("replay_unsafe_requests mds%d\n", session->s_mds);
+
+	mutex_lock(&mdsc->mutex);
+	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
+		err = __prepare_send_request(mdsc, req, session->s_mds);
+		if (!err) {
+			ceph_msg_get(req->r_request);
+			ceph_con_send(&session->s_con, req->r_request);
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+/*
+ * Encode information about a cap for a reconnect with the MDS.
+ */
+static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
+			  void *arg)
+{
+	struct ceph_mds_cap_reconnect rec;
+	struct ceph_inode_info *ci;
+	struct ceph_pagelist *pagelist = arg;
+	char *path;
+	int pathlen, err;
+	u64 pathbase;
+	struct dentry *dentry;
+
+	ci = cap->ci;
+
+	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
+	     inode, ceph_vinop(inode), cap, cap->cap_id,
+	     ceph_cap_string(cap->issued));
+	err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
+	if (err)
+		return err;
+
+	dentry = d_find_alias(inode);
+	if (dentry) {
+		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			BUG_ON(err);
+		}
+	} else {
+		path = NULL;
+		pathlen = 0;
+	}
+	err = ceph_pagelist_encode_string(pagelist, path, pathlen);
+	if (err)
+		goto out;
+
+	spin_lock(&inode->i_lock);
+	cap->seq = 0;        /* reset cap seq */
+	cap->issue_seq = 0;  /* and issue_seq */
+	rec.cap_id = cpu_to_le64(cap->cap_id);
+	rec.pathbase = cpu_to_le64(pathbase);
+	rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+	rec.issued = cpu_to_le32(cap->issued);
+	rec.size = cpu_to_le64(inode->i_size);
+	ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
+	ceph_encode_timespec(&rec.atime, &inode->i_atime);
+	rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+	spin_unlock(&inode->i_lock);
+
+	err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
+
+out:
+	kfree(path);
+	dput(dentry);
+	return err;
+}
+
+
+/*
+ * If an MDS fails and recovers, clients need to reconnect in order to
+ * reestablish shared state.  This includes all caps issued through
+ * this session _and_ the snap_realm hierarchy.  Because it's not
+ * clear which snap realms the mds cares about, we send everything we
+ * know about.. that ensures we'll then get any new info the
+ * recovering MDS might have.
+ *
+ * This is a relatively heavyweight operation, but it's rare.
+ *
+ * called with mdsc->mutex held.
+ */
+static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+{
+	struct ceph_mds_session *session = NULL;
+	struct ceph_msg *reply;
+	struct rb_node *p;
+	int err;
+	struct ceph_pagelist *pagelist;
+
+	pr_info("reconnect to recovering mds%d\n", mds);
+
+	pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+	if (!pagelist)
+		goto fail_nopagelist;
+	ceph_pagelist_init(pagelist);
+
+	reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
+	if (IS_ERR(reply)) {
+		err = PTR_ERR(reply);
+		goto fail_nomsg;
+	}
+
+	/* find session */
+	session = __ceph_lookup_mds_session(mdsc, mds);
+	mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
+
+	if (session) {
+		mutex_lock(&session->s_mutex);
+
+		session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+		session->s_seq = 0;
+
+		ceph_con_open(&session->s_con,
+			      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+		/* replay unsafe requests */
+		replay_unsafe_requests(mdsc, session);
+	} else {
+		dout("no session for mds%d, will send short reconnect\n",
+		     mds);
+	}
+
+	down_read(&mdsc->snap_rwsem);
+
+	if (!session)
+		goto send;
+	dout("session %p state %s\n", session,
+	     session_state_name(session->s_state));
+
+	/* traverse this session's caps */
+	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+	if (err)
+		goto fail;
+	err = iterate_session_caps(session, encode_caps_cb, pagelist);
+	if (err < 0)
+		goto out;
+
+	/*
+	 * snaprealms.  we provide mds with the ino, seq (version), and
+	 * parent for all of our realms.  If the mds has any newer info,
+	 * it will tell us.
+	 */
+	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
+		struct ceph_snap_realm *realm =
+			rb_entry(p, struct ceph_snap_realm, node);
+		struct ceph_mds_snaprealm_reconnect sr_rec;
+
+		dout(" adding snap realm %llx seq %lld parent %llx\n",
+		     realm->ino, realm->seq, realm->parent_ino);
+		sr_rec.ino = cpu_to_le64(realm->ino);
+		sr_rec.seq = cpu_to_le64(realm->seq);
+		sr_rec.parent = cpu_to_le64(realm->parent_ino);
+		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
+		if (err)
+			goto fail;
+	}
+
+send:
+	reply->pagelist = pagelist;
+	reply->hdr.data_len = cpu_to_le32(pagelist->length);
+	reply->nr_pages = calc_pages_for(0, pagelist->length);
+	ceph_con_send(&session->s_con, reply);
+
+	if (session) {
+		session->s_state = CEPH_MDS_SESSION_OPEN;
+		__wake_requests(mdsc, &session->s_waiting);
+	}
+
+out:
+	up_read(&mdsc->snap_rwsem);
+	if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+	mutex_lock(&mdsc->mutex);
+	return;
+
+fail:
+	ceph_msg_put(reply);
+fail_nomsg:
+	ceph_pagelist_release(pagelist);
+	kfree(pagelist);
+fail_nopagelist:
+	pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
+	goto out;
+}
+
+
+/*
+ * compare old and new mdsmaps, kicking requests
+ * and closing out old connections as necessary
+ *
+ * called under mdsc->mutex.
+ */
+static void check_new_map(struct ceph_mds_client *mdsc,
+			  struct ceph_mdsmap *newmap,
+			  struct ceph_mdsmap *oldmap)
+{
+	int i;
+	int oldstate, newstate;
+	struct ceph_mds_session *s;
+
+	dout("check_new_map new %u old %u\n",
+	     newmap->m_epoch, oldmap->m_epoch);
+
+	for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i] == NULL)
+			continue;
+		s = mdsc->sessions[i];
+		oldstate = ceph_mdsmap_get_state(oldmap, i);
+		newstate = ceph_mdsmap_get_state(newmap, i);
+
+		dout("check_new_map mds%d state %s -> %s (session %s)\n",
+		     i, ceph_mds_state_name(oldstate),
+		     ceph_mds_state_name(newstate),
+		     session_state_name(s->s_state));
+
+		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
+			   ceph_mdsmap_get_addr(newmap, i),
+			   sizeof(struct ceph_entity_addr))) {
+			if (s->s_state == CEPH_MDS_SESSION_OPENING) {
+				/* the session never opened, just close it
+				 * out now */
+				__wake_requests(mdsc, &s->s_waiting);
+				__unregister_session(mdsc, s);
+			} else {
+				/* just close it */
+				mutex_unlock(&mdsc->mutex);
+				mutex_lock(&s->s_mutex);
+				mutex_lock(&mdsc->mutex);
+				ceph_con_close(&s->s_con);
+				mutex_unlock(&s->s_mutex);
+				s->s_state = CEPH_MDS_SESSION_RESTARTING;
+			}
+
+			/* kick any requests waiting on the recovering mds */
+			kick_requests(mdsc, i, 1);
+		} else if (oldstate == newstate) {
+			continue;  /* nothing new with this mds */
+		}
+
+		/*
+		 * send reconnect?
+		 */
+		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
+		    newstate >= CEPH_MDS_STATE_RECONNECT)
+			send_mds_reconnect(mdsc, i);
+
+		/*
+		 * kick requests on any mds that has gone active.
+		 *
+		 * kick requests on cur or forwarder: we may have sent
+		 * the request to mds1, mds1 told us it forwarded it
+		 * to mds2, but then we learn mds1 failed and can't be
+		 * sure it successfully forwarded our request before
+		 * it died.
+		 */
+		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
+		    newstate >= CEPH_MDS_STATE_ACTIVE) {
+			pr_info("mds%d reconnect completed\n", s->s_mds);
+			kick_requests(mdsc, i, 1);
+			ceph_kick_flushing_caps(mdsc, s);
+			wake_up_session_caps(s, 1);
+		}
+	}
+}
+
+
+
+/*
+ * leases
+ */
+
+/*
+ * caller must hold session s_mutex, dentry->d_lock
+ */
+void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
+{
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+	ceph_put_mds_session(di->lease_session);
+	di->lease_session = NULL;
+}
+
+static void handle_lease(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *session,
+			 struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->client->sb;
+	struct inode *inode;
+	struct ceph_inode_info *ci;
+	struct dentry *parent, *dentry;
+	struct ceph_dentry_info *di;
+	int mds = session->s_mds;
+	struct ceph_mds_lease *h = msg->front.iov_base;
+	struct ceph_vino vino;
+	int mask;
+	struct qstr dname;
+	int release = 0;
+
+	dout("handle_lease from mds%d\n", mds);
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
+		goto bad;
+	vino.ino = le64_to_cpu(h->ino);
+	vino.snap = CEPH_NOSNAP;
+	mask = le16_to_cpu(h->mask);
+	dname.name = (void *)h + sizeof(*h) + sizeof(u32);
+	dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
+	if (dname.len != get_unaligned_le32(h+1))
+		goto bad;
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+
+	/* lookup inode */
+	inode = ceph_find_inode(sb, vino);
+	dout("handle_lease '%s', mask %d, ino %llx %p\n",
+	     ceph_lease_op_name(h->action), mask, vino.ino, inode);
+	if (inode == NULL) {
+		dout("handle_lease no inode %llx\n", vino.ino);
+		goto release;
+	}
+	ci = ceph_inode(inode);
+
+	/* dentry */
+	parent = d_find_alias(inode);
+	if (!parent) {
+		dout("no parent dentry on inode %p\n", inode);
+		WARN_ON(1);
+		goto release;  /* hrm... */
+	}
+	dname.hash = full_name_hash(dname.name, dname.len);
+	dentry = d_lookup(parent, &dname);
+	dput(parent);
+	if (!dentry)
+		goto release;
+
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	switch (h->action) {
+	case CEPH_MDS_LEASE_REVOKE:
+		if (di && di->lease_session == session) {
+			h->seq = cpu_to_le32(di->lease_seq);
+			__ceph_mdsc_drop_dentry_lease(dentry);
+		}
+		release = 1;
+		break;
+
+	case CEPH_MDS_LEASE_RENEW:
+		if (di && di->lease_session == session &&
+		    di->lease_gen == session->s_cap_gen &&
+		    di->lease_renew_from &&
+		    di->lease_renew_after == 0) {
+			unsigned long duration =
+				le32_to_cpu(h->duration_ms) * HZ / 1000;
+
+			di->lease_seq = le32_to_cpu(h->seq);
+			dentry->d_time = di->lease_renew_from + duration;
+			di->lease_renew_after = di->lease_renew_from +
+				(duration >> 1);
+			di->lease_renew_from = 0;
+		}
+		break;
+	}
+	spin_unlock(&dentry->d_lock);
+	dput(dentry);
+
+	if (!release)
+		goto out;
+
+release:
+	/* let's just reuse the same message */
+	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
+	ceph_msg_get(msg);
+	ceph_con_send(&session->s_con, msg);
+
+out:
+	iput(inode);
+	mutex_unlock(&session->s_mutex);
+	return;
+
+bad:
+	pr_err("corrupt lease message\n");
+	ceph_msg_dump(msg);
+}
+
+void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+			      struct inode *inode,
+			      struct dentry *dentry, char action,
+			      u32 seq)
+{
+	struct ceph_msg *msg;
+	struct ceph_mds_lease *lease;
+	int len = sizeof(*lease) + sizeof(u32);
+	int dnamelen = 0;
+
+	dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
+	     inode, dentry, ceph_lease_op_name(action), session->s_mds);
+	dnamelen = dentry->d_name.len;
+	len += dnamelen;
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+	if (IS_ERR(msg))
+		return;
+	lease = msg->front.iov_base;
+	lease->action = action;
+	lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+	lease->ino = cpu_to_le64(ceph_vino(inode).ino);
+	lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
+	lease->seq = cpu_to_le32(seq);
+	put_unaligned_le32(dnamelen, lease + 1);
+	memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
+
+	/*
+	 * if this is a preemptive lease RELEASE, no need to
+	 * flush request stream, since the actual request will
+	 * soon follow.
+	 */
+	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
+
+	ceph_con_send(&session->s_con, msg);
+}
+
+/*
+ * Preemptively release a lease we expect to invalidate anyway.
+ * Pass @inode always, @dentry is optional.
+ */
+void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
+			     struct dentry *dentry, int mask)
+{
+	struct ceph_dentry_info *di;
+	struct ceph_mds_session *session;
+	u32 seq;
+
+	BUG_ON(inode == NULL);
+	BUG_ON(dentry == NULL);
+	BUG_ON(mask != CEPH_LOCK_DN);
+
+	/* is dentry lease valid? */
+	spin_lock(&dentry->d_lock);
+	di = ceph_dentry(dentry);
+	if (!di || !di->lease_session ||
+	    di->lease_session->s_mds < 0 ||
+	    di->lease_gen != di->lease_session->s_cap_gen ||
+	    !time_before(jiffies, dentry->d_time)) {
+		dout("lease_release inode %p dentry %p -- "
+		     "no lease on %d\n",
+		     inode, dentry, mask);
+		spin_unlock(&dentry->d_lock);
+		return;
+	}
+
+	/* we do have a lease on this dentry; note mds and seq */
+	session = ceph_get_mds_session(di->lease_session);
+	seq = di->lease_seq;
+	__ceph_mdsc_drop_dentry_lease(dentry);
+	spin_unlock(&dentry->d_lock);
+
+	dout("lease_release inode %p dentry %p mask %d to mds%d\n",
+	     inode, dentry, mask, session->s_mds);
+	ceph_mdsc_lease_send_msg(session, inode, dentry,
+				 CEPH_MDS_LEASE_RELEASE, seq);
+	ceph_put_mds_session(session);
+}
+
+/*
+ * drop all leases (and dentry refs) in preparation for umount
+ */
+static void drop_leases(struct ceph_mds_client *mdsc)
+{
+	int i;
+
+	dout("drop_leases\n");
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		mutex_unlock(&mdsc->mutex);
+		mutex_lock(&s->s_mutex);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+}
+
+
+
+/*
+ * delayed work -- periodically trim expired leases, renew caps with mds
+ */
+static void schedule_delayed(struct ceph_mds_client *mdsc)
+{
+	int delay = 5;
+	unsigned hz = round_jiffies_relative(HZ * delay);
+	schedule_delayed_work(&mdsc->delayed_work, hz);
+}
+
+static void delayed_work(struct work_struct *work)
+{
+	int i;
+	struct ceph_mds_client *mdsc =
+		container_of(work, struct ceph_mds_client, delayed_work.work);
+	int renew_interval;
+	int renew_caps;
+
+	dout("mdsc delayed_work\n");
+	ceph_check_delayed_caps(mdsc);
+
+	mutex_lock(&mdsc->mutex);
+	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
+	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
+				   mdsc->last_renew_caps);
+	if (renew_caps)
+		mdsc->last_renew_caps = jiffies;
+
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
+		if (s == NULL)
+			continue;
+		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+			dout("resending session close request for mds%d\n",
+			     s->s_mds);
+			request_close_session(mdsc, s);
+			ceph_put_mds_session(s);
+			continue;
+		}
+		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+			if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+				s->s_state = CEPH_MDS_SESSION_HUNG;
+				pr_info("mds%d hung\n", s->s_mds);
+			}
+		}
+		if (s->s_state < CEPH_MDS_SESSION_OPEN) {
+			/* this mds is failed or recovering, just wait */
+			ceph_put_mds_session(s);
+			continue;
+		}
+		mutex_unlock(&mdsc->mutex);
+
+		mutex_lock(&s->s_mutex);
+		if (renew_caps)
+			send_renew_caps(mdsc, s);
+		else
+			ceph_con_keepalive(&s->s_con);
+		add_cap_releases(mdsc, s, -1);
+		send_cap_releases(mdsc, s);
+		mutex_unlock(&s->s_mutex);
+		ceph_put_mds_session(s);
+
+		mutex_lock(&mdsc->mutex);
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	schedule_delayed(mdsc);
+}
+
+
+int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
+{
+	mdsc->client = client;
+	mutex_init(&mdsc->mutex);
+	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+	init_completion(&mdsc->safe_umount_waiters);
+	init_completion(&mdsc->session_close_waiters);
+	INIT_LIST_HEAD(&mdsc->waiting_for_map);
+	mdsc->sessions = NULL;
+	mdsc->max_sessions = 0;
+	mdsc->stopping = 0;
+	init_rwsem(&mdsc->snap_rwsem);
+	mdsc->snap_realms = RB_ROOT;
+	INIT_LIST_HEAD(&mdsc->snap_empty);
+	spin_lock_init(&mdsc->snap_empty_lock);
+	mdsc->last_tid = 0;
+	mdsc->request_tree = RB_ROOT;
+	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
+	mdsc->last_renew_caps = jiffies;
+	INIT_LIST_HEAD(&mdsc->cap_delay_list);
+	spin_lock_init(&mdsc->cap_delay_lock);
+	INIT_LIST_HEAD(&mdsc->snap_flush_list);
+	spin_lock_init(&mdsc->snap_flush_lock);
+	mdsc->cap_flush_seq = 0;
+	INIT_LIST_HEAD(&mdsc->cap_dirty);
+	mdsc->num_cap_flushing = 0;
+	spin_lock_init(&mdsc->cap_dirty_lock);
+	init_waitqueue_head(&mdsc->cap_flushing_wq);
+	spin_lock_init(&mdsc->dentry_lru_lock);
+	INIT_LIST_HEAD(&mdsc->dentry_lru);
+	return 0;
+}
+
+/*
+ * Wait for safe replies on open mds requests.  If we time out, drop
+ * all requests from the tree to avoid dangling dentry refs.
+ */
+static void wait_requests(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_request *req;
+	struct ceph_client *client = mdsc->client;
+
+	mutex_lock(&mdsc->mutex);
+	if (__get_oldest_req(mdsc)) {
+		mutex_unlock(&mdsc->mutex);
+
+		dout("wait_requests waiting for requests\n");
+		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
+				    client->mount_args->mount_timeout * HZ);
+
+		/* tear down remaining requests */
+		mutex_lock(&mdsc->mutex);
+		while ((req = __get_oldest_req(mdsc))) {
+			dout("wait_requests timed out on tid %llu\n",
+			     req->r_tid);
+			__unregister_request(mdsc, req);
+		}
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_requests done\n");
+}
+
+/*
+ * called before mount is ro, and before dentries are torn down.
+ * (hmm, does this still race with new lookups?)
+ */
+void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
+{
+	dout("pre_umount\n");
+	mdsc->stopping = 1;
+
+	drop_leases(mdsc);
+	ceph_flush_dirty_caps(mdsc);
+	wait_requests(mdsc);
+}
+
+/*
+ * wait for all write mds requests to flush.
+ */
+static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+	struct ceph_mds_request *req = NULL, *nextreq;
+	struct rb_node *n;
+
+	mutex_lock(&mdsc->mutex);
+	dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
+	req = __get_oldest_req(mdsc);
+	while (req && req->r_tid <= want_tid) {
+		/* find next request */
+		n = rb_next(&req->r_node);
+		if (n)
+			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+		else
+			nextreq = NULL;
+		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
+			/* write op */
+			ceph_mdsc_get_request(req);
+			if (nextreq)
+				ceph_mdsc_get_request(nextreq);
+			mutex_unlock(&mdsc->mutex);
+			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
+			     req->r_tid, want_tid);
+			wait_for_completion(&req->r_safe_completion);
+			mutex_lock(&mdsc->mutex);
+			ceph_mdsc_put_request(req);
+			if (!nextreq)
+				break;  /* next dne before, so we're done! */
+			if (RB_EMPTY_NODE(&nextreq->r_node)) {
+				/* next request was removed from tree */
+				ceph_mdsc_put_request(nextreq);
+				goto restart;
+			}
+			ceph_mdsc_put_request(nextreq);  /* won't go away */
+		}
+		req = nextreq;
+	}
+	mutex_unlock(&mdsc->mutex);
+	dout("wait_unsafe_requests done\n");
+}
+
+void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
+{
+	u64 want_tid, want_flush;
+
+	dout("sync\n");
+	mutex_lock(&mdsc->mutex);
+	want_tid = mdsc->last_tid;
+	want_flush = mdsc->cap_flush_seq;
+	mutex_unlock(&mdsc->mutex);
+	dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
+
+	ceph_flush_dirty_caps(mdsc);
+
+	wait_unsafe_requests(mdsc, want_tid);
+	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+}
+
+
+/*
+ * called after sb is ro.
+ */
+void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
+{
+	struct ceph_mds_session *session;
+	int i;
+	int n;
+	struct ceph_client *client = mdsc->client;
+	unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+
+	dout("close_sessions\n");
+
+	mutex_lock(&mdsc->mutex);
+
+	/* close sessions */
+	started = jiffies;
+	while (time_before(jiffies, started + timeout)) {
+		dout("closing sessions\n");
+		n = 0;
+		for (i = 0; i < mdsc->max_sessions; i++) {
+			session = __ceph_lookup_mds_session(mdsc, i);
+			if (!session)
+				continue;
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&session->s_mutex);
+			__close_session(mdsc, session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			mutex_lock(&mdsc->mutex);
+			n++;
+		}
+		if (n == 0)
+			break;
+
+		if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
+			break;
+
+		dout("waiting for sessions to close\n");
+		mutex_unlock(&mdsc->mutex);
+		wait_for_completion_timeout(&mdsc->session_close_waiters,
+					    timeout);
+		mutex_lock(&mdsc->mutex);
+	}
+
+	/* tear down remaining sessions */
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		if (mdsc->sessions[i]) {
+			session = get_session(mdsc->sessions[i]);
+			__unregister_session(mdsc, session);
+			mutex_unlock(&mdsc->mutex);
+			mutex_lock(&session->s_mutex);
+			remove_session_caps(session);
+			mutex_unlock(&session->s_mutex);
+			ceph_put_mds_session(session);
+			mutex_lock(&mdsc->mutex);
+		}
+	}
+
+	WARN_ON(!list_empty(&mdsc->cap_delay_list));
+
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_cleanup_empty_realms(mdsc);
+
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+
+	dout("stopped\n");
+}
+
+void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	if (mdsc->mdsmap)
+		ceph_mdsmap_destroy(mdsc->mdsmap);
+	kfree(mdsc->sessions);
+}
+
+
+/*
+ * handle mds map update.
+ */
+void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+	u32 epoch;
+	u32 maplen;
+	void *p = msg->front.iov_base;
+	void *end = p + msg->front.iov_len;
+	struct ceph_mdsmap *newmap, *oldmap;
+	struct ceph_fsid fsid;
+	int err = -EINVAL;
+
+	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_check_fsid(mdsc->client, &fsid) < 0)
+		return;
+	epoch = ceph_decode_32(&p);
+	maplen = ceph_decode_32(&p);
+	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
+
+	/* do we need it? */
+	ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
+	mutex_lock(&mdsc->mutex);
+	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
+		dout("handle_map epoch %u <= our %u\n",
+		     epoch, mdsc->mdsmap->m_epoch);
+		mutex_unlock(&mdsc->mutex);
+		return;
+	}
+
+	newmap = ceph_mdsmap_decode(&p, end);
+	if (IS_ERR(newmap)) {
+		err = PTR_ERR(newmap);
+		goto bad_unlock;
+	}
+
+	/* swap into place */
+	if (mdsc->mdsmap) {
+		oldmap = mdsc->mdsmap;
+		mdsc->mdsmap = newmap;
+		check_new_map(mdsc, newmap, oldmap);
+		ceph_mdsmap_destroy(oldmap);
+	} else {
+		mdsc->mdsmap = newmap;  /* first mds map */
+	}
+	mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
+
+	__wake_requests(mdsc, &mdsc->waiting_for_map);
+
+	mutex_unlock(&mdsc->mutex);
+	schedule_delayed(mdsc);
+	return;
+
+bad_unlock:
+	mutex_unlock(&mdsc->mutex);
+bad:
+	pr_err("error decoding mdsmap %d\n", err);
+	return;
+}
+
+static struct ceph_connection *con_get(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	if (get_session(s)) {
+		dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
+		return con;
+	}
+	dout("mdsc con_get %p FAIL\n", s);
+	return NULL;
+}
+
+static void con_put(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	ceph_put_mds_session(s);
+	dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
+}
+
+/*
+ * if the client is unresponsive for long enough, the mds will kill
+ * the session entirely.
+ */
+static void peer_reset(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+
+	pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
+	       s->s_mds);
+}
+
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	mutex_lock(&mdsc->mutex);
+	if (__verify_registered_session(mdsc, s) < 0) {
+		mutex_unlock(&mdsc->mutex);
+		goto out;
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	switch (type) {
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(mdsc, msg);
+		break;
+	case CEPH_MSG_CLIENT_SESSION:
+		handle_session(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REPLY:
+		handle_reply(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+		handle_forward(mdsc, s, msg);
+		break;
+	case CEPH_MSG_CLIENT_CAPS:
+		ceph_handle_caps(s, msg);
+		break;
+	case CEPH_MSG_CLIENT_SNAP:
+		ceph_handle_snap(mdsc, s, msg);
+		break;
+	case CEPH_MSG_CLIENT_LEASE:
+		handle_lease(mdsc, s, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+out:
+	ceph_msg_put(msg);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+			  void **buf, int *len, int *proto,
+			  void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && s->s_authorizer) {
+		ac->ops->destroy_authorizer(ac, s->s_authorizer);
+		s->s_authorizer = NULL;
+	}
+	if (s->s_authorizer == NULL) {
+		if (ac->ops->create_authorizer) {
+			ret = ac->ops->create_authorizer(
+				ac, CEPH_ENTITY_TYPE_MDS,
+				&s->s_authorizer,
+				&s->s_authorizer_buf,
+				&s->s_authorizer_buf_len,
+				&s->s_authorizer_reply_buf,
+				&s->s_authorizer_reply_buf_len);
+			if (ret)
+				return ret;
+		}
+	}
+
+	*proto = ac->protocol;
+	*buf = s->s_authorizer_buf;
+	*len = s->s_authorizer_buf_len;
+	*reply_buf = s->s_authorizer_reply_buf;
+	*reply_len = s->s_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mds_client *mdsc = s->s_mdsc;
+	struct ceph_auth_client *ac = mdsc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+
+	return ceph_monc_validate_auth(&mdsc->client->monc);
+}
+
+const static struct ceph_connection_operations mds_con_ops = {
+	.get = con_get,
+	.put = con_put,
+	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
+	.peer_reset = peer_reset,
+};
+
+
+
+
+/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 00000000000..961cc6f6587
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
+#ifndef _FS_CEPH_MDS_CLIENT_H
+#define _FS_CEPH_MDS_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "mdsmap.h"
+
+/*
+ * Some lock dependencies:
+ *
+ * session->s_mutex
+ *         mdsc->mutex
+ *
+ *         mdsc->snap_rwsem
+ *
+ *         inode->i_lock
+ *                 mdsc->snap_flush_lock
+ *                 mdsc->cap_delay_lock
+ *
+ */
+
+struct ceph_client;
+struct ceph_cap;
+
+/*
+ * parsed info about a single inode.  pointers are into the encoded
+ * on-wire structures within the mds reply message payload.
+ */
+struct ceph_mds_reply_info_in {
+	struct ceph_mds_reply_inode *in;
+	u32 symlink_len;
+	char *symlink;
+	u32 xattr_len;
+	char *xattr_data;
+};
+
+/*
+ * parsed info about an mds reply, including information about the
+ * target inode and/or its parent directory and dentry, and directory
+ * contents (for readdir results).
+ */
+struct ceph_mds_reply_info_parsed {
+	struct ceph_mds_reply_head    *head;
+
+	struct ceph_mds_reply_info_in diri, targeti;
+	struct ceph_mds_reply_dirfrag *dirfrag;
+	char                          *dname;
+	u32                           dname_len;
+	struct ceph_mds_reply_lease   *dlease;
+
+	struct ceph_mds_reply_dirfrag *dir_dir;
+	int                           dir_nr;
+	char                          **dir_dname;
+	u32                           *dir_dname_len;
+	struct ceph_mds_reply_lease   **dir_dlease;
+	struct ceph_mds_reply_info_in *dir_in;
+	u8                            dir_complete, dir_end;
+
+	/* encoded blob describing snapshot contexts for certain
+	   operations (e.g., open) */
+	void *snapblob;
+	int snapblob_len;
+};
+
+
+/*
+ * cap releases are batched and sent to the MDS en masse.
+ */
+#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -			\
+				sizeof(struct ceph_mds_cap_release)) /	\
+			       sizeof(struct ceph_mds_cap_item))
+
+
+/*
+ * state associated with each MDS<->client session
+ */
+enum {
+	CEPH_MDS_SESSION_NEW = 1,
+	CEPH_MDS_SESSION_OPENING = 2,
+	CEPH_MDS_SESSION_OPEN = 3,
+	CEPH_MDS_SESSION_HUNG = 4,
+	CEPH_MDS_SESSION_CLOSING = 5,
+	CEPH_MDS_SESSION_RESTARTING = 6,
+	CEPH_MDS_SESSION_RECONNECTING = 7,
+};
+
+struct ceph_mds_session {
+	struct ceph_mds_client *s_mdsc;
+	int               s_mds;
+	int               s_state;
+	unsigned long     s_ttl;      /* time until mds kills us */
+	u64               s_seq;      /* incoming msg seq # */
+	struct mutex      s_mutex;    /* serialize session messages */
+
+	struct ceph_connection s_con;
+
+	struct ceph_authorizer *s_authorizer;
+	void             *s_authorizer_buf, *s_authorizer_reply_buf;
+	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
+
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
+	u32               s_cap_gen;  /* inc each time we get mds stale msg */
+	unsigned long     s_cap_ttl;  /* when session caps expire */
+	struct list_head  s_caps;     /* all caps issued by this session */
+	int               s_nr_caps, s_trim_caps;
+	int               s_num_cap_releases;
+	struct list_head  s_cap_releases; /* waiting cap_release messages */
+	struct list_head  s_cap_releases_done; /* ready to send */
+	struct ceph_cap  *s_cap_iterator;
+
+	/* protected by mutex */
+	struct list_head  s_cap_flushing;     /* inodes w/ flushing caps */
+	struct list_head  s_cap_snaps_flushing;
+	unsigned long     s_renew_requested; /* last time we sent a renew req */
+	u64               s_renew_seq;
+
+	atomic_t          s_ref;
+	struct list_head  s_waiting;  /* waiting requests */
+	struct list_head  s_unsafe;   /* unsafe requests */
+};
+
+/*
+ * modes of choosing which MDS to send a request to
+ */
+enum {
+	USE_ANY_MDS,
+	USE_RANDOM_MDS,
+	USE_AUTH_MDS,   /* prefer authoritative mds for this metadata item */
+};
+
+struct ceph_mds_request;
+struct ceph_mds_client;
+
+/*
+ * request completion callback
+ */
+typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
+					     struct ceph_mds_request *req);
+
+/*
+ * an in-flight mds request
+ */
+struct ceph_mds_request {
+	u64 r_tid;                   /* transaction id */
+	struct rb_node r_node;
+
+	int r_op;                    /* mds op code */
+	int r_mds;
+
+	/* operation on what? */
+	struct inode *r_inode;              /* arg1 */
+	struct dentry *r_dentry;            /* arg1 */
+	struct dentry *r_old_dentry;        /* arg2: rename from or link from */
+	char *r_path1, *r_path2;
+	struct ceph_vino r_ino1, r_ino2;
+
+	struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
+	struct inode *r_target_inode;       /* resulting inode */
+
+	union ceph_mds_request_args r_args;
+	int r_fmode;        /* file mode, if expecting cap */
+
+	/* for choosing which mds to send this request to */
+	int r_direct_mode;
+	u32 r_direct_hash;      /* choose dir frag based on this dentry hash */
+	bool r_direct_is_hash;  /* true if r_direct_hash is valid */
+
+	/* data payload is used for xattr ops */
+	struct page **r_pages;
+	int r_num_pages;
+	int r_data_len;
+
+	/* what caps shall we drop? */
+	int r_inode_drop, r_inode_unless;
+	int r_dentry_drop, r_dentry_unless;
+	int r_old_dentry_drop, r_old_dentry_unless;
+	struct inode *r_old_inode;
+	int r_old_inode_drop, r_old_inode_unless;
+
+	struct ceph_msg  *r_request;  /* original request */
+	struct ceph_msg  *r_reply;
+	struct ceph_mds_reply_info_parsed r_reply_info;
+	int r_err;
+	bool r_aborted;
+
+	unsigned long r_timeout;  /* optional.  jiffies */
+	unsigned long r_started;  /* start time to measure timeout against */
+	unsigned long r_request_started; /* start time for mds request only,
+					    used to measure lease durations */
+
+	/* link unsafe requests to parent directory, for fsync */
+	struct inode	*r_unsafe_dir;
+	struct list_head r_unsafe_dir_item;
+
+	struct ceph_mds_session *r_session;
+
+	int               r_attempts;   /* resend attempts */
+	int               r_num_fwd;    /* number of forward attempts */
+	int               r_num_stale;
+	int               r_resend_mds; /* mds to resend to next, if any*/
+
+	struct kref       r_kref;
+	struct list_head  r_wait;
+	struct completion r_completion;
+	struct completion r_safe_completion;
+	ceph_mds_request_callback_t r_callback;
+	struct list_head  r_unsafe_item;  /* per-session unsafe list item */
+	bool		  r_got_unsafe, r_got_safe;
+
+	bool              r_did_prepopulate;
+	u32               r_readdir_offset;
+
+	struct ceph_cap_reservation r_caps_reservation;
+	int r_num_caps;
+};
+
+/*
+ * mds client state
+ */
+struct ceph_mds_client {
+	struct ceph_client      *client;
+	struct mutex            mutex;         /* all nested structures */
+
+	struct ceph_mdsmap      *mdsmap;
+	struct completion       safe_umount_waiters, session_close_waiters;
+	struct list_head        waiting_for_map;
+
+	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
+	int                     max_sessions;  /* len of s_mds_sessions */
+	int                     stopping;      /* true if shutting down */
+
+	/*
+	 * snap_rwsem will cover cap linkage into snaprealms, and
+	 * realm snap contexts.  (later, we can do per-realm snap
+	 * contexts locks..)  the empty list contains realms with no
+	 * references (implying they contain no inodes with caps) that
+	 * should be destroyed.
+	 */
+	struct rw_semaphore     snap_rwsem;
+	struct rb_root          snap_realms;
+	struct list_head        snap_empty;
+	spinlock_t              snap_empty_lock;  /* protect snap_empty */
+
+	u64                    last_tid;      /* most recent mds request */
+	struct rb_root         request_tree;  /* pending mds requests */
+	struct delayed_work    delayed_work;  /* delayed work */
+	unsigned long    last_renew_caps;  /* last time we renewed our caps */
+	struct list_head cap_delay_list;   /* caps with delayed release */
+	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
+	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
+	spinlock_t       snap_flush_lock;
+
+	u64               cap_flush_seq;
+	struct list_head  cap_dirty;        /* inodes with dirty caps */
+	int               num_cap_flushing; /* # caps we are flushing */
+	spinlock_t        cap_dirty_lock;   /* protects above items */
+	wait_queue_head_t cap_flushing_wq;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry 	  *debugfs_file;
+#endif
+
+	spinlock_t	  dentry_lru_lock;
+	struct list_head  dentry_lru;
+	int		  num_dentry;
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+extern struct ceph_mds_session *
+__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
+
+static inline struct ceph_mds_session *
+ceph_get_mds_session(struct ceph_mds_session *s)
+{
+	atomic_inc(&s->s_ref);
+	return s;
+}
+
+extern void ceph_put_mds_session(struct ceph_mds_session *s);
+
+extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
+			     struct ceph_msg *msg, int mds);
+
+extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
+			   struct ceph_client *client);
+extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
+
+extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
+				    struct inode *inode,
+				    struct dentry *dn, int mask);
+
+extern struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
+extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_request *req);
+extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
+				struct inode *dir,
+				struct ceph_mds_request *req);
+static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
+{
+	kref_get(&req->r_kref);
+}
+extern void ceph_mdsc_release_request(struct kref *kref);
+static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
+{
+	kref_put(&req->r_kref, ceph_mdsc_release_request);
+}
+
+extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
+
+extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
+				  int stop_on_nosnap);
+
+extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
+extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
+				     struct inode *inode,
+				     struct dentry *dentry, char action,
+				     u32 seq);
+
+extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
+				 struct ceph_msg *msg);
+
+#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 00000000000..c4c498e6dfe
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
+#include "ceph_debug.h"
+
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "mdsmap.h"
+#include "messenger.h"
+#include "decode.h"
+
+#include "super.h"
+
+
+/*
+ * choose a random mds that is "up" (i.e. has a state > 0), or -1.
+ */
+int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
+{
+	int n = 0;
+	int i;
+	char r;
+
+	/* count */
+	for (i = 0; i < m->m_max_mds; i++)
+		if (m->m_info[i].state > 0)
+			n++;
+	if (n == 0)
+		return -1;
+
+	/* pick */
+	get_random_bytes(&r, 1);
+	n = r % n;
+	i = 0;
+	for (i = 0; n > 0; i++, n--)
+		while (m->m_info[i].state <= 0)
+			i++;
+
+	return i;
+}
+
+/*
+ * Decode an MDS map
+ *
+ * Ignore any fields we don't care about (there are quite a few of
+ * them).
+ */
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+{
+	struct ceph_mdsmap *m;
+	const void *start = *p;
+	int i, j, n;
+	int err = -EINVAL;
+	u16 version;
+
+	m = kzalloc(sizeof(*m), GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_16_safe(p, end, version, bad);
+
+	ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
+	m->m_epoch = ceph_decode_32(p);
+	m->m_client_epoch = ceph_decode_32(p);
+	m->m_last_failure = ceph_decode_32(p);
+	m->m_root = ceph_decode_32(p);
+	m->m_session_timeout = ceph_decode_32(p);
+	m->m_session_autoclose = ceph_decode_32(p);
+	m->m_max_file_size = ceph_decode_64(p);
+	m->m_max_mds = ceph_decode_32(p);
+
+	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+	if (m->m_info == NULL)
+		goto badmem;
+
+	/* pick out active nodes from mds_info (state > 0) */
+	n = ceph_decode_32(p);
+	for (i = 0; i < n; i++) {
+		u64 global_id;
+		u32 namelen;
+		s32 mds, inc, state;
+		u64 state_seq;
+		u8 infoversion;
+		struct ceph_entity_addr addr;
+		u32 num_export_targets;
+		void *pexport_targets = NULL;
+
+		ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
+		global_id = ceph_decode_64(p);
+		infoversion = ceph_decode_8(p);
+		*p += sizeof(u64);
+		namelen = ceph_decode_32(p);  /* skip mds name */
+		*p += namelen;
+
+		ceph_decode_need(p, end,
+				 4*sizeof(u32) + sizeof(u64) +
+				 sizeof(addr) + sizeof(struct ceph_timespec),
+				 bad);
+		mds = ceph_decode_32(p);
+		inc = ceph_decode_32(p);
+		state = ceph_decode_32(p);
+		state_seq = ceph_decode_64(p);
+		ceph_decode_copy(p, &addr, sizeof(addr));
+		ceph_decode_addr(&addr);
+		*p += sizeof(struct ceph_timespec);
+		*p += sizeof(u32);
+		ceph_decode_32_safe(p, end, namelen, bad);
+		*p += namelen;
+		if (infoversion >= 2) {
+			ceph_decode_32_safe(p, end, num_export_targets, bad);
+			pexport_targets = *p;
+			*p += num_export_targets * sizeof(u32);
+		} else {
+			num_export_targets = 0;
+		}
+
+		dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+		     i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
+		     ceph_mds_state_name(state));
+		if (mds >= 0 && mds < m->m_max_mds && state > 0) {
+			m->m_info[mds].global_id = global_id;
+			m->m_info[mds].state = state;
+			m->m_info[mds].addr = addr;
+			m->m_info[mds].num_export_targets = num_export_targets;
+			if (num_export_targets) {
+				m->m_info[mds].export_targets =
+					kcalloc(num_export_targets, sizeof(u32),
+						GFP_NOFS);
+				for (j = 0; j < num_export_targets; j++)
+					m->m_info[mds].export_targets[j] =
+					       ceph_decode_32(&pexport_targets);
+			} else {
+				m->m_info[mds].export_targets = NULL;
+			}
+		}
+	}
+
+	/* pg_pools */
+	ceph_decode_32_safe(p, end, n, bad);
+	m->m_num_data_pg_pools = n;
+	m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
+	if (!m->m_data_pg_pools)
+		goto badmem;
+	ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
+	for (i = 0; i < n; i++)
+		m->m_data_pg_pools[i] = ceph_decode_32(p);
+	m->m_cas_pg_pool = ceph_decode_32(p);
+
+	/* ok, we don't care about the rest. */
+	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
+	return m;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	pr_err("corrupt mdsmap\n");
+	print_hex_dump(KERN_DEBUG, "mdsmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
+	ceph_mdsmap_destroy(m);
+	return ERR_PTR(-EINVAL);
+}
+
+void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
+{
+	int i;
+
+	for (i = 0; i < m->m_max_mds; i++)
+		kfree(m->m_info[i].export_targets);
+	kfree(m->m_info);
+	kfree(m->m_data_pg_pools);
+	kfree(m);
+}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 00000000000..eacc131aa5c
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
+#ifndef _FS_CEPH_MDSMAP_H
+#define _FS_CEPH_MDSMAP_H
+
+#include "types.h"
+
+/*
+ * mds map - describe servers in the mds cluster.
+ *
+ * we limit fields to those the client actually xcares about
+ */
+struct ceph_mds_info {
+	u64 global_id;
+	struct ceph_entity_addr addr;
+	s32 state;
+	int num_export_targets;
+	u32 *export_targets;
+};
+
+struct ceph_mdsmap {
+	u32 m_epoch, m_client_epoch, m_last_failure;
+	u32 m_root;
+	u32 m_session_timeout;          /* seconds */
+	u32 m_session_autoclose;        /* seconds */
+	u64 m_max_file_size;
+	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
+	struct ceph_mds_info *m_info;
+
+	/* which object pools file data can be stored in */
+	int m_num_data_pg_pools;
+	u32 *m_data_pg_pools;
+	u32 m_cas_pg_pool;
+};
+
+static inline struct ceph_entity_addr *
+ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
+{
+	if (w >= m->m_max_mds)
+		return NULL;
+	return &m->m_info[w].addr;
+}
+
+static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
+{
+	BUG_ON(w < 0);
+	if (w >= m->m_max_mds)
+		return CEPH_MDS_STATE_DNE;
+	return m->m_info[w].state;
+}
+
+extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
+extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
+
+#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 00000000000..a32f0f896d9
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2239 @@
+#include "ceph_debug.h"
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <net/tcp.h>
+
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "pagelist.h"
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system.  The messenger provides ordered and reliable
+ * delivery.  We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error).  Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+const char *ceph_name_type_str(int t)
+{
+	switch (t) {
+	case CEPH_ENTITY_TYPE_MON: return "mon";
+	case CEPH_ENTITY_TYPE_MDS: return "mds";
+	case CEPH_ENTITY_TYPE_OSD: return "osd";
+	case CEPH_ENTITY_TYPE_CLIENT: return "client";
+	case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+	default: return "???";
+	}
+}
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+static char addr_str[MAX_ADDR_STR][40];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *pr_addr(const struct sockaddr_storage *ss)
+{
+	int i;
+	char *s;
+	struct sockaddr_in *in4 = (void *)ss;
+	unsigned char *quad = (void *)&in4->sin_addr.s_addr;
+	struct sockaddr_in6 *in6 = (void *)ss;
+
+	spin_lock(&addr_str_lock);
+	i = last_addr_str++;
+	if (last_addr_str == MAX_ADDR_STR)
+		last_addr_str = 0;
+	spin_unlock(&addr_str_lock);
+	s = addr_str[i];
+
+	switch (ss->ss_family) {
+	case AF_INET:
+		sprintf(s, "%u.%u.%u.%u:%u",
+			(unsigned int)quad[0],
+			(unsigned int)quad[1],
+			(unsigned int)quad[2],
+			(unsigned int)quad[3],
+			(unsigned int)ntohs(in4->sin_port));
+		break;
+
+	case AF_INET6:
+		sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
+			in6->sin6_addr.s6_addr16[0],
+			in6->sin6_addr.s6_addr16[1],
+			in6->sin6_addr.s6_addr16[2],
+			in6->sin6_addr.s6_addr16[3],
+			in6->sin6_addr.s6_addr16[4],
+			in6->sin6_addr.s6_addr16[5],
+			in6->sin6_addr.s6_addr16[6],
+			in6->sin6_addr.s6_addr16[7],
+			(unsigned int)ntohs(in6->sin6_port));
+		break;
+
+	default:
+		sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+	}
+
+	return s;
+}
+
+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+	ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int __init ceph_msgr_init(void)
+{
+	ceph_msgr_wq = create_workqueue("ceph-msgr");
+	if (IS_ERR(ceph_msgr_wq)) {
+		int ret = PTR_ERR(ceph_msgr_wq);
+		pr_err("msgr_init failed to create workqueue: %d\n", ret);
+		ceph_msgr_wq = NULL;
+		return ret;
+	}
+	return 0;
+}
+
+void ceph_msgr_exit(void)
+{
+	destroy_workqueue(ceph_msgr_wq);
+}
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+	if (sk->sk_state != TCP_CLOSE_WAIT) {
+		dout("ceph_data_ready on %p state = %lu, queueing work\n",
+		     con, con->state);
+		queue_con(con);
+	}
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	/* only queue to workqueue if there is data we want to write. */
+	if (test_bit(WRITE_PENDING, &con->state)) {
+		dout("ceph_write_space %p queueing write work\n", con);
+		queue_con(con);
+	} else {
+		dout("ceph_write_space %p nothing to write\n", con);
+	}
+
+	/* since we have our own write_space, clear the SOCK_NOSPACE flag */
+	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+	struct ceph_connection *con =
+		(struct ceph_connection *)sk->sk_user_data;
+
+	dout("ceph_state_change %p state = %lu sk_state = %u\n",
+	     con, con->state, sk->sk_state);
+
+	if (test_bit(CLOSED, &con->state))
+		return;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		dout("ceph_state_change TCP_CLOSE\n");
+	case TCP_CLOSE_WAIT:
+		dout("ceph_state_change TCP_CLOSE_WAIT\n");
+		if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+			if (test_bit(CONNECTING, &con->state))
+				con->error_msg = "connection failed";
+			else
+				con->error_msg = "socket closed";
+			queue_con(con);
+		}
+		break;
+	case TCP_ESTABLISHED:
+		dout("ceph_state_change TCP_ESTABLISHED\n");
+		queue_con(con);
+		break;
+	}
+}
+
+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,
+			       struct ceph_connection *con)
+{
+	struct sock *sk = sock->sk;
+	sk->sk_user_data = (void *)con;
+	sk->sk_data_ready = ceph_data_ready;
+	sk->sk_write_space = ceph_write_space;
+	sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+	struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+	struct socket *sock;
+	int ret;
+
+	BUG_ON(con->sock);
+	ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ret)
+		return ERR_PTR(ret);
+	con->sock = sock;
+	sock->sk->sk_allocation = GFP_NOFS;
+
+	set_sock_callbacks(sock, con);
+
+	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
+
+	ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+	if (ret == -EINPROGRESS) {
+		dout("connect %s EINPROGRESS sk_state = %u\n",
+		     pr_addr(&con->peer_addr.in_addr),
+		     sock->sk->sk_state);
+		ret = 0;
+	}
+	if (ret < 0) {
+		pr_err("connect %s error %d\n",
+		       pr_addr(&con->peer_addr.in_addr), ret);
+		sock_release(sock);
+		con->sock = NULL;
+		con->error_msg = "connect error";
+	}
+
+	if (ret < 0)
+		return ERR_PTR(ret);
+	return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+		     size_t kvlen, size_t len, int more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)
+{
+	int rc;
+
+	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	if (!con->sock)
+		return 0;
+	set_bit(SOCK_CLOSED, &con->state);
+	rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+	sock_release(con->sock);
+	con->sock = NULL;
+	clear_bit(SOCK_CLOSED, &con->state);
+	return rc;
+}
+
+/*
+ * Reset a connection.  Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+	list_del_init(&msg->list_head);
+	ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+							list_head);
+		ceph_msg_remove(msg);
+	}
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+	/* reset connection, out_queue, msg_ and connect_seq */
+	/* discard existing out_queue and msg_seq */
+	ceph_msg_remove_list(&con->out_queue);
+	ceph_msg_remove_list(&con->out_sent);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	con->connect_seq = 0;
+	con->out_seq = 0;
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+	con->in_seq = 0;
+}
+
+/*
+ * mark a peer down.  drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+	dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+	clear_bit(STANDBY, &con->state);  /* avoid connect_seq bump */
+	clear_bit(LOSSYTX, &con->state);  /* so we retry next connect */
+	clear_bit(KEEPALIVE_PENDING, &con->state);
+	clear_bit(WRITE_PENDING, &con->state);
+	mutex_lock(&con->mutex);
+	reset_connection(con);
+	cancel_delayed_work(&con->work);
+	mutex_unlock(&con->mutex);
+	queue_con(con);
+}
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+	dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
+	set_bit(OPENING, &con->state);
+	clear_bit(CLOSED, &con->state);
+	memcpy(&con->peer_addr, addr, sizeof(*addr));
+	con->delay = 0;      /* reset backoff memory */
+	queue_con(con);
+}
+
+/*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+	return con->connect_seq > 0;
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+	dout("con_get %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+	if (atomic_inc_not_zero(&con->nref))
+		return con;
+	return NULL;
+}
+
+void ceph_con_put(struct ceph_connection *con)
+{
+	dout("con_put %p nref = %d -> %d\n", con,
+	     atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+	BUG_ON(atomic_read(&con->nref) == 0);
+	if (atomic_dec_and_test(&con->nref)) {
+		BUG_ON(con->sock);
+		kfree(con);
+	}
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+	dout("con_init %p\n", con);
+	memset(con, 0, sizeof(*con));
+	atomic_set(&con->nref, 1);
+	con->msgr = msgr;
+	mutex_init(&con->mutex);
+	INIT_LIST_HEAD(&con->out_queue);
+	INIT_LIST_HEAD(&con->out_sent);
+	INIT_DELAYED_WORK(&con->work, con_work);
+}
+
+
+/*
+ * We maintain a global counter to order connection attempts.  Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+	u32 ret;
+
+	spin_lock(&msgr->global_seq_lock);
+	if (msgr->global_seq < gt)
+		msgr->global_seq = gt;
+	ret = ++msgr->global_seq;
+	spin_unlock(&msgr->global_seq_lock);
+	return ret;
+}
+
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con->out_kvec_is_msg = true;
+	con->out_kvec[v].iov_base = &m->footer;
+	con->out_kvec[v].iov_len = sizeof(m->footer);
+	con->out_kvec_bytes += sizeof(m->footer);
+	con->out_kvec_left++;
+	con->out_more = m->more_to_follow;
+	con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	int v = 0;
+
+	con->out_kvec_bytes = 0;
+	con->out_kvec_is_msg = true;
+	con->out_msg_done = false;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con->out_kvec[v].iov_base = &tag_ack;
+		con->out_kvec[v++].iov_len = 1;
+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con->out_kvec[v].iov_base = &con->out_temp_ack;
+		con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+		con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	}
+
+	m = list_first_entry(&con->out_queue,
+		       struct ceph_msg, list_head);
+	con->out_msg = m;
+	if (test_bit(LOSSYTX, &con->state)) {
+		list_del_init(&m->list_head);
+	} else {
+		/* put message on sent list */
+		ceph_msg_get(m);
+		list_move_tail(&m->list_head, &con->out_sent);
+	}
+
+	m->hdr.seq = cpu_to_le64(++con->out_seq);
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     le32_to_cpu(m->hdr.data_len),
+	     m->nr_pages);
+	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+	/* tag + hdr + front + middle */
+	con->out_kvec[v].iov_base = &tag_msg;
+	con->out_kvec[v++].iov_len = 1;
+	con->out_kvec[v].iov_base = &m->hdr;
+	con->out_kvec[v++].iov_len = sizeof(m->hdr);
+	con->out_kvec[v++] = m->front;
+	if (m->middle)
+		con->out_kvec[v++] = m->middle->vec;
+	con->out_kvec_left = v;
+	con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+		(m->middle ? m->middle->vec.iov_len : 0);
+	con->out_kvec_cur = con->out_kvec;
+
+	/* fill in crc (except data pages), footer */
+	con->out_msg->hdr.crc =
+		cpu_to_le32(crc32c(0, (void *)&m->hdr,
+				      sizeof(m->hdr) - sizeof(m->hdr.crc)));
+	con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+	con->out_msg->footer.front_crc =
+		cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+	if (m->middle)
+		con->out_msg->footer.middle_crc =
+			cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+					   m->middle->vec.iov_len));
+	else
+		con->out_msg->footer.middle_crc = 0;
+	con->out_msg->footer.data_crc = 0;
+	dout("prepare_write_message front_crc %u data_crc %u\n",
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+
+	/* is there a data payload? */
+	if (le32_to_cpu(m->hdr.data_len) > 0) {
+		/* initialize page iterator */
+		con->out_msg_pos.page = 0;
+		con->out_msg_pos.page_pos =
+			le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+		con->out_msg_pos.data_pos = 0;
+		con->out_msg_pos.did_page_crc = 0;
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con, v);
+	}
+
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con->out_kvec[0].iov_base = &tag_ack;
+	con->out_kvec[0].iov_len = 1;
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con->out_kvec[1].iov_base = &con->out_temp_ack;
+	con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 1;  /* more will follow.. eventually.. */
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con->out_kvec[0].iov_base = &tag_keepalive;
+	con->out_kvec[0].iov_len = 1;
+	con->out_kvec_left = 1;
+	con->out_kvec_bytes = 1;
+	con->out_kvec_cur = con->out_kvec;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+	void *auth_buf;
+	int auth_len = 0;
+	int auth_protocol = 0;
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->get_authorizer)
+		con->ops->get_authorizer(con, &auth_buf, &auth_len,
+					 &auth_protocol, &con->auth_reply_buf,
+					 &con->auth_reply_buf_len,
+					 con->auth_retry);
+	mutex_lock(&con->mutex);
+
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+	con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+	con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+	con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+	con->out_kvec_left++;
+	con->out_kvec_bytes += auth_len;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,
+				 struct ceph_connection *con)
+{
+	int len = strlen(CEPH_BANNER);
+
+	con->out_kvec[0].iov_base = CEPH_BANNER;
+	con->out_kvec[0].iov_len = len;
+	con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+	con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+	con->out_kvec_left = 2;
+	con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect(struct ceph_messenger *msgr,
+				  struct ceph_connection *con,
+				  int after_banner)
+{
+	unsigned global_seq = get_global_seq(con->msgr, 0);
+	int proto;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->connect_seq, global_seq, proto);
+
+	con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq = cpu_to_le32(global_seq);
+	con->out_connect.protocol_version = cpu_to_le32(proto);
+	con->out_connect.flags = 0;
+
+	if (!after_banner) {
+		con->out_kvec_left = 0;
+		con->out_kvec_bytes = 0;
+	}
+	con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+	con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+	con->out_kvec_left++;
+	con->out_kvec_bytes += sizeof(con->out_connect);
+	con->out_kvec_cur = con->out_kvec;
+	con->out_more = 0;
+	set_bit(WRITE_PENDING, &con->state);
+
+	prepare_connect_authorizer(con);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+	while (con->out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+				       con->out_kvec_left, con->out_kvec_bytes,
+				       con->out_more);
+		if (ret <= 0)
+			goto out;
+		con->out_kvec_bytes -= ret;
+		if (con->out_kvec_bytes == 0)
+			break;            /* done */
+		while (ret > 0) {
+			if (ret >= con->out_kvec_cur->iov_len) {
+				ret -= con->out_kvec_cur->iov_len;
+				con->out_kvec_cur++;
+				con->out_kvec_left--;
+			} else {
+				con->out_kvec_cur->iov_len -= ret;
+				con->out_kvec_cur->iov_base += ret;
+				ret = 0;
+				break;
+			}
+		}
+	}
+	con->out_kvec_left = 0;
+	con->out_kvec_is_msg = false;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+	size_t len;
+	int crc = con->msgr->nocrc;
+	int ret;
+
+	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+	     con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+	     con->out_msg_pos.page_pos);
+
+	while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+		struct page *page = NULL;
+		void *kaddr = NULL;
+
+		/*
+		 * if we are calculating the data crc (the default), we need
+		 * to map the page.  if our pages[] has been revoked, use the
+		 * zero page.
+		 */
+		if (msg->pages) {
+			page = msg->pages[con->out_msg_pos.page];
+			if (crc)
+				kaddr = kmap(page);
+		} else if (msg->pagelist) {
+			page = list_first_entry(&msg->pagelist->head,
+						struct page, lru);
+			if (crc)
+				kaddr = kmap(page);
+		} else {
+			page = con->msgr->zero_page;
+			if (crc)
+				kaddr = page_address(con->msgr->zero_page);
+		}
+		len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
+			  (int)(data_len - con->out_msg_pos.data_pos));
+		if (crc && !con->out_msg_pos.did_page_crc) {
+			void *base = kaddr + con->out_msg_pos.page_pos;
+			u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+			BUG_ON(kaddr == NULL);
+			con->out_msg->footer.data_crc =
+				cpu_to_le32(crc32c(tmpcrc, base, len));
+			con->out_msg_pos.did_page_crc = 1;
+		}
+
+		ret = kernel_sendpage(con->sock, page,
+				      con->out_msg_pos.page_pos, len,
+				      MSG_DONTWAIT | MSG_NOSIGNAL |
+				      MSG_MORE);
+
+		if (crc && (msg->pages || msg->pagelist))
+			kunmap(page);
+
+		if (ret <= 0)
+			goto out;
+
+		con->out_msg_pos.data_pos += ret;
+		con->out_msg_pos.page_pos += ret;
+		if (ret == len) {
+			con->out_msg_pos.page_pos = 0;
+			con->out_msg_pos.page++;
+			con->out_msg_pos.did_page_crc = 0;
+			if (msg->pagelist)
+				list_move_tail(&page->lru,
+					       &msg->pagelist->head);
+		}
+	}
+
+	dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+	/* prepare and queue up footer, too */
+	if (!crc)
+		con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_left = 0;
+	con->out_kvec_cur = con->out_kvec;
+	prepare_write_message_footer(con, 0);
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int ret;
+
+	while (con->out_skip > 0) {
+		struct kvec iov = {
+			.iov_base = page_address(con->msgr->zero_page),
+			.iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+		};
+
+		ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+		if (ret <= 0)
+			goto out;
+		con->out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+
+static int read_partial(struct ceph_connection *con,
+			int *to, int size, void *object)
+{
+	*to += size;
+	while (con->in_base_pos < *to) {
+		int left = *to - con->in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	return 1;
+}
+
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+	/* peer's banner */
+	ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+			   &con->actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+			   &con->peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int ret, to = 0;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+	ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+	if (ret <= 0)
+		goto out;
+	ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+			   con->auth_reply_buf);
+	if (ret <= 0)
+		goto out;
+
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, (int)con->in_reply.tag,
+	     le32_to_cpu(con->in_reply.connect_seq),
+	     le32_to_cpu(con->in_reply.global_seq));
+out:
+	return ret;
+
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to %s got bad banner\n",
+		       pr_addr(&con->peer_addr.in_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+	case AF_INET6:
+		return
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+		     ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+	}
+	return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		return ntohs(((struct sockaddr_in *)ss)->sin_port);
+	case AF_INET6:
+		return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+	}
+	return 0;
+}
+
+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+	switch (ss->ss_family) {
+	case AF_INET:
+		((struct sockaddr_in *)ss)->sin_port = htons(p);
+	case AF_INET6:
+		((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+	}
+}
+
+/*
+ * Parse an ip[:port] list into an addr array.  Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+		   struct ceph_entity_addr *addr,
+		   int max_count, int *count)
+{
+	int i;
+	const char *p = c;
+
+	dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+	for (i = 0; i < max_count; i++) {
+		const char *ipend;
+		struct sockaddr_storage *ss = &addr[i].in_addr;
+		struct sockaddr_in *in4 = (void *)ss;
+		struct sockaddr_in6 *in6 = (void *)ss;
+		int port;
+
+		memset(ss, 0, sizeof(*ss));
+		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+			     ',', &ipend)) {
+			ss->ss_family = AF_INET;
+		} else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+				    ',', &ipend)) {
+			ss->ss_family = AF_INET6;
+		} else {
+			goto bad;
+		}
+		p = ipend;
+
+		/* port? */
+		if (p < end && *p == ':') {
+			port = 0;
+			p++;
+			while (p < end && *p >= '0' && *p <= '9') {
+				port = (port * 10) + (*p - '0');
+				p++;
+			}
+			if (port > 65535 || port == 0)
+				goto bad;
+		} else {
+			port = CEPH_MON_PORT;
+		}
+
+		addr_set_port(ss, port);
+
+		dout("parse_ips got %s\n", pr_addr(ss));
+
+		if (p == end)
+			break;
+		if (*p != ',')
+			goto bad;
+		p++;
+	}
+
+	if (p != end)
+		goto bad;
+
+	if (count)
+		*count = i + 1;
+	return 0;
+
+bad:
+	pr_err("parse_ips bad ip '%s'\n", c);
+	return -EINVAL;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+	dout("process_banner on %p\n", con);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	ceph_decode_addr(&con->actual_peer_addr);
+	ceph_decode_addr(&con->peer_addr_for_me);
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
+	    !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
+			   pr_addr(&con->peer_addr.in_addr),
+			   le64_to_cpu(con->peer_addr.nonce),
+			   pr_addr(&con->actual_peer_addr.in_addr),
+			   le64_to_cpu(con->actual_peer_addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+		int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+		memcpy(&con->msgr->inst.addr.in_addr,
+		       &con->peer_addr_for_me.in_addr,
+		       sizeof(con->peer_addr_for_me.in_addr));
+		addr_set_port(&con->msgr->inst.addr.in_addr, port);
+		encode_my_addr(con->msgr);
+		dout("process_banner learned my addr is %s\n",
+		     pr_addr(&con->msgr->inst.addr.in_addr));
+	}
+
+	set_bit(NEGOTIATING, &con->state);
+	prepare_read_connect(con);
+	return 0;
+}
+
+static void fail_protocol(struct ceph_connection *con)
+{
+	reset_connection(con);
+	set_bit(CLOSED, &con->state);  /* in case there's queued work */
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->bad_proto)
+		con->ops->bad_proto(con);
+	mutex_lock(&con->mutex);
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	u64 sup_feat = CEPH_FEATURE_SUPPORTED;
+	u64 req_feat = CEPH_FEATURE_REQUIRED;
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
+
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       pr_addr(&con->peer_addr.in_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       pr_addr(&con->peer_addr.in_addr),
+		       le32_to_cpu(con->out_connect.protocol_version),
+		       le32_to_cpu(con->in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		fail_protocol(con);
+		return -1;
+
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->auth_retry);
+		if (con->auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			reset_connection(con);
+			set_bit(CLOSED, &con->state);
+			return -1;
+		}
+		con->auth_retry = 1;
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->in_connect.connect_seq));
+		pr_err("%s%lld %s connection reset\n",
+		       ENTITY_NAME(con->peer_name),
+		       pr_addr(&con->peer_addr.in_addr));
+		reset_connection(con);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
+		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+		     le32_to_cpu(con->out_connect.connect_seq),
+		     le32_to_cpu(con->in_connect.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_connect.global_seq));
+		get_global_seq(con->msgr,
+			       le32_to_cpu(con->in_connect.global_seq));
+		prepare_write_connect(con->msgr, con, 0);
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       pr_addr(&con->peer_addr.in_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			fail_protocol(con);
+			return -1;
+		}
+		clear_bit(CONNECTING, &con->state);
+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+		con->connect_seq++;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.connect_seq),
+		     con->connect_seq);
+		WARN_ON(con->connect_seq !=
+			le32_to_cpu(con->in_reply.connect_seq));
+
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			set_bit(LOSSYTX, &con->state);
+
+		prepare_read_tag(con);
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		pr_err("process_connect peer connecting WAIT\n");
+
+	default:
+		pr_err("connect protocol error, will retry\n");
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int to = 0;
+
+	return read_partial(con, &to, sizeof(con->in_temp_ack),
+			    &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u64 ack = le64_to_cpu(con->in_temp_ack);
+	u64 seq;
+
+	while (!list_empty(&con->out_sent)) {
+		m = list_first_entry(&con->out_sent, struct ceph_msg,
+				     list_head);
+		seq = le64_to_cpu(m->hdr.seq);
+		if (seq > ack)
+			break;
+		dout("got ack for seq %llu type %d at %p\n", seq,
+		     le16_to_cpu(m->hdr.type), m);
+		ceph_msg_remove(m);
+	}
+	prepare_read_tag(con);
+}
+
+
+
+
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section, unsigned int sec_len,
+					u32 *crc)
+{
+	int left;
+	int ret;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+		if (section->iov_len == sec_len)
+			*crc = crc32c(0, section->iov_base,
+				      section->iov_len);
+	}
+
+	return 1;
+}
+
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip);
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	void *p;
+	int ret;
+	int to, left;
+	unsigned front_len, middle_len, data_len, data_off;
+	int datacrc = con->msgr->nocrc;
+	int skip;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	while (con->in_base_pos < sizeof(con->in_hdr)) {
+		left = sizeof(con->in_hdr) - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock,
+				       (char *)&con->in_hdr + con->in_base_pos,
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+		if (con->in_base_pos == sizeof(con->in_hdr)) {
+			u32 crc = crc32c(0, (void *)&con->in_hdr,
+				 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+			if (crc != le32_to_cpu(con->in_hdr.crc)) {
+				pr_err("read_partial_message bad hdr "
+				       " crc %u != expected %u\n",
+				       crc, con->in_hdr.crc);
+				return -EBADMSG;
+			}
+		}
+	}
+	front_len = le32_to_cpu(con->in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+	data_off = le16_to_cpu(con->in_hdr.data_off);
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		     con->in_hdr.front_len, con->in_hdr.data_len);
+		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+		if (skip) {
+			/* skip this message */
+			dout("alloc_msg returned NULL, skipping message\n");
+			con->in_base_pos = -front_len - middle_len - data_len -
+				sizeof(m->footer);
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			return 0;
+		}
+		if (IS_ERR(con->in_msg)) {
+			ret = PTR_ERR(con->in_msg);
+			con->in_msg = NULL;
+			con->error_msg =
+				"error allocating memory for incoming message";
+			return ret;
+		}
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
+
+		con->in_msg_pos.page = 0;
+		con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+		con->in_msg_pos.data_pos = 0;
+	}
+
+	/* front */
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
+
+	/* middle */
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec, middle_len,
+						   &con->in_middle_crc);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* (page) data */
+	while (con->in_msg_pos.data_pos < data_len) {
+		left = min((int)(data_len - con->in_msg_pos.data_pos),
+			   (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+		BUG_ON(m->pages == NULL);
+		p = kmap(m->pages[con->in_msg_pos.page]);
+		ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+				       left);
+		if (ret > 0 && datacrc)
+			con->in_data_crc =
+				crc32c(con->in_data_crc,
+					  p + con->in_msg_pos.page_pos, ret);
+		kunmap(m->pages[con->in_msg_pos.page]);
+		if (ret <= 0)
+			return ret;
+		con->in_msg_pos.data_pos += ret;
+		con->in_msg_pos.page_pos += ret;
+		if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+			con->in_msg_pos.page_pos = 0;
+			con->in_msg_pos.page++;
+		}
+	}
+
+	/* footer */
+	to = sizeof(m->hdr) + sizeof(m->footer);
+	while (con->in_base_pos < to) {
+		left = to - con->in_base_pos;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+				       (con->in_base_pos - sizeof(m->hdr)),
+				       left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+/*
+ * Process message.  This happens in the worker thread.  The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{
+	struct ceph_msg *msg;
+
+	msg = con->in_msg;
+	con->in_msg = NULL;
+
+	/* if first message, set peer_name */
+	if (con->peer_name.type == 0)
+		con->peer_name = msg->hdr.src.name;
+
+	con->in_seq++;
+	mutex_unlock(&con->mutex);
+
+	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+	     msg, le64_to_cpu(msg->hdr.seq),
+	     ENTITY_NAME(msg->hdr.src.name),
+	     le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.data_len),
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+	con->ops->dispatch(con, msg);
+
+	mutex_lock(&con->mutex);
+	prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+	struct ceph_messenger *msgr = con->msgr;
+	int ret = 1;
+
+	dout("try_write start %p state %lu nref %d\n", con, con->state,
+	     atomic_read(&con->nref));
+
+	mutex_lock(&con->mutex);
+more:
+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+	/* open the socket first? */
+	if (con->sock == NULL) {
+		/*
+		 * if we were STANDBY and are reconnecting _this_
+		 * connection, bump connect_seq now.  Always bump
+		 * global_seq.
+		 */
+		if (test_and_clear_bit(STANDBY, &con->state))
+			con->connect_seq++;
+
+		prepare_write_banner(msgr, con);
+		prepare_write_connect(msgr, con, 1);
+		prepare_read_banner(con);
+		set_bit(CONNECTING, &con->state);
+		clear_bit(NEGOTIATING, &con->state);
+
+		BUG_ON(con->in_msg);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %lu\n",
+		     con, con->state);
+		con->sock = ceph_tcp_connect(con);
+		if (IS_ERR(con->sock)) {
+			con->sock = NULL;
+			con->error_msg = "connect error";
+			ret = -1;
+			goto out;
+		}
+	}
+
+more_kvec:
+	/* kvec data queued? */
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_skip err %d\n", ret);
+			goto done;
+		}
+	}
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto done;
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		if (con->out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
+		ret = write_partial_msg_pages(con);
+		if (ret == 1)
+			goto more_kvec;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto done;
+		if (ret < 0) {
+			dout("try_write write_partial_msg_pages err %d\n",
+			     ret);
+			goto done;
+		}
+	}
+
+do_next:
+	if (!test_bit(CONNECTING, &con->state)) {
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+		if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	clear_bit(WRITE_PENDING, &con->state);
+	dout("try_write nothing else to write.\n");
+done:
+	ret = 0;
+out:
+	mutex_unlock(&con->mutex);
+	dout("try_write done on %p\n", con);
+	return ret;
+}
+
+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+	struct ceph_messenger *msgr;
+	int ret = -1;
+
+	if (!con->sock)
+		return 0;
+
+	if (test_bit(STANDBY, &con->state))
+		return 0;
+
+	dout("try_read start on %p\n", con);
+	msgr = con->msgr;
+
+	mutex_lock(&con->mutex);
+
+more:
+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+	     con->in_base_pos);
+	if (test_bit(CONNECTING, &con->state)) {
+		if (!test_bit(NEGOTIATING, &con->state)) {
+			dout("try_read connecting\n");
+			ret = read_partial_banner(con);
+			if (ret <= 0)
+				goto done;
+			if (process_banner(con) < 0) {
+				ret = -1;
+				goto out;
+			}
+		}
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto done;
+		if (process_connect(con) < 0) {
+			ret = -1;
+			goto out;
+		}
+		goto more;
+	}
+
+	if (con->in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 *
+		 * FIXME: there must be a better way to do this!
+		 */
+		static char buf[1024];
+		int skip = min(1024, -con->in_base_pos);
+		dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+		ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+		if (ret <= 0)
+			goto done;
+		con->in_base_pos += ret;
+		if (con->in_base_pos)
+			goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		if (ret <= 0)
+			goto done;
+		dout("try_read got tag %d\n", (int)con->in_tag);
+		switch (con->in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			set_bit(CLOSED, &con->state);   /* fixme */
+			goto done;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc";
+				ret = -EIO;
+				goto out;
+			case -EIO:
+				con->error_msg = "io error";
+				goto out;
+			default:
+				goto done;
+			}
+		}
+		if (con->in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		process_message(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto done;
+		process_ack(con);
+		goto more;
+	}
+
+done:
+	ret = 0;
+out:
+	mutex_unlock(&con->mutex);
+	dout("try_read done on %p\n", con);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+
+/*
+ * Atomically queue work on a connection.  Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY.  It
+ * clears QUEUED and does it's thing.  When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work.  If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+	if (test_bit(DEAD, &con->state)) {
+		dout("queue_con %p ignoring: DEAD\n",
+		     con);
+		return;
+	}
+
+	if (!con->ops->get(con)) {
+		dout("queue_con %p ref count 0\n", con);
+		return;
+	}
+
+	set_bit(QUEUED, &con->state);
+	if (test_bit(BUSY, &con->state)) {
+		dout("queue_con %p - already BUSY\n", con);
+		con->ops->put(con);
+	} else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+		dout("queue_con %p - already queued\n", con);
+		con->ops->put(con);
+	} else {
+		dout("queue_con %p\n", con);
+	}
+}
+
+/*
+ * Do some work on a connection.  Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+	struct ceph_connection *con = container_of(work, struct ceph_connection,
+						   work.work);
+	int backoff = 0;
+
+more:
+	if (test_and_set_bit(BUSY, &con->state) != 0) {
+		dout("con_work %p BUSY already set\n", con);
+		goto out;
+	}
+	dout("con_work %p start, clearing QUEUED\n", con);
+	clear_bit(QUEUED, &con->state);
+
+	if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+		dout("con_work CLOSED\n");
+		con_close_socket(con);
+		goto done;
+	}
+	if (test_and_clear_bit(OPENING, &con->state)) {
+		/* reopen w/ new peer */
+		dout("con_work OPENING\n");
+		con_close_socket(con);
+	}
+
+	if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+	    try_read(con) < 0 ||
+	    try_write(con) < 0) {
+		backoff = 1;
+		ceph_fault(con);     /* error/fault path */
+	}
+
+done:
+	clear_bit(BUSY, &con->state);
+	dout("con->state=%lu\n", con->state);
+	if (test_bit(QUEUED, &con->state)) {
+		if (!backoff || test_bit(OPENING, &con->state)) {
+			dout("con_work %p QUEUED reset, looping\n", con);
+			goto more;
+		}
+		dout("con_work %p QUEUED reset, but just faulted\n", con);
+		clear_bit(QUEUED, &con->state);
+	}
+	dout("con_work %p done\n", con);
+
+out:
+	con->ops->put(con);
+}
+
+
+/*
+ * Generic error/fault handler.  A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+	pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+	       pr_addr(&con->peer_addr.in_addr), con->error_msg);
+	dout("fault %p state %lu to peer %s\n",
+	     con, con->state, pr_addr(&con->peer_addr.in_addr));
+
+	if (test_bit(LOSSYTX, &con->state)) {
+		dout("fault on LOSSYTX channel\n");
+		goto out;
+	}
+
+	mutex_lock(&con->mutex);
+	if (test_bit(CLOSED, &con->state))
+		goto out_unlock;
+
+	con_close_socket(con);
+
+	if (con->in_msg) {
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+
+	/* Requeue anything that hasn't been acked */
+	list_splice_init(&con->out_sent, &con->out_queue);
+
+	/* If there are no messages in the queue, place the connection
+	 * in a STANDBY state (i.e., don't try to reconnect just yet). */
+	if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+		dout("fault setting STANDBY\n");
+		set_bit(STANDBY, &con->state);
+	} else {
+		/* retry after a delay. */
+		if (con->delay == 0)
+			con->delay = BASE_DELAY_INTERVAL;
+		else if (con->delay < MAX_DELAY_INTERVAL)
+			con->delay *= 2;
+		dout("fault queueing %p delay %lu\n", con, con->delay);
+		con->ops->get(con);
+		if (queue_delayed_work(ceph_msgr_wq, &con->work,
+				       round_jiffies_relative(con->delay)) == 0)
+			con->ops->put(con);
+	}
+
+out_unlock:
+	mutex_unlock(&con->mutex);
+out:
+	/*
+	 * in case we faulted due to authentication, invalidate our
+	 * current tickets so that we can get new ones.
+         */
+	if (con->auth_retry && con->ops->invalidate_authorizer) {
+		dout("calling invalidate_authorizer()\n");
+		con->ops->invalidate_authorizer(con);
+	}
+
+	if (con->ops->fault)
+		con->ops->fault(con);
+}
+
+
+
+/*
+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
+{
+	struct ceph_messenger *msgr;
+
+	msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+	if (msgr == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&msgr->global_seq_lock);
+
+	/* the zero page is needed if a request is "canceled" while the message
+	 * is being written over the socket */
+	msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!msgr->zero_page) {
+		kfree(msgr);
+		return ERR_PTR(-ENOMEM);
+	}
+	kmap(msgr->zero_page);
+
+	if (myaddr)
+		msgr->inst.addr = *myaddr;
+
+	/* select a random nonce */
+	msgr->inst.addr.type = 0;
+	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+	encode_my_addr(msgr);
+
+	dout("messenger_create %p\n", msgr);
+	return msgr;
+}
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+	dout("destroy %p\n", msgr);
+	kunmap(msgr->zero_page);
+	__free_page(msgr->zero_page);
+	kfree(msgr);
+	dout("destroyed messenger %p\n", msgr);
+}
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	if (test_bit(CLOSED, &con->state)) {
+		dout("con_send %p closed, dropping %p\n", con, msg);
+		ceph_msg_put(msg);
+		return;
+	}
+
+	/* set src+dst */
+	msg->hdr.src.name = con->msgr->inst.name;
+	msg->hdr.src.addr = con->msgr->my_enc_addr;
+	msg->hdr.orig_src = msg->hdr.src;
+
+	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+
+	/* queue */
+	mutex_lock(&con->mutex);
+	BUG_ON(!list_empty(&msg->list_head));
+	list_add_tail(&msg->list_head, &con->out_queue);
+	dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+	     ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.middle_len),
+	     le32_to_cpu(msg->hdr.data_len));
+	mutex_unlock(&con->mutex);
+
+	/* if there wasn't anything waiting to send before, queue
+	 * new work */
+	if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (!list_empty(&msg->list_head)) {
+		dout("con_revoke %p msg %p\n", con, msg);
+		list_del_init(&msg->list_head);
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
+		if (con->out_msg == msg) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;
+		}
+		if (con->out_kvec_is_msg) {
+			con->out_skip = con->out_kvec_bytes;
+			con->out_kvec_is_msg = false;
+		}
+	} else {
+		dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	mutex_lock(&con->mutex);
+	if (con->in_msg && con->in_msg == msg) {
+		unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+		unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+		unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+
+		/* skip rest of message */
+		dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+			con->in_base_pos = con->in_base_pos -
+				sizeof(struct ceph_msg_header) -
+				front_len -
+				middle_len -
+				data_len -
+				sizeof(struct ceph_msg_footer);
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+		con->in_tag = CEPH_MSGR_TAG_READY;
+	} else {
+		dout("con_revoke_pages %p msg %p pages %p no-op\n",
+		     con, con->in_msg, msg);
+	}
+	mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+	if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+	    test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+		queue_con(con);
+}
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len,
+			      int page_len, int page_off, struct page **pages)
+{
+	struct ceph_msg *m;
+
+	m = kmalloc(sizeof(*m), GFP_NOFS);
+	if (m == NULL)
+		goto out;
+	kref_init(&m->kref);
+	INIT_LIST_HEAD(&m->list_head);
+
+	m->hdr.type = cpu_to_le16(type);
+	m->hdr.front_len = cpu_to_le32(front_len);
+	m->hdr.middle_len = 0;
+	m->hdr.data_len = cpu_to_le32(page_len);
+	m->hdr.data_off = cpu_to_le16(page_off);
+	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+	m->footer.front_crc = 0;
+	m->footer.middle_crc = 0;
+	m->footer.data_crc = 0;
+	m->front_max = front_len;
+	m->front_is_vmalloc = false;
+	m->more_to_follow = false;
+	m->pool = NULL;
+
+	/* front */
+	if (front_len) {
+		if (front_len > PAGE_CACHE_SIZE) {
+			m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+						      PAGE_KERNEL);
+			m->front_is_vmalloc = true;
+		} else {
+			m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+		}
+		if (m->front.iov_base == NULL) {
+			pr_err("msg_new can't allocate %d bytes\n",
+			     front_len);
+			goto out2;
+		}
+	} else {
+		m->front.iov_base = NULL;
+	}
+	m->front.iov_len = front_len;
+
+	/* middle */
+	m->middle = NULL;
+
+	/* data */
+	m->nr_pages = calc_pages_for(page_off, page_len);
+	m->pages = pages;
+	m->pagelist = NULL;
+
+	dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+	     m->nr_pages);
+	return m;
+
+out2:
+	ceph_msg_put(m);
+out:
+	pr_err("msg_new can't create type %d len %d\n", type, front_len);
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg.  This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	int type = le16_to_cpu(msg->hdr.type);
+	int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+	dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+	     ceph_msg_type_name(type), middle_len);
+	BUG_ON(!middle_len);
+	BUG_ON(msg->middle);
+
+	msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+	if (!msg->middle)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+				struct ceph_msg_header *hdr,
+				int *skip)
+{
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	int middle_len = le32_to_cpu(hdr->middle_len);
+	struct ceph_msg *msg = NULL;
+	int ret;
+
+	if (con->ops->alloc_msg) {
+		mutex_unlock(&con->mutex);
+		msg = con->ops->alloc_msg(con, hdr, skip);
+		mutex_lock(&con->mutex);
+		if (IS_ERR(msg))
+			return msg;
+
+		if (*skip)
+			return NULL;
+	}
+	if (!msg) {
+		*skip = 0;
+		msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+		if (!msg) {
+			pr_err("unable to allocate msg type %d len %d\n",
+			       type, front_len);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+	if (middle_len) {
+		ret = ceph_alloc_middle(con, msg);
+
+		if (ret < 0) {
+			ceph_msg_put(msg);
+			return msg;
+		}
+	}
+
+	return msg;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+	dout("msg_kfree %p\n", m);
+	if (m->front_is_vmalloc)
+		vfree(m->front.iov_base);
+	else
+		kfree(m->front.iov_base);
+	kfree(m);
+}
+
+/*
+ * Drop a msg ref.  Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+
+	dout("ceph_msg_put last one on %p\n", m);
+	WARN_ON(!list_empty(&m->list_head));
+
+	/* drop middle, data, if any */
+	if (m->middle) {
+		ceph_buffer_put(m->middle);
+		m->middle = NULL;
+	}
+	m->nr_pages = 0;
+	m->pages = NULL;
+
+	if (m->pagelist) {
+		ceph_pagelist_release(m->pagelist);
+		kfree(m->pagelist);
+		m->pagelist = NULL;
+	}
+
+	if (m->pool)
+		ceph_msgpool_put(m->pool, m);
+	else
+		ceph_msg_kfree(m);
+}
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+		 msg->front_max, msg->nr_pages);
+	print_hex_dump(KERN_DEBUG, "header: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->hdr, sizeof(msg->hdr), true);
+	print_hex_dump(KERN_DEBUG, " front: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       msg->front.iov_base, msg->front.iov_len, true);
+	if (msg->middle)
+		print_hex_dump(KERN_DEBUG, "middle: ",
+			       DUMP_PREFIX_OFFSET, 16, 1,
+			       msg->middle->vec.iov_base,
+			       msg->middle->vec.iov_len, true);
+	print_hex_dump(KERN_DEBUG, "footer: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       &msg->footer, sizeof(msg->footer), true);
+}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 00000000000..a343dae73cd
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,255 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+#include "types.h"
+#include "buffer.h"
+
+struct ceph_msg;
+struct ceph_connection;
+
+extern struct workqueue_struct *ceph_msgr_wq;       /* receive work queue */
+
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+	struct ceph_connection *(*get)(struct ceph_connection *);
+	void (*put)(struct ceph_connection *);
+
+	/* handle an incoming message. */
+	void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+
+	/* authorize an outgoing connection */
+	int (*get_authorizer) (struct ceph_connection *con,
+			       void **buf, int *len, int *proto,
+			       void **reply_buf, int *reply_len, int force_new);
+	int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+	int (*invalidate_authorizer)(struct ceph_connection *con);
+
+	/* protocol version mismatch */
+	void (*bad_proto) (struct ceph_connection *con);
+
+	/* there was some error on the socket (disconnect, whatever) */
+	void (*fault) (struct ceph_connection *con);
+
+	/* a remote host as terminated a message exchange session, and messages
+	 * we sent (or they tried to send us) may be lost. */
+	void (*peer_reset) (struct ceph_connection *con);
+
+	struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
+					struct ceph_msg_header *hdr,
+					int *skip);
+};
+
+extern const char *ceph_name_type_str(int t);
+
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+
+struct ceph_messenger {
+	struct ceph_entity_inst inst;    /* my name+address */
+	struct ceph_entity_addr my_enc_addr;
+	struct page *zero_page;          /* used in certain error cases */
+
+	bool nocrc;
+
+	/*
+	 * the global_seq counts connections i (attempt to) initiate
+	 * in order to disambiguate certain connect race conditions.
+	 */
+	u32 global_seq;
+	spinlock_t global_seq_lock;
+};
+
+/*
+ * a single message.  it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+	struct ceph_msg_header hdr;	/* header */
+	struct ceph_msg_footer footer;	/* footer */
+	struct kvec front;              /* unaligned blobs of message */
+	struct ceph_buffer *middle;
+	struct page **pages;            /* data payload.  NOT OWNER. */
+	unsigned nr_pages;              /* size of page array */
+	struct ceph_pagelist *pagelist; /* instead of pages */
+	struct list_head list_head;
+	struct kref kref;
+	bool front_is_vmalloc;
+	bool more_to_follow;
+	int front_max;
+
+	struct ceph_msgpool *pool;
+};
+
+struct ceph_msg_pos {
+	int page, page_pos;  /* which page; offset in page */
+	int data_pos;        /* offset in data payload */
+	int did_page_crc;    /* true if we've calculated crc for current page */
+};
+
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL	(HZ/2)
+#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
+
+/*
+ * ceph_connection state bit flags
+ *
+ * QUEUED and BUSY are used together to ensure that only a single
+ * thread is currently opening, reading or writing data to the socket.
+ */
+#define LOSSYTX         0  /* we can close channel or drop messages on errors */
+#define CONNECTING	1
+#define NEGOTIATING	2
+#define KEEPALIVE_PENDING      3
+#define WRITE_PENDING	4  /* we have data ready to send */
+#define QUEUED          5  /* there is work queued on this connection */
+#define BUSY            6  /* work is being done */
+#define STANDBY		8  /* no outgoing messages, socket closed.  we keep
+			    * the ceph_connection around to maintain shared
+			    * state with the peer. */
+#define CLOSED		10 /* we've closed the connection */
+#define SOCK_CLOSED	11 /* socket state changed to closed */
+#define OPENING         13 /* open connection w/ (possibly new) peer */
+#define DEAD            14 /* dead, about to kfree */
+
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+	void *private;
+	atomic_t nref;
+
+	const struct ceph_connection_operations *ops;
+
+	struct ceph_messenger *msgr;
+	struct socket *sock;
+	unsigned long state;	/* connection state (see flags above) */
+	const char *error_msg;  /* error message, if any */
+
+	struct ceph_entity_addr peer_addr; /* peer address */
+	struct ceph_entity_name peer_name; /* peer name */
+	struct ceph_entity_addr peer_addr_for_me;
+	u32 connect_seq;      /* identify the most recent connection
+				 attempt for this connection, client */
+	u32 peer_global_seq;  /* peer's global seq for this connection */
+
+	int auth_retry;       /* true if we need a newer authorizer */
+	void *auth_reply_buf;   /* where to put the authorizer reply */
+	int auth_reply_buf_len;
+
+	struct mutex mutex;
+
+	/* out queue */
+	struct list_head out_queue;
+	struct list_head out_sent;   /* sending or sent but unacked */
+	u64 out_seq;		     /* last message queued for send */
+	u64 out_seq_sent;            /* last message sent */
+	bool out_keepalive_pending;
+
+	u64 in_seq, in_seq_acked;  /* last message received, acked */
+
+	/* connection negotiation temps */
+	char in_banner[CEPH_BANNER_MAX_LEN];
+	union {
+		struct {  /* outgoing connection */
+			struct ceph_msg_connect out_connect;
+			struct ceph_msg_connect_reply in_reply;
+		};
+		struct {  /* incoming */
+			struct ceph_msg_connect in_connect;
+			struct ceph_msg_connect_reply out_reply;
+		};
+	};
+	struct ceph_entity_addr actual_peer_addr;
+
+	/* message out temps */
+	struct ceph_msg *out_msg;        /* sending message (== tail of
+					    out_sent) */
+	bool out_msg_done;
+	struct ceph_msg_pos out_msg_pos;
+
+	struct kvec out_kvec[8],         /* sending header/footer data */
+		*out_kvec_cur;
+	int out_kvec_left;   /* kvec's left in out_kvec */
+	int out_skip;        /* skip this many bytes */
+	int out_kvec_bytes;  /* total bytes left */
+	bool out_kvec_is_msg; /* kvec refers to out_msg */
+	int out_more;        /* there is more data after the kvecs */
+	__le64 out_temp_ack; /* for writing an ack */
+
+	/* message in temps */
+	struct ceph_msg_header in_hdr;
+	struct ceph_msg *in_msg;
+	struct ceph_msg_pos in_msg_pos;
+	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
+
+	char in_tag;         /* protocol control byte */
+	int in_base_pos;     /* bytes read */
+	__le64 in_temp_ack;  /* for reading an ack */
+
+	struct delayed_work work;	    /* send|recv work */
+	unsigned long       delay;          /* current delay interval */
+};
+
+
+extern const char *pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+			  struct ceph_entity_addr *addr,
+			  int max_count, int *count);
+
+
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+
+extern struct ceph_messenger *ceph_messenger_create(
+	struct ceph_entity_addr *myaddr);
+extern void ceph_messenger_destroy(struct ceph_messenger *);
+
+extern void ceph_con_init(struct ceph_messenger *msgr,
+			  struct ceph_connection *con);
+extern void ceph_con_open(struct ceph_connection *con,
+			  struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke_message(struct ceph_connection *con,
+				  struct ceph_msg *msg);
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
+extern void ceph_con_put(struct ceph_connection *con);
+
+extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+				     int page_len, int page_off,
+				     struct page **pages);
+extern void ceph_msg_kfree(struct ceph_msg *m);
+
+
+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+	kref_get(&msg->kref);
+	return msg;
+}
+extern void ceph_msg_last_put(struct kref *kref);
+static inline void ceph_msg_put(struct ceph_msg *msg)
+{
+	kref_put(&msg->kref, ceph_msg_last_put);
+}
+
+extern void ceph_msg_dump(struct ceph_msg *msg);
+
+#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 00000000000..890597c09d4
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,834 @@
+#include "ceph_debug.h"
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include "mon_client.h"
+#include "super.h"
+#include "auth.h"
+#include "decode.h"
+
+/*
+ * Interact with Ceph monitor cluster.  Handle requests for new map
+ * versions, and periodically resend as needed.  Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information.  An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates.  We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure.  If the connection does break, we
+ * randomly hunt for a new monitor.  Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+const static struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+	struct ceph_monmap *m = NULL;
+	int i, err = -EINVAL;
+	struct ceph_fsid fsid;
+	u32 epoch, num_mon;
+	u16 version;
+	u32 len;
+
+	ceph_decode_32_safe(&p, end, len, bad);
+	ceph_decode_need(&p, end, len, bad);
+
+	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+	ceph_decode_16_safe(&p, end, version, bad);
+
+	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(&p);
+
+	num_mon = ceph_decode_32(&p);
+	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+	if (num_mon >= CEPH_MAX_MON)
+		goto bad;
+	m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+	if (m == NULL)
+		return ERR_PTR(-ENOMEM);
+	m->fsid = fsid;
+	m->epoch = epoch;
+	m->num_mon = num_mon;
+	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+	for (i = 0; i < num_mon; i++)
+		ceph_decode_addr(&m->mon_inst[i].addr);
+
+	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+	     m->num_mon);
+	for (i = 0; i < m->num_mon; i++)
+		dout("monmap_decode  mon%d is %s\n", i,
+		     pr_addr(&m->mon_inst[i].addr.in_addr));
+	return m;
+
+bad:
+	dout("monmap_decode failed with %d\n", err);
+	kfree(m);
+	return ERR_PTR(err);
+}
+
+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+	int i;
+
+	for (i = 0; i < m->num_mon; i++)
+		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+			return 1;
+	return 0;
+}
+
+/*
+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+	monc->pending_auth = 1;
+	monc->m_auth->front.iov_len = len;
+	monc->m_auth->hdr.front_len = cpu_to_le32(len);
+	ceph_msg_get(monc->m_auth);  /* keep our ref */
+	ceph_con_send(monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+	if (monc->con) {
+		dout("__close_session closing mon%d\n", monc->cur_mon);
+		ceph_con_revoke(monc->con, monc->m_auth);
+		ceph_con_close(monc->con);
+		monc->cur_mon = -1;
+		monc->pending_auth = 0;
+		ceph_auth_reset(monc->auth);
+	}
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+	char r;
+	int ret;
+
+	if (monc->cur_mon < 0) {
+		get_random_bytes(&r, 1);
+		monc->cur_mon = r % monc->monmap->num_mon;
+		dout("open_session num=%d r=%d -> mon%d\n",
+		     monc->monmap->num_mon, r, monc->cur_mon);
+		monc->sub_sent = 0;
+		monc->sub_renew_after = jiffies;  /* i.e., expired */
+		monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+		dout("open_session mon%d opening\n", monc->cur_mon);
+		monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+		monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+		ceph_con_open(monc->con,
+			      &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+		/* initiatiate authentication handshake */
+		ret = ceph_auth_build_hello(monc->auth,
+					    monc->m_auth->front.iov_base,
+					    monc->m_auth->front_max);
+		__send_prepared_auth_request(monc, ret);
+	} else {
+		dout("open_session mon%d already open\n", monc->cur_mon);
+	}
+	return 0;
+}
+
+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+	return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+	unsigned delay;
+
+	if (monc->cur_mon < 0 || __sub_expired(monc))
+		delay = 10 * HZ;
+	else
+		delay = 20 * HZ;
+	dout("__schedule_delayed after %u\n", delay);
+	schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+	dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+	     (unsigned)monc->sub_sent, __sub_expired(monc),
+	     monc->want_next_osdmap);
+	if ((__sub_expired(monc) && !monc->sub_sent) ||
+	    monc->want_next_osdmap == 1) {
+		struct ceph_msg *msg;
+		struct ceph_mon_subscribe_item *i;
+		void *p, *end;
+
+		msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
+		if (!msg)
+			return;
+
+		p = msg->front.iov_base;
+		end = p + msg->front.iov_len;
+
+		dout("__send_subscribe to 'mdsmap' %u+\n",
+		     (unsigned)monc->have_mdsmap);
+		if (monc->want_next_osdmap) {
+			dout("__send_subscribe to 'osdmap' %u\n",
+			     (unsigned)monc->have_osdmap);
+			ceph_encode_32(&p, 3);
+			ceph_encode_string(&p, end, "osdmap", 6);
+			i = p;
+			i->have = cpu_to_le64(monc->have_osdmap);
+			i->onetime = 1;
+			p += sizeof(*i);
+			monc->want_next_osdmap = 2;  /* requested */
+		} else {
+			ceph_encode_32(&p, 2);
+		}
+		ceph_encode_string(&p, end, "mdsmap", 6);
+		i = p;
+		i->have = cpu_to_le64(monc->have_mdsmap);
+		i->onetime = 0;
+		p += sizeof(*i);
+		ceph_encode_string(&p, end, "monmap", 6);
+		i = p;
+		i->have = 0;
+		i->onetime = 0;
+		p += sizeof(*i);
+
+		msg->front.iov_len = p - msg->front.iov_base;
+		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+		ceph_con_send(monc->con, msg);
+
+		monc->sub_sent = jiffies | 1;  /* never 0 */
+	}
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	unsigned seconds;
+	struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	seconds = le32_to_cpu(h->duration);
+
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		pr_info("mon%d %s session established\n",
+			monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
+		monc->hunting = false;
+	}
+	dout("handle_subscribe_ack after %d seconds\n", seconds);
+	monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+	monc->sub_sent = 0;
+	mutex_unlock(&monc->mutex);
+	return;
+bad:
+	pr_err("got corrupt subscribe-ack msg\n");
+	ceph_msg_dump(msg);
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_mdsmap = got;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+	mutex_lock(&monc->mutex);
+	monc->have_osdmap = got;
+	monc->want_next_osdmap = 0;
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+/*
+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+	dout("request_next_osdmap have %u\n", monc->have_osdmap);
+	mutex_lock(&monc->mutex);
+	if (!monc->want_next_osdmap)
+		monc->want_next_osdmap = 1;
+	if (monc->want_next_osdmap < 2)
+		__send_subscribe(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ *
+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+	if (!monc->con) {
+		monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+		if (!monc->con)
+			return -ENOMEM;
+		ceph_con_init(monc->client->msgr, monc->con);
+		monc->con->private = monc;
+		monc->con->ops = &mon_con_ops;
+	}
+
+	mutex_lock(&monc->mutex);
+	__open_session(monc);
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+/*
+ * The monitor responds with mount ack indicate mount success.  The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+				 struct ceph_msg *msg)
+{
+	struct ceph_client *client = monc->client;
+	struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+	void *p, *end;
+
+	mutex_lock(&monc->mutex);
+
+	dout("handle_monmap\n");
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	monmap = ceph_monmap_decode(p, end);
+	if (IS_ERR(monmap)) {
+		pr_err("problem decoding monmap, %d\n",
+		       (int)PTR_ERR(monmap));
+		goto out;
+	}
+
+	if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+		kfree(monmap);
+		goto out;
+	}
+
+	client->monc.monmap = monmap;
+	kfree(old);
+
+out:
+	mutex_unlock(&monc->mutex);
+	wake_up(&client->auth_wq);
+}
+
+/*
+ * statfs
+ */
+static struct ceph_mon_statfs_request *__lookup_statfs(
+	struct ceph_mon_client *monc, u64 tid)
+{
+	struct ceph_mon_statfs_request *req;
+	struct rb_node *n = monc->statfs_request_tree.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_mon_statfs_request, node);
+		if (tid < req->tid)
+			n = n->rb_left;
+		else if (tid > req->tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static void __insert_statfs(struct ceph_mon_client *monc,
+			    struct ceph_mon_statfs_request *new)
+{
+	struct rb_node **p = &monc->statfs_request_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_mon_statfs_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+		if (new->tid < req->tid)
+			p = &(*p)->rb_left;
+		else if (new->tid > req->tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &monc->statfs_request_tree);
+}
+
+static void handle_statfs_reply(struct ceph_mon_client *monc,
+				struct ceph_msg *msg)
+{
+	struct ceph_mon_statfs_request *req;
+	struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+	u64 tid;
+
+	if (msg->front.iov_len != sizeof(*reply))
+		goto bad;
+	tid = le64_to_cpu(msg->hdr.tid);
+	dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+	mutex_lock(&monc->mutex);
+	req = __lookup_statfs(monc, tid);
+	if (req) {
+		*req->buf = reply->st;
+		req->result = 0;
+	}
+	mutex_unlock(&monc->mutex);
+	if (req)
+		complete(&req->completion);
+	return;
+
+bad:
+	pr_err("corrupt statfs reply, no tid\n");
+	ceph_msg_dump(msg);
+}
+
+/*
+ * (re)send a statfs request
+ */
+static int send_statfs(struct ceph_mon_client *monc,
+		       struct ceph_mon_statfs_request *req)
+{
+	struct ceph_msg *msg;
+	struct ceph_mon_statfs *h;
+
+	dout("send_statfs tid %llu\n", req->tid);
+	msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
+	req->request = msg;
+	msg->hdr.tid = cpu_to_le64(req->tid);
+	h = msg->front.iov_base;
+	h->monhdr.have_version = 0;
+	h->monhdr.session_mon = cpu_to_le16(-1);
+	h->monhdr.session_mon_tid = 0;
+	h->fsid = monc->monmap->fsid;
+	ceph_con_send(monc->con, msg);
+	return 0;
+}
+
+/*
+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+	struct ceph_mon_statfs_request req;
+	int err;
+
+	req.buf = buf;
+	init_completion(&req.completion);
+
+	/* allocate memory for reply */
+	err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
+	if (err)
+		return err;
+
+	/* register request */
+	mutex_lock(&monc->mutex);
+	req.tid = ++monc->last_tid;
+	req.last_attempt = jiffies;
+	req.delay = BASE_DELAY_INTERVAL;
+	__insert_statfs(monc, &req);
+	monc->num_statfs_requests++;
+	mutex_unlock(&monc->mutex);
+
+	/* send request and wait */
+	err = send_statfs(monc, &req);
+	if (!err)
+		err = wait_for_completion_interruptible(&req.completion);
+
+	mutex_lock(&monc->mutex);
+	rb_erase(&req.node, &monc->statfs_request_tree);
+	monc->num_statfs_requests--;
+	ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+	mutex_unlock(&monc->mutex);
+
+	if (!err)
+		err = req.result;
+	return err;
+}
+
+/*
+ * Resend pending statfs requests.
+ */
+static void __resend_statfs(struct ceph_mon_client *monc)
+{
+	struct ceph_mon_statfs_request *req;
+	struct rb_node *p;
+
+	for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_mon_statfs_request, node);
+		send_statfs(monc, req);
+	}
+}
+
+/*
+ * Delayed work.  If we haven't mounted yet, retry.  Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM).  And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+	struct ceph_mon_client *monc =
+		container_of(work, struct ceph_mon_client, delayed_work.work);
+
+	dout("monc delayed_work\n");
+	mutex_lock(&monc->mutex);
+	if (monc->hunting) {
+		__close_session(monc);
+		__open_session(monc);  /* continue hunting */
+	} else {
+		ceph_con_keepalive(monc->con);
+
+		__validate_auth(monc);
+
+		if (monc->auth->ops->is_authenticated(monc->auth))
+			__send_subscribe(monc);
+	}
+	__schedule_delayed(monc);
+	mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+	struct ceph_mount_args *args = monc->client->mount_args;
+	struct ceph_entity_addr *mon_addr = args->mon_addr;
+	int num_mon = args->num_mon;
+	int i;
+
+	/* build initial monmap */
+	monc->monmap = kzalloc(sizeof(*monc->monmap) +
+			       num_mon*sizeof(monc->monmap->mon_inst[0]),
+			       GFP_KERNEL);
+	if (!monc->monmap)
+		return -ENOMEM;
+	for (i = 0; i < num_mon; i++) {
+		monc->monmap->mon_inst[i].addr = mon_addr[i];
+		monc->monmap->mon_inst[i].addr.nonce = 0;
+		monc->monmap->mon_inst[i].name.type =
+			CEPH_ENTITY_TYPE_MON;
+		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+	}
+	monc->monmap->num_mon = num_mon;
+	monc->have_fsid = false;
+
+	/* release addr memory */
+	kfree(args->mon_addr);
+	args->mon_addr = NULL;
+	args->num_mon = 0;
+	return 0;
+}
+
+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{
+	int err = 0;
+
+	dout("init\n");
+	memset(monc, 0, sizeof(*monc));
+	monc->client = cl;
+	monc->monmap = NULL;
+	mutex_init(&monc->mutex);
+
+	err = build_initial_monmap(monc);
+	if (err)
+		goto out;
+
+	monc->con = NULL;
+
+	/* authentication */
+	monc->auth = ceph_auth_init(cl->mount_args->name,
+				    cl->mount_args->secret);
+	if (IS_ERR(monc->auth))
+		return PTR_ERR(monc->auth);
+	monc->auth->want_keys =
+		CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+		CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+	/* msg pools */
+	err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+			       sizeof(struct ceph_mon_subscribe_ack), 1, false);
+	if (err < 0)
+		goto out_monmap;
+	err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
+				sizeof(struct ceph_mon_statfs_reply), 0, false);
+	if (err < 0)
+		goto out_pool1;
+	err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
+	if (err < 0)
+		goto out_pool2;
+
+	monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+	monc->pending_auth = 0;
+	if (IS_ERR(monc->m_auth)) {
+		err = PTR_ERR(monc->m_auth);
+		monc->m_auth = NULL;
+		goto out_pool3;
+	}
+
+	monc->cur_mon = -1;
+	monc->hunting = true;
+	monc->sub_renew_after = jiffies;
+	monc->sub_sent = 0;
+
+	INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+	monc->statfs_request_tree = RB_ROOT;
+	monc->num_statfs_requests = 0;
+	monc->last_tid = 0;
+
+	monc->have_mdsmap = 0;
+	monc->have_osdmap = 0;
+	monc->want_next_osdmap = 1;
+	return 0;
+
+out_pool3:
+	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+out_pool2:
+	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+out_pool1:
+	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+out_monmap:
+	kfree(monc->monmap);
+out:
+	return err;
+}
+
+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+	dout("stop\n");
+	cancel_delayed_work_sync(&monc->delayed_work);
+
+	mutex_lock(&monc->mutex);
+	__close_session(monc);
+	if (monc->con) {
+		monc->con->private = NULL;
+		monc->con->ops->put(monc->con);
+		monc->con = NULL;
+	}
+	mutex_unlock(&monc->mutex);
+
+	ceph_auth_destroy(monc->auth);
+
+	ceph_msg_put(monc->m_auth);
+	ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+	ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+	ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+
+	kfree(monc->monmap);
+}
+
+static void handle_auth_reply(struct ceph_mon_client *monc,
+			      struct ceph_msg *msg)
+{
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	monc->pending_auth = 0;
+	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+				     msg->front.iov_len,
+				     monc->m_auth->front.iov_base,
+				     monc->m_auth->front_max);
+	if (ret < 0) {
+		monc->client->auth_err = ret;
+		wake_up(&monc->client->auth_wq);
+	} else if (ret > 0) {
+		__send_prepared_auth_request(monc, ret);
+	} else if (monc->auth->ops->is_authenticated(monc->auth)) {
+		dout("authenticated, starting session\n");
+
+		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+		monc->client->msgr->inst.name.num = monc->auth->global_id;
+
+		__send_subscribe(monc);
+		__resend_statfs(monc);
+	}
+	mutex_unlock(&monc->mutex);
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	if (monc->pending_auth)
+		return 0;
+
+	ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+			      monc->m_auth->front_max);
+	if (ret <= 0)
+		return ret; /* either an error, or no need to authenticate */
+	__send_prepared_auth_request(monc, ret);
+	return 0;
+}
+
+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = __validate_auth(monc);
+	mutex_unlock(&monc->mutex);
+	return ret;
+}
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!monc)
+		return;
+
+	switch (type) {
+	case CEPH_MSG_AUTH_REPLY:
+		handle_auth_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		handle_subscribe_ack(monc, msg);
+		break;
+
+	case CEPH_MSG_STATFS_REPLY:
+		handle_statfs_reply(monc, msg);
+		break;
+
+	case CEPH_MSG_MON_MAP:
+		ceph_monc_handle_map(monc, msg);
+		break;
+
+	case CEPH_MSG_MDS_MAP:
+		ceph_mdsc_handle_map(&monc->client->mdsc, msg);
+		break;
+
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(&monc->client->osdc, msg);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+	ceph_msg_put(msg);
+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
+				      struct ceph_msg_header *hdr,
+				      int *skip)
+{
+	struct ceph_mon_client *monc = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front_len = le32_to_cpu(hdr->front_len);
+	struct ceph_msg *m = NULL;
+
+	*skip = 0;
+
+	switch (type) {
+	case CEPH_MSG_MON_SUBSCRIBE_ACK:
+		m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+		break;
+	case CEPH_MSG_STATFS_REPLY:
+		m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
+		break;
+	case CEPH_MSG_AUTH_REPLY:
+		m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+		break;
+	case CEPH_MSG_MON_MAP:
+	case CEPH_MSG_MDS_MAP:
+	case CEPH_MSG_OSD_MAP:
+		m = ceph_msg_new(type, front_len, 0, 0, NULL);
+		break;
+	}
+
+	if (!m) {
+		pr_info("alloc_msg unknown type %d\n", type);
+		*skip = 1;
+	}
+	return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+	struct ceph_mon_client *monc = con->private;
+
+	if (!monc)
+		return;
+
+	dout("mon_fault\n");
+	mutex_lock(&monc->mutex);
+	if (!con->private)
+		goto out;
+
+	if (monc->con && !monc->hunting)
+		pr_info("mon%d %s session lost, "
+			"hunting for new mon\n", monc->cur_mon,
+			pr_addr(&monc->con->peer_addr.in_addr));
+
+	__close_session(monc);
+	if (!monc->hunting) {
+		/* start hunting */
+		monc->hunting = true;
+		__open_session(monc);
+	} else {
+		/* already hunting, let's wait a bit */
+		__schedule_delayed(monc);
+	}
+out:
+	mutex_unlock(&monc->mutex);
+}
+
+const static struct ceph_connection_operations mon_con_ops = {
+	.get = ceph_con_get,
+	.put = ceph_con_put,
+	.dispatch = dispatch,
+	.fault = mon_fault,
+	.alloc_msg = mon_alloc_msg,
+};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 00000000000..b958ad5afa0
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+
+#include "messenger.h"
+#include "msgpool.h"
+
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {
+	struct ceph_fsid fsid;
+	u32 epoch;
+	u32 num_mon;
+	struct ceph_entity_inst mon_inst[0];
+};
+
+struct ceph_mon_client;
+struct ceph_mon_statfs_request;
+
+
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+					 int newmon);
+
+/* a pending monitor request */
+struct ceph_mon_request {
+	struct ceph_mon_client *monc;
+	struct delayed_work delayed_work;
+	unsigned long delay;
+	ceph_monc_request_func_t do_request;
+};
+
+/*
+ * statfs() is done a bit differently because we need to get data back
+ * to the caller
+ */
+struct ceph_mon_statfs_request {
+	u64 tid;
+	struct rb_node node;
+	int result;
+	struct ceph_statfs *buf;
+	struct completion completion;
+	unsigned long last_attempt, delay; /* jiffies */
+	struct ceph_msg *request;  /* original request */
+};
+
+struct ceph_mon_client {
+	struct ceph_client *client;
+	struct ceph_monmap *monmap;
+
+	struct mutex mutex;
+	struct delayed_work delayed_work;
+
+	struct ceph_auth_client *auth;
+	struct ceph_msg *m_auth;
+	int pending_auth;
+
+	bool hunting;
+	int cur_mon;                       /* last monitor i contacted */
+	unsigned long sub_sent, sub_renew_after;
+	struct ceph_connection *con;
+	bool have_fsid;
+
+	/* msg pools */
+	struct ceph_msgpool msgpool_subscribe_ack;
+	struct ceph_msgpool msgpool_statfs_reply;
+	struct ceph_msgpool msgpool_auth_reply;
+
+	/* pending statfs requests */
+	struct rb_root statfs_request_tree;
+	int num_statfs_requests;
+	u64 last_tid;
+
+	/* mds/osd map */
+	int want_next_osdmap; /* 1 = want, 2 = want+asked */
+	u32 have_osdmap, have_mdsmap;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_file;
+#endif
+};
+
+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+				struct ceph_entity_addr *addr);
+
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map.  We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+			       struct ceph_statfs *buf);
+
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+
+
+
+#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 00000000000..ca3b44a89f2
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "msgpool.h"
+
+/*
+ * We use msg pools to preallocate memory for messages we expect to
+ * receive over the wire, to avoid getting ourselves into OOM
+ * conditions at unexpected times.  We take use a few different
+ * strategies:
+ *
+ *  - for request/response type interactions, we preallocate the
+ * memory needed for the response when we generate the request.
+ *
+ *  - for messages we can receive at any time from the MDS, we preallocate
+ * a pool of messages we can re-use.
+ *
+ *  - for writeback, we preallocate some number of messages to use for
+ * requests and their replies, so that we always make forward
+ * progress.
+ *
+ * The msgpool behaves like a mempool_t, but keeps preallocated
+ * ceph_msgs strung together on a list_head instead of using a pointer
+ * vector.  This avoids vector reallocation when we adjust the number
+ * of preallocated items (which happens frequently).
+ */
+
+
+/*
+ * Allocate or release as necessary to meet our target pool size.
+ */
+static int __fill_msgpool(struct ceph_msgpool *pool)
+{
+	struct ceph_msg *msg;
+
+	while (pool->num < pool->min) {
+		dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
+		     pool->min);
+		spin_unlock(&pool->lock);
+		msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+		spin_lock(&pool->lock);
+		if (IS_ERR(msg))
+			return PTR_ERR(msg);
+		msg->pool = pool;
+		list_add(&msg->list_head, &pool->msgs);
+		pool->num++;
+	}
+	while (pool->num > pool->min) {
+		msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
+		dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
+		     pool->min, msg);
+		list_del_init(&msg->list_head);
+		pool->num--;
+		ceph_msg_kfree(msg);
+	}
+	return 0;
+}
+
+int ceph_msgpool_init(struct ceph_msgpool *pool,
+		      int front_len, int min, bool blocking)
+{
+	int ret;
+
+	dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
+	spin_lock_init(&pool->lock);
+	pool->front_len = front_len;
+	INIT_LIST_HEAD(&pool->msgs);
+	pool->num = 0;
+	pool->min = min;
+	pool->blocking = blocking;
+	init_waitqueue_head(&pool->wait);
+
+	spin_lock(&pool->lock);
+	ret = __fill_msgpool(pool);
+	spin_unlock(&pool->lock);
+	return ret;
+}
+
+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+	dout("msgpool_destroy %p\n", pool);
+	spin_lock(&pool->lock);
+	pool->min = 0;
+	__fill_msgpool(pool);
+	spin_unlock(&pool->lock);
+}
+
+int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+{
+	int ret;
+
+	spin_lock(&pool->lock);
+	dout("msgpool_resv %p delta %d\n", pool, delta);
+	pool->min += delta;
+	ret = __fill_msgpool(pool);
+	spin_unlock(&pool->lock);
+	return ret;
+}
+
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
+{
+	wait_queue_t wait;
+	struct ceph_msg *msg;
+
+	if (front_len && front_len > pool->front_len) {
+		pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
+		       pool, front_len, pool->front_len);
+		WARN_ON(1);
+
+		/* try to alloc a fresh message */
+		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+		if (!IS_ERR(msg))
+			return msg;
+	}
+
+	if (!front_len)
+		front_len = pool->front_len;
+
+	if (pool->blocking) {
+		/* mempool_t behavior; first try to alloc */
+		msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+		if (!IS_ERR(msg))
+			return msg;
+	}
+
+	while (1) {
+		spin_lock(&pool->lock);
+		if (likely(pool->num)) {
+			msg = list_entry(pool->msgs.next, struct ceph_msg,
+					 list_head);
+			list_del_init(&msg->list_head);
+			pool->num--;
+			dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
+			     pool->num, pool->min);
+			spin_unlock(&pool->lock);
+			return msg;
+		}
+		pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
+		       pool->min, pool->blocking ? "waiting" : "may fail");
+		spin_unlock(&pool->lock);
+
+		if (!pool->blocking) {
+			WARN_ON(1);
+
+			/* maybe we can allocate it now? */
+			msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+			if (!IS_ERR(msg))
+				return msg;
+
+			pr_err("msgpool_get %p empty + alloc failed\n", pool);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		init_wait(&wait);
+		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+		schedule();
+		finish_wait(&pool->wait, &wait);
+	}
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+	spin_lock(&pool->lock);
+	if (pool->num < pool->min) {
+		/* reset msg front_len; user may have changed it */
+		msg->front.iov_len = pool->front_len;
+		msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+		kref_set(&msg->kref, 1);  /* retake a single ref */
+		list_add(&msg->list_head, &pool->msgs);
+		pool->num++;
+		dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
+		     pool->num, pool->min);
+		spin_unlock(&pool->lock);
+		wake_up(&pool->wait);
+	} else {
+		dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
+		     pool->num, pool->min);
+		spin_unlock(&pool->lock);
+		ceph_msg_kfree(msg);
+	}
+}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 00000000000..bc834bfcd72
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+
+#include "messenger.h"
+
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+	spinlock_t lock;
+	int front_len;          /* preallocated payload size */
+	struct list_head msgs;  /* msgs in the pool; each has 1 ref */
+	int num, min;           /* cur, min # msgs in the pool */
+	bool blocking;
+	wait_queue_head_t wait;
+};
+
+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+			     int front_len, int size, bool blocking);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+					 int front_len);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
+
+#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 00000000000..8aaab414f3f
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
+#ifndef __MSGR_H
+#define __MSGR_H
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT    6789  /* default monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST  6789
+#define CEPH_PORT_START  6800  /* non-monitors start here */
+#define CEPH_PORT_LAST   6900
+
+/*
+ * tcp connection banner.  include a protocol version. and adjust
+ * whenever the wire protocol changes.  try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+       return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+	__u8 type;      /* CEPH_ENTITY_TYPE_* */
+	__le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON    0x01
+#define CEPH_ENTITY_TYPE_MDS    0x02
+#define CEPH_ENTITY_TYPE_OSD    0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_ADMIN  0x10
+#define CEPH_ENTITY_TYPE_AUTH   0x20
+
+#define CEPH_ENTITY_TYPE_ANY    0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+	__le32 type;
+	__le32 nonce;  /* unique id for process (e.g. pid) */
+	struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+	struct ceph_entity_name name;
+	struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY         1  /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION  2  /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT          3  /* server->client: wait for racing
+					  incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4  /* server->client + cseq: try again
+					  with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL  5  /* server->client + gseq: try again
+					  with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE         6  /* closing pipe */
+#define CEPH_MSGR_TAG_MSG           7  /* message */
+#define CEPH_MSGR_TAG_ACK           8  /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
+
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+	__le64 features;     /* supported feature bits */
+	__le32 host_type;    /* CEPH_ENTITY_TYPE_* */
+	__le32 global_seq;   /* count connections initiated by this host */
+	__le32 connect_seq;  /* count connections initiated in this session */
+	__le32 protocol_version;
+	__le32 authorizer_protocol;
+	__le32 authorizer_len;
+	__u8  flags;         /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+	__u8 tag;
+	__le64 features;     /* feature bits for this session */
+	__le32 global_seq;
+	__le32 connect_seq;
+	__le32 protocol_version;
+	__le32 authorizer_len;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY  1  /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 front_len; /* bytes in main payload */
+	__le32 middle_len;/* bytes in middle payload */
+	__le32 data_len;  /* bytes of data payload */
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	struct ceph_entity_inst src, orig_src;
+	__le32 reserved;
+	__le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW     64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH    196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+	__le32 front_crc, middle_crc, data_crc;
+	__u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE  (1<<0)   /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC     (1<<1)   /* no data crc */
+
+
+#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 00000000000..c7b4dedaace
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1550 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "super.h"
+#include "osd_client.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+
+#define OSD_OP_FRONT_LEN	4096
+#define OSD_OPREPLY_FRONT_LEN	512
+
+const static struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd);
+
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices."  (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.)  Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly.  shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+			struct ceph_vino vino, struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	struct ceph_osd_op *op = (void *)(reqhead + 1);
+	u64 orig_len = *plen;
+	u64 objoff, objlen;    /* extent in object */
+	u64 bno;
+
+	reqhead->snapid = cpu_to_le64(vino.snap);
+
+	/* object extent? */
+	ceph_calc_file_object_mapping(layout, off, plen, &bno,
+				      &objoff, &objlen);
+	if (*plen < orig_len)
+		dout(" skipping last %llu, final file extent %llu~%llu\n",
+		     orig_len - *plen, off, *plen);
+
+	sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+	req->r_oid_len = strlen(req->r_oid);
+
+	op->extent.offset = cpu_to_le64(objoff);
+	op->extent.length = cpu_to_le64(objlen);
+	req->r_num_pages = calc_pages_for(off, *plen);
+
+	dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
+	     req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
+}
+
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+	struct ceph_osd_request *req = container_of(kref,
+						    struct ceph_osd_request,
+						    r_kref);
+
+	if (req->r_request)
+		ceph_msg_put(req->r_request);
+	if (req->r_reply)
+		ceph_msg_put(req->r_reply);
+	if (req->r_con_filling_msg) {
+		dout("release_request revoking pages %p from con %p\n",
+		     req->r_pages, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg,
+				      req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+	}
+	if (req->r_own_pages)
+		ceph_release_page_vector(req->r_pages,
+					 req->r_num_pages);
+	ceph_put_snap_context(req->r_snapc);
+	if (req->r_mempool)
+		mempool_free(req, req->r_osdc->req_mempool);
+	else
+		kfree(req);
+}
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately.  (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+					       struct ceph_file_layout *layout,
+					       struct ceph_vino vino,
+					       u64 off, u64 *plen,
+					       int opcode, int flags,
+					       struct ceph_snap_context *snapc,
+					       int do_sync,
+					       u32 truncate_seq,
+					       u64 truncate_size,
+					       struct timespec *mtime,
+					       bool use_mempool, int num_reply)
+{
+	struct ceph_osd_request *req;
+	struct ceph_msg *msg;
+	struct ceph_osd_request_head *head;
+	struct ceph_osd_op *op;
+	void *p;
+	int num_op = 1 + do_sync;
+	size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+	int i;
+
+	if (use_mempool) {
+		req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+		memset(req, 0, sizeof(*req));
+	} else {
+		req = kzalloc(sizeof(*req), GFP_NOFS);
+	}
+	if (req == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	req->r_osdc = osdc;
+	req->r_mempool = use_mempool;
+	kref_init(&req->r_kref);
+	init_completion(&req->r_completion);
+	init_completion(&req->r_safe_completion);
+	INIT_LIST_HEAD(&req->r_unsafe_item);
+	req->r_flags = flags;
+
+	WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+	/* create reply message */
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+				   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+	if (IS_ERR(msg)) {
+		ceph_osdc_put_request(req);
+		return ERR_PTR(PTR_ERR(msg));
+	}
+	req->r_reply = msg;
+
+	/* create request message; allow space for oid */
+	msg_size += 40;
+	if (snapc)
+		msg_size += sizeof(u64) * snapc->num_snaps;
+	if (use_mempool)
+		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+	else
+		msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+	if (IS_ERR(msg)) {
+		ceph_osdc_put_request(req);
+		return ERR_PTR(PTR_ERR(msg));
+	}
+	msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+	memset(msg->front.iov_base, 0, msg->front.iov_len);
+	head = msg->front.iov_base;
+	op = (void *)(head + 1);
+	p = (void *)(op + num_op);
+
+	req->r_request = msg;
+	req->r_snapc = ceph_get_snap_context(snapc);
+
+	head->client_inc = cpu_to_le32(1); /* always, for now. */
+	head->flags = cpu_to_le32(flags);
+	if (flags & CEPH_OSD_FLAG_WRITE)
+		ceph_encode_timespec(&head->mtime, mtime);
+	head->num_ops = cpu_to_le16(num_op);
+	op->op = cpu_to_le16(opcode);
+
+	/* calculate max write size */
+	calc_layout(osdc, vino, layout, off, plen, req);
+	req->r_file_layout = *layout;  /* keep a copy */
+
+	if (flags & CEPH_OSD_FLAG_WRITE) {
+		req->r_request->hdr.data_off = cpu_to_le16(off);
+		req->r_request->hdr.data_len = cpu_to_le32(*plen);
+		op->payload_len = cpu_to_le32(*plen);
+	}
+	op->extent.truncate_size = cpu_to_le64(truncate_size);
+	op->extent.truncate_seq = cpu_to_le32(truncate_seq);
+
+	/* fill in oid */
+	head->object_len = cpu_to_le32(req->r_oid_len);
+	memcpy(p, req->r_oid, req->r_oid_len);
+	p += req->r_oid_len;
+
+	if (do_sync) {
+		op++;
+		op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
+	}
+	if (snapc) {
+		head->snap_seq = cpu_to_le64(snapc->seq);
+		head->num_snaps = cpu_to_le32(snapc->num_snaps);
+		for (i = 0; i < snapc->num_snaps; i++) {
+			put_unaligned_le64(snapc->snaps[i], p);
+			p += sizeof(u64);
+		}
+	}
+
+	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+	msg_size = p - msg->front.iov_base;
+	msg->front.iov_len = msg_size;
+	msg->hdr.front_len = cpu_to_le32(msg_size);
+	return req;
+}
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *new)
+{
+	struct rb_node **p = &osdc->requests.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd_request *req = NULL;
+
+	while (*p) {
+		parent = *p;
+		req = rb_entry(parent, struct ceph_osd_request, r_node);
+		if (new->r_tid < req->r_tid)
+			p = &(*p)->rb_left;
+		else if (new->r_tid > req->r_tid)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->r_node, parent, p);
+	rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+						 u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid)
+			n = n->rb_left;
+		else if (tid > req->r_tid)
+			n = n->rb_right;
+		else
+			return req;
+	}
+	return NULL;
+}
+
+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+		    u64 tid)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *n = osdc->requests.rb_node;
+
+	while (n) {
+		req = rb_entry(n, struct ceph_osd_request, r_node);
+		if (tid < req->r_tid) {
+			if (!n->rb_left)
+				return req;
+			n = n->rb_left;
+		} else if (tid > req->r_tid) {
+			n = n->rb_right;
+		} else {
+			return req;
+		}
+	}
+	return NULL;
+}
+
+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+
+	if (!osd)
+		return;
+	dout("osd_reset osd%d\n", osd->o_osd);
+	osdc = osd->o_osdc;
+	down_read(&osdc->map_sem);
+	kick_requests(osdc, osd);
+	up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd *osd;
+
+	osd = kzalloc(sizeof(*osd), GFP_NOFS);
+	if (!osd)
+		return NULL;
+
+	atomic_set(&osd->o_ref, 1);
+	osd->o_osdc = osdc;
+	INIT_LIST_HEAD(&osd->o_requests);
+	INIT_LIST_HEAD(&osd->o_osd_lru);
+	osd->o_incarnation = 1;
+
+	ceph_con_init(osdc->client->msgr, &osd->o_con);
+	osd->o_con.private = osd;
+	osd->o_con.ops = &osd_con_ops;
+	osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
+	INIT_LIST_HEAD(&osd->o_keepalive_item);
+	return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+	if (atomic_inc_not_zero(&osd->o_ref)) {
+		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+		     atomic_read(&osd->o_ref));
+		return osd;
+	} else {
+		dout("get_osd %p FAIL\n", osd);
+		return NULL;
+	}
+}
+
+static void put_osd(struct ceph_osd *osd)
+{
+	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+	     atomic_read(&osd->o_ref) - 1);
+	if (atomic_dec_and_test(&osd->o_ref))
+		kfree(osd);
+}
+
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	dout("__remove_osd %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_requests));
+	rb_erase(&osd->o_node, &osdc->osds);
+	list_del_init(&osd->o_osd_lru);
+	ceph_con_close(&osd->o_con);
+	put_osd(osd);
+}
+
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+			      struct ceph_osd *osd)
+{
+	dout("__move_osd_to_lru %p\n", osd);
+	BUG_ON(!list_empty(&osd->o_osd_lru));
+	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+	osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+	dout("__remove_osd_from_lru %p\n", osd);
+	if (!list_empty(&osd->o_osd_lru))
+		list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+	struct ceph_osd *osd, *nosd;
+
+	dout("__remove_old_osds %p\n", osdc);
+	mutex_lock(&osdc->request_mutex);
+	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+		if (!remove_all && time_before(jiffies, osd->lru_ttl))
+			break;
+		__remove_osd(osdc, osd);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	struct ceph_osd_request *req;
+	int ret = 0;
+
+	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+	if (list_empty(&osd->o_requests)) {
+		__remove_osd(osdc, osd);
+	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+			  &osd->o_con.peer_addr,
+			  sizeof(osd->o_con.peer_addr)) == 0 &&
+		   !ceph_con_opened(&osd->o_con)) {
+		dout(" osd addr hasn't changed and connection never opened,"
+		     " letting msgr retry");
+		/* touch each r_stamp for handle_timeout()'s benfit */
+		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+			req->r_stamp = jiffies;
+		ret = -EAGAIN;
+	} else {
+		ceph_con_close(&osd->o_con);
+		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+		osd->o_incarnation++;
+	}
+	return ret;
+}
+
+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+	struct rb_node **p = &osdc->osds.rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_osd *osd = NULL;
+
+	while (*p) {
+		parent = *p;
+		osd = rb_entry(parent, struct ceph_osd, o_node);
+		if (new->o_osd < osd->o_osd)
+			p = &(*p)->rb_left;
+		else if (new->o_osd > osd->o_osd)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->o_node, parent, p);
+	rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+	struct ceph_osd *osd;
+	struct rb_node *n = osdc->osds.rb_node;
+
+	while (n) {
+		osd = rb_entry(n, struct ceph_osd, o_node);
+		if (o < osd->o_osd)
+			n = n->rb_left;
+		else if (o > osd->o_osd)
+			n = n->rb_right;
+		else
+			return osd;
+	}
+	return NULL;
+}
+
+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+	schedule_delayed_work(&osdc->timeout_work,
+			osdc->client->mount_args->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work(&osdc->timeout_work);
+}
+
+/*
+ * Register request, assign tid.  If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+			     struct ceph_osd_request *req)
+{
+	mutex_lock(&osdc->request_mutex);
+	req->r_tid = ++osdc->last_tid;
+	req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+	INIT_LIST_HEAD(&req->r_req_lru_item);
+
+	dout("register_request %p tid %lld\n", req, req->r_tid);
+	__insert_request(osdc, req);
+	ceph_osdc_get_request(req);
+	osdc->num_requests++;
+
+	if (osdc->num_requests == 1) {
+		dout(" first request, scheduling timeout\n");
+		__schedule_osd_timeout(osdc);
+	}
+	mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,
+				 struct ceph_osd_request *req)
+{
+	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+	rb_erase(&req->r_node, &osdc->requests);
+	osdc->num_requests--;
+
+	if (req->r_osd) {
+		/* make sure the original request isn't in flight. */
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+		list_del_init(&req->r_osd_item);
+		if (list_empty(&req->r_osd->o_requests))
+			__move_osd_to_lru(osdc, req->r_osd);
+		req->r_osd = NULL;
+	}
+
+	ceph_osdc_put_request(req);
+
+	list_del_init(&req->r_req_lru_item);
+	if (osdc->num_requests == 0) {
+		dout(" no requests, canceling timeout\n");
+		__cancel_osd_timeout(osdc);
+	}
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+	if (req->r_sent) {
+		ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+		req->r_sent = 0;
+	}
+	list_del_init(&req->r_req_lru_item);
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately.  If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,
+		      struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+	struct ceph_pg pgid;
+	int o = -1;
+	int err;
+
+	dout("map_osds %p tid %lld\n", req, req->r_tid);
+	err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+				      &req->r_file_layout, osdc->osdmap);
+	if (err)
+		return err;
+	pgid = reqhead->layout.ol_pgid;
+	req->r_pgid = pgid;
+
+	o = ceph_calc_pg_primary(osdc->osdmap, pgid);
+
+	if ((req->r_osd && req->r_osd->o_osd == o &&
+	     req->r_sent >= req->r_osd->o_incarnation) ||
+	    (req->r_osd == NULL && o == -1))
+		return 0;  /* no change */
+
+	dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+	     req->r_osd ? req->r_osd->o_osd : -1);
+
+	if (req->r_osd) {
+		__cancel_request(req);
+		list_del_init(&req->r_osd_item);
+		req->r_osd = NULL;
+	}
+
+	req->r_osd = __lookup_osd(osdc, o);
+	if (!req->r_osd && o >= 0) {
+		err = -ENOMEM;
+		req->r_osd = create_osd(osdc);
+		if (!req->r_osd)
+			goto out;
+
+		dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+		req->r_osd->o_osd = o;
+		req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+		__insert_osd(osdc, req->r_osd);
+
+		ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+	}
+
+	if (req->r_osd) {
+		__remove_osd_from_lru(req->r_osd);
+		list_add(&req->r_osd_item, &req->r_osd->o_requests);
+	}
+	err = 1;   /* osd changed */
+
+out:
+	return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,
+			  struct ceph_osd_request *req)
+{
+	struct ceph_osd_request_head *reqhead;
+	int err;
+
+	err = __map_osds(osdc, req);
+	if (err < 0)
+		return err;
+	if (req->r_osd == NULL) {
+		dout("send_request %p no up osds in pg\n", req);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+		return 0;
+	}
+
+	dout("send_request %p tid %llu to osd%d flags %d\n",
+	     req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+	reqhead = req->r_request->front.iov_base;
+	reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
+	reqhead->reassert_version = req->r_reassert_version;
+
+	req->r_stamp = jiffies;
+	list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+
+	ceph_msg_get(req->r_request); /* send consumes a ref */
+	ceph_con_send(&req->r_osd->o_con, req->r_request);
+	req->r_sent = req->r_osd->o_incarnation;
+	return 0;
+}
+
+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds.  When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected.  Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client, timeout_work.work);
+	struct ceph_osd_request *req, *last_req = NULL;
+	struct ceph_osd *osd;
+	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
+	unsigned long keepalive =
+		osdc->client->mount_args->osd_keepalive_timeout * HZ;
+	unsigned long last_stamp = 0;
+	struct rb_node *p;
+	struct list_head slow_osds;
+
+	dout("timeout\n");
+	down_read(&osdc->map_sem);
+
+	ceph_monc_request_next_osdmap(&osdc->client->monc);
+
+	mutex_lock(&osdc->request_mutex);
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			int err;
+
+			dout("osdc resending prev failed %lld\n", req->r_tid);
+			err = __send_request(osdc, req);
+			if (err)
+				dout("osdc failed again on %lld\n", req->r_tid);
+			else
+				req->r_resend = false;
+			continue;
+		}
+	}
+
+	/*
+	 * reset osds that appear to be _really_ unresponsive.  this
+	 * is a failsafe measure.. we really shouldn't be getting to
+	 * this point if the system is working properly.  the monitors
+	 * should mark the osd as failed and we should find out about
+	 * it from an updated osd map.
+	 */
+	while (!list_empty(&osdc->req_lru)) {
+		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+				 r_req_lru_item);
+
+		if (time_before(jiffies, req->r_stamp + timeout))
+			break;
+
+		BUG_ON(req == last_req && req->r_stamp == last_stamp);
+		last_req = req;
+		last_stamp = req->r_stamp;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+			   req->r_tid, osd->o_osd);
+		__kick_requests(osdc, osd);
+	}
+
+	/*
+	 * ping osds that are a bit slow.  this ensures that if there
+	 * is a break in the TCP connection we will notice, and reopen
+	 * a connection with that osd (from the fault callback).
+	 */
+	INIT_LIST_HEAD(&slow_osds);
+	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+		if (time_before(jiffies, req->r_stamp + keepalive))
+			break;
+
+		osd = req->r_osd;
+		BUG_ON(!osd);
+		dout(" tid %llu is slow, will send keepalive on osd%d\n",
+		     req->r_tid, osd->o_osd);
+		list_move_tail(&osd->o_keepalive_item, &slow_osds);
+	}
+	while (!list_empty(&slow_osds)) {
+		osd = list_entry(slow_osds.next, struct ceph_osd,
+				 o_keepalive_item);
+		list_del_init(&osd->o_keepalive_item);
+		ceph_con_keepalive(&osd->o_con);
+	}
+
+	__schedule_osd_timeout(osdc);
+	mutex_unlock(&osdc->request_mutex);
+
+	up_read(&osdc->map_sem);
+}
+
+static void handle_osds_timeout(struct work_struct *work)
+{
+	struct ceph_osd_client *osdc =
+		container_of(work, struct ceph_osd_client,
+			     osds_timeout_work.work);
+	unsigned long delay =
+		osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
+
+	dout("osds timeout\n");
+	down_read(&osdc->map_sem);
+	remove_old_osds(osdc, 0);
+	up_read(&osdc->map_sem);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+			      round_jiffies_relative(delay));
+}
+
+/*
+ * handle osd op reply.  either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+			 struct ceph_connection *con)
+{
+	struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+	struct ceph_osd_request *req;
+	u64 tid;
+	int numops, object_len, flags;
+
+	tid = le64_to_cpu(msg->hdr.tid);
+	if (msg->front.iov_len < sizeof(*rhead))
+		goto bad;
+	numops = le32_to_cpu(rhead->num_ops);
+	object_len = le32_to_cpu(rhead->object_len);
+	if (msg->front.iov_len != sizeof(*rhead) + object_len +
+	    numops * sizeof(struct ceph_osd_op))
+		goto bad;
+	dout("handle_reply %p tid %llu\n", msg, tid);
+
+	/* lookup */
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (req == NULL) {
+		dout("handle_reply tid %llu dne\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		return;
+	}
+	ceph_osdc_get_request(req);
+	flags = le32_to_cpu(rhead->flags);
+
+	/*
+	 * if this connection filled our message, drop our reference now, to
+	 * avoid a (safe but slower) revoke later.
+	 */
+	if (req->r_con_filling_msg == con && req->r_reply == msg) {
+		dout(" dropping con_filling_msg ref %p\n", con);
+		req->r_con_filling_msg = NULL;
+		ceph_con_put(con);
+	}
+
+	if (!req->r_got_reply) {
+		unsigned bytes;
+
+		req->r_result = le32_to_cpu(rhead->result);
+		bytes = le32_to_cpu(msg->hdr.data_len);
+		dout("handle_reply result %d bytes %d\n", req->r_result,
+		     bytes);
+		if (req->r_result == 0)
+			req->r_result = bytes;
+
+		/* in case this is a write and we need to replay, */
+		req->r_reassert_version = rhead->reassert_version;
+
+		req->r_got_reply = 1;
+	} else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+		dout("handle_reply tid %llu dup ack\n", tid);
+		mutex_unlock(&osdc->request_mutex);
+		goto done;
+	}
+
+	dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+	/* either this is a read, or we got the safe response */
+	if ((flags & CEPH_OSD_FLAG_ONDISK) ||
+	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+		__unregister_request(osdc, req);
+
+	mutex_unlock(&osdc->request_mutex);
+
+	if (req->r_callback)
+		req->r_callback(req, msg);
+	else
+		complete(&req->r_completion);
+
+	if (flags & CEPH_OSD_FLAG_ONDISK) {
+		if (req->r_safe_callback)
+			req->r_safe_callback(req, msg);
+		complete(&req->r_safe_completion);  /* fsync waiter */
+	}
+
+done:
+	ceph_osdc_put_request(req);
+	return;
+
+bad:
+	pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+	       (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+	       (int)sizeof(*rhead));
+	ceph_msg_dump(msg);
+}
+
+
+static int __kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	struct ceph_osd_request *req;
+	struct rb_node *p, *n;
+	int needmap = 0;
+	int err;
+
+	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+	if (kickosd) {
+		err = __reset_osd(osdc, kickosd);
+		if (err == -EAGAIN)
+			return 1;
+	} else {
+		for (p = rb_first(&osdc->osds); p; p = n) {
+			struct ceph_osd *osd =
+				rb_entry(p, struct ceph_osd, o_node);
+
+			n = rb_next(p);
+			if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+			    memcmp(&osd->o_con.peer_addr,
+				   ceph_osd_addr(osdc->osdmap,
+						 osd->o_osd),
+				   sizeof(struct ceph_entity_addr)) != 0)
+				__reset_osd(osdc, osd);
+		}
+	}
+
+	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+		req = rb_entry(p, struct ceph_osd_request, r_node);
+
+		if (req->r_resend) {
+			dout(" r_resend set on tid %llu\n", req->r_tid);
+			__cancel_request(req);
+			goto kick;
+		}
+		if (req->r_osd && kickosd == req->r_osd) {
+			__cancel_request(req);
+			goto kick;
+		}
+
+		err = __map_osds(osdc, req);
+		if (err == 0)
+			continue;  /* no change */
+		if (err < 0) {
+			/*
+			 * FIXME: really, we should set the request
+			 * error and fail if this isn't a 'nofail'
+			 * request, but that's a fair bit more
+			 * complicated to do.  So retry!
+			 */
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+			continue;
+		}
+		if (req->r_osd == NULL) {
+			dout("tid %llu maps to no valid osd\n", req->r_tid);
+			needmap++;  /* request a newer map */
+			continue;
+		}
+
+kick:
+		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+		     req->r_osd ? req->r_osd->o_osd : -1);
+		req->r_flags |= CEPH_OSD_FLAG_RETRY;
+		err = __send_request(osdc, req);
+		if (err) {
+			dout(" setting r_resend on %llu\n", req->r_tid);
+			req->r_resend = true;
+		}
+	}
+
+	return needmap;
+}
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed.  Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+			  struct ceph_osd *kickosd)
+{
+	int needmap;
+
+	mutex_lock(&osdc->request_mutex);
+	needmap = __kick_requests(osdc, kickosd);
+	mutex_unlock(&osdc->request_mutex);
+
+	if (needmap) {
+		dout("%d requests for down osds, need new map\n", needmap);
+		ceph_monc_request_next_osdmap(&osdc->client->monc);
+	}
+
+}
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster.  Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+	void *p, *end, *next;
+	u32 nr_maps, maplen;
+	u32 epoch;
+	struct ceph_osdmap *newmap = NULL, *oldmap;
+	int err;
+	struct ceph_fsid fsid;
+
+	dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+	p = msg->front.iov_base;
+	end = p + msg->front.iov_len;
+
+	/* verify fsid */
+	ceph_decode_need(&p, end, sizeof(fsid), bad);
+	ceph_decode_copy(&p, &fsid, sizeof(fsid));
+	if (ceph_check_fsid(osdc->client, &fsid) < 0)
+		return;
+
+	down_write(&osdc->map_sem);
+
+	/* incremental maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d inc maps\n", nr_maps);
+	while (nr_maps > 0) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		next = p + maplen;
+		if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+			dout("applying incremental map %u len %d\n",
+			     epoch, maplen);
+			newmap = osdmap_apply_incremental(&p, next,
+							  osdc->osdmap,
+							  osdc->client->msgr);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			if (newmap != osdc->osdmap) {
+				ceph_osdmap_destroy(osdc->osdmap);
+				osdc->osdmap = newmap;
+			}
+		} else {
+			dout("ignoring incremental map %u len %d\n",
+			     epoch, maplen);
+		}
+		p = next;
+		nr_maps--;
+	}
+	if (newmap)
+		goto done;
+
+	/* full maps */
+	ceph_decode_32_safe(&p, end, nr_maps, bad);
+	dout(" %d full maps\n", nr_maps);
+	while (nr_maps) {
+		ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+		epoch = ceph_decode_32(&p);
+		maplen = ceph_decode_32(&p);
+		ceph_decode_need(&p, end, maplen, bad);
+		if (nr_maps > 1) {
+			dout("skipping non-latest full map %u len %d\n",
+			     epoch, maplen);
+		} else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+			dout("skipping full map %u len %d, "
+			     "older than our %u\n", epoch, maplen,
+			     osdc->osdmap->epoch);
+		} else {
+			dout("taking full map %u len %d\n", epoch, maplen);
+			newmap = osdmap_decode(&p, p+maplen);
+			if (IS_ERR(newmap)) {
+				err = PTR_ERR(newmap);
+				goto bad;
+			}
+			BUG_ON(!newmap);
+			oldmap = osdc->osdmap;
+			osdc->osdmap = newmap;
+			if (oldmap)
+				ceph_osdmap_destroy(oldmap);
+		}
+		p += maplen;
+		nr_maps--;
+	}
+
+done:
+	downgrade_write(&osdc->map_sem);
+	ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+	if (newmap)
+		kick_requests(osdc, NULL);
+	up_read(&osdc->map_sem);
+	return;
+
+bad:
+	pr_err("osdc handle_map corrupt msg\n");
+	ceph_msg_dump(msg);
+	up_write(&osdc->map_sem);
+	return;
+}
+
+
+/*
+ * A read request prepares specific pages that data is to be read into.
+ * When a message is being read off the wire, we call prepare_pages to
+ * find those pages.
+ *  0 = success, -1 failure.
+ */
+static int __prepare_pages(struct ceph_connection *con,
+			 struct ceph_msg_header *hdr,
+			 struct ceph_osd_request *req,
+			 u64 tid,
+			 struct ceph_msg *m)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+	int ret = -1;
+	int data_len = le32_to_cpu(hdr->data_len);
+	unsigned data_off = le16_to_cpu(hdr->data_off);
+
+	int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+	if (!osd)
+		return -1;
+
+	osdc = osd->o_osdc;
+
+	dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
+	     tid, req->r_num_pages, want);
+	if (unlikely(req->r_num_pages < want))
+		goto out;
+	m->pages = req->r_pages;
+	m->nr_pages = req->r_num_pages;
+	ret = 0; /* success */
+out:
+	BUG_ON(ret < 0 || m->nr_pages < want);
+
+	return ret;
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+			    struct ceph_osd_request *req,
+			    bool nofail)
+{
+	int rc = 0;
+
+	req->r_request->pages = req->r_pages;
+	req->r_request->nr_pages = req->r_num_pages;
+
+	register_request(osdc, req);
+
+	down_read(&osdc->map_sem);
+	mutex_lock(&osdc->request_mutex);
+	/*
+	 * a racing kick_requests() may have sent the message for us
+	 * while we dropped request_mutex above, so only send now if
+	 * the request still han't been touched yet.
+	 */
+	if (req->r_sent == 0) {
+		rc = __send_request(osdc, req);
+		if (rc) {
+			if (nofail) {
+				dout("osdc_start_request failed send, "
+				     " marking %lld\n", req->r_tid);
+				req->r_resend = true;
+				rc = 0;
+			} else {
+				__unregister_request(osdc, req);
+			}
+		}
+	}
+	mutex_unlock(&osdc->request_mutex);
+	up_read(&osdc->map_sem);
+	return rc;
+}
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+			   struct ceph_osd_request *req)
+{
+	int rc;
+
+	rc = wait_for_completion_interruptible(&req->r_completion);
+	if (rc < 0) {
+		mutex_lock(&osdc->request_mutex);
+		__cancel_request(req);
+		__unregister_request(osdc, req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+		return rc;
+	}
+
+	dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+	return req->r_result;
+}
+
+/*
+ * sync - wait for all in-flight requests to flush.  avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+	struct ceph_osd_request *req;
+	u64 last_tid, next_tid = 0;
+
+	mutex_lock(&osdc->request_mutex);
+	last_tid = osdc->last_tid;
+	while (1) {
+		req = __lookup_request_ge(osdc, next_tid);
+		if (!req)
+			break;
+		if (req->r_tid > last_tid)
+			break;
+
+		next_tid = req->r_tid + 1;
+		if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+			continue;
+
+		ceph_osdc_get_request(req);
+		mutex_unlock(&osdc->request_mutex);
+		dout("sync waiting on tid %llu (last is %llu)\n",
+		     req->r_tid, last_tid);
+		wait_for_completion(&req->r_safe_completion);
+		mutex_lock(&osdc->request_mutex);
+		ceph_osdc_put_request(req);
+	}
+	mutex_unlock(&osdc->request_mutex);
+	dout("sync done (thru tid %llu)\n", last_tid);
+}
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+	int err;
+
+	dout("init\n");
+	osdc->client = client;
+	osdc->osdmap = NULL;
+	init_rwsem(&osdc->map_sem);
+	init_completion(&osdc->map_waiters);
+	osdc->last_requested_map = 0;
+	mutex_init(&osdc->request_mutex);
+	osdc->last_tid = 0;
+	osdc->osds = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->osd_lru);
+	osdc->requests = RB_ROOT;
+	INIT_LIST_HEAD(&osdc->req_lru);
+	osdc->num_requests = 0;
+	INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+	INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+
+	schedule_delayed_work(&osdc->osds_timeout_work,
+	   round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
+
+	err = -ENOMEM;
+	osdc->req_mempool = mempool_create_kmalloc_pool(10,
+					sizeof(struct ceph_osd_request));
+	if (!osdc->req_mempool)
+		goto out;
+
+	err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+	if (err < 0)
+		goto out_mempool;
+	err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+				OSD_OPREPLY_FRONT_LEN, 10, true);
+	if (err < 0)
+		goto out_msgpool;
+	return 0;
+
+out_msgpool:
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+	mempool_destroy(osdc->req_mempool);
+out:
+	return err;
+}
+
+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+	cancel_delayed_work_sync(&osdc->timeout_work);
+	cancel_delayed_work_sync(&osdc->osds_timeout_work);
+	if (osdc->osdmap) {
+		ceph_osdmap_destroy(osdc->osdmap);
+		osdc->osdmap = NULL;
+	}
+	remove_old_osds(osdc, 1);
+	mempool_destroy(osdc->req_mempool);
+	ceph_msgpool_destroy(&osdc->msgpool_op);
+	ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+
+/*
+ * Read some contiguous pages.  If we cross a stripe boundary, shorten
+ * *plen.  Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			struct ceph_vino vino, struct ceph_file_layout *layout,
+			u64 off, u64 *plen,
+			u32 truncate_seq, u64 truncate_size,
+			struct page **pages, int num_pages)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+	     vino.snap, off, *plen);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+				    NULL, 0, truncate_seq, truncate_size, NULL,
+				    false, 1);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	/* it may be a short read due to an object boundary */
+	req->r_pages = pages;
+	num_pages = calc_pages_for(off, *plen);
+	req->r_num_pages = num_pages;
+
+	dout("readpages  final extent is %llu~%llu (%d pages)\n",
+	     off, *plen, req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, false);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	dout("readpages result %d\n", rc);
+	return rc;
+}
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+			 struct ceph_file_layout *layout,
+			 struct ceph_snap_context *snapc,
+			 u64 off, u64 len,
+			 u32 truncate_seq, u64 truncate_size,
+			 struct timespec *mtime,
+			 struct page **pages, int num_pages,
+			 int flags, int do_sync, bool nofail)
+{
+	struct ceph_osd_request *req;
+	int rc = 0;
+
+	BUG_ON(vino.snap != CEPH_NOSNAP);
+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+				    CEPH_OSD_OP_WRITE,
+				    flags | CEPH_OSD_FLAG_ONDISK |
+					    CEPH_OSD_FLAG_WRITE,
+				    snapc, do_sync,
+				    truncate_seq, truncate_size, mtime,
+				    nofail, 1);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	/* it may be a short write due to an object boundary */
+	req->r_pages = pages;
+	req->r_num_pages = calc_pages_for(off, len);
+	dout("writepages %llu~%llu (%d pages)\n", off, len,
+	     req->r_num_pages);
+
+	rc = ceph_osdc_start_request(osdc, req, nofail);
+	if (!rc)
+		rc = ceph_osdc_wait_request(osdc, req);
+
+	ceph_osdc_put_request(req);
+	if (rc == 0)
+		rc = len;
+	dout("writepages result %d\n", rc);
+	return rc;
+}
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc;
+	int type = le16_to_cpu(msg->hdr.type);
+
+	if (!osd)
+		return;
+	osdc = osd->o_osdc;
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		ceph_osdc_handle_map(osdc, msg);
+		break;
+	case CEPH_MSG_OSD_OPREPLY:
+		handle_reply(osdc, msg, con);
+		break;
+
+	default:
+		pr_err("received unknown message type %d %s\n", type,
+		       ceph_msg_type_name(type));
+	}
+	ceph_msg_put(msg);
+}
+
+/*
+ * lookup and return message for incoming reply
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	struct ceph_osd_client *osdc = osd->o_osdc;
+	struct ceph_msg *m;
+	struct ceph_osd_request *req;
+	int front = le32_to_cpu(hdr->front_len);
+	int data_len = le32_to_cpu(hdr->data_len);
+	u64 tid;
+	int err;
+
+	tid = le64_to_cpu(hdr->tid);
+	mutex_lock(&osdc->request_mutex);
+	req = __lookup_request(osdc, tid);
+	if (!req) {
+		*skip = 1;
+		m = NULL;
+		pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+			osd->o_osd);
+		goto out;
+	}
+
+	if (req->r_con_filling_msg) {
+		dout("get_reply revoking msg %p from old con %p\n",
+		     req->r_reply, req->r_con_filling_msg);
+		ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+		ceph_con_put(req->r_con_filling_msg);
+	}
+
+	if (front > req->r_reply->front.iov_len) {
+		pr_warning("get_reply front %d > preallocated %d\n",
+			   front, (int)req->r_reply->front.iov_len);
+		m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+		if (IS_ERR(m))
+			goto out;
+		ceph_msg_put(req->r_reply);
+		req->r_reply = m;
+	}
+	m = ceph_msg_get(req->r_reply);
+
+	if (data_len > 0) {
+		err = __prepare_pages(con, hdr, req, tid, m);
+		if (err < 0) {
+			*skip = 1;
+			ceph_msg_put(m);
+			m = ERR_PTR(err);
+		}
+	}
+	*skip = 0;
+	req->r_con_filling_msg = ceph_con_get(con);
+	dout("get_reply tid %lld %p\n", tid, m);
+
+out:
+	mutex_unlock(&osdc->request_mutex);
+	return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+				  struct ceph_msg_header *hdr,
+				  int *skip)
+{
+	struct ceph_osd *osd = con->private;
+	int type = le16_to_cpu(hdr->type);
+	int front = le32_to_cpu(hdr->front_len);
+
+	switch (type) {
+	case CEPH_MSG_OSD_MAP:
+		return ceph_msg_new(type, front, 0, 0, NULL);
+	case CEPH_MSG_OSD_OPREPLY:
+		return get_reply(con, hdr, skip);
+	default:
+		pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+			osd->o_osd);
+		*skip = 1;
+		return NULL;
+	}
+}
+
+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	if (get_osd(osd))
+		return con;
+	return NULL;
+}
+
+static void put_osd_con(struct ceph_connection *con)
+{
+	struct ceph_osd *osd = con->private;
+	put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+	                  void **buf, int *len, int *proto,
+	                  void **reply_buf, int *reply_len, int force_new)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	int ret = 0;
+
+	if (force_new && o->o_authorizer) {
+		ac->ops->destroy_authorizer(ac, o->o_authorizer);
+		o->o_authorizer = NULL;
+	}
+	if (o->o_authorizer == NULL) {
+		ret = ac->ops->create_authorizer(
+			ac, CEPH_ENTITY_TYPE_OSD,
+			&o->o_authorizer,
+			&o->o_authorizer_buf,
+			&o->o_authorizer_buf_len,
+			&o->o_authorizer_reply_buf,
+			&o->o_authorizer_reply_buf_len);
+		if (ret)
+		return ret;
+	}
+
+	*proto = ac->protocol;
+	*buf = o->o_authorizer_buf;
+	*len = o->o_authorizer_buf_len;
+	*reply_buf = o->o_authorizer_reply_buf;
+	*reply_len = o->o_authorizer_reply_buf_len;
+	return 0;
+}
+
+
+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_osd_client *osdc = o->o_osdc;
+	struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+
+	return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+const static struct ceph_connection_operations osd_con_ops = {
+	.get = get_osd_con,
+	.put = put_osd_con,
+	.dispatch = dispatch,
+	.get_authorizer = get_authorizer,
+	.verify_authorizer_reply = verify_authorizer_reply,
+	.invalidate_authorizer = invalidate_authorizer,
+	.alloc_msg = alloc_msg,
+	.fault = osd_reset,
+};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 00000000000..b0759911e7c
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+
+#include "types.h"
+#include "osdmap.h"
+#include "messenger.h"
+
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+struct ceph_authorizer;
+
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+				     struct ceph_msg *);
+
+/* a given osd we're communicating with */
+struct ceph_osd {
+	atomic_t o_ref;
+	struct ceph_osd_client *o_osdc;
+	int o_osd;
+	int o_incarnation;
+	struct rb_node o_node;
+	struct ceph_connection o_con;
+	struct list_head o_requests;
+	struct list_head o_osd_lru;
+	struct ceph_authorizer *o_authorizer;
+	void *o_authorizer_buf, *o_authorizer_reply_buf;
+	size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+	unsigned long lru_ttl;
+	int o_marked_for_keepalive;
+	struct list_head o_keepalive_item;
+};
+
+/* an in-flight request */
+struct ceph_osd_request {
+	u64             r_tid;              /* unique for this client */
+	struct rb_node  r_node;
+	struct list_head r_req_lru_item;
+	struct list_head r_osd_item;
+	struct ceph_osd *r_osd;
+	struct ceph_pg   r_pgid;
+
+	struct ceph_connection *r_con_filling_msg;
+
+	struct ceph_msg  *r_request, *r_reply;
+	int               r_result;
+	int               r_flags;     /* any additional flags for the osd */
+	u32               r_sent;      /* >0 if r_request is sending/sent */
+	int               r_got_reply;
+
+	struct ceph_osd_client *r_osdc;
+	struct kref       r_kref;
+	bool              r_mempool;
+	struct completion r_completion, r_safe_completion;
+	ceph_osdc_callback_t r_callback, r_safe_callback;
+	struct ceph_eversion r_reassert_version;
+	struct list_head  r_unsafe_item;
+
+	struct inode *r_inode;         	      /* for use by callbacks */
+	struct writeback_control *r_wbc;      /* ditto */
+
+	char              r_oid[40];          /* object name */
+	int               r_oid_len;
+	unsigned long     r_stamp;            /* send OR check time */
+	bool              r_resend;           /* msg send failed, needs retry */
+
+	struct ceph_file_layout r_file_layout;
+	struct ceph_snap_context *r_snapc;    /* snap context for writes */
+	unsigned          r_num_pages;        /* size of page array (follows) */
+	struct page     **r_pages;            /* pages for data payload */
+	int               r_pages_from_pool;
+	int               r_own_pages;        /* if true, i own page list */
+};
+
+struct ceph_osd_client {
+	struct ceph_client     *client;
+
+	struct ceph_osdmap     *osdmap;       /* current map */
+	struct rw_semaphore    map_sem;
+	struct completion      map_waiters;
+	u64                    last_requested_map;
+
+	struct mutex           request_mutex;
+	struct rb_root         osds;          /* osds */
+	struct list_head       osd_lru;       /* idle osds */
+	u64                    timeout_tid;   /* tid of timeout triggering rq */
+	u64                    last_tid;      /* tid of last request */
+	struct rb_root         requests;      /* pending requests */
+	struct list_head       req_lru;	      /* pending requests lru */
+	int                    num_requests;
+	struct delayed_work    timeout_work;
+	struct delayed_work    osds_timeout_work;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry 	       *debugfs_file;
+#endif
+
+	mempool_t              *req_mempool;
+
+	struct ceph_msgpool	msgpool_op;
+	struct ceph_msgpool	msgpool_op_reply;
+};
+
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+			  struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+				   struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
+				 struct ceph_msg *msg);
+
+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+				      struct ceph_file_layout *layout,
+				      struct ceph_vino vino,
+				      u64 offset, u64 *len, int op, int flags,
+				      struct ceph_snap_context *snapc,
+				      int do_sync, u32 truncate_seq,
+				      u64 truncate_size,
+				      struct timespec *mtime,
+				      bool use_mempool, int num_reply);
+
+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+	kref_get(&req->r_kref);
+}
+extern void ceph_osdc_release_request(struct kref *kref);
+static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+	kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+				   struct ceph_osd_request *req,
+				   bool nofail);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+				  struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+			       struct ceph_vino vino,
+			       struct ceph_file_layout *layout,
+			       u64 off, u64 *plen,
+			       u32 truncate_seq, u64 truncate_size,
+			       struct page **pages, int nr_pages);
+
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+				struct ceph_vino vino,
+				struct ceph_file_layout *layout,
+				struct ceph_snap_context *sc,
+				u64 off, u64 len,
+				u32 truncate_seq, u64 truncate_size,
+				struct timespec *mtime,
+				struct page **pages, int nr_pages,
+				int flags, int do_sync, bool nofail);
+
+#endif
+
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 00000000000..d82fe87c2a6
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1022 @@
+
+#include <asm/div64.h>
+
+#include "super.h"
+#include "osdmap.h"
+#include "crush/hash.h"
+#include "crush/mapper.h"
+#include "decode.h"
+#include "ceph_debug.h"
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+	int flag = 0;
+
+	if (!len)
+		goto done;
+
+	*str = '\0';
+	if (state) {
+		if (state & CEPH_OSD_EXISTS) {
+			snprintf(str, len, "exists");
+			flag = 1;
+		}
+		if (state & CEPH_OSD_UP) {
+			snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+				 "up");
+			flag = 1;
+		}
+	} else {
+		snprintf(str, len, "doesn't exist");
+	}
+done:
+	return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+	int b = 0;
+	while (t) {
+		t = t >> 1;
+		b++;
+	}
+	return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+	pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+	pi->pgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+	pi->lpg_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+	pi->lpgp_num_mask =
+		(1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+				       struct crush_bucket_uniform *b)
+{
+	dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+	ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+	b->item_weight = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_list_bucket(void **p, void *end,
+				    struct crush_bucket_list *b)
+{
+	int j;
+	dout("crush_decode_list_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->sum_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->sum_weights[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_tree_bucket(void **p, void *end,
+				    struct crush_bucket_tree *b)
+{
+	int j;
+	dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+	ceph_decode_32_safe(p, end, b->num_nodes, bad);
+	b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+	if (b->node_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+	for (j = 0; j < b->num_nodes; j++)
+		b->node_weights[j] = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static int crush_decode_straw_bucket(void **p, void *end,
+				     struct crush_bucket_straw *b)
+{
+	int j;
+	dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->straws == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++) {
+		b->item_weights[j] = ceph_decode_32(p);
+		b->straws[j] = ceph_decode_32(p);
+	}
+	return 0;
+bad:
+	return -EINVAL;
+}
+
+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+	struct crush_map *c;
+	int err = -EINVAL;
+	int i, j;
+	void **p = &pbyval;
+	void *start = pbyval;
+	u32 magic;
+
+	dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	c = kzalloc(sizeof(*c), GFP_NOFS);
+	if (c == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ceph_decode_need(p, end, 4*sizeof(u32), bad);
+	magic = ceph_decode_32(p);
+	if (magic != CRUSH_MAGIC) {
+		pr_err("crush_decode magic %x != current %x\n",
+		       (unsigned)magic, (unsigned)CRUSH_MAGIC);
+		goto bad;
+	}
+	c->max_buckets = ceph_decode_32(p);
+	c->max_rules = ceph_decode_32(p);
+	c->max_devices = ceph_decode_32(p);
+
+	c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+	if (c->device_parents == NULL)
+		goto badmem;
+	c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+	if (c->bucket_parents == NULL)
+		goto badmem;
+
+	c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+	if (c->buckets == NULL)
+		goto badmem;
+	c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+	if (c->rules == NULL)
+		goto badmem;
+
+	/* buckets */
+	for (i = 0; i < c->max_buckets; i++) {
+		int size = 0;
+		u32 alg;
+		struct crush_bucket *b;
+
+		ceph_decode_32_safe(p, end, alg, bad);
+		if (alg == 0) {
+			c->buckets[i] = NULL;
+			continue;
+		}
+		dout("crush_decode bucket %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		switch (alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			size = sizeof(struct crush_bucket_uniform);
+			break;
+		case CRUSH_BUCKET_LIST:
+			size = sizeof(struct crush_bucket_list);
+			break;
+		case CRUSH_BUCKET_TREE:
+			size = sizeof(struct crush_bucket_tree);
+			break;
+		case CRUSH_BUCKET_STRAW:
+			size = sizeof(struct crush_bucket_straw);
+			break;
+		default:
+			err = -EINVAL;
+			goto bad;
+		}
+		BUG_ON(size == 0);
+		b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+		if (b == NULL)
+			goto badmem;
+
+		ceph_decode_need(p, end, 4*sizeof(u32), bad);
+		b->id = ceph_decode_32(p);
+		b->type = ceph_decode_16(p);
+		b->alg = ceph_decode_8(p);
+		b->hash = ceph_decode_8(p);
+		b->weight = ceph_decode_32(p);
+		b->size = ceph_decode_32(p);
+
+		dout("crush_decode bucket size %d off %x %p to %p\n",
+		     b->size, (int)(*p-start), *p, end);
+
+		b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+		if (b->items == NULL)
+			goto badmem;
+		b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+		if (b->perm == NULL)
+			goto badmem;
+		b->perm_n = 0;
+
+		ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+		for (j = 0; j < b->size; j++)
+			b->items[j] = ceph_decode_32(p);
+
+		switch (b->alg) {
+		case CRUSH_BUCKET_UNIFORM:
+			err = crush_decode_uniform_bucket(p, end,
+				  (struct crush_bucket_uniform *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_LIST:
+			err = crush_decode_list_bucket(p, end,
+			       (struct crush_bucket_list *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_TREE:
+			err = crush_decode_tree_bucket(p, end,
+				(struct crush_bucket_tree *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		case CRUSH_BUCKET_STRAW:
+			err = crush_decode_straw_bucket(p, end,
+				(struct crush_bucket_straw *)b);
+			if (err < 0)
+				goto bad;
+			break;
+		}
+	}
+
+	/* rules */
+	dout("rule vec is %p\n", c->rules);
+	for (i = 0; i < c->max_rules; i++) {
+		u32 yes;
+		struct crush_rule *r;
+
+		ceph_decode_32_safe(p, end, yes, bad);
+		if (!yes) {
+			dout("crush_decode NO rule %d off %x %p to %p\n",
+			     i, (int)(*p-start), *p, end);
+			c->rules[i] = NULL;
+			continue;
+		}
+
+		dout("crush_decode rule %d off %x %p to %p\n",
+		     i, (int)(*p-start), *p, end);
+
+		/* len */
+		ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+		err = -EINVAL;
+		if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+			goto bad;
+#endif
+		r = c->rules[i] = kmalloc(sizeof(*r) +
+					  yes*sizeof(struct crush_rule_step),
+					  GFP_NOFS);
+		if (r == NULL)
+			goto badmem;
+		dout(" rule %d is at %p\n", i, r);
+		r->len = yes;
+		ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+		ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+		for (j = 0; j < r->len; j++) {
+			r->steps[j].op = ceph_decode_32(p);
+			r->steps[j].arg1 = ceph_decode_32(p);
+			r->steps[j].arg2 = ceph_decode_32(p);
+		}
+	}
+
+	/* ignore trailing name maps. */
+
+	dout("crush_decode success\n");
+	return c;
+
+badmem:
+	err = -ENOMEM;
+bad:
+	dout("crush_decode fail %d\n", err);
+	crush_destroy(c);
+	return ERR_PTR(err);
+}
+
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+	dout("osdmap_destroy %p\n", map);
+	if (map->crush)
+		crush_destroy(map->crush);
+	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_temp),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_temp);
+		kfree(pg);
+	}
+	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rb_first(&map->pg_pools),
+				 struct ceph_pg_pool_info, node);
+		rb_erase(&pi->node, &map->pg_pools);
+		kfree(pi);
+	}
+	kfree(map->osd_state);
+	kfree(map->osd_weight);
+	kfree(map->osd_addr);
+	kfree(map);
+}
+
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+	u8 *state;
+	struct ceph_entity_addr *addr;
+	u32 *weight;
+
+	state = kcalloc(max, sizeof(*state), GFP_NOFS);
+	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+	if (state == NULL || addr == NULL || weight == NULL) {
+		kfree(state);
+		kfree(addr);
+		kfree(weight);
+		return -ENOMEM;
+	}
+
+	/* copy old? */
+	if (map->osd_state) {
+		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+		kfree(map->osd_state);
+		kfree(map->osd_addr);
+		kfree(map->osd_weight);
+	}
+
+	map->osd_state = state;
+	map->osd_weight = weight;
+	map->osd_addr = addr;
+	map->max_osd = max;
+	return 0;
+}
+
+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+	u64 a = *(u64 *)&l;
+	u64 b = *(u64 *)&r;
+
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+			       struct rb_root *root)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_mapping *pg = NULL;
+	int c;
+
+	while (*p) {
+		parent = *p;
+		pg = rb_entry(parent, struct ceph_pg_mapping, node);
+		c = pgid_cmp(new->pgid, pg->pgid);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+						   struct ceph_pg pgid)
+{
+	struct rb_node *n = root->rb_node;
+	struct ceph_pg_mapping *pg;
+	int c;
+
+	while (n) {
+		pg = rb_entry(n, struct ceph_pg_mapping, node);
+		c = pgid_cmp(pgid, pg->pgid);
+		if (c < 0)
+			n = n->rb_left;
+		else if (c > 0)
+			n = n->rb_right;
+		else
+			return pg;
+	}
+	return NULL;
+}
+
+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_pg_pool_info *pi = NULL;
+
+	while (*p) {
+		parent = *p;
+		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+		if (new->id < pi->id)
+			p = &(*p)->rb_left;
+		else if (new->id > pi->id)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+	return 0;
+}
+
+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+	struct ceph_pg_pool_info *pi;
+	struct rb_node *n = root->rb_node;
+
+	while (n) {
+		pi = rb_entry(n, struct ceph_pg_pool_info, node);
+		if (id < pi->id)
+			n = n->rb_left;
+		else if (id > pi->id)
+			n = n->rb_right;
+		else
+			return pi;
+	}
+	return NULL;
+}
+
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+	calc_pg_masks(pi);
+	*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+
+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+	struct ceph_osdmap *map;
+	u16 version;
+	u32 len, max, i;
+	u8 ev;
+	int err = -EINVAL;
+	void *start = *p;
+	struct ceph_pg_pool_info *pi;
+
+	dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (map == NULL)
+		return ERR_PTR(-ENOMEM);
+	map->pg_temp = RB_ROOT;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_VERSION) {
+		pr_warning("got unknown v %d > %d of osdmap\n", version,
+			   CEPH_OSDMAP_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+	ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+	map->epoch = ceph_decode_32(p);
+	ceph_decode_copy(p, &map->created, sizeof(map->created));
+	ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+	ceph_decode_32_safe(p, end, max, bad);
+	while (max--) {
+		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+		pi = kmalloc(sizeof(*pi), GFP_NOFS);
+		if (!pi)
+			goto bad;
+		pi->id = ceph_decode_32(p);
+		ev = ceph_decode_8(p); /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			goto bad;
+		}
+		__decode_pool(p, pi);
+		__insert_pg_pool(&map->pg_pools, pi);
+	}
+	ceph_decode_32_safe(p, end, map->pool_max, bad);
+
+	ceph_decode_32_safe(p, end, map->flags, bad);
+
+	max = ceph_decode_32(p);
+
+	/* (re)alloc osd arrays */
+	err = osdmap_set_max_osd(map, max);
+	if (err < 0)
+		goto bad;
+	dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+	/* osds */
+	err = -EINVAL;
+	ceph_decode_need(p, end, 3*sizeof(u32) +
+			 map->max_osd*(1 + sizeof(*map->osd_weight) +
+				       sizeof(*map->osd_addr)), bad);
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+	*p += 4; /* skip length field (should match max) */
+	for (i = 0; i < map->max_osd; i++)
+		map->osd_weight[i] = ceph_decode_32(p);
+
+	*p += 4; /* skip length field (should match max) */
+	ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+	for (i = 0; i < map->max_osd; i++)
+		ceph_decode_addr(&map->osd_addr[i]);
+
+	/* pg_temp */
+	ceph_decode_32_safe(p, end, len, bad);
+	for (i = 0; i < len; i++) {
+		int n, j;
+		struct ceph_pg pgid;
+		struct ceph_pg_mapping *pg;
+
+		ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		n = ceph_decode_32(p);
+		ceph_decode_need(p, end, n * sizeof(u32), bad);
+		err = -ENOMEM;
+		pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+		if (!pg)
+			goto bad;
+		pg->pgid = pgid;
+		pg->len = n;
+		for (j = 0; j < n; j++)
+			pg->osds[j] = ceph_decode_32(p);
+
+		err = __insert_pg_mapping(pg, &map->pg_temp);
+		if (err)
+			goto bad;
+		dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+	}
+
+	/* crush */
+	ceph_decode_32_safe(p, end, len, bad);
+	dout("osdmap_decode crush len %d from off 0x%x\n", len,
+	     (int)(*p - start));
+	ceph_decode_need(p, end, len, bad);
+	map->crush = crush_decode(*p, end);
+	*p += len;
+	if (IS_ERR(map->crush)) {
+		err = PTR_ERR(map->crush);
+		map->crush = NULL;
+		goto bad;
+	}
+
+	/* ignore the rest of the map */
+	*p = end;
+
+	dout("osdmap_decode done %p %p\n", *p, end);
+	return map;
+
+bad:
+	dout("osdmap_decode fail\n");
+	ceph_osdmap_destroy(map);
+	return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					     struct ceph_osdmap *map,
+					     struct ceph_messenger *msgr)
+{
+	struct crush_map *newcrush = NULL;
+	struct ceph_fsid fsid;
+	u32 epoch = 0;
+	struct ceph_timespec modified;
+	u32 len, pool;
+	__s32 new_pool_max, new_flags, max;
+	void *start = *p;
+	int err = -EINVAL;
+	u16 version;
+	struct rb_node *rbp;
+
+	ceph_decode_16_safe(p, end, version, bad);
+	if (version > CEPH_OSDMAP_INC_VERSION) {
+		pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+			   CEPH_OSDMAP_INC_VERSION);
+		goto bad;
+	}
+
+	ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+			 bad);
+	ceph_decode_copy(p, &fsid, sizeof(fsid));
+	epoch = ceph_decode_32(p);
+	BUG_ON(epoch != map->epoch+1);
+	ceph_decode_copy(p, &modified, sizeof(modified));
+	new_pool_max = ceph_decode_32(p);
+	new_flags = ceph_decode_32(p);
+
+	/* full map? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental full map len %d, %p to %p\n",
+		     len, *p, end);
+		return osdmap_decode(p, min(*p+len, end));
+	}
+
+	/* new crush? */
+	ceph_decode_32_safe(p, end, len, bad);
+	if (len > 0) {
+		dout("apply_incremental new crush map len %d, %p to %p\n",
+		     len, *p, end);
+		newcrush = crush_decode(*p, min(*p+len, end));
+		if (IS_ERR(newcrush))
+			return ERR_PTR(PTR_ERR(newcrush));
+	}
+
+	/* new flags? */
+	if (new_flags >= 0)
+		map->flags = new_flags;
+	if (new_pool_max >= 0)
+		map->pool_max = new_pool_max;
+
+	ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+	/* new max? */
+	max = ceph_decode_32(p);
+	if (max >= 0) {
+		err = osdmap_set_max_osd(map, max);
+		if (err < 0)
+			goto bad;
+	}
+
+	map->epoch++;
+	map->modified = map->modified;
+	if (newcrush) {
+		if (map->crush)
+			crush_destroy(map->crush);
+		map->crush = newcrush;
+		newcrush = NULL;
+	}
+
+	/* new_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		__u8 ev;
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+		ev = ceph_decode_8(p);  /* encoding version */
+		if (ev > CEPH_PG_POOL_VERSION) {
+			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+				   ev, CEPH_PG_POOL_VERSION);
+			goto bad;
+		}
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (!pi) {
+			pi = kmalloc(sizeof(*pi), GFP_NOFS);
+			if (!pi) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pi->id = pool;
+			__insert_pg_pool(&map->pg_pools, pi);
+		}
+		__decode_pool(p, pi);
+	}
+
+	/* old_pool */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_pool_info *pi;
+
+		ceph_decode_32_safe(p, end, pool, bad);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi) {
+			rb_erase(&pi->node, &map->pg_pools);
+			kfree(pi);
+		}
+	}
+
+	/* new_up */
+	err = -EINVAL;
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		struct ceph_entity_addr addr;
+		ceph_decode_32_safe(p, end, osd, bad);
+		ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+		ceph_decode_addr(&addr);
+		pr_info("osd%d up\n", osd);
+		BUG_ON(osd >= map->max_osd);
+		map->osd_state[osd] |= CEPH_OSD_UP;
+		map->osd_addr[osd] = addr;
+	}
+
+	/* new_down */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd;
+		ceph_decode_32_safe(p, end, osd, bad);
+		(*p)++;  /* clean flag */
+		pr_info("osd%d down\n", osd);
+		if (osd < map->max_osd)
+			map->osd_state[osd] &= ~CEPH_OSD_UP;
+	}
+
+	/* new_weight */
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		u32 osd, off;
+		ceph_decode_need(p, end, sizeof(u32)*2, bad);
+		osd = ceph_decode_32(p);
+		off = ceph_decode_32(p);
+		pr_info("osd%d weight 0x%x %s\n", osd, off,
+		     off == CEPH_OSD_IN ? "(in)" :
+		     (off == CEPH_OSD_OUT ? "(out)" : ""));
+		if (osd < map->max_osd)
+			map->osd_weight[osd] = off;
+	}
+
+	/* new_pg_temp */
+	rbp = rb_first(&map->pg_temp);
+	ceph_decode_32_safe(p, end, len, bad);
+	while (len--) {
+		struct ceph_pg_mapping *pg;
+		int j;
+		struct ceph_pg pgid;
+		u32 pglen;
+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+		ceph_decode_copy(p, &pgid, sizeof(pgid));
+		pglen = ceph_decode_32(p);
+
+		/* remove any? */
+		while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+						node)->pgid, pgid) <= 0) {
+			struct rb_node *cur = rbp;
+			rbp = rb_next(rbp);
+			dout(" removed pg_temp %llx\n",
+			     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+					       node)->pgid);
+			rb_erase(cur, &map->pg_temp);
+		}
+
+		if (pglen) {
+			/* insert */
+			ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+			pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+			if (!pg) {
+				err = -ENOMEM;
+				goto bad;
+			}
+			pg->pgid = pgid;
+			pg->len = pglen;
+			for (j = 0; j < pglen; j++)
+				pg->osds[j] = ceph_decode_32(p);
+			err = __insert_pg_mapping(pg, &map->pg_temp);
+			if (err)
+				goto bad;
+			dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+			     pglen);
+		}
+	}
+	while (rbp) {
+		struct rb_node *cur = rbp;
+		rbp = rb_next(rbp);
+		dout(" removed pg_temp %llx\n",
+		     *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+				       node)->pgid);
+		rb_erase(cur, &map->pg_temp);
+	}
+
+	/* ignore the rest */
+	*p = end;
+	return map;
+
+bad:
+	pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+	       epoch, (int)(*p - start), *p, start, end);
+	print_hex_dump(KERN_DEBUG, "osdmap: ",
+		       DUMP_PREFIX_OFFSET, 16, 1,
+		       start, end - start, true);
+	if (newcrush)
+		crush_destroy(newcrush);
+	return ERR_PTR(err);
+}
+
+
+
+
+/*
+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+				   u64 off, u64 *plen,
+				   u64 *ono,
+				   u64 *oxoff, u64 *oxlen)
+{
+	u32 osize = le32_to_cpu(layout->fl_object_size);
+	u32 su = le32_to_cpu(layout->fl_stripe_unit);
+	u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	u32 bl, stripeno, stripepos, objsetno;
+	u32 su_per_object;
+	u64 t, su_offset;
+
+	dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+	     osize, su);
+	su_per_object = osize / su;
+	dout("osize %u / su %u = su_per_object %u\n", osize, su,
+	     su_per_object);
+
+	BUG_ON((su & ~PAGE_MASK) != 0);
+	/* bl = *off / su; */
+	t = off;
+	do_div(t, su);
+	bl = t;
+	dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+	stripeno = bl / sc;
+	stripepos = bl % sc;
+	objsetno = stripeno / su_per_object;
+
+	*ono = objsetno * sc + stripepos;
+	dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+	/* *oxoff = *off % layout->fl_stripe_unit;  # offset in su */
+	t = off;
+	su_offset = do_div(t, su);
+	*oxoff = su_offset + (stripeno % su_per_object) * su;
+
+	/*
+	 * Calculate the length of the extent being written to the selected
+	 * object. This is the minimum of the full length requested (plen) or
+	 * the remainder of the current stripe being written to.
+	 */
+	*oxlen = min_t(u64, *plen, su - su_offset);
+	*plen = *oxlen;
+
+	dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+			    const char *oid,
+			    struct ceph_file_layout *fl,
+			    struct ceph_osdmap *osdmap)
+{
+	unsigned num, num_mask;
+	struct ceph_pg pgid;
+	s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+	int poolid = le32_to_cpu(fl->fl_pg_pool);
+	struct ceph_pg_pool_info *pool;
+	unsigned ps;
+
+	BUG_ON(!osdmap);
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return -EIO;
+	ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+	if (preferred >= 0) {
+		ps += preferred;
+		num = le32_to_cpu(pool->v.lpg_num);
+		num_mask = pool->lpg_num_mask;
+	} else {
+		num = le32_to_cpu(pool->v.pg_num);
+		num_mask = pool->pg_num_mask;
+	}
+
+	pgid.ps = cpu_to_le16(ps);
+	pgid.preferred = cpu_to_le16(preferred);
+	pgid.pool = fl->fl_pg_pool;
+	if (preferred >= 0)
+		dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+		     (int)preferred);
+	else
+		dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+
+	ol->ol_pgid = pgid;
+	ol->ol_stripe_unit = fl->fl_object_stripe_unit;
+	return 0;
+}
+
+/*
+ * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *osds, int *num)
+{
+	struct ceph_pg_mapping *pg;
+	struct ceph_pg_pool_info *pool;
+	int ruleno;
+	unsigned poolid, ps, pps;
+	int preferred;
+
+	/* pg_temp? */
+	pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+	if (pg) {
+		*num = pg->len;
+		return pg->osds;
+	}
+
+	/* crush */
+	poolid = le32_to_cpu(pgid.pool);
+	ps = le16_to_cpu(pgid.ps);
+	preferred = (s16)le16_to_cpu(pgid.preferred);
+
+	/* don't forcefeed bad device ids to crush */
+	if (preferred >= osdmap->max_osd ||
+	    preferred >= osdmap->crush->max_devices)
+		preferred = -1;
+
+	pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+	if (!pool)
+		return NULL;
+	ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+				 pool->v.type, pool->v.size);
+	if (ruleno < 0) {
+		pr_err("no crush rule pool %d type %d size %d\n",
+		       poolid, pool->v.type, pool->v.size);
+		return NULL;
+	}
+
+	if (preferred >= 0)
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.lpgp_num),
+				      pool->lpgp_num_mask);
+	else
+		pps = ceph_stable_mod(ps,
+				      le32_to_cpu(pool->v.pgp_num),
+				      pool->pgp_num_mask);
+	pps += poolid;
+	*num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+			     min_t(int, pool->v.size, *num),
+			     preferred, osdmap->osd_weight);
+	return osds;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+	int rawosds[10], *osds;
+	int i, num = ARRAY_SIZE(rawosds);
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i])) {
+			return osds[i];
+			break;
+		}
+	return -1;
+}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 00000000000..1fb55afb264
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+
+#include <linux/rbtree.h>
+#include "types.h"
+#include "ceph_fs.h"
+#include "crush/crush.h"
+
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds.  That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg_pool_info {
+	struct rb_node node;
+	int id;
+	struct ceph_pg_pool v;
+	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+};
+
+struct ceph_pg_mapping {
+	struct rb_node node;
+	struct ceph_pg pgid;
+	int len;
+	int osds[];
+};
+
+struct ceph_osdmap {
+	struct ceph_fsid fsid;
+	u32 epoch;
+	u32 mkfs_epoch;
+	struct ceph_timespec created, modified;
+
+	u32 flags;         /* CEPH_OSDMAP_* */
+
+	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
+	u8 *osd_state;     /* CEPH_OSD_* */
+	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
+	struct ceph_entity_addr *osd_addr;
+
+	struct rb_root pg_temp;
+	struct rb_root pg_pools;
+	u32 pool_max;
+
+	/* the CRUSH map specifies the mapping of placement groups to
+	 * the list of osds that store+replicate them. */
+	struct crush_map *crush;
+};
+
+/*
+ * file layout helpers
+ */
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+	((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_preferred(l) \
+	((__s32)le32_to_cpu((l).fl_pg_preferred))
+#define ceph_file_layout_pg_pool(l) \
+	((__s32)le32_to_cpu((l).fl_pg_pool))
+
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+	return le32_to_cpu(l->fl_stripe_unit) *
+		le32_to_cpu(l->fl_stripe_count);
+}
+
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+	return le32_to_cpu(l->fl_object_size) *
+		le32_to_cpu(l->fl_stripe_count);
+}
+
+
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+	return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+}
+
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+	return map && (map->flags & flag);
+}
+
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+						     int osd)
+{
+	if (osd >= map->max_osd)
+		return NULL;
+	return &map->osd_addr[osd];
+}
+
+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+					    struct ceph_osdmap *map,
+					    struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+
+/* calculate mapping of a file extent to an object */
+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+					  u64 off, u64 *plen,
+					  u64 *bno, u64 *oxoff, u64 *oxlen);
+
+/* calculate mapping of object to a placement group */
+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+				   const char *oid,
+				   struct ceph_file_layout *fl,
+				   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+				struct ceph_pg pgid);
+
+#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 00000000000..370e9369547
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,54 @@
+
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+
+#include "pagelist.h"
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+	if (pl->mapped_tail)
+		kunmap(pl->mapped_tail);
+	while (!list_empty(&pl->head)) {
+		struct page *page = list_first_entry(&pl->head, struct page,
+						     lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	return 0;
+}
+
+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+	struct page *page = alloc_page(GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+	pl->room += PAGE_SIZE;
+	list_add_tail(&page->lru, &pl->head);
+	if (pl->mapped_tail)
+		kunmap(pl->mapped_tail);
+	pl->mapped_tail = kmap(page);
+	return 0;
+}
+
+int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
+{
+	while (pl->room < len) {
+		size_t bit = pl->room;
+		int ret;
+
+		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+		       buf, bit);
+		pl->length += bit;
+		pl->room -= bit;
+		buf += bit;
+		len -= bit;
+		ret = ceph_pagelist_addpage(pl);
+		if (ret)
+			return ret;
+	}
+
+	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+	pl->length += len;
+	pl->room -= len;
+	return 0;
+}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 00000000000..e8a4187e108
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+
+#include <linux/list.h>
+
+struct ceph_pagelist {
+	struct list_head head;
+	void *mapped_tail;
+	size_t length;
+	size_t room;
+};
+
+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
+{
+	INIT_LIST_HEAD(&pl->head);
+	pl->mapped_tail = NULL;
+	pl->length = 0;
+	pl->room = 0;
+}
+extern int ceph_pagelist_release(struct ceph_pagelist *pl);
+
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
+
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+	__le64 ev = cpu_to_le64(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+	__le32 ev = cpu_to_le32(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+	__le16 ev = cpu_to_le16(v);
+	return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+	return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+					      char *s, size_t len)
+{
+	int ret = ceph_pagelist_encode_32(pl, len);
+	if (ret)
+		return ret;
+	if (len)
+		return ceph_pagelist_append(pl, s, len);
+	return 0;
+}
+
+#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 00000000000..26ac8b89a67
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
+#ifndef __RADOS_H
+#define __RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include "msgr.h"
+
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION 4
+#define CEPH_OSDMAP_VERSION     4
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+	unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+				    const struct ceph_fsid *b)
+{
+	return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1))  /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP  ((__u64)(-2))  /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3))  /* largest valid snapid */
+
+struct ceph_timespec {
+	__le32 tv_sec;
+	__le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH     1
+#define CEPH_OBJECT_LAYOUT_LINEAR   2
+#define CEPH_OBJECT_LAYOUT_HASHINO  3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH  0
+#define CEPH_PG_LAYOUT_HASH   1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+	__le16 preferred; /* preferred primary osd */
+	__le16 ps;        /* placement seed */
+	__le32 pool;      /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ *  pg_num -- base number of pseudorandomly placed pgs
+ *
+ *  pgp_num -- effective number when calculating pg placement.  this
+ * is used for pg_num increases.  new pgs result in data being "split"
+ * into new pgs.  for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split.  only _then_ are the new
+ * pgs placed independently.
+ *
+ *  lpg_num -- localized pg count (per device).  replicas are randomly
+ * selected.
+ *
+ *  lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP     1
+#define CEPH_PG_TYPE_RAID4   2
+#define CEPH_PG_POOL_VERSION 2
+struct ceph_pg_pool {
+	__u8 type;                /* CEPH_PG_TYPE_* */
+	__u8 size;                /* number of osds in each pg */
+	__u8 crush_ruleset;       /* crush placement rule */
+	__u8 object_hash;         /* hash mapping object name to ps */
+	__le32 pg_num, pgp_num;   /* number of pg's */
+	__le32 lpg_num, lpgp_num; /* number of localized pg's */
+	__le32 last_change;       /* most recent epoch changed */
+	__le64 snap_seq;          /* seq for per-pool snapshot */
+	__le32 snap_epoch;        /* epoch of last snap */
+	__le32 num_snaps;
+	__le32 num_removed_snap_intervals;
+	__le64 uid;
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time.  b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+	if ((x & bmask) < b)
+		return x & bmask;
+	else
+		return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+	struct ceph_pg ol_pgid;   /* raw pg, with _full_ ps precision. */
+	__le32 ol_stripe_unit;    /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+	__le32 epoch;
+	__le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP     2
+
+/* osd weights.  fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN  0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0)  /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL     (1<<1)  /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD  (1<<2)  /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR  (1<<3)  /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4)  /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE       0xf000
+#define CEPH_OSD_OP_MODE_RD    0x1000
+#define CEPH_OSD_OP_MODE_WR    0x2000
+#define CEPH_OSD_OP_MODE_RMW   0x3000
+#define CEPH_OSD_OP_MODE_SUB   0x4000
+
+#define CEPH_OSD_OP_TYPE       0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK  0x0100
+#define CEPH_OSD_OP_TYPE_DATA  0x0200
+#define CEPH_OSD_OP_TYPE_ATTR  0x0300
+#define CEPH_OSD_OP_TYPE_EXEC  0x0400
+#define CEPH_OSD_OP_TYPE_PG    0x0500
+
+enum {
+	/** data **/
+	/* read */
+	CEPH_OSD_OP_READ      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_STAT      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+	/* fancy read */
+	CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+	/* write */
+	CEPH_OSD_OP_WRITE     = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+	CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+	CEPH_OSD_OP_TRUNCATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+	CEPH_OSD_OP_ZERO      = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+	CEPH_OSD_OP_DELETE    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+	/* fancy write */
+	CEPH_OSD_OP_APPEND    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+	CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+	CEPH_OSD_OP_SETTRUNC  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+	CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+	CEPH_OSD_OP_TMAPUP  = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+	CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+	CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+
+	CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+
+	/** attrs **/
+	/* read */
+	CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+
+	/* write */
+	CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+	CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+	CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+	CEPH_OSD_OP_RMXATTR   = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+	/** subop **/
+	CEPH_OSD_OP_PULL           = CEPH_OSD_OP_MODE_SUB | 1,
+	CEPH_OSD_OP_PUSH           = CEPH_OSD_OP_MODE_SUB | 2,
+	CEPH_OSD_OP_BALANCEREADS   = CEPH_OSD_OP_MODE_SUB | 3,
+	CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+	CEPH_OSD_OP_SCRUB          = CEPH_OSD_OP_MODE_SUB | 5,
+
+	/** lock **/
+	CEPH_OSD_OP_WRLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+	CEPH_OSD_OP_WRUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+	CEPH_OSD_OP_RDLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+	CEPH_OSD_OP_RDUNLOCK  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+	CEPH_OSD_OP_UPLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+	CEPH_OSD_OP_DNLOCK    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+
+	/** exec **/
+	CEPH_OSD_OP_CALL    = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+
+	/** pg **/
+	CEPH_OSD_OP_PGLS      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+	return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+	return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM  'r'
+
+extern const char *ceph_osd_op_name(int op);
+
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+	CEPH_OSD_FLAG_ACK = 1,          /* want (or is) "ack" ack */
+	CEPH_OSD_FLAG_ONNVRAM = 2,      /* want (or is) "onnvram" ack */
+	CEPH_OSD_FLAG_ONDISK = 4,       /* want (or is) "ondisk" ack */
+	CEPH_OSD_FLAG_RETRY = 8,        /* resend attempt */
+	CEPH_OSD_FLAG_READ = 16,        /* op may read */
+	CEPH_OSD_FLAG_WRITE = 32,       /* op may write */
+	CEPH_OSD_FLAG_ORDERSNAP = 64,   /* EOLDSNAP if snapc is out of order */
+	CEPH_OSD_FLAG_PEERSTAT = 128,   /* msg includes osd_peer_stat */
+	CEPH_OSD_FLAG_BALANCE_READS = 256,
+	CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+	CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
+	CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+};
+
+enum {
+	CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
+};
+
+#define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/*
+ * an individual object operation.  each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+	__le16 op;           /* CEPH_OSD_OP_* */
+	__le32 flags;        /* CEPH_OSD_FLAG_* */
+	union {
+		struct {
+			__le64 offset, length;
+			__le64 truncate_size;
+			__le32 truncate_seq;
+		} __attribute__ ((packed)) extent;
+		struct {
+			__le32 name_len;
+			__le32 value_len;
+		} __attribute__ ((packed)) xattr;
+		struct {
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+			__le32 indata_len;
+		} __attribute__ ((packed)) cls;
+		struct {
+			__le64 cookie, count;
+		} __attribute__ ((packed)) pgls;
+	};
+	__le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * osd request message header.  each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+	__le32 client_inc;                 /* client incarnation */
+	struct ceph_object_layout layout;  /* pgid */
+	__le32 osdmap_epoch;               /* client's osdmap epoch */
+
+	__le32 flags;
+
+	struct ceph_timespec mtime;        /* for mutations only */
+	struct ceph_eversion reassert_version; /* if we are replaying op */
+
+	__le32 object_len;     /* length of object name */
+
+	__le64 snapid;         /* snapid to read */
+	__le64 snap_seq;       /* writer's snap context */
+	__le32 num_snaps;
+
+	__le16 num_ops;
+	struct ceph_osd_op ops[];  /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+	__le32 client_inc;                /* client incarnation */
+	__le32 flags;
+	struct ceph_object_layout layout;
+	__le32 osdmap_epoch;
+	struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+	__le32 result;                    /* result code */
+
+	__le32 object_len;                /* length of object name */
+	__le32 num_ops;
+	struct ceph_osd_op ops[0];  /* ops[], object */
+} __attribute__ ((packed));
+
+
+#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 00000000000..df04e210a05
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,906 @@
+#include "ceph_debug.h"
+
+#include <linux/sort.h>
+
+#include "super.h"
+#include "decode.h"
+
+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client.  In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot.  Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide.  Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory.  This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots.  An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename).  Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system.  (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.)  A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm.  This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here.  If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+
+
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("get_realm %p %d -> %d\n", realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+	/*
+	 * since we _only_ increment realm refs or empty the empty
+	 * list with snap_rwsem held, adjusting the empty list here is
+	 * safe.  we do need to protect against concurrent empty list
+	 * additions, however.
+	 */
+	if (atomic_read(&realm->nref) == 0) {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_del_init(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+
+	atomic_inc(&realm->nref);
+}
+
+static void __insert_snap_realm(struct rb_root *root,
+				struct ceph_snap_realm *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct ceph_snap_realm *r = NULL;
+
+	while (*p) {
+		parent = *p;
+		r = rb_entry(parent, struct ceph_snap_realm, node);
+		if (new->ino < r->ino)
+			p = &(*p)->rb_left;
+		else if (new->ino > r->ino)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+}
+
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+	struct ceph_mds_client *mdsc,
+	u64 ino)
+{
+	struct ceph_snap_realm *realm;
+
+	realm = kzalloc(sizeof(*realm), GFP_NOFS);
+	if (!realm)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&realm->nref, 0);    /* tree does not take a ref */
+	realm->ino = ino;
+	INIT_LIST_HEAD(&realm->children);
+	INIT_LIST_HEAD(&realm->child_item);
+	INIT_LIST_HEAD(&realm->empty_item);
+	INIT_LIST_HEAD(&realm->inodes_with_caps);
+	spin_lock_init(&realm->inodes_with_caps_lock);
+	__insert_snap_realm(&mdsc->snap_realms, realm);
+	dout("create_snap_realm %llx %p\n", realm->ino, realm);
+	return realm;
+}
+
+/*
+ * lookup the realm rooted at @ino.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino)
+{
+	struct rb_node *n = mdsc->snap_realms.rb_node;
+	struct ceph_snap_realm *r;
+
+	while (n) {
+		r = rb_entry(n, struct ceph_snap_realm, node);
+		if (ino < r->ino)
+			n = n->rb_left;
+		else if (ino > r->ino)
+			n = n->rb_right;
+		else {
+			dout("lookup_snap_realm %llx %p\n", r->ino, r);
+			return r;
+		}
+	}
+	return NULL;
+}
+
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm);
+
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+				 struct ceph_snap_realm *realm)
+{
+	dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+
+	rb_erase(&realm->node, &mdsc->snap_realms);
+
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		__put_snap_realm(mdsc, realm->parent);
+	}
+
+	kfree(realm->prior_parent_snaps);
+	kfree(realm->snaps);
+	ceph_put_snap_context(realm->cached_context);
+	kfree(realm);
+}
+
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+			     struct ceph_snap_realm *realm)
+{
+	dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (atomic_dec_and_test(&realm->nref))
+		__destroy_snap_realm(mdsc, realm);
+}
+
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+			 struct ceph_snap_realm *realm)
+{
+	dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+	     atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+	if (!atomic_dec_and_test(&realm->nref))
+		return;
+
+	if (down_write_trylock(&mdsc->snap_rwsem)) {
+		__destroy_snap_realm(mdsc, realm);
+		up_write(&mdsc->snap_rwsem);
+	} else {
+		spin_lock(&mdsc->snap_empty_lock);
+		list_add(&mdsc->snap_empty, &realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+	}
+}
+
+/*
+ * Clean up any realms whose ref counts have dropped to zero.  Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snap_realm *realm;
+
+	spin_lock(&mdsc->snap_empty_lock);
+	while (!list_empty(&mdsc->snap_empty)) {
+		realm = list_first_entry(&mdsc->snap_empty,
+				   struct ceph_snap_realm, empty_item);
+		list_del(&realm->empty_item);
+		spin_unlock(&mdsc->snap_empty_lock);
+		__destroy_snap_realm(mdsc, realm);
+		spin_lock(&mdsc->snap_empty_lock);
+	}
+	spin_unlock(&mdsc->snap_empty_lock);
+}
+
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+	down_write(&mdsc->snap_rwsem);
+	__cleanup_empty_realms(mdsc);
+	up_write(&mdsc->snap_rwsem);
+}
+
+/*
+ * adjust the parent realm of a given @realm.  adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+				    struct ceph_snap_realm *realm,
+				    u64 parentino)
+{
+	struct ceph_snap_realm *parent;
+
+	if (realm->parent_ino == parentino)
+		return 0;
+
+	parent = ceph_lookup_snap_realm(mdsc, parentino);
+	if (!parent) {
+		parent = ceph_create_snap_realm(mdsc, parentino);
+		if (IS_ERR(parent))
+			return PTR_ERR(parent);
+	}
+	dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+	     realm->ino, realm, realm->parent_ino, realm->parent,
+	     parentino, parent);
+	if (realm->parent) {
+		list_del_init(&realm->child_item);
+		ceph_put_snap_realm(mdsc, realm->parent);
+	}
+	realm->parent_ino = parentino;
+	realm->parent = parent;
+	ceph_get_snap_realm(mdsc, parent);
+	list_add(&realm->child_item, &parent->children);
+	return 1;
+}
+
+
+static int cmpu64_rev(const void *a, const void *b)
+{
+	if (*(u64 *)a < *(u64 *)b)
+		return 1;
+	if (*(u64 *)a > *(u64 *)b)
+		return -1;
+	return 0;
+}
+
+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *parent = realm->parent;
+	struct ceph_snap_context *snapc;
+	int err = 0;
+	int i;
+	int num = realm->num_prior_parent_snaps + realm->num_snaps;
+
+	/*
+	 * build parent context, if it hasn't been built.
+	 * conservatively estimate that all parent snaps might be
+	 * included by us.
+	 */
+	if (parent) {
+		if (!parent->cached_context) {
+			err = build_snap_context(parent);
+			if (err)
+				goto fail;
+		}
+		num += parent->cached_context->num_snaps;
+	}
+
+	/* do i actually need to update?  not if my context seq
+	   matches realm seq, and my parents' does to.  (this works
+	   because we rebuild_snap_realms() works _downward_ in
+	   hierarchy after each update.) */
+	if (realm->cached_context &&
+	    realm->cached_context->seq == realm->seq &&
+	    (!parent ||
+	     realm->cached_context->seq >= parent->cached_context->seq)) {
+		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+		     " (unchanged)\n",
+		     realm->ino, realm, realm->cached_context,
+		     realm->cached_context->seq,
+		     realm->cached_context->num_snaps);
+		return 0;
+	}
+
+	/* alloc new snap context */
+	err = -ENOMEM;
+	if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+		goto fail;
+	snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+	if (!snapc)
+		goto fail;
+	atomic_set(&snapc->nref, 1);
+
+	/* build (reverse sorted) snap vector */
+	num = 0;
+	snapc->seq = realm->seq;
+	if (parent) {
+		/* include any of parent's snaps occuring _after_ my
+		   parent became my parent */
+		for (i = 0; i < parent->cached_context->num_snaps; i++)
+			if (parent->cached_context->snaps[i] >=
+			    realm->parent_since)
+				snapc->snaps[num++] =
+					parent->cached_context->snaps[i];
+		if (parent->cached_context->seq > snapc->seq)
+			snapc->seq = parent->cached_context->seq;
+	}
+	memcpy(snapc->snaps + num, realm->snaps,
+	       sizeof(u64)*realm->num_snaps);
+	num += realm->num_snaps;
+	memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+	       sizeof(u64)*realm->num_prior_parent_snaps);
+	num += realm->num_prior_parent_snaps;
+
+	sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+	snapc->num_snaps = num;
+	dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
+	     realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+
+	if (realm->cached_context)
+		ceph_put_snap_context(realm->cached_context);
+	realm->cached_context = snapc;
+	return 0;
+
+fail:
+	/*
+	 * if we fail, clear old (incorrect) cached_context... hopefully
+	 * we'll have better luck building it later
+	 */
+	if (realm->cached_context) {
+		ceph_put_snap_context(realm->cached_context);
+		realm->cached_context = NULL;
+	}
+	pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+	       realm, err);
+	return err;
+}
+
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+{
+	struct ceph_snap_realm *child;
+
+	dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+	build_snap_context(realm);
+
+	list_for_each_entry(child, &realm->children, child_item)
+		rebuild_snap_realms(child);
+}
+
+
+/*
+ * helper to allocate and decode an array of snapids.  free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, int num)
+{
+	int i;
+
+	kfree(*dst);
+	if (num) {
+		*dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+		if (!*dst)
+			return -ENOMEM;
+		for (i = 0; i < num; i++)
+			(*dst)[i] = get_unaligned_le64(src + i);
+	} else {
+		*dst = NULL;
+	}
+	return 0;
+}
+
+
+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known).  In this case the
+ * cap_snap->writing = 1, and is said to be "pending."  When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+			 struct ceph_snap_context *snapc)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_cap_snap *capsnap;
+	int used;
+
+	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+	if (!capsnap) {
+		pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+		return;
+	}
+
+	spin_lock(&inode->i_lock);
+	used = __ceph_caps_used(ci);
+	if (__ceph_have_pending_cap_snap(ci)) {
+		/* there is no point in queuing multiple "pending" cap_snaps,
+		   as no new writes are allowed to start when pending, so any
+		   writes in progress now were started before the previous
+		   cap_snap.  lucky us. */
+		dout("queue_cap_snap %p snapc %p seq %llu used %d"
+		     " already pending\n", inode, snapc, snapc->seq, used);
+		kfree(capsnap);
+	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+		igrab(inode);
+
+		atomic_set(&capsnap->nref, 1);
+		capsnap->ci = ci;
+		INIT_LIST_HEAD(&capsnap->ci_item);
+		INIT_LIST_HEAD(&capsnap->flushing_item);
+
+		capsnap->follows = snapc->seq - 1;
+		capsnap->context = ceph_get_snap_context(snapc);
+		capsnap->issued = __ceph_caps_issued(ci, NULL);
+		capsnap->dirty = __ceph_caps_dirty(ci);
+
+		capsnap->mode = inode->i_mode;
+		capsnap->uid = inode->i_uid;
+		capsnap->gid = inode->i_gid;
+
+		/* fixme? */
+		capsnap->xattr_blob = NULL;
+		capsnap->xattr_len = 0;
+
+		/* dirty page count moved from _head to this cap_snap;
+		   all subsequent writes page dirties occur _after_ this
+		   snapshot. */
+		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+		ci->i_wrbuffer_ref_head = 0;
+		ceph_put_snap_context(ci->i_head_snapc);
+		ci->i_head_snapc = NULL;
+		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+
+		if (used & CEPH_CAP_FILE_WR) {
+			dout("queue_cap_snap %p cap_snap %p snapc %p"
+			     " seq %llu used WR, now pending\n", inode,
+			     capsnap, snapc, snapc->seq);
+			capsnap->writing = 1;
+		} else {
+			/* note mtime, size NOW. */
+			__ceph_finish_cap_snap(ci, capsnap);
+		}
+	} else {
+		dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+		kfree(capsnap);
+	}
+
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.
+ *
+ * Caller must hold i_lock.
+ */
+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+			    struct ceph_cap_snap *capsnap)
+{
+	struct inode *inode = &ci->vfs_inode;
+	struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+
+	BUG_ON(capsnap->writing);
+	capsnap->size = inode->i_size;
+	capsnap->mtime = inode->i_mtime;
+	capsnap->atime = inode->i_atime;
+	capsnap->ctime = inode->i_ctime;
+	capsnap->time_warp_seq = ci->i_time_warp_seq;
+	if (capsnap->dirty_pages) {
+		dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
+		     "still has %d dirty pages\n", inode, capsnap,
+		     capsnap->context, capsnap->context->seq,
+		     capsnap->size, capsnap->dirty_pages);
+		return 0;
+	}
+	dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
+	     inode, capsnap, capsnap->context,
+	     capsnap->context->seq, capsnap->size);
+
+	spin_lock(&mdsc->snap_flush_lock);
+	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+	spin_unlock(&mdsc->snap_flush_lock);
+	return 1;  /* caller may want to ceph_flush_snaps */
+}
+
+
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
+			   void *p, void *e, bool deletion)
+{
+	struct ceph_mds_snap_realm *ri;    /* encoded */
+	__le64 *snaps;                     /* encoded */
+	__le64 *prior_parent_snaps;        /* encoded */
+	struct ceph_snap_realm *realm;
+	int invalidate = 0;
+	int err = -ENOMEM;
+
+	dout("update_snap_trace deletion=%d\n", deletion);
+more:
+	ceph_decode_need(&p, e, sizeof(*ri), bad);
+	ri = p;
+	p += sizeof(*ri);
+	ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+			    le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+	snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+	prior_parent_snaps = p;
+	p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+
+	realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+	if (!realm) {
+		realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+		if (IS_ERR(realm)) {
+			err = PTR_ERR(realm);
+			goto fail;
+		}
+	}
+
+	if (le64_to_cpu(ri->seq) > realm->seq) {
+		dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+		     realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+		/*
+		 * if the realm seq has changed, queue a cap_snap for every
+		 * inode with open caps.  we do this _before_ we update
+		 * the realm info so that we prepare for writeback under the
+		 * _previous_ snap context.
+		 *
+		 * ...unless it's a snap deletion!
+		 */
+		if (!deletion) {
+			struct ceph_inode_info *ci;
+			struct inode *lastinode = NULL;
+
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_for_each_entry(ci, &realm->inodes_with_caps,
+					    i_snap_realm_item) {
+				struct inode *inode = igrab(&ci->vfs_inode);
+				if (!inode)
+					continue;
+				spin_unlock(&realm->inodes_with_caps_lock);
+				if (lastinode)
+					iput(lastinode);
+				lastinode = inode;
+				ceph_queue_cap_snap(ci, realm->cached_context);
+				spin_lock(&realm->inodes_with_caps_lock);
+			}
+			spin_unlock(&realm->inodes_with_caps_lock);
+			if (lastinode)
+				iput(lastinode);
+			dout("update_snap_trace cap_snaps queued\n");
+		}
+
+	} else {
+		dout("update_snap_trace %llx %p seq %lld unchanged\n",
+		     realm->ino, realm, realm->seq);
+	}
+
+	/* ensure the parent is correct */
+	err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+	if (err < 0)
+		goto fail;
+	invalidate += err;
+
+	if (le64_to_cpu(ri->seq) > realm->seq) {
+		/* update realm parameters, snap lists */
+		realm->seq = le64_to_cpu(ri->seq);
+		realm->created = le64_to_cpu(ri->created);
+		realm->parent_since = le64_to_cpu(ri->parent_since);
+
+		realm->num_snaps = le32_to_cpu(ri->num_snaps);
+		err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+		if (err < 0)
+			goto fail;
+
+		realm->num_prior_parent_snaps =
+			le32_to_cpu(ri->num_prior_parent_snaps);
+		err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+				realm->num_prior_parent_snaps);
+		if (err < 0)
+			goto fail;
+
+		invalidate = 1;
+	} else if (!realm->cached_context) {
+		invalidate = 1;
+	}
+
+	dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+	     realm, invalidate, p, e);
+
+	if (p < e)
+		goto more;
+
+	/* invalidate when we reach the _end_ (root) of the trace */
+	if (invalidate)
+		rebuild_snap_realms(realm);
+
+	__cleanup_empty_realms(mdsc);
+	return 0;
+
+bad:
+	err = -EINVAL;
+fail:
+	pr_err("update_snap_trace error %d\n", err);
+	return err;
+}
+
+
+/*
+ * Send any cap_snaps that are queued for flush.  Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)
+{
+	struct ceph_inode_info *ci;
+	struct inode *inode;
+	struct ceph_mds_session *session = NULL;
+
+	dout("flush_snaps\n");
+	spin_lock(&mdsc->snap_flush_lock);
+	while (!list_empty(&mdsc->snap_flush_list)) {
+		ci = list_first_entry(&mdsc->snap_flush_list,
+				struct ceph_inode_info, i_snap_flush_item);
+		inode = &ci->vfs_inode;
+		igrab(inode);
+		spin_unlock(&mdsc->snap_flush_lock);
+		spin_lock(&inode->i_lock);
+		__ceph_flush_snaps(ci, &session);
+		spin_unlock(&inode->i_lock);
+		iput(inode);
+		spin_lock(&mdsc->snap_flush_lock);
+	}
+	spin_unlock(&mdsc->snap_flush_lock);
+
+	if (session) {
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
+	}
+	dout("flush_snaps done\n");
+}
+
+
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm.  This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+		      struct ceph_mds_session *session,
+		      struct ceph_msg *msg)
+{
+	struct super_block *sb = mdsc->client->sb;
+	int mds = session->s_mds;
+	u64 split;
+	int op;
+	int trace_len;
+	struct ceph_snap_realm *realm = NULL;
+	void *p = msg->front.iov_base;
+	void *e = p + msg->front.iov_len;
+	struct ceph_mds_snap_head *h;
+	int num_split_inos, num_split_realms;
+	__le64 *split_inos = NULL, *split_realms = NULL;
+	int i;
+	int locked_rwsem = 0;
+
+	/* decode */
+	if (msg->front.iov_len < sizeof(*h))
+		goto bad;
+	h = p;
+	op = le32_to_cpu(h->op);
+	split = le64_to_cpu(h->split);   /* non-zero if we are splitting an
+					  * existing realm */
+	num_split_inos = le32_to_cpu(h->num_split_inos);
+	num_split_realms = le32_to_cpu(h->num_split_realms);
+	trace_len = le32_to_cpu(h->trace_len);
+	p += sizeof(*h);
+
+	dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+	     ceph_snap_op_name(op), split, trace_len);
+
+	mutex_lock(&session->s_mutex);
+	session->s_seq++;
+	mutex_unlock(&session->s_mutex);
+
+	down_write(&mdsc->snap_rwsem);
+	locked_rwsem = 1;
+
+	if (op == CEPH_SNAP_OP_SPLIT) {
+		struct ceph_mds_snap_realm *ri;
+
+		/*
+		 * A "split" breaks part of an existing realm off into
+		 * a new realm.  The MDS provides a list of inodes
+		 * (with caps) and child realms that belong to the new
+		 * child.
+		 */
+		split_inos = p;
+		p += sizeof(u64) * num_split_inos;
+		split_realms = p;
+		p += sizeof(u64) * num_split_realms;
+		ceph_decode_need(&p, e, sizeof(*ri), bad);
+		/* we will peek at realm info here, but will _not_
+		 * advance p, as the realm update will occur below in
+		 * ceph_update_snap_trace. */
+		ri = p;
+
+		realm = ceph_lookup_snap_realm(mdsc, split);
+		if (!realm) {
+			realm = ceph_create_snap_realm(mdsc, split);
+			if (IS_ERR(realm))
+				goto out;
+		}
+		ceph_get_snap_realm(mdsc, realm);
+
+		dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+		for (i = 0; i < num_split_inos; i++) {
+			struct ceph_vino vino = {
+				.ino = le64_to_cpu(split_inos[i]),
+				.snap = CEPH_NOSNAP,
+			};
+			struct inode *inode = ceph_find_inode(sb, vino);
+			struct ceph_inode_info *ci;
+
+			if (!inode)
+				continue;
+			ci = ceph_inode(inode);
+
+			spin_lock(&inode->i_lock);
+			if (!ci->i_snap_realm)
+				goto skip_inode;
+			/*
+			 * If this inode belongs to a realm that was
+			 * created after our new realm, we experienced
+			 * a race (due to another split notifications
+			 * arriving from a different MDS).  So skip
+			 * this inode.
+			 */
+			if (ci->i_snap_realm->created >
+			    le64_to_cpu(ri->created)) {
+				dout(" leaving %p in newer realm %llx %p\n",
+				     inode, ci->i_snap_realm->ino,
+				     ci->i_snap_realm);
+				goto skip_inode;
+			}
+			dout(" will move %p to split realm %llx %p\n",
+			     inode, realm->ino, realm);
+			/*
+			 * Remove the inode from the realm's inode
+			 * list, but don't add it to the new realm
+			 * yet.  We don't want the cap_snap to be
+			 * queued (again) by ceph_update_snap_trace()
+			 * below.  Queue it _now_, under the old context.
+			 */
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_del_init(&ci->i_snap_realm_item);
+			spin_unlock(&realm->inodes_with_caps_lock);
+			spin_unlock(&inode->i_lock);
+
+			ceph_queue_cap_snap(ci,
+					    ci->i_snap_realm->cached_context);
+
+			iput(inode);
+			continue;
+
+skip_inode:
+			spin_unlock(&inode->i_lock);
+			iput(inode);
+		}
+
+		/* we may have taken some of the old realm's children. */
+		for (i = 0; i < num_split_realms; i++) {
+			struct ceph_snap_realm *child =
+				ceph_lookup_snap_realm(mdsc,
+					   le64_to_cpu(split_realms[i]));
+			if (!child)
+				continue;
+			adjust_snap_realm_parent(mdsc, child, realm->ino);
+		}
+	}
+
+	/*
+	 * update using the provided snap trace. if we are deleting a
+	 * snap, we can avoid queueing cap_snaps.
+	 */
+	ceph_update_snap_trace(mdsc, p, e,
+			       op == CEPH_SNAP_OP_DESTROY);
+
+	if (op == CEPH_SNAP_OP_SPLIT) {
+		/*
+		 * ok, _now_ add the inodes into the new realm.
+		 */
+		for (i = 0; i < num_split_inos; i++) {
+			struct ceph_vino vino = {
+				.ino = le64_to_cpu(split_inos[i]),
+				.snap = CEPH_NOSNAP,
+			};
+			struct inode *inode = ceph_find_inode(sb, vino);
+			struct ceph_inode_info *ci;
+
+			if (!inode)
+				continue;
+			ci = ceph_inode(inode);
+			spin_lock(&inode->i_lock);
+			if (!ci->i_snap_realm)
+				goto split_skip_inode;
+			ceph_put_snap_realm(mdsc, ci->i_snap_realm);
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_add(&ci->i_snap_realm_item,
+				 &realm->inodes_with_caps);
+			ci->i_snap_realm = realm;
+			spin_unlock(&realm->inodes_with_caps_lock);
+			ceph_get_snap_realm(mdsc, realm);
+split_skip_inode:
+			spin_unlock(&inode->i_lock);
+			iput(inode);
+		}
+
+		/* we took a reference when we created the realm, above */
+		ceph_put_snap_realm(mdsc, realm);
+	}
+
+	__cleanup_empty_realms(mdsc);
+
+	up_write(&mdsc->snap_rwsem);
+
+	flush_snaps(mdsc);
+	return;
+
+bad:
+	pr_err("corrupt snap message from mds%d\n", mds);
+	ceph_msg_dump(msg);
+out:
+	if (locked_rwsem)
+		up_write(&mdsc->snap_rwsem);
+	return;
+}
+
+
+
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 00000000000..4290a6e860b
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1030 @@
+
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
+#include "decode.h"
+#include "super.h"
+#include "mon_client.h"
+#include "auth.h"
+
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+	const char *e = s + len;
+
+	while (e != s && *(e-1) != '/')
+		e--;
+	return e;
+}
+
+
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+	struct ceph_client *cl = ceph_client(s);
+
+	dout("put_super\n");
+	ceph_mdsc_close_sessions(&cl->mdsc);
+	return;
+}
+
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
+	struct ceph_monmap *monmap = client->monc.monmap;
+	struct ceph_statfs st;
+	u64 fsid;
+	int err;
+
+	dout("statfs\n");
+	err = ceph_monc_do_statfs(&client->monc, &st);
+	if (err < 0)
+		return err;
+
+	/* fill in kstatfs */
+	buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
+
+	/*
+	 * express utilization in terms of large blocks to avoid
+	 * overflow on 32-bit machines.
+	 */
+	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+	buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
+		(CEPH_BLOCK_SHIFT-10);
+	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+
+	buf->f_files = le64_to_cpu(st.num_objects);
+	buf->f_ffree = -1;
+	buf->f_namelen = PATH_MAX;
+	buf->f_frsize = PAGE_CACHE_SIZE;
+
+	/* leave fsid little-endian, regardless of host endianness */
+	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+	buf->f_fsid.val[0] = fsid & 0xffffffff;
+	buf->f_fsid.val[1] = fsid >> 32;
+
+	return 0;
+}
+
+
+static int ceph_syncfs(struct super_block *sb, int wait)
+{
+	dout("sync_fs %d\n", wait);
+	ceph_osdc_sync(&ceph_client(sb)->osdc);
+	ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+	dout("sync_fs %d done\n", wait);
+	return 0;
+}
+
+
+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
+	struct ceph_mount_args *args = client->mount_args;
+
+	if (args->flags & CEPH_OPT_FSID)
+		seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
+			   le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
+	if (args->flags & CEPH_OPT_NOSHARE)
+		seq_puts(m, ",noshare");
+	if (args->flags & CEPH_OPT_DIRSTAT)
+		seq_puts(m, ",dirstat");
+	if ((args->flags & CEPH_OPT_RBYTES) == 0)
+		seq_puts(m, ",norbytes");
+	if (args->flags & CEPH_OPT_NOCRC)
+		seq_puts(m, ",nocrc");
+	if (args->flags & CEPH_OPT_NOASYNCREADDIR)
+		seq_puts(m, ",noasyncreaddir");
+	if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+		seq_printf(m, ",snapdirname=%s", args->snapdir_name);
+	if (args->name)
+		seq_printf(m, ",name=%s", args->name);
+	if (args->secret)
+		seq_puts(m, ",secret=<hidden>");
+	return 0;
+}
+
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
+{
+	struct ceph_inode_info *ci = foo;
+	inode_init_once(&ci->vfs_inode);
+}
+
+static int default_congestion_kb(void)
+{
+	int congestion_kb;
+
+	/*
+	 * Copied from NFS
+	 *
+	 * congestion size, scale with available memory.
+	 *
+	 *  64MB:    8192k
+	 * 128MB:   11585k
+	 * 256MB:   16384k
+	 * 512MB:   23170k
+	 *   1GB:   32768k
+	 *   2GB:   46340k
+	 *   4GB:   65536k
+	 *   8GB:   92681k
+	 *  16GB:  131072k
+	 *
+	 * This allows larger machines to have larger/more transfers.
+	 * Limit the default to 256M
+	 */
+	congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+	if (congestion_kb > 256*1024)
+		congestion_kb = 256*1024;
+
+	return congestion_kb;
+}
+
+static int __init init_caches(void)
+{
+	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+				      sizeof(struct ceph_inode_info),
+				      __alignof__(struct ceph_inode_info),
+				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+				      ceph_inode_init_once);
+	if (ceph_inode_cachep == NULL)
+		return -ENOMEM;
+
+	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_cachep == NULL)
+		goto bad_cap;
+
+	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_dentry_cachep == NULL)
+		goto bad_dentry;
+
+	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_file_cachep == NULL)
+		goto bad_file;
+
+	return 0;
+
+bad_file:
+	kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+	kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+	kmem_cache_destroy(ceph_inode_cachep);
+	return -ENOMEM;
+}
+
+static void destroy_caches(void)
+{
+	kmem_cache_destroy(ceph_inode_cachep);
+	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_dentry_cachep);
+	kmem_cache_destroy(ceph_file_cachep);
+}
+
+
+/*
+ * ceph_umount_begin - initiate forced umount.  Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+	struct ceph_client *client = ceph_sb_to_client(sb);
+
+	dout("ceph_umount_begin - starting forced umount\n");
+	if (!client)
+		return;
+	client->mount_state = CEPH_MOUNT_SHUTDOWN;
+	return;
+}
+
+static const struct super_operations ceph_super_ops = {
+	.alloc_inode	= ceph_alloc_inode,
+	.destroy_inode	= ceph_destroy_inode,
+	.write_inode    = ceph_write_inode,
+	.sync_fs        = ceph_syncfs,
+	.put_super	= ceph_put_super,
+	.show_options   = ceph_show_options,
+	.statfs		= ceph_statfs,
+	.umount_begin   = ceph_umount_begin,
+};
+
+
+const char *ceph_msg_type_name(int type)
+{
+	switch (type) {
+	case CEPH_MSG_SHUTDOWN: return "shutdown";
+	case CEPH_MSG_PING: return "ping";
+	case CEPH_MSG_AUTH: return "auth";
+	case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+	case CEPH_MSG_MON_MAP: return "mon_map";
+	case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+	case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+	case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+	case CEPH_MSG_STATFS: return "statfs";
+	case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+	case CEPH_MSG_MDS_MAP: return "mds_map";
+	case CEPH_MSG_CLIENT_SESSION: return "client_session";
+	case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+	case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+	case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+	case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+	case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+	case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+	case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+	case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+	case CEPH_MSG_OSD_MAP: return "osd_map";
+	case CEPH_MSG_OSD_OP: return "osd_op";
+	case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
+	default: return "unknown";
+	}
+}
+
+
+/*
+ * mount options
+ */
+enum {
+	Opt_fsidmajor,
+	Opt_fsidminor,
+	Opt_monport,
+	Opt_wsize,
+	Opt_rsize,
+	Opt_osdtimeout,
+	Opt_osdkeepalivetimeout,
+	Opt_mount_timeout,
+	Opt_osd_idle_ttl,
+	Opt_caps_wanted_delay_min,
+	Opt_caps_wanted_delay_max,
+	Opt_readdir_max_entries,
+	Opt_congestion_kb,
+	Opt_last_int,
+	/* int args above */
+	Opt_snapdirname,
+	Opt_name,
+	Opt_secret,
+	Opt_last_string,
+	/* string args above */
+	Opt_ip,
+	Opt_noshare,
+	Opt_dirstat,
+	Opt_nodirstat,
+	Opt_rbytes,
+	Opt_norbytes,
+	Opt_nocrc,
+	Opt_noasyncreaddir,
+};
+
+static match_table_t arg_tokens = {
+	{Opt_fsidmajor, "fsidmajor=%ld"},
+	{Opt_fsidminor, "fsidminor=%ld"},
+	{Opt_monport, "monport=%d"},
+	{Opt_wsize, "wsize=%d"},
+	{Opt_rsize, "rsize=%d"},
+	{Opt_osdtimeout, "osdtimeout=%d"},
+	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+	{Opt_mount_timeout, "mount_timeout=%d"},
+	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+	{Opt_congestion_kb, "write_congestion_kb=%d"},
+	/* int args above */
+	{Opt_snapdirname, "snapdirname=%s"},
+	{Opt_name, "name=%s"},
+	{Opt_secret, "secret=%s"},
+	/* string args above */
+	{Opt_ip, "ip=%s"},
+	{Opt_noshare, "noshare"},
+	{Opt_dirstat, "dirstat"},
+	{Opt_nodirstat, "nodirstat"},
+	{Opt_rbytes, "rbytes"},
+	{Opt_norbytes, "norbytes"},
+	{Opt_nocrc, "nocrc"},
+	{Opt_noasyncreaddir, "noasyncreaddir"},
+	{-1, NULL}
+};
+
+
+static struct ceph_mount_args *parse_mount_args(int flags, char *options,
+						const char *dev_name,
+						const char **path)
+{
+	struct ceph_mount_args *args;
+	const char *c;
+	int err = -ENOMEM;
+	substring_t argstr[MAX_OPT_ARGS];
+
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return ERR_PTR(-ENOMEM);
+	args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
+				 GFP_KERNEL);
+	if (!args->mon_addr)
+		goto out;
+
+	dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
+
+	/* start with defaults */
+	args->sb_flags = flags;
+	args->flags = CEPH_OPT_DEFAULT;
+	args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+	args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+	args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+	args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+	args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+	args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+	args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+	args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+	args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+	args->max_readdir = 1024;
+	args->congestion_kb = default_congestion_kb();
+
+	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+	err = -EINVAL;
+	if (!dev_name)
+		goto out;
+	*path = strstr(dev_name, ":/");
+	if (*path == NULL) {
+		pr_err("device name is missing path (no :/ in %s)\n",
+		       dev_name);
+		goto out;
+	}
+
+	/* get mon ip(s) */
+	err = ceph_parse_ips(dev_name, *path, args->mon_addr,
+			     CEPH_MAX_MON, &args->num_mon);
+	if (err < 0)
+		goto out;
+
+	/* path on server */
+	*path += 2;
+	dout("server path '%s'\n", *path);
+
+	/* parse mount options */
+	while ((c = strsep(&options, ",")) != NULL) {
+		int token, intval, ret;
+		if (!*c)
+			continue;
+		err = -EINVAL;
+		token = match_token((char *)c, arg_tokens, argstr);
+		if (token < 0) {
+			pr_err("bad mount option at '%s'\n", c);
+			goto out;
+		}
+		if (token < Opt_last_int) {
+			ret = match_int(&argstr[0], &intval);
+			if (ret < 0) {
+				pr_err("bad mount option arg (not int) "
+				       "at '%s'\n", c);
+				continue;
+			}
+			dout("got int token %d val %d\n", token, intval);
+		} else if (token > Opt_last_int && token < Opt_last_string) {
+			dout("got string token %d val %s\n", token,
+			     argstr[0].from);
+		} else {
+			dout("got token %d\n", token);
+		}
+		switch (token) {
+		case Opt_fsidmajor:
+			*(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
+			break;
+		case Opt_fsidminor:
+			*(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
+			break;
+		case Opt_ip:
+			err = ceph_parse_ips(argstr[0].from,
+					     argstr[0].to,
+					     &args->my_addr,
+					     1, NULL);
+			if (err < 0)
+				goto out;
+			args->flags |= CEPH_OPT_MYIP;
+			break;
+
+		case Opt_snapdirname:
+			kfree(args->snapdir_name);
+			args->snapdir_name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
+		case Opt_name:
+			args->name = kstrndup(argstr[0].from,
+					      argstr[0].to-argstr[0].from,
+					      GFP_KERNEL);
+			break;
+		case Opt_secret:
+			args->secret = kstrndup(argstr[0].from,
+						argstr[0].to-argstr[0].from,
+						GFP_KERNEL);
+			break;
+
+			/* misc */
+		case Opt_wsize:
+			args->wsize = intval;
+			break;
+		case Opt_rsize:
+			args->rsize = intval;
+			break;
+		case Opt_osdtimeout:
+			args->osd_timeout = intval;
+			break;
+		case Opt_osdkeepalivetimeout:
+			args->osd_keepalive_timeout = intval;
+			break;
+		case Opt_mount_timeout:
+			args->mount_timeout = intval;
+			break;
+		case Opt_caps_wanted_delay_min:
+			args->caps_wanted_delay_min = intval;
+			break;
+		case Opt_caps_wanted_delay_max:
+			args->caps_wanted_delay_max = intval;
+			break;
+		case Opt_readdir_max_entries:
+			args->max_readdir = intval;
+			break;
+		case Opt_congestion_kb:
+			args->congestion_kb = intval;
+			break;
+
+		case Opt_noshare:
+			args->flags |= CEPH_OPT_NOSHARE;
+			break;
+
+		case Opt_dirstat:
+			args->flags |= CEPH_OPT_DIRSTAT;
+			break;
+		case Opt_nodirstat:
+			args->flags &= ~CEPH_OPT_DIRSTAT;
+			break;
+		case Opt_rbytes:
+			args->flags |= CEPH_OPT_RBYTES;
+			break;
+		case Opt_norbytes:
+			args->flags &= ~CEPH_OPT_RBYTES;
+			break;
+		case Opt_nocrc:
+			args->flags |= CEPH_OPT_NOCRC;
+			break;
+		case Opt_noasyncreaddir:
+			args->flags |= CEPH_OPT_NOASYNCREADDIR;
+			break;
+
+		default:
+			BUG_ON(token);
+		}
+	}
+	return args;
+
+out:
+	kfree(args->mon_addr);
+	kfree(args);
+	return ERR_PTR(err);
+}
+
+static void destroy_mount_args(struct ceph_mount_args *args)
+{
+	dout("destroy_mount_args %p\n", args);
+	kfree(args->snapdir_name);
+	args->snapdir_name = NULL;
+	kfree(args->name);
+	args->name = NULL;
+	kfree(args->secret);
+	args->secret = NULL;
+	kfree(args);
+}
+
+/*
+ * create a fresh client instance
+ */
+static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+{
+	struct ceph_client *client;
+	int err = -ENOMEM;
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (client == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&client->mount_mutex);
+
+	init_waitqueue_head(&client->auth_wq);
+
+	client->sb = NULL;
+	client->mount_state = CEPH_MOUNT_MOUNTING;
+	client->mount_args = args;
+
+	client->msgr = NULL;
+
+	client->auth_err = 0;
+	atomic_long_set(&client->writeback_count, 0);
+
+	err = bdi_init(&client->backing_dev_info);
+	if (err < 0)
+		goto fail;
+
+	err = -ENOMEM;
+	client->wb_wq = create_workqueue("ceph-writeback");
+	if (client->wb_wq == NULL)
+		goto fail_bdi;
+	client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+	if (client->pg_inv_wq == NULL)
+		goto fail_wb_wq;
+	client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+	if (client->trunc_wq == NULL)
+		goto fail_pg_inv_wq;
+
+	/* set up mempools */
+	err = -ENOMEM;
+	client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+			      client->mount_args->wsize >> PAGE_CACHE_SHIFT);
+	if (!client->wb_pagevec_pool)
+		goto fail_trunc_wq;
+
+	/* caps */
+	client->min_caps = args->max_readdir;
+	ceph_adjust_min_caps(client->min_caps);
+
+	/* subsystems */
+	err = ceph_monc_init(&client->monc, client);
+	if (err < 0)
+		goto fail_mempool;
+	err = ceph_osdc_init(&client->osdc, client);
+	if (err < 0)
+		goto fail_monc;
+	err = ceph_mdsc_init(&client->mdsc, client);
+	if (err < 0)
+		goto fail_osdc;
+	return client;
+
+fail_osdc:
+	ceph_osdc_stop(&client->osdc);
+fail_monc:
+	ceph_monc_stop(&client->monc);
+fail_mempool:
+	mempool_destroy(client->wb_pagevec_pool);
+fail_trunc_wq:
+	destroy_workqueue(client->trunc_wq);
+fail_pg_inv_wq:
+	destroy_workqueue(client->pg_inv_wq);
+fail_wb_wq:
+	destroy_workqueue(client->wb_wq);
+fail_bdi:
+	bdi_destroy(&client->backing_dev_info);
+fail:
+	kfree(client);
+	return ERR_PTR(err);
+}
+
+static void ceph_destroy_client(struct ceph_client *client)
+{
+	dout("destroy_client %p\n", client);
+
+	/* unmount */
+	ceph_mdsc_stop(&client->mdsc);
+	ceph_monc_stop(&client->monc);
+	ceph_osdc_stop(&client->osdc);
+
+	ceph_adjust_min_caps(-client->min_caps);
+
+	ceph_debugfs_client_cleanup(client);
+	destroy_workqueue(client->wb_wq);
+	destroy_workqueue(client->pg_inv_wq);
+	destroy_workqueue(client->trunc_wq);
+
+	if (client->msgr)
+		ceph_messenger_destroy(client->msgr);
+	mempool_destroy(client->wb_pagevec_pool);
+
+	destroy_mount_args(client->mount_args);
+
+	kfree(client);
+	dout("destroy_client %p done\n", client);
+}
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+	if (client->have_fsid) {
+		if (ceph_fsid_compare(&client->fsid, fsid)) {
+			pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
+			       PR_FSID(&client->fsid), PR_FSID(fsid));
+			return -1;
+		}
+	} else {
+		pr_info("client%lld fsid " FSID_FORMAT "\n",
+			client->monc.auth->global_id, PR_FSID(fsid));
+		memcpy(&client->fsid, fsid, sizeof(*fsid));
+		ceph_debugfs_client_init(client);
+		client->have_fsid = true;
+	}
+	return 0;
+}
+
+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_map(struct ceph_client *client)
+{
+	return client->monc.monmap && client->monc.monmap->epoch;
+}
+
+/*
+ * Bootstrap mount by opening the root directory.  Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_client *client,
+				       const char *path,
+				       unsigned long started)
+{
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct ceph_mds_request *req = NULL;
+	int err;
+	struct dentry *root;
+
+	/* open dir */
+	dout("open_root_inode opening '%s'\n", path);
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
+	req->r_path1 = kstrdup(path, GFP_NOFS);
+	req->r_ino1.ino = CEPH_INO_ROOT;
+	req->r_ino1.snap = CEPH_NOSNAP;
+	req->r_started = started;
+	req->r_timeout = client->mount_args->mount_timeout * HZ;
+	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+	req->r_num_caps = 2;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	if (err == 0) {
+		dout("open_root_inode success\n");
+		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+		    client->sb->s_root == NULL)
+			root = d_alloc_root(req->r_target_inode);
+		else
+			root = d_obtain_alias(req->r_target_inode);
+		req->r_target_inode = NULL;
+		dout("open_root_inode success, root dentry is %p\n", root);
+	} else {
+		root = ERR_PTR(err);
+	}
+	ceph_mdsc_put_request(req);
+	return root;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+		      const char *path)
+{
+	struct ceph_entity_addr *myaddr = NULL;
+	int err;
+	unsigned long timeout = client->mount_args->mount_timeout * HZ;
+	unsigned long started = jiffies;  /* note the start time */
+	struct dentry *root;
+
+	dout("mount start\n");
+	mutex_lock(&client->mount_mutex);
+
+	/* initialize the messenger */
+	if (client->msgr == NULL) {
+		if (ceph_test_opt(client, MYIP))
+			myaddr = &client->mount_args->my_addr;
+		client->msgr = ceph_messenger_create(myaddr);
+		if (IS_ERR(client->msgr)) {
+			err = PTR_ERR(client->msgr);
+			client->msgr = NULL;
+			goto out;
+		}
+		client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+	}
+
+	/* open session, and wait for mon, mds, and osd maps */
+	err = ceph_monc_open_session(&client->monc);
+	if (err < 0)
+		goto out;
+
+	while (!have_mon_map(client)) {
+		err = -EIO;
+		if (timeout && time_after_eq(jiffies, started + timeout))
+			goto out;
+
+		/* wait */
+		dout("mount waiting for mon_map\n");
+		err = wait_event_interruptible_timeout(client->auth_wq,
+			       have_mon_map(client) || (client->auth_err < 0),
+			       timeout);
+		if (err == -EINTR || err == -ERESTARTSYS)
+			goto out;
+		if (client->auth_err < 0) {
+			err = client->auth_err;
+			goto out;
+		}
+	}
+
+	dout("mount opening root\n");
+	root = open_root_dentry(client, "", started);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out;
+	}
+	if (client->sb->s_root)
+		dput(root);
+	else
+		client->sb->s_root = root;
+
+	if (path[0] == 0) {
+		dget(root);
+	} else {
+		dout("mount opening base mountpoint\n");
+		root = open_root_dentry(client, path, started);
+		if (IS_ERR(root)) {
+			err = PTR_ERR(root);
+			dput(client->sb->s_root);
+			client->sb->s_root = NULL;
+			goto out;
+		}
+	}
+
+	mnt->mnt_root = root;
+	mnt->mnt_sb = client->sb;
+
+	client->mount_state = CEPH_MOUNT_MOUNTED;
+	dout("mount success\n");
+	err = 0;
+
+out:
+	mutex_unlock(&client->mount_mutex);
+	return err;
+}
+
+static int ceph_set_super(struct super_block *s, void *data)
+{
+	struct ceph_client *client = data;
+	int ret;
+
+	dout("set_super %p data %p\n", s, data);
+
+	s->s_flags = client->mount_args->sb_flags;
+	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
+
+	s->s_fs_info = client;
+	client->sb = s;
+
+	s->s_op = &ceph_super_ops;
+	s->s_export_op = &ceph_export_ops;
+
+	s->s_time_gran = 1000;  /* 1000 ns == 1 us */
+
+	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
+	if (ret != 0)
+		goto fail;
+
+	return ret;
+
+fail:
+	s->s_fs_info = NULL;
+	client->sb = NULL;
+	return ret;
+}
+
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+	struct ceph_client *new = data;
+	struct ceph_mount_args *args = new->mount_args;
+	struct ceph_client *other = ceph_sb_to_client(sb);
+	int i;
+
+	dout("ceph_compare_super %p\n", sb);
+	if (args->flags & CEPH_OPT_FSID) {
+		if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
+			dout("fsid doesn't match\n");
+			return 0;
+		}
+	} else {
+		/* do we share (a) monitor? */
+		for (i = 0; i < new->monc.monmap->num_mon; i++)
+			if (ceph_monmap_contains(other->monc.monmap,
+					 &new->monc.monmap->mon_inst[i].addr))
+				break;
+		if (i == new->monc.monmap->num_mon) {
+			dout("mon ip not part of monmap\n");
+			return 0;
+		}
+		dout("mon ip matches existing sb %p\n", sb);
+	}
+	if (args->sb_flags != other->mount_args->sb_flags) {
+		dout("flags differ\n");
+		return 0;
+	}
+	return 1;
+}
+
+/*
+ * construct our own bdi so we can control readahead, etc.
+ */
+static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
+{
+	int err;
+
+	sb->s_bdi = &client->backing_dev_info;
+
+	/* set ra_pages based on rsize mount option? */
+	if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
+		client->backing_dev_info.ra_pages =
+			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+			>> PAGE_SHIFT;
+	err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+	return err;
+}
+
+static int ceph_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name, void *data,
+		       struct vfsmount *mnt)
+{
+	struct super_block *sb;
+	struct ceph_client *client;
+	int err;
+	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+	const char *path = NULL;
+	struct ceph_mount_args *args;
+
+	dout("ceph_get_sb\n");
+	args = parse_mount_args(flags, data, dev_name, &path);
+	if (IS_ERR(args)) {
+		err = PTR_ERR(args);
+		goto out_final;
+	}
+
+	/* create client (which we may/may not use) */
+	client = ceph_create_client(args);
+	if (IS_ERR(client)) {
+		err = PTR_ERR(client);
+		goto out_final;
+	}
+
+	if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+		compare_super = NULL;
+	sb = sget(fs_type, compare_super, ceph_set_super, client);
+	if (IS_ERR(sb)) {
+		err = PTR_ERR(sb);
+		goto out;
+	}
+
+	if (ceph_client(sb) != client) {
+		ceph_destroy_client(client);
+		client = ceph_client(sb);
+		dout("get_sb got existing client %p\n", client);
+	} else {
+		dout("get_sb using new client %p\n", client);
+		err = ceph_register_bdi(sb, client);
+		if (err < 0)
+			goto out_splat;
+	}
+
+	err = ceph_mount(client, mnt, path);
+	if (err < 0)
+		goto out_splat;
+	dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+	     mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+	return 0;
+
+out_splat:
+	ceph_mdsc_close_sessions(&client->mdsc);
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	goto out_final;
+
+out:
+	ceph_destroy_client(client);
+out_final:
+	dout("ceph_get_sb fail %d\n", err);
+	return err;
+}
+
+static void ceph_kill_sb(struct super_block *s)
+{
+	struct ceph_client *client = ceph_sb_to_client(s);
+	dout("kill_sb %p\n", s);
+	ceph_mdsc_pre_umount(&client->mdsc);
+	kill_anon_super(s);    /* will call put_super after sb is r/o */
+	if (s->s_bdi == &client->backing_dev_info)
+		bdi_unregister(&client->backing_dev_info);
+	bdi_destroy(&client->backing_dev_info);
+	ceph_destroy_client(client);
+}
+
+static struct file_system_type ceph_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ceph",
+	.get_sb		= ceph_get_sb,
+	.kill_sb	= ceph_kill_sb,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE,
+};
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+
+static int __init init_ceph(void)
+{
+	int ret = 0;
+
+	ret = ceph_debugfs_init();
+	if (ret < 0)
+		goto out;
+
+	ret = ceph_msgr_init();
+	if (ret < 0)
+		goto out_debugfs;
+
+	ret = init_caches();
+	if (ret)
+		goto out_msgr;
+
+	ceph_caps_init();
+
+	ret = register_filesystem(&ceph_fs_type);
+	if (ret)
+		goto out_icache;
+
+	pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
+		CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
+		CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+	return 0;
+
+out_icache:
+	destroy_caches();
+out_msgr:
+	ceph_msgr_exit();
+out_debugfs:
+	ceph_debugfs_cleanup();
+out:
+	return ret;
+}
+
+static void __exit exit_ceph(void)
+{
+	dout("exit_ceph\n");
+	unregister_filesystem(&ceph_fs_type);
+	ceph_caps_finalize();
+	destroy_caches();
+	ceph_msgr_exit();
+	ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph);
+module_exit(exit_ceph);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 00000000000..65d12036b67
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,901 @@
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+
+#include "ceph_debug.h"
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+#include <linux/writeback.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "mds_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT   20  /* 1 MB */
+#define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID             (1<<0)
+#define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
+#define CEPH_OPT_DIRSTAT          (1<<4) /* funky `cat dirname` for stats */
+#define CEPH_OPT_RBYTES           (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_OPT_NOCRC            (1<<6) /* no data crc on writes */
+#define CEPH_OPT_NOASYNCREADDIR   (1<<7) /* no dcache readdir */
+
+#define CEPH_OPT_DEFAULT   (CEPH_OPT_RBYTES)
+
+#define ceph_set_opt(client, opt) \
+	(client)->mount_args->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+	(!!((client)->mount_args->flags & CEPH_OPT_##opt))
+
+
+struct ceph_mount_args {
+	int sb_flags;
+	int num_mon;
+	struct ceph_entity_addr *mon_addr;
+	int flags;
+	int mount_timeout;
+	int osd_idle_ttl;
+	int caps_wanted_delay_min, caps_wanted_delay_max;
+	struct ceph_fsid fsid;
+	struct ceph_entity_addr my_addr;
+	int wsize;
+	int rsize;            /* max readahead */
+	int max_readdir;      /* max readdir size */
+	int congestion_kb;      /* max readdir size */
+	int osd_timeout;
+	int osd_keepalive_timeout;
+	char *snapdir_name;   /* default ".snap" */
+	char *name;
+	char *secret;
+	int cap_release_safety;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
+#define CEPH_OSD_TIMEOUT_DEFAULT    60  /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT  5
+#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+
+#define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
+
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+#define CEPH_AUTH_NAME_DEFAULT   "guest"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file.  Delay a minimum amount of time, even if we send a cap
+ * message for some other reason.  Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+
+
+/* mount state */
+enum {
+	CEPH_MOUNT_MOUNTING,
+	CEPH_MOUNT_MOUNTED,
+	CEPH_MOUNT_UNMOUNTING,
+	CEPH_MOUNT_UNMOUNTED,
+	CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+	BUG_ON(time_after(b, a));
+	return (long)a - (long)b;
+}
+
+/*
+ * per-filesystem client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+	struct ceph_fsid fsid;
+	bool have_fsid;
+
+	struct mutex mount_mutex;       /* serialize mount attempts */
+	struct ceph_mount_args *mount_args;
+
+	struct super_block *sb;
+
+	unsigned long mount_state;
+	wait_queue_head_t auth_wq;
+
+	int auth_err;
+
+	int min_caps;                  /* min caps i added */
+
+	struct ceph_messenger *msgr;   /* messenger instance */
+	struct ceph_mon_client monc;
+	struct ceph_mds_client mdsc;
+	struct ceph_osd_client osdc;
+
+	/* writeback */
+	mempool_t *wb_pagevec_pool;
+	struct workqueue_struct *wb_wq;
+	struct workqueue_struct *pg_inv_wq;
+	struct workqueue_struct *trunc_wq;
+	atomic_long_t writeback_count;
+
+	struct backing_dev_info backing_dev_info;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_monmap;
+	struct dentry *debugfs_mdsmap, *debugfs_osdmap;
+	struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+	struct dentry *debugfs_congestion_kb;
+	struct dentry *debugfs_bdi;
+#endif
+};
+
+static inline struct ceph_client *ceph_client(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+
+/*
+ * File i/o capability.  This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data.  For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+	struct ceph_inode_info *ci;
+	struct rb_node ci_node;          /* per-ci cap tree */
+	struct ceph_mds_session *session;
+	struct list_head session_caps;   /* per-session caplist */
+	int mds;
+	u64 cap_id;       /* unique cap id (mds provided) */
+	int issued;       /* latest, from the mds */
+	int implemented;  /* implemented superset of issued (for revocation) */
+	int mds_wanted;
+	u32 seq, issue_seq, mseq;
+	u32 cap_gen;      /* active/stale cycle */
+	unsigned long last_used;
+	struct list_head caps_item;
+};
+
+#define CHECK_CAPS_NODELAY    1  /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY   2  /* only check auth cap */
+#define CHECK_CAPS_FLUSH      4  /* flush any dirty caps */
+
+/*
+ * Snapped cap state that is pending flush to mds.  When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+	atomic_t nref;
+	struct ceph_inode_info *ci;
+	struct list_head ci_item, flushing_item;
+
+	u64 follows, flush_tid;
+	int issued, dirty;
+	struct ceph_snap_context *context;
+
+	mode_t mode;
+	uid_t uid;
+	gid_t gid;
+
+	void *xattr_blob;
+	int xattr_len;
+	u64 xattr_version;
+
+	u64 size;
+	struct timespec mtime, atime, ctime;
+	u64 time_warp_seq;
+	int writing;   /* a sync write is still in progress */
+	int dirty_pages;     /* dirty pages awaiting writeback */
+};
+
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+	if (atomic_dec_and_test(&capsnap->nref))
+		kfree(capsnap);
+}
+
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers.  It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info.  That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+
+struct ceph_inode_frag {
+	struct rb_node node;
+
+	/* fragtree state */
+	u32 frag;
+	int split_by;         /* i.e. 2^(split_by) children */
+
+	/* delegation and replication info */
+	int mds;              /* -1 if same authority as parent */
+	int ndist;            /* >0 if replicated */
+	int dist[CEPH_MAX_DIRFRAG_REP];
+};
+
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+	struct rb_node node;
+
+	const char *name;
+	int name_len;
+	const char *val;
+	int val_len;
+	int dirty;
+
+	int should_free_name;
+	int should_free_val;
+};
+
+struct ceph_inode_xattrs_info {
+	/*
+	 * (still encoded) xattr blob. we avoid the overhead of parsing
+	 * this until someone actually calls getxattr, etc.
+	 *
+	 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+	 * NULL means we don't know.
+	*/
+	struct ceph_buffer *blob, *prealloc_blob;
+
+	struct rb_root index;
+	bool dirty;
+	int count;
+	int names_size;
+	int vals_size;
+	u64 version, index_version;
+};
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE  1  /* we have complete directory cached */
+#define CEPH_I_NODELAY   4  /* do not delay cap release */
+#define CEPH_I_FLUSH     8  /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH  16  /* do not flush dirty caps */
+
+struct ceph_inode_info {
+	struct ceph_vino i_vino;   /* ceph ino + snap */
+
+	u64 i_version;
+	u32 i_time_warp_seq;
+
+	unsigned i_ceph_flags;
+	unsigned long i_release_count;
+
+	struct ceph_file_layout i_layout;
+	char *i_symlink;
+
+	/* for dirs */
+	struct timespec i_rctime;
+	u64 i_rbytes, i_rfiles, i_rsubdirs;
+	u64 i_files, i_subdirs;
+	u64 i_max_offset;  /* largest readdir offset, set with I_COMPLETE */
+
+	struct rb_root i_fragtree;
+	struct mutex i_fragtree_mutex;
+
+	struct ceph_inode_xattrs_info i_xattrs;
+
+	/* capabilities.  protected _both_ by i_lock and cap->session's
+	 * s_mutex. */
+	struct rb_root i_caps;           /* cap list */
+	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
+	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
+	struct list_head i_dirty_item, i_flushing_item;
+	u64 i_cap_flush_seq;
+	/* we need to track cap writeback on a per-cap-bit basis, to allow
+	 * overlapping, pipelined cap flushes to the mds.  we can probably
+	 * reduce the tid to 8 bits if we're concerned about inode size. */
+	u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
+	unsigned long i_hold_caps_min; /* jiffies */
+	unsigned long i_hold_caps_max; /* jiffies */
+	struct list_head i_cap_delay_list;  /* for delayed cap release to mds */
+	int i_cap_exporting_mds;         /* to handle cap migration between */
+	unsigned i_cap_exporting_mseq;   /*  mds's. */
+	unsigned i_cap_exporting_issued;
+	struct ceph_cap_reservation i_cap_migration_resv;
+	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
+	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+	unsigned i_snap_caps;           /* cap bits for snapped files */
+
+	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
+
+	u32 i_truncate_seq;        /* last truncate to smaller size */
+	u64 i_truncate_size;       /*  and the size we last truncated down to */
+	int i_truncate_pending;    /*  still need to call vmtruncate */
+
+	u64 i_max_size;            /* max file size authorized by mds */
+	u64 i_reported_size; /* (max_)size reported to or requested of mds */
+	u64 i_wanted_max_size;     /* offset we'd like to write too */
+	u64 i_requested_max_size;  /* max_size we've requested */
+
+	/* held references to caps */
+	int i_pin_ref;
+	int i_rd_ref, i_rdcache_ref, i_wr_ref;
+	int i_wrbuffer_ref, i_wrbuffer_ref_head;
+	u32 i_shared_gen;       /* increment each time we get FILE_SHARED */
+	u32 i_rdcache_gen;      /* we increment this each time we get
+				   FILE_CACHE.  If it's non-zero, we
+				   _may_ have cached pages. */
+	u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+
+	struct list_head i_unsafe_writes; /* uncommitted sync writes */
+	struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+	spinlock_t i_unsafe_lock;
+
+	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+	int i_snap_realm_counter; /* snap realm (if caps) */
+	struct list_head i_snap_realm_item;
+	struct list_head i_snap_flush_item;
+
+	struct work_struct i_wb_work;  /* writeback work */
+	struct work_struct i_pg_inv_work;  /* page invalidation work */
+
+	struct work_struct i_vmtruncate_work;
+
+	struct inode vfs_inode; /* at end */
+};
+
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+	return container_of(inode, struct ceph_inode_info, vfs_inode);
+}
+
+static inline void ceph_i_clear(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags &= ~mask;
+	spin_unlock(&inode->i_lock);
+}
+
+static inline void ceph_i_set(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	spin_lock(&inode->i_lock);
+	ci->i_ceph_flags |= mask;
+	spin_unlock(&inode->i_lock);
+}
+
+static inline bool ceph_i_test(struct inode *inode, unsigned mask)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	bool r;
+
+	smp_mb();
+	r = (ci->i_ceph_flags & mask) == mask;
+	return r;
+}
+
+
+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+						u32 f);
+
+/*
+ * choose fragment for value @v.  copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+			    struct ceph_inode_frag *pfrag,
+			    int *found);
+
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+	struct ceph_mds_session *lease_session;
+	u32 lease_gen, lease_shared_gen;
+	u32 lease_seq;
+	unsigned long lease_renew_after, lease_renew_from;
+	struct list_head lru;
+	struct dentry *dentry;
+	u64 time;
+	u64 offset;
+};
+
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+	return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+
+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
+{
+	return ((loff_t)frag << 32) | (loff_t)off;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+	if (!ino)
+		ino = 1;
+#endif
+	return ino;
+}
+
+static inline int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	return 0;
+}
+
+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+	return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+	struct ceph_vino *pvino = (struct ceph_vino *)data;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	return ci->i_vino.ino == pvino->ino &&
+		ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+					    struct ceph_vino vino)
+{
+	ino_t t = ceph_vino_to_ino(vino);
+	return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+	return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+				    struct ceph_cap *cap);
+
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+	int issued;
+	spin_lock(&ci->vfs_inode.i_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return issued;
+}
+
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+					int touch)
+{
+	int r;
+	spin_lock(&ci->vfs_inode.i_lock);
+	r = __ceph_caps_issued_mask(ci, mask, touch);
+	spin_unlock(&ci->vfs_inode.i_lock);
+	return r;
+}
+
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+	return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+	if (w & CEPH_CAP_FILE_BUFFER)
+		w |= CEPH_CAP_FILE_EXCL;  /* we want EXCL if dirty data */
+	return w;
+}
+
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+
+extern void ceph_caps_init(void);
+extern void ceph_caps_finalize(void);
+extern void ceph_adjust_min_caps(int delta);
+extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_client *client,
+				    int *total, int *avail, int *used,
+				    int *reserved, int *min);
+
+static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+{
+	return (struct ceph_client *)inode->i_sb->s_fs_info;
+}
+
+static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+{
+	return (struct ceph_client *)sb->s_fs_info;
+}
+
+
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+struct ceph_file_info {
+	int fmode;     /* initialized on open */
+
+	/* readdir: position within the dir */
+	u32 frag;
+	struct ceph_mds_request *last_readdir;
+	int at_end;
+
+	/* readdir: position within a frag */
+	unsigned offset;       /* offset of last chunk, adjusted for . and .. */
+	u64 next_offset;       /* offset of next chunk (last_name's + 1) */
+	char *last_name;       /* last entry in previous chunk */
+	struct dentry *dentry; /* next dentry (for dcache readdir) */
+	unsigned long dir_release_count;
+
+	/* used for -o dirstat read() on directory thing */
+	char *dir_info;
+	int dir_info_len;
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data.  It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+	atomic_t nref;
+	u64 seq;
+	int num_snaps;
+	u64 snaps[];
+};
+
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+	/*
+	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)+1);
+	*/
+	if (sc)
+		atomic_inc(&sc->nref);
+	return sc;
+}
+
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+	if (!sc)
+		return;
+	/*
+	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+	       atomic_read(&sc->nref)-1);
+	*/
+	if (atomic_dec_and_test(&sc->nref)) {
+		/*printk(" deleting snap_context %p\n", sc);*/
+		kfree(sc);
+	}
+}
+
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it.  The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+	u64 ino;
+	atomic_t nref;
+	struct rb_node node;
+
+	u64 created, seq;
+	u64 parent_ino;
+	u64 parent_since;   /* snapid when our current parent became so */
+
+	u64 *prior_parent_snaps;      /* snaps inherited from any parents we */
+	int num_prior_parent_snaps;   /*  had prior to parent_since */
+	u64 *snaps;                   /* snaps specific to this realm */
+	int num_snaps;
+
+	struct ceph_snap_realm *parent;
+	struct list_head children;       /* list of child realms */
+	struct list_head child_item;
+
+	struct list_head empty_item;     /* if i have ref==0 */
+
+	/* the current set of snaps for this realm */
+	struct ceph_snap_context *cached_context;
+
+	struct list_head inodes_with_caps;
+	spinlock_t inodes_with_caps_lock;
+};
+
+
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+		(off >> PAGE_CACHE_SHIFT);
+}
+
+
+
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+					       u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+				struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+				  void *p, void *e, bool deletion);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+			     struct ceph_mds_session *session,
+			     struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+				struct ceph_snap_context *snapc);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+				  struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+	return !list_empty(&ci->i_cap_snaps) &&
+		list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
+			   ci_item)->writing;
+}
+
+
+/* super.c */
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+
+#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
+	"%02x%02x%02x%02x%02x%02x"
+#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
+		(f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7],    \
+		(f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11],  \
+		(f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
+
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_destroy_inode(struct inode *inode);
+
+extern struct inode *ceph_get_inode(struct super_block *sb,
+				    struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+			       u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+				u64 time_warp_seq, struct timespec *ctime,
+				struct timespec *mtime, struct timespec *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+			   struct ceph_mds_request *req,
+			   struct ceph_mds_session *session);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+				    struct ceph_mds_session *session);
+
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+
+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void ceph_queue_vmtruncate(struct inode *inode);
+
+extern void ceph_queue_invalidate(struct inode *inode);
+extern void ceph_queue_writeback(struct inode *inode);
+
+extern int ceph_do_getattr(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat);
+
+/* xattr.c */
+extern int ceph_setxattr(struct dentry *, const char *, const void *,
+			 size_t, int);
+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern int ceph_removexattr(struct dentry *, const char *);
+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+			     struct ceph_msg *msg);
+extern int ceph_add_cap(struct inode *inode,
+			struct ceph_mds_session *session, u64 cap_id,
+			int fmode, unsigned issued, unsigned wanted,
+			unsigned cap, unsigned seq, u64 realmino, int flags,
+			struct ceph_cap_reservation *caps_reservation);
+extern void __ceph_remove_cap(struct ceph_cap *cap);
+static inline void ceph_remove_cap(struct ceph_cap *cap)
+{
+	struct inode *inode = &cap->ci->vfs_inode;
+	spin_lock(&inode->i_lock);
+	__ceph_remove_cap(cap);
+	spin_unlock(&inode->i_lock);
+}
+extern void ceph_put_cap(struct ceph_cap *cap);
+
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
+extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+				    struct ceph_mds_session *session);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+				       struct ceph_snap_context *snapc);
+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
+			       struct ceph_mds_session **psession);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+			    struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+				     int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+				      int mds, int drop, int unless);
+
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+			 int *got, loff_t endoff);
+
+/* for counting open files by mode */
+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
+{
+	ci->i_nr_by_mode[mode]++;
+}
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+extern const struct address_space_operations ceph_aops;
+extern int ceph_open(struct inode *inode, struct file *file);
+extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+				       struct nameidata *nd, int mode,
+				       int locked_dir);
+extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+	ceph_snapdir_dentry_ops;
+
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+					 struct dentry *dentry, int err);
+
+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+
+/*
+ * our d_ops vary depending on whether the inode is live,
+ * snapshotted (read-only), or a virtual ".snap" directory.
+ */
+int ceph_init_dentry(struct dentry *dentry);
+
+
+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
+{
+	if (dentry && dentry->d_parent)
+		return dentry->d_parent->d_inode;
+
+	return NULL;
+}
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 00000000000..28b35a005ec
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "ceph_hash.h"
+
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+	u64 ino;
+	u64 snap;
+};
+
+
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+	int count;
+};
+
+
+#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 00000000000..37d6ce64569
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,844 @@
+#include "ceph_debug.h"
+#include "super.h"
+#include "decode.h"
+
+#include <linux/xattr.h>
+
+static bool ceph_is_valid_xattr(const char *name)
+{
+	return !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+
+/*
+ * These define virtual xattrs exposing the recursive directory
+ * statistics and layout metadata.
+ */
+struct ceph_vxattr_cb {
+	bool readonly;
+	char *name;
+	size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
+			      size_t size);
+};
+
+/* directories */
+
+static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
+				      size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_files);
+}
+
+static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_subdirs);
+}
+
+static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rfiles);
+}
+
+static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
+					 size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rsubdirs);
+}
+
+static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%lld", ci->i_rbytes);
+}
+
+static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
+				       size_t size)
+{
+	return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
+			(long)ci->i_rctime.tv_nsec);
+}
+
+static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
+	{ true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
+	{ true, "user.ceph.dir.files", ceph_vxattrcb_files},
+	{ true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+	{ true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
+	{ true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+	{ true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+	{ true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+	{ true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+	{ true, NULL, NULL }
+};
+
+/* files */
+
+static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
+				   size_t size)
+{
+	int ret;
+
+	ret = snprintf(val, size,
+		"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
+		(unsigned long long)ceph_file_layout_su(ci->i_layout),
+		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
+		(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+	if (ceph_file_layout_pg_preferred(ci->i_layout))
+		ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
+			    (unsigned long long)ceph_file_layout_pg_preferred(
+				    ci->i_layout));
+	return ret;
+}
+
+static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+	{ true, "user.ceph.layout", ceph_vxattrcb_layout},
+	{ NULL, NULL }
+};
+
+static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode))
+		return ceph_dir_vxattrs;
+	else if (S_ISREG(inode->i_mode))
+		return ceph_file_vxattrs;
+	return NULL;
+}
+
+static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
+						const char *name)
+{
+	do {
+		if (strcmp(vxattr->name, name) == 0)
+			return vxattr;
+		vxattr++;
+	} while (vxattr->name);
+	return NULL;
+}
+
+static int __set_xattr(struct ceph_inode_info *ci,
+			   const char *name, int name_len,
+			   const char *val, int val_len,
+			   int dirty,
+			   int should_free_name, int should_free_val,
+			   struct ceph_inode_xattr **newxattr)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int c;
+	int new = 0;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			if (name_len == xattr->name_len)
+				break;
+			else if (name_len < xattr->name_len)
+				p = &(*p)->rb_left;
+			else
+				p = &(*p)->rb_right;
+		}
+		xattr = NULL;
+	}
+
+	if (!xattr) {
+		new = 1;
+		xattr = *newxattr;
+		xattr->name = name;
+		xattr->name_len = name_len;
+		xattr->should_free_name = should_free_name;
+
+		ci->i_xattrs.count++;
+		dout("__set_xattr count=%d\n", ci->i_xattrs.count);
+	} else {
+		kfree(*newxattr);
+		*newxattr = NULL;
+		if (xattr->should_free_val)
+			kfree((void *)xattr->val);
+
+		if (should_free_name) {
+			kfree((void *)name);
+			name = xattr->name;
+		}
+		ci->i_xattrs.names_size -= xattr->name_len;
+		ci->i_xattrs.vals_size -= xattr->val_len;
+	}
+	if (!xattr) {
+		pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
+		       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
+		       xattr->val);
+		return -ENOMEM;
+	}
+	ci->i_xattrs.names_size += name_len;
+	ci->i_xattrs.vals_size += val_len;
+	if (val)
+		xattr->val = val;
+	else
+		xattr->val = "";
+
+	xattr->val_len = val_len;
+	xattr->dirty = dirty;
+	xattr->should_free_val = (val && should_free_val);
+
+	if (new) {
+		rb_link_node(&xattr->node, parent, p);
+		rb_insert_color(&xattr->node, &ci->i_xattrs.index);
+		dout("__set_xattr_val p=%p\n", p);
+	}
+
+	dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
+	     ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
+
+	return 0;
+}
+
+static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int c;
+
+	p = &ci->i_xattrs.index.rb_node;
+	while (*p) {
+		parent = *p;
+		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
+		c = strncmp(name, xattr->name, xattr->name_len);
+		if (c < 0)
+			p = &(*p)->rb_left;
+		else if (c > 0)
+			p = &(*p)->rb_right;
+		else {
+			dout("__get_xattr %s: found %.*s\n", name,
+			     xattr->val_len, xattr->val);
+			return xattr;
+		}
+	}
+
+	dout("__get_xattr %s: not found\n", name);
+
+	return NULL;
+}
+
+static void __free_xattr(struct ceph_inode_xattr *xattr)
+{
+	BUG_ON(!xattr);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	kfree(xattr);
+}
+
+static int __remove_xattr(struct ceph_inode_info *ci,
+			  struct ceph_inode_xattr *xattr)
+{
+	if (!xattr)
+		return -EOPNOTSUPP;
+
+	rb_erase(&xattr->node, &ci->i_xattrs.index);
+
+	if (xattr->should_free_name)
+		kfree((void *)xattr->name);
+	if (xattr->should_free_val)
+		kfree((void *)xattr->val);
+
+	ci->i_xattrs.names_size -= xattr->name_len;
+	ci->i_xattrs.vals_size -= xattr->val_len;
+	ci->i_xattrs.count--;
+	kfree(xattr);
+
+	return 0;
+}
+
+static int __remove_xattr_by_name(struct ceph_inode_info *ci,
+			   const char *name)
+{
+	struct rb_node **p;
+	struct ceph_inode_xattr *xattr;
+	int err;
+
+	p = &ci->i_xattrs.index.rb_node;
+	xattr = __get_xattr(ci, name);
+	err = __remove_xattr(ci, xattr);
+	return err;
+}
+
+static char *__copy_xattr_names(struct ceph_inode_info *ci,
+				char *dest)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+	dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		memcpy(dest, xattr->name, xattr->name_len);
+		dest[xattr->name_len] = '\0';
+
+		dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
+		     xattr->name_len, ci->i_xattrs.names_size);
+
+		dest += xattr->name_len + 1;
+		p = rb_next(p);
+	}
+
+	return dest;
+}
+
+void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
+{
+	struct rb_node *p, *tmp;
+	struct ceph_inode_xattr *xattr = NULL;
+
+	p = rb_first(&ci->i_xattrs.index);
+
+	dout("__ceph_destroy_xattrs p=%p\n", p);
+
+	while (p) {
+		xattr = rb_entry(p, struct ceph_inode_xattr, node);
+		tmp = p;
+		p = rb_next(tmp);
+		dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
+		     xattr->name_len, xattr->name);
+		rb_erase(tmp, &ci->i_xattrs.index);
+
+		__free_xattr(xattr);
+	}
+
+	ci->i_xattrs.names_size = 0;
+	ci->i_xattrs.vals_size = 0;
+	ci->i_xattrs.index_version = 0;
+	ci->i_xattrs.count = 0;
+	ci->i_xattrs.index = RB_ROOT;
+}
+
+static int __build_xattrs(struct inode *inode)
+{
+	u32 namelen;
+	u32 numattr = 0;
+	void *p, *end;
+	u32 len;
+	const char *name, *val;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int xattr_version;
+	struct ceph_inode_xattr **xattrs = NULL;
+	int err = 0;
+	int i;
+
+	dout("__build_xattrs() len=%d\n",
+	     ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
+
+	if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
+		return 0; /* already built */
+
+	__ceph_destroy_xattrs(ci);
+
+start:
+	/* updated internal xattr rb tree */
+	if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
+		p = ci->i_xattrs.blob->vec.iov_base;
+		end = p + ci->i_xattrs.blob->vec.iov_len;
+		ceph_decode_32_safe(&p, end, numattr, bad);
+		xattr_version = ci->i_xattrs.version;
+		spin_unlock(&inode->i_lock);
+
+		xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+				 GFP_NOFS);
+		err = -ENOMEM;
+		if (!xattrs)
+			goto bad_lock;
+		memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
+		for (i = 0; i < numattr; i++) {
+			xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
+					    GFP_NOFS);
+			if (!xattrs[i])
+				goto bad_lock;
+		}
+
+		spin_lock(&inode->i_lock);
+		if (ci->i_xattrs.version != xattr_version) {
+			/* lost a race, retry */
+			for (i = 0; i < numattr; i++)
+				kfree(xattrs[i]);
+			kfree(xattrs);
+			goto start;
+		}
+		err = -EIO;
+		while (numattr--) {
+			ceph_decode_32_safe(&p, end, len, bad);
+			namelen = len;
+			name = p;
+			p += len;
+			ceph_decode_32_safe(&p, end, len, bad);
+			val = p;
+			p += len;
+
+			err = __set_xattr(ci, name, namelen, val, len,
+					  0, 0, 0, &xattrs[numattr]);
+
+			if (err < 0)
+				goto bad;
+		}
+		kfree(xattrs);
+	}
+	ci->i_xattrs.index_version = ci->i_xattrs.version;
+	ci->i_xattrs.dirty = false;
+
+	return err;
+bad_lock:
+	spin_lock(&inode->i_lock);
+bad:
+	if (xattrs) {
+		for (i = 0; i < numattr; i++)
+			kfree(xattrs[i]);
+		kfree(xattrs);
+	}
+	ci->i_xattrs.names_size = 0;
+	return err;
+}
+
+static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
+				    int val_size)
+{
+	/*
+	 * 4 bytes for the length, and additional 4 bytes per each xattr name,
+	 * 4 bytes per each value
+	 */
+	int size = 4 + ci->i_xattrs.count*(4 + 4) +
+			     ci->i_xattrs.names_size +
+			     ci->i_xattrs.vals_size;
+	dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
+	     ci->i_xattrs.count, ci->i_xattrs.names_size,
+	     ci->i_xattrs.vals_size);
+
+	if (name_size)
+		size += 4 + 4 + name_size + val_size;
+
+	return size;
+}
+
+/*
+ * If there are dirty xattrs, reencode xattrs into the prealloc_blob
+ * and swap into place.
+ */
+void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
+{
+	struct rb_node *p;
+	struct ceph_inode_xattr *xattr = NULL;
+	void *dest;
+
+	dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+	if (ci->i_xattrs.dirty) {
+		int need = __get_required_blob_size(ci, 0, 0);
+
+		BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
+
+		p = rb_first(&ci->i_xattrs.index);
+		dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		ceph_encode_32(&dest, ci->i_xattrs.count);
+		while (p) {
+			xattr = rb_entry(p, struct ceph_inode_xattr, node);
+
+			ceph_encode_32(&dest, xattr->name_len);
+			memcpy(dest, xattr->name, xattr->name_len);
+			dest += xattr->name_len;
+			ceph_encode_32(&dest, xattr->val_len);
+			memcpy(dest, xattr->val, xattr->val_len);
+			dest += xattr->val_len;
+
+			p = rb_next(p);
+		}
+
+		/* adjust buffer len; it may be larger than we need */
+		ci->i_xattrs.prealloc_blob->vec.iov_len =
+			dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
+
+		if (ci->i_xattrs.blob)
+			ceph_buffer_put(ci->i_xattrs.blob);
+		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
+		ci->i_xattrs.prealloc_blob = NULL;
+		ci->i_xattrs.dirty = false;
+	}
+}
+
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+		      size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int err;
+	struct ceph_inode_xattr *xattr;
+	struct ceph_vxattr_cb *vxattr = NULL;
+
+	if (!ceph_is_valid_xattr(name))
+		return -ENODATA;
+
+	/* let's see if a virtual xattr was requested */
+	if (vxattrs)
+		vxattr = ceph_match_vxattr(vxattrs, name);
+
+	spin_lock(&inode->i_lock);
+	dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+	    (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
+		goto get_xattr;
+	} else {
+		spin_unlock(&inode->i_lock);
+		/* get xattrs from mds (if we don't already have them) */
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+		if (err)
+			return err;
+	}
+
+	spin_lock(&inode->i_lock);
+
+	if (vxattr && vxattr->readonly) {
+		err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+
+get_xattr:
+	err = -ENODATA;  /* == ENOATTR */
+	xattr = __get_xattr(ci, name);
+	if (!xattr) {
+		if (vxattr)
+			err = vxattr->getxattr_cb(ci, value, size);
+		goto out;
+	}
+
+	err = -ERANGE;
+	if (size && size < xattr->val_len)
+		goto out;
+
+	err = xattr->val_len;
+	if (size == 0)
+		goto out;
+
+	memcpy(value, xattr->val, xattr->val_len);
+
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+}
+
+ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	u32 vir_namelen = 0;
+	u32 namelen;
+	int err;
+	u32 len;
+	int i;
+
+	spin_lock(&inode->i_lock);
+	dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
+	     ci->i_xattrs.version, ci->i_xattrs.index_version);
+
+	if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
+	    (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+		goto list_xattr;
+	} else {
+		spin_unlock(&inode->i_lock);
+		err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
+		if (err)
+			return err;
+	}
+
+	spin_lock(&inode->i_lock);
+
+	err = __build_xattrs(inode);
+	if (err < 0)
+		goto out;
+
+list_xattr:
+	vir_namelen = 0;
+	/* include virtual dir xattrs */
+	if (vxattrs)
+		for (i = 0; vxattrs[i].name; i++)
+			vir_namelen += strlen(vxattrs[i].name) + 1;
+	/* adding 1 byte per each variable due to the null termination */
+	namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
+	err = -ERANGE;
+	if (size && namelen > size)
+		goto out;
+
+	err = namelen;
+	if (size == 0)
+		goto out;
+
+	names = __copy_xattr_names(ci, names);
+
+	/* virtual xattr names, too */
+	if (vxattrs)
+		for (i = 0; vxattrs[i].name; i++) {
+			len = sprintf(names, "%s", vxattrs[i].name);
+			names += len + 1;
+		}
+
+out:
+	spin_unlock(&inode->i_lock);
+	return err;
+}
+
+static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
+			      const char *value, size_t size, int flags)
+{
+	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct ceph_mds_request *req;
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	int err;
+	int i, nr_pages;
+	struct page **pages = NULL;
+	void *kaddr;
+
+	/* copy value into some pages */
+	nr_pages = calc_pages_for(0, size);
+	if (nr_pages) {
+		pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
+		if (!pages)
+			return -ENOMEM;
+		err = -ENOMEM;
+		for (i = 0; i < nr_pages; i++) {
+			pages[i] = alloc_page(GFP_NOFS);
+			if (!pages[i]) {
+				nr_pages = i;
+				goto out;
+			}
+			kaddr = kmap(pages[i]);
+			memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
+			       min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
+		}
+	}
+
+	dout("setxattr value=%.*s\n", (int)size, value);
+
+	/* do request */
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+	req->r_inode = igrab(inode);
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+	req->r_num_caps = 1;
+	req->r_args.setxattr.flags = cpu_to_le32(flags);
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+
+	req->r_pages = pages;
+	req->r_num_pages = nr_pages;
+	req->r_data_len = size;
+
+	dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
+
+out:
+	if (pages) {
+		for (i = 0; i < nr_pages; i++)
+			__free_page(pages[i]);
+		kfree(pages);
+	}
+	return err;
+}
+
+int ceph_setxattr(struct dentry *dentry, const char *name,
+		  const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int err;
+	int name_len = strlen(name);
+	int val_len = size;
+	char *newname = NULL;
+	char *newval = NULL;
+	struct ceph_inode_xattr *xattr = NULL;
+	int issued;
+	int required_blob_size;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (vxattrs) {
+		struct ceph_vxattr_cb *vxattr =
+			ceph_match_vxattr(vxattrs, name);
+		if (vxattr && vxattr->readonly)
+			return -EOPNOTSUPP;
+	}
+
+	/* preallocate memory for xattr name, value, index node */
+	err = -ENOMEM;
+	newname = kmalloc(name_len + 1, GFP_NOFS);
+	if (!newname)
+		goto out;
+	memcpy(newname, name, name_len + 1);
+
+	if (val_len) {
+		newval = kmalloc(val_len + 1, GFP_NOFS);
+		if (!newval)
+			goto out;
+		memcpy(newval, value, val_len);
+		newval[val_len] = '\0';
+	}
+
+	xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
+	if (!xattr)
+		goto out;
+
+	spin_lock(&inode->i_lock);
+retry:
+	issued = __ceph_caps_issued(ci, NULL);
+	if (!(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+	__build_xattrs(inode);
+
+	required_blob_size = __get_required_blob_size(ci, name_len, val_len);
+
+	if (!ci->i_xattrs.prealloc_blob ||
+	    required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+		struct ceph_buffer *blob = NULL;
+
+		spin_unlock(&inode->i_lock);
+		dout(" preaallocating new blob size=%d\n", required_blob_size);
+		blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+		if (!blob)
+			goto out;
+		spin_lock(&inode->i_lock);
+		if (ci->i_xattrs.prealloc_blob)
+			ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+		ci->i_xattrs.prealloc_blob = blob;
+		goto retry;
+	}
+
+	dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
+	err = __set_xattr(ci, newname, name_len, newval,
+			  val_len, 1, 1, 1, &xattr);
+	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	ci->i_xattrs.dirty = true;
+	inode->i_ctime = CURRENT_TIME;
+	spin_unlock(&inode->i_lock);
+
+	return err;
+
+do_sync:
+	spin_unlock(&inode->i_lock);
+	err = ceph_sync_setxattr(dentry, name, value, size, flags);
+out:
+	kfree(newname);
+	kfree(newval);
+	kfree(xattr);
+	return err;
+}
+
+static int ceph_send_removexattr(struct dentry *dentry, const char *name)
+{
+	struct ceph_client *client = ceph_client(dentry->d_sb);
+	struct ceph_mds_client *mdsc = &client->mdsc;
+	struct inode *inode = dentry->d_inode;
+	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct ceph_mds_request *req;
+	int err;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
+				       USE_AUTH_MDS);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+	req->r_inode = igrab(inode);
+	req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
+	req->r_num_caps = 1;
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+
+	err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+	ceph_mdsc_put_request(req);
+	return err;
+}
+
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
+	int issued;
+	int err;
+
+	if (ceph_snap(inode) != CEPH_NOSNAP)
+		return -EROFS;
+
+	if (!ceph_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+
+	if (vxattrs) {
+		struct ceph_vxattr_cb *vxattr =
+			ceph_match_vxattr(vxattrs, name);
+		if (vxattr && vxattr->readonly)
+			return -EOPNOTSUPP;
+	}
+
+	spin_lock(&inode->i_lock);
+	__build_xattrs(inode);
+	issued = __ceph_caps_issued(ci, NULL);
+	dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
+
+	if (!(issued & CEPH_CAP_XATTR_EXCL))
+		goto do_sync;
+
+	err = __remove_xattr_by_name(ceph_inode(inode), name);
+	__ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	ci->i_xattrs.dirty = true;
+	inode->i_ctime = CURRENT_TIME;
+
+	spin_unlock(&inode->i_lock);
+
+	return err;
+do_sync:
+	spin_unlock(&inode->i_lock);
+	err = ceph_send_removexattr(dentry, name);
+	return err;
+}
+
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a0362717..5183bc2a191 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
 	cifs_inode->clientCanCacheRead = false;
 	cifs_inode->clientCanCacheAll = false;
 	cifs_inode->delete_pending = false;
+	cifs_inode->invalid_mapping = false;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 	cifs_inode->server_eof = 0;
 
@@ -638,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		   setting the revalidate time to zero */
 		CIFS_I(file->f_path.dentry->d_inode)->time = 0;
 
-		retval = cifs_revalidate(file->f_path.dentry);
+		retval = cifs_revalidate_file(file);
 		if (retval < 0)
 			return (loff_t)retval;
 	}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78c1b86d55f..7aa57ecdc43 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
 extern int cifs_rmdir(struct inode *, struct dentry *);
 extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
 		       struct dentry *);
-extern int cifs_revalidate(struct dentry *);
+extern int cifs_revalidate_file(struct file *filp);
+extern int cifs_revalidate_dentry(struct dentry *);
 extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int cifs_setattr(struct dentry *, struct iattr *);
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a1c817eb291..63c89d1d70b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -389,6 +389,7 @@ struct cifsInodeInfo {
 	bool clientCanCacheRead:1;	/* read oplock */
 	bool clientCanCacheAll:1;	/* read and writebehind oplock */
 	bool delete_pending:1;		/* DELETE_ON_CLOSE is set */
+	bool invalid_mapping:1;		/* pagecache is invalid */
 	u64  server_eof;		/* current file size on server */
 	u64  uniqueid;			/* server inode number */
 	struct inode vfs_inode;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 88e2bc44ac5..39e47f46dea 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
 extern struct inode *cifs_iget(struct super_block *sb,
 			       struct cifs_fattr *fattr);
 
+extern int cifs_get_file_info(struct file *filp);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
 			FILE_ALL_INFO *pfile_info,
 			struct super_block *sb, int xid, const __u16 *pfid);
+extern int cifs_get_file_info_unix(struct file *filp);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
@@ -142,6 +144,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
 			const __u16 search_handle);
 
+extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+			u16 netfid, FILE_ALL_INFO *pFindData);
 extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
 			FILE_ALL_INFO *findData,
@@ -152,6 +156,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
 			FILE_ALL_INFO *findData,
 			const struct nls_table *nls_codepage, int remap);
 
+extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+			u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
 extern int CIFSSMBUnixQPathInfo(const int xid,
 			struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 61183589984..7cc7f83e931 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -500,7 +500,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	} else if (pSMBr->hdr.WordCount == 13) {
 		cERROR(1, ("mount failed, cifs module not built "
 			  "with CIFS_WEAK_PW_HASH support"));
-			rc = -EOPNOTSUPP;
+		rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
 		goto neg_err_exit;
 	} else if (pSMBr->hdr.WordCount != 17) {
@@ -3230,8 +3230,72 @@ QInfRetry:
 	return rc;
 }
 
+int
+CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
+		 u16 netfid, FILE_ALL_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
 
+QFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
 
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	pSMB->hdr.smb_buf_length += byte_count;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, ("Send error in QPathInfo = %d", rc));
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc) /* BB add auto retry on EOPNOTSUPP? */
+			rc = -EIO;
+		else if (pSMBr->ByteCount < 40)
+			rc = -EIO;	/* bad smb */
+		else if (pFindData) {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset, sizeof(FILE_ALL_INFO));
+		} else
+		    rc = -ENOMEM;
+	}
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto QFileInfoRetry;
+
+	return rc;
+}
 
 int
 CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3335,6 +3399,75 @@ QPathInfoRetry:
 }
 
 int
+CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
+		 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
+{
+	struct smb_t2_qfi_req *pSMB = NULL;
+	struct smb_t2_qfi_rsp *pSMBr = NULL;
+	int rc = 0;
+	int bytes_returned;
+	__u16 params, byte_count;
+
+UnixQFileInfoRetry:
+	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
+		      (void **) &pSMBr);
+	if (rc)
+		return rc;
+
+	params = 2 /* level */ + 2 /* fid */;
+	pSMB->t2.TotalDataCount = 0;
+	pSMB->t2.MaxParameterCount = cpu_to_le16(4);
+	/* BB find exact max data count below from sess structure BB */
+	pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
+	pSMB->t2.MaxSetupCount = 0;
+	pSMB->t2.Reserved = 0;
+	pSMB->t2.Flags = 0;
+	pSMB->t2.Timeout = 0;
+	pSMB->t2.Reserved2 = 0;
+	pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
+					       Fid) - 4);
+	pSMB->t2.DataCount = 0;
+	pSMB->t2.DataOffset = 0;
+	pSMB->t2.SetupCount = 1;
+	pSMB->t2.Reserved3 = 0;
+	pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
+	byte_count = params + 1 /* pad */ ;
+	pSMB->t2.TotalParameterCount = cpu_to_le16(params);
+	pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
+	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
+	pSMB->Pad = 0;
+	pSMB->Fid = netfid;
+	pSMB->hdr.smb_buf_length += byte_count;
+
+	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+	if (rc) {
+		cFYI(1, ("Send error in QPathInfo = %d", rc));
+	} else {		/* decode response */
+		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+
+		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+			cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+				   "Unix Extensions can be disabled on mount "
+				   "by specifying the nosfu mount option."));
+			rc = -EIO;	/* bad smb */
+		} else {
+			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			memcpy((char *) pFindData,
+			       (char *) &pSMBr->hdr.Protocol +
+			       data_offset,
+			       sizeof(FILE_UNIX_BASIC_INFO));
+		}
+	}
+
+	cifs_buf_release(pSMB);
+	if (rc == -EAGAIN)
+		goto UnixQFileInfoRetry;
+
+	return rc;
+}
+
+int
 CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
 		     const unsigned char *searchName,
 		     FILE_UNIX_BASIC_INFO *pFindData,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b..e9f7ecc2714 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -739,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 	int isValid = 1;
 
 	if (direntry->d_inode) {
-		if (cifs_revalidate(direntry))
+		if (cifs_revalidate_dentry(direntry))
 			return 0;
 	} else {
 		cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3d8f8a96f5a..ca2ba7a0193 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -219,8 +219,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 		cFYI(1, ("inode unchanged on server"));
 	} else {
 		if (file->f_path.dentry->d_inode->i_mapping) {
-		/* BB no need to lock inode until after invalidate
-		   since namei code should already have it locked? */
+			/* BB no need to lock inode until after invalidate
+			since namei code should already have it locked? */
 			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
 			if (rc != 0)
 				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
@@ -1890,11 +1890,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 
 int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct dentry *dentry = file->f_path.dentry;
 	int rc, xid;
 
 	xid = GetXid();
-	rc = cifs_revalidate(dentry);
+	rc = cifs_revalidate_file(file);
 	if (rc) {
 		cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
 		FreeXid(xid);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8bdbc818164..723daaccbd0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -77,6 +77,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
 	}
 }
 
+/* check inode attributes against fattr. If they don't match, tag the
+ * inode for cache invalidation
+ */
+static void
+cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
+{
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
+	cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
+
+	if (inode->i_state & I_NEW) {
+		cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
+		return;
+	}
+
+	/* don't bother with revalidation if we have an oplock */
+	if (cifs_i->clientCanCacheRead) {
+		cFYI(1, ("%s: inode %llu is oplocked", __func__,
+			 cifs_i->uniqueid));
+		return;
+	}
+
+	 /* revalidate if mtime or size have changed */
+	if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+	    cifs_i->server_eof == fattr->cf_eof) {
+		cFYI(1, ("%s: inode %llu is unchanged", __func__,
+			 cifs_i->uniqueid));
+		return;
+	}
+
+	cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
+		 cifs_i->uniqueid));
+	cifs_i->invalid_mapping = true;
+}
+
 /* populate an inode with info from a cifs_fattr struct */
 void
 cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +120,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	unsigned long oldtime = cifs_i->time;
 
+	cifs_revalidate_cache(inode, fattr);
+
 	inode->i_atime = fattr->cf_atime;
 	inode->i_mtime = fattr->cf_mtime;
 	inode->i_ctime = fattr->cf_ctime;
@@ -231,6 +268,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 	fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
 }
 
+int cifs_get_file_info_unix(struct file *filp)
+{
+	int rc;
+	int xid;
+	FILE_UNIX_BASIC_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+
+	xid = GetXid();
+	rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
+	if (!rc) {
+		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+	}
+
+	cifs_fattr_to_inode(inode, &fattr);
+	FreeXid(xid);
+	return rc;
+}
+
 int cifs_get_inode_info_unix(struct inode **pinode,
 			     const unsigned char *full_path,
 			     struct super_block *sb, int xid)
@@ -432,6 +494,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 	fattr->cf_gid = cifs_sb->mnt_gid;
 }
 
+int cifs_get_file_info(struct file *filp)
+{
+	int rc;
+	int xid;
+	FILE_ALL_INFO find_data;
+	struct cifs_fattr fattr;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsTconInfo *tcon = cifs_sb->tcon;
+	struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
+
+	xid = GetXid();
+	rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
+	if (rc == -EOPNOTSUPP || rc == -EINVAL) {
+		/*
+		 * FIXME: legacy server -- fall back to path-based call?
+		 * for now, just skip revalidating and mark inode for
+		 * immediate reval.
+		 */
+		rc = 0;
+		CIFS_I(inode)->time = 0;
+		goto cgfi_exit;
+	} else if (rc == -EREMOTE) {
+		cifs_create_dfs_fattr(&fattr, inode->i_sb);
+		rc = 0;
+	} else if (rc)
+		goto cgfi_exit;
+
+	/*
+	 * don't bother with SFU junk here -- just mark inode as needing
+	 * revalidation.
+	 */
+	cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+	fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
+	fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
+	cifs_fattr_to_inode(inode, &fattr);
+cgfi_exit:
+	FreeXid(xid);
+	return rc;
+}
+
 int cifs_get_inode_info(struct inode **pinode,
 	const unsigned char *full_path, FILE_ALL_INFO *pfindData,
 	struct super_block *sb, int xid, const __u16 *pfid)
@@ -1389,135 +1492,103 @@ cifs_rename_exit:
 	return rc;
 }
 
-int cifs_revalidate(struct dentry *direntry)
+static bool
+cifs_inode_needs_reval(struct inode *inode)
 {
-	int xid;
-	int rc = 0, wbrc = 0;
-	char *full_path;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsInodeInfo *cifsInode;
-	loff_t local_size;
-	struct timespec local_mtime;
-	bool invalidate_inode = false;
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 
-	if (direntry->d_inode == NULL)
-		return -ENOENT;
+	if (cifs_i->clientCanCacheRead)
+		return false;
 
-	cifsInode = CIFS_I(direntry->d_inode);
+	if (!lookupCacheEnabled)
+		return true;
 
-	if (cifsInode == NULL)
-		return -ENOENT;
+	if (cifs_i->time == 0)
+		return true;
 
-	/* no sense revalidating inode info on file that no one can write */
-	if (CIFS_I(direntry->d_inode)->clientCanCacheRead)
-		return rc;
+	/* FIXME: the actimeo should be tunable */
+	if (time_after_eq(jiffies, cifs_i->time + HZ))
+		return true;
+
+	return false;
+}
+
+/* check invalid_mapping flag and zap the cache if it's set */
+static void
+cifs_invalidate_mapping(struct inode *inode)
+{
+	int rc;
+	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
+
+	cifs_i->invalid_mapping = false;
+
+	/* write back any cached data */
+	if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
+		rc = filemap_write_and_wait(inode->i_mapping);
+		if (rc)
+			cifs_i->write_behind_rc = rc;
+	}
+	invalidate_remote_inode(inode);
+}
+
+int cifs_revalidate_file(struct file *filp)
+{
+	int rc = 0;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+	if (!cifs_inode_needs_reval(inode))
+		goto check_inval;
+
+	if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
+		rc = cifs_get_file_info_unix(filp);
+	else
+		rc = cifs_get_file_info(filp);
+
+check_inval:
+	if (CIFS_I(inode)->invalid_mapping)
+		cifs_invalidate_mapping(inode);
+
+	return rc;
+}
+
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+	int xid;
+	int rc = 0;
+	char *full_path = NULL;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dentry->d_sb;
+
+	if (inode == NULL)
+		return -ENOENT;
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(direntry->d_sb);
+	if (!cifs_inode_needs_reval(inode))
+		goto check_inval;
 
 	/* can not safely grab the rename sem here if rename calls revalidate
 	   since that would deadlock */
-	full_path = build_path_from_dentry(direntry);
+	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
-	}
-	cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
-		 "jiffies %ld", full_path, direntry->d_inode,
-		 direntry->d_inode->i_count.counter, direntry,
-		 direntry->d_time, jiffies));
-
-	if (cifsInode->time == 0) {
-		/* was set to zero previously to force revalidate */
-	} else if (time_before(jiffies, cifsInode->time + HZ) &&
-		   lookupCacheEnabled) {
-		if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
-		    (direntry->d_inode->i_nlink == 1)) {
-			kfree(full_path);
-			FreeXid(xid);
-			return rc;
-		} else {
-			cFYI(1, ("Have to revalidate file due to hardlinks"));
-		}
-	}
-
-	/* save mtime and size */
-	local_mtime = direntry->d_inode->i_mtime;
-	local_size = direntry->d_inode->i_size;
-
-	if (cifs_sb->tcon->unix_ext) {
-		rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
-					      direntry->d_sb, xid);
-		if (rc) {
-			cFYI(1, ("error on getting revalidate info %d", rc));
-/*			if (rc != -ENOENT)
-				rc = 0; */	/* BB should we cache info on
-						   certain errors? */
-		}
-	} else {
-		rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
-					 direntry->d_sb, xid, NULL);
-		if (rc) {
-			cFYI(1, ("error on getting revalidate info %d", rc));
-/*			if (rc != -ENOENT)
-				rc = 0; */	/* BB should we cache info on
-						   certain errors? */
-		}
+		goto check_inval;
 	}
-	/* should we remap certain errors, access denied?, to zero */
 
-	/* if not oplocked, we invalidate inode pages if mtime or file size
-	   had changed on server */
+	cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
+		 "jiffies %ld", full_path, inode, inode->i_count.counter,
+		 dentry, dentry->d_time, jiffies));
 
-	if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) &&
-	    (local_size == direntry->d_inode->i_size)) {
-		cFYI(1, ("cifs_revalidate - inode unchanged"));
-	} else {
-		/* file may have changed on server */
-		if (cifsInode->clientCanCacheRead) {
-			/* no need to invalidate inode pages since we were the
-			   only ones who could have modified the file and the
-			   server copy is staler than ours */
-		} else {
-			invalidate_inode = true;
-		}
-	}
+	if (CIFS_SB(sb)->tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
+	else
+		rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
+					 xid, NULL);
 
-	/* can not grab this sem since kernel filesys locking documentation
-	   indicates i_mutex may be taken by the kernel on lookup and rename
-	   which could deadlock if we grab the i_mutex here as well */
-/*	mutex_lock(&direntry->d_inode->i_mutex);*/
-	/* need to write out dirty pages here  */
-	if (direntry->d_inode->i_mapping) {
-		/* do we need to lock inode until after invalidate completes
-		   below? */
-		wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
-		if (wbrc)
-			CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-	}
-	if (invalidate_inode) {
-	/* shrink_dcache not necessary now that cifs dentry ops
-	are exported for negative dentries */
-/*		if (S_ISDIR(direntry->d_inode->i_mode))
-			shrink_dcache_parent(direntry); */
-		if (S_ISREG(direntry->d_inode->i_mode)) {
-			if (direntry->d_inode->i_mapping) {
-				wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
-				if (wbrc)
-					CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
-			}
-			/* may eventually have to do this for open files too */
-			if (list_empty(&(cifsInode->openFileList))) {
-				/* changed on server - flush read ahead pages */
-				cFYI(1, ("Invalidating read ahead data on "
-					 "closed file"));
-				invalidate_remote_inode(direntry->d_inode);
-			}
-		}
-	}
-/*	mutex_unlock(&direntry->d_inode->i_mutex); */
+check_inval:
+	if (CIFS_I(inode)->invalid_mapping)
+		cifs_invalidate_mapping(inode);
 
 	kfree(full_path);
 	FreeXid(xid);
@@ -1527,7 +1598,7 @@ int cifs_revalidate(struct dentry *direntry)
 int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct kstat *stat)
 {
-	int err = cifs_revalidate(dentry);
+	int err = cifs_revalidate_dentry(dentry);
 	if (!err) {
 		generic_fillattr(dentry->d_inode, stat);
 		stat->blksize = CIFS_MAX_MSGSIZE;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ef9008b885b..0d0e97ed3ff 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -582,7 +582,9 @@ got:
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
 
-	ei->i_state = EXT3_STATE_NEW;
+	ei->i_state_flags = 0;
+	ext3_set_inode_state(inode, EXT3_STATE_NEW);
+
 	ei->i_extra_isize =
 		(EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) ?
 		sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 7f920b7263a..ea33bdf0a30 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2811,7 +2811,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
 
-	ei->i_state = 0;
+	ei->i_state_flags = 0;
 	ei->i_dir_start_lookup = 0;
 	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
 	/* We now have enough fields to check if the inode was active or not.
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 361c0b9962a..57f6eef6ccd 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -263,7 +263,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 					ext4_group_t f;
 
 					f = ext4_flex_group(sbi, block_group);
-					atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+					atomic_dec(&sbi->s_flex_groups[f].used_dirs);
 				}
 
 			}
@@ -773,7 +773,7 @@ static int ext4_claim_inode(struct super_block *sb,
 		if (sbi->s_log_groups_per_flex) {
 			ext4_group_t f = ext4_flex_group(sbi, group);
 
-			atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
 		}
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 986120f3006..11119e07233 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1035,7 +1035,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 					      sector_t lblock)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+	sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
 	int blk_bits;
 
 	if (lblock < EXT4_NDIR_BLOCKS)
@@ -1050,7 +1050,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 	}
 	ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
 	ei->i_da_metadata_calc_len = 1;
-	blk_bits = roundup_pow_of_two(lblock + 1);
+	blk_bits = order_base_2(lblock);
 	return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ba191dae873..e14d22c170d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -68,7 +68,21 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data, struct vfsmount *mnt);
 
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext3",
+	.get_sb		= ext4_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#else
+#define IS_EXT3_SB(sb) (0)
+#endif
 
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
@@ -2539,7 +2553,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * enable delayed allocation by default
 	 * Use -o nodelalloc to turn it off
 	 */
-	set_opt(sbi->s_mount_opt, DELALLOC);
+	if (!IS_EXT3_SB(sb))
+		set_opt(sbi->s_mount_opt, DELALLOC);
 
 	if (!parse_options((char *) data, sb, &journal_devnum,
 			   &journal_ioprio, NULL, 0))
@@ -4068,7 +4083,7 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
-#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext2",
@@ -4095,15 +4110,7 @@ static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
 #endif
 
-#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext3_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ext3",
-	.get_sb		= ext4_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
+#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static inline void register_as_ext3(void)
 {
 	int err = register_filesystem(&ext3_fs_type);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c1ef5015486..6fcc7e71fba 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 {
 	struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options;
 	wchar_t *ip, *ext_start, *end, *name_start;
-	unsigned char base[9], ext[4], buf[8], *p;
+	unsigned char base[9], ext[4], buf[5], *p;
 	unsigned char charbuf[NLS_MAX_CHARSET_SIZE];
 	int chl, chi;
 	int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen;
@@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 			return 0;
 	}
 
-	i = jiffies & 0xffff;
+	i = jiffies;
 	sz = (jiffies >> 16) & 0x7;
 	if (baselen > 2) {
 		baselen = numtail2_baselen;
@@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls,
 	name_res[baselen + 4] = '~';
 	name_res[baselen + 5] = '1' + sz;
 	while (1) {
-		sprintf(buf, "%04X", i);
+		snprintf(buf, sizeof(buf), "%04X", i & 0xffff);
 		memcpy(&name_res[baselen], buf, 4);
 		if (vfat_find_form(dir, name_res) < 0)
 			break;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index e513ac599c8..0b589a9b4ff 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -53,7 +53,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
 static void fscache_object_slow_work_put_ref(struct slow_work *);
 static int  fscache_object_slow_work_get_ref(struct slow_work *);
 static void fscache_object_slow_work_execute(struct slow_work *);
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
 #endif
 static void fscache_initialise_object(struct fscache_object *);
@@ -69,7 +69,7 @@ const struct slow_work_ops fscache_object_slow_work_ops = {
 	.get_ref	= fscache_object_slow_work_get_ref,
 	.put_ref	= fscache_object_slow_work_put_ref,
 	.execute	= fscache_object_slow_work_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	.desc		= fscache_object_slow_work_desc,
 #endif
 };
@@ -364,7 +364,7 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
 /*
  * describe an object for slow-work debugging
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_object_slow_work_desc(struct slow_work *work,
 					  struct seq_file *m)
 {
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 313e79a1426..9f6c928d458 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -500,7 +500,7 @@ static void fscache_op_execute(struct slow_work *work)
 /*
  * describe an operation for slow-work debugging
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
 {
 	struct fscache_operation *op =
@@ -517,7 +517,7 @@ const struct slow_work_ops fscache_op_slow_work_ops = {
 	.get_ref	= fscache_op_get_ref,
 	.put_ref	= fscache_op_put_ref,
 	.execute	= fscache_op_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	.desc		= fscache_op_desc,
 #endif
 };
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7..69809024d71 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -881,6 +881,7 @@ submit_failed:
 	goto nobufs;
 
 nobufs_unlock_obj:
+	spin_unlock(&cookie->stores_lock);
 	spin_unlock(&object->lock);
 nobufs:
 	spin_unlock(&cookie->lock);
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 9718c22f186..a5d0c56d3eb 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -80,6 +80,7 @@ static void writeseg_end_io(struct bio *bio, int err)
 			prefetchw(&bvec->bv_page->flags);
 
 		end_page_writeback(page);
+		page_cache_release(page);
 	} while (bvec >= bio->bi_io_vec);
 	bio_put(bio);
 	if (atomic_dec_and_test(&super->s_pending_writes))
@@ -97,8 +98,10 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
 	int i;
 
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
 	bio = bio_alloc(GFP_NOFS, max_pages);
-	BUG_ON(!bio); /* FIXME: handle this */
+	BUG_ON(!bio);
 
 	for (i = 0; i < nr_pages; i++) {
 		if (i >= max_pages) {
@@ -191,8 +194,10 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 	unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
 	int i;
 
+	if (max_pages > BIO_MAX_PAGES)
+		max_pages = BIO_MAX_PAGES;
 	bio = bio_alloc(GFP_NOFS, max_pages);
-	BUG_ON(!bio); /* FIXME: handle this */
+	BUG_ON(!bio);
 
 	for (i = 0; i < nr_pages; i++) {
 		if (i >= max_pages) {
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 56a8bfbb012..c76b4b5c7ff 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -303,12 +303,12 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
 				(filler_t *)logfs_readpage, NULL);
 		if (IS_ERR(page))
 			return PTR_ERR(page);
-		dd = kmap_atomic(page, KM_USER0);
+		dd = kmap(page);
 		BUG_ON(dd->namelen == 0);
 
 		full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
 				pos, be64_to_cpu(dd->ino), dd->type);
-		kunmap_atomic(dd, KM_USER0);
+		kunmap(page);
 		page_cache_release(page);
 		if (full)
 			break;
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 6ad30a4c905..d57c7b07b60 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -800,6 +800,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct logfs_area *area = super->s_journal_area;
+	struct btree_head32 *head = &super->s_reserved_segments;
 	u32 segno, ec;
 	int i, err;
 
@@ -807,6 +808,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 	/* Drop old segments */
 	journal_for_each(i)
 		if (super->s_journal_seg[i]) {
+			btree_remove32(head, super->s_journal_seg[i]);
 			logfs_set_segment_unreserved(sb,
 					super->s_journal_seg[i],
 					super->s_journal_ec[i]);
@@ -819,8 +821,13 @@ void do_logfs_journal_wl_pass(struct super_block *sb)
 		super->s_journal_seg[i] = segno;
 		super->s_journal_ec[i] = ec;
 		logfs_set_segment_reserved(sb, segno);
+		err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
+		BUG_ON(err); /* mempool should prevent this */
+		err = logfs_erase_segment(sb, segno, 1);
+		BUG_ON(err); /* FIXME: remount-ro would be nicer */
 	}
 	/* Manually move journal_area */
+	freeseg(sb, area->a_segno);
 	area->a_segno = super->s_journal_seg[0];
 	area->a_is_open = 0;
 	area->a_used_bytes = 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 12977943137..b84b0eec602 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -587,6 +587,7 @@ void move_page_to_btree(struct page *page);
 int logfs_init_mapping(struct super_block *sb);
 void logfs_sync_area(struct logfs_area *area);
 void logfs_sync_segments(struct super_block *sb);
+void freeseg(struct super_block *sb, u32 segno);
 
 /* area handling */
 int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 7a23b3e7c0a..c3a3a6814b8 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1594,7 +1594,6 @@ int logfs_delete(struct inode *inode, pgoff_t index,
 	return ret;
 }
 
-/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
 int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
 		gc_level_t gc_level, long flags)
 {
@@ -1611,6 +1610,18 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
 		if (level != 0)
 			alloc_indirect_block(inode, page, 0);
 		err = logfs_write_buf(inode, page, flags);
+		if (!err && shrink_level(gc_level) == 0) {
+			/* Rewrite cannot mark the inode dirty but has to
+			 * write it immediatly.
+			 * Q: Can't we just create an alias for the inode
+			 * instead?  And if not, why not?
+			 */
+			if (inode->i_ino == LOGFS_INO_MASTER)
+				logfs_write_anchor(inode->i_sb);
+			else {
+				err = __logfs_write_inode(inode, flags);
+			}
+		}
 	}
 	logfs_put_write_page(page);
 	return err;
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 1a14f9910d5..0ecd8f07c11 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -93,50 +93,58 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
 	} while (len);
 }
 
-/*
- * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
- */
-static void pad_wbuf(struct logfs_area *area, int final)
+static void pad_partial_page(struct logfs_area *area)
 {
 	struct super_block *sb = area->a_sb;
-	struct logfs_super *super = logfs_super(sb);
 	struct page *page;
 	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
 	pgoff_t index = ofs >> PAGE_SHIFT;
 	long offset = ofs & (PAGE_SIZE-1);
 	u32 len = PAGE_SIZE - offset;
 
-	if (len == PAGE_SIZE) {
-		/* The math in this function can surely use some love */
-		len = 0;
-	}
-	if (len) {
-		BUG_ON(area->a_used_bytes >= super->s_segsize);
-
-		page = get_mapping_page(area->a_sb, index, 0);
+	if (len % PAGE_SIZE) {
+		page = get_mapping_page(sb, index, 0);
 		BUG_ON(!page); /* FIXME: reserve a pool */
 		memset(page_address(page) + offset, 0xff, len);
 		SetPagePrivate(page);
 		page_cache_release(page);
 	}
+}
 
-	if (!final)
-		return;
+static void pad_full_pages(struct logfs_area *area)
+{
+	struct super_block *sb = area->a_sb;
+	struct logfs_super *super = logfs_super(sb);
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
+	u32 len = super->s_segsize - area->a_used_bytes;
+	pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
+	pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+	struct page *page;
 
-	area->a_used_bytes += len;
-	for ( ; area->a_used_bytes < super->s_segsize;
-			area->a_used_bytes += PAGE_SIZE) {
-		/* Memset another page */
-		index++;
-		page = get_mapping_page(area->a_sb, index, 0);
+	while (no_indizes) {
+		page = get_mapping_page(sb, index, 0);
 		BUG_ON(!page); /* FIXME: reserve a pool */
-		memset(page_address(page), 0xff, PAGE_SIZE);
+		SetPageUptodate(page);
+		memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
 		SetPagePrivate(page);
 		page_cache_release(page);
+		index++;
+		no_indizes--;
 	}
 }
 
 /*
+ * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
+ * Also make sure we allocate (and memset) all pages for final writeout.
+ */
+static void pad_wbuf(struct logfs_area *area, int final)
+{
+	pad_partial_page(area);
+	if (final)
+		pad_full_pages(area);
+}
+
+/*
  * We have to be careful with the alias tree.  Since lookup is done by bix,
  * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
  * indirect blocks.  So always use it through accessor functions.
@@ -683,7 +691,7 @@ int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
 	return 0;
 }
 
-static void freeseg(struct super_block *sb, u32 segno)
+void freeseg(struct super_block *sb, u32 segno)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct address_space *mapping = super->s_mapping_inode->i_mapping;
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index c66beab78de..9d856c49afc 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -277,7 +277,7 @@ static int logfs_recover_sb(struct super_block *sb)
 	}
 	if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
 		printk(KERN_INFO"Superblocks don't match - fixing.\n");
-		return write_one_sb(sb, super->s_devops->find_last_sb);
+		return logfs_write_sb(sb);
 	}
 	/* If neither is valid now, something's wrong.  Didn't we properly
 	 * check them before?!? */
@@ -289,6 +289,10 @@ static int logfs_make_writeable(struct super_block *sb)
 {
 	int err;
 
+	err = logfs_open_segfile(sb);
+	if (err)
+		return err;
+
 	/* Repair any broken superblock copies */
 	err = logfs_recover_sb(sb);
 	if (err)
@@ -299,10 +303,6 @@ static int logfs_make_writeable(struct super_block *sb)
 	if (err)
 		return err;
 
-	err = logfs_open_segfile(sb);
-	if (err)
-		return err;
-
 	/* Do one GC pass before any data gets dirtied */
 	logfs_gc_pass(sb);
 
@@ -328,7 +328,7 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
 
 	sb->s_root = d_alloc_root(rootdir);
 	if (!sb->s_root)
-		goto fail;
+		goto fail2;
 
 	super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
 	if (!super->s_erase_page)
@@ -572,8 +572,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags,
 	return 0;
 
 err1:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
+	deactivate_locked_super(sb);
 	return err;
 err0:
 	kfree(super);
diff --git a/fs/namei.c b/fs/namei.c
index 1c0fca6e899..a7dce91a7e4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1610,8 +1610,7 @@ exit:
 
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    int open_flag, int acc_mode,
-			    int mode, const char *pathname,
-			    int *want_dir)
+			    int mode, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
@@ -1642,7 +1641,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (nd->last.name[nd->last.len]) {
 		if (open_flag & O_CREAT)
 			goto exit;
-		*want_dir = 1;
+		nd->flags |= LOOKUP_DIRECTORY;
 	}
 
 	/* just plain open? */
@@ -1656,8 +1655,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (path->dentry->d_inode->i_op->follow_link)
 			return NULL;
 		error = -ENOTDIR;
-		if (*want_dir && !path->dentry->d_inode->i_op->lookup)
-			goto exit_dput;
+		if (nd->flags & LOOKUP_DIRECTORY) {
+			if (!path->dentry->d_inode->i_op->lookup)
+				goto exit_dput;
+		}
 		path_to_nameidata(path, nd);
 		audit_inode(pathname, nd->path.dentry);
 		goto ok;
@@ -1766,7 +1767,6 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
 	int force_reval = 0;
-	int want_dir = open_flag & O_DIRECTORY;
 
 	if (!(open_flag & O_CREAT))
 		mode = 0;
@@ -1828,7 +1828,9 @@ reval:
 		if (open_flag & O_EXCL)
 			nd.flags |= LOOKUP_EXCL;
 	}
-	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+	if (open_flag & O_DIRECTORY)
+		nd.flags |= LOOKUP_DIRECTORY;
+	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path holder;
 		struct inode *inode = path.dentry->d_inode;
@@ -1866,7 +1868,7 @@ reval:
 		}
 		holder = path;
 		nd.flags &= ~LOOKUP_PARENT;
-		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname, &want_dir);
+		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 		if (inode->i_op->put_link)
 			inode->i_op->put_link(holder.dentry, &nd, cookie);
 		path_put(&holder);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae8d02294e4..ae0d9273653 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -491,7 +491,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 {
 	dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
 
-	if (gfp & __GFP_WAIT)
+	/* Only do I/O if gfp is a superset of GFP_KERNEL */
+	if ((gfp & GFP_KERNEL) == GFP_KERNEL)
 		nfs_wb_page(page->mapping->host, page);
 	/* If PagePrivate() is set, then the page is not freeable */
 	if (PagePrivate(page))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4d338be492c..dd17713413a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5552,6 +5552,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
 	if (status != 0)
 		goto out;
 	status = decode_delegreturn(&xdr);
+	if (status != 0)
+		goto out;
 	decode_getfattr(&xdr, res->fattr, res->server,
 			!RPC_IS_ASYNC(rqstp->rq_task));
 out:
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 636eaafd6ea..6129a431aa3 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -323,14 +323,14 @@ int nilfs_write_logs(struct list_head *logs, struct the_nilfs *nilfs)
 int nilfs_wait_on_logs(struct list_head *logs)
 {
 	struct nilfs_segment_buffer *segbuf;
-	int err;
+	int err, ret = 0;
 
 	list_for_each_entry(segbuf, logs, sb_list) {
 		err = nilfs_segbuf_wait(segbuf);
-		if (err)
-			return err;
+		if (err && !ret)
+			ret = err;
 	}
-	return 0;
+	return ret;
 }
 
 /*
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 69576a95e13..c161d89061b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1510,6 +1510,12 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 		if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
 			break;
 
+		nilfs_clear_logs(&sci->sc_segbufs);
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
 		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
 			err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
 							sci->sc_freesegs,
@@ -1517,12 +1523,6 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 							NULL);
 			WARN_ON(err); /* do not happen */
 		}
-		nilfs_clear_logs(&sci->sc_segbufs);
-
-		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-		if (unlikely(err))
-			return err;
-
 		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
 		sci->sc_stage = prev_stage;
 	}
@@ -1897,8 +1897,7 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
 
 	list_splice_tail_init(&sci->sc_write_logs, &logs);
 	ret = nilfs_wait_on_logs(&logs);
-	if (ret)
-		nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret);
+	nilfs_abort_logs(&logs, NULL, sci->sc_super_root, ret ? : err);
 
 	list_splice_tail_init(&sci->sc_segbufs, &logs);
 	nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 0501974bedd..8ccf0f8c9cc 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -30,6 +30,8 @@
 #include "alloc.h"
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
+#include "journal.h"
 #include "ocfs2_fs.h"
 
 #include "xattr.h"
@@ -166,6 +168,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
 }
 
 /*
+ * Helper function to set i_mode in memory and disk. Some call paths
+ * will not have di_bh or a journal handle to pass, in which case it
+ * will create it's own.
+ */
+static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
+			      handle_t *handle, umode_t new_mode)
+{
+	int ret, commit_handle = 0;
+	struct ocfs2_dinode *di;
+
+	if (di_bh == NULL) {
+		ret = ocfs2_read_inode_block(inode, &di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else
+		get_bh(di_bh);
+
+	if (handle == NULL) {
+		handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
+					   OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			goto out_brelse;
+		}
+
+		commit_handle = 1;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	inode->i_mode = new_mode;
+	di->i_mode = cpu_to_le16(inode->i_mode);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	if (commit_handle)
+		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out_brelse:
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+/*
  * Set the access or default ACL of an inode.
  */
 static int ocfs2_set_acl(handle_t *handle,
@@ -193,9 +249,14 @@ static int ocfs2_set_acl(handle_t *handle,
 			if (ret < 0)
 				return ret;
 			else {
-				inode->i_mode = mode;
 				if (ret == 0)
 					acl = NULL;
+
+				ret = ocfs2_acl_set_mode(inode, di_bh,
+							 handle, mode);
+				if (ret)
+					return ret;
+
 			}
 		}
 		break;
@@ -283,6 +344,7 @@ int ocfs2_init_acl(handle_t *handle,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct posix_acl *acl = NULL;
 	int ret = 0;
+	mode_t mode;
 
 	if (!S_ISLNK(inode->i_mode)) {
 		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
@@ -291,12 +353,17 @@ int ocfs2_init_acl(handle_t *handle,
 			if (IS_ERR(acl))
 				return PTR_ERR(acl);
 		}
-		if (!acl)
-			inode->i_mode &= ~current_umask();
+		if (!acl) {
+			mode = inode->i_mode & ~current_umask();
+			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+			if (ret) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
 	}
 	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
 		struct posix_acl *clone;
-		mode_t mode;
 
 		if (S_ISDIR(inode->i_mode)) {
 			ret = ocfs2_set_acl(handle, inode, di_bh,
@@ -313,7 +380,7 @@ int ocfs2_init_acl(handle_t *handle,
 		mode = inode->i_mode;
 		ret = posix_acl_create_masq(clone, &mode);
 		if (ret >= 0) {
-			inode->i_mode = mode;
+			ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
 			if (ret > 0) {
 				ret = ocfs2_set_acl(handle, inode,
 						    di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a659606dcb9..9289b4357d2 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 ok:
 		spin_unlock(&res->spinlock);
 	}
-	spin_unlock(&dlm->spinlock);
 
 	// mlog(0, "woo!  got an assert_master from node %u!\n",
 	// 	     assert->node_idx);
@@ -1926,7 +1925,6 @@ ok:
 		/* master is known, detach if not already detached.
 		 * ensures that only one assert_master call will happen
 		 * on this mle. */
-		spin_lock(&dlm->spinlock);
 		spin_lock(&dlm->master_lock);
 
 		rr = atomic_read(&mle->mle_refs.refcount);
@@ -1959,7 +1957,6 @@ ok:
 			__dlm_put_mle(mle);
 		}
 		spin_unlock(&dlm->master_lock);
-		spin_unlock(&dlm->spinlock);
 	} else if (res) {
 		if (res->owner != assert->node_idx) {
 			mlog(0, "assert_master from %u, but current "
@@ -1967,6 +1964,7 @@ ok:
 			     res->owner, namelen, name);
 		}
 	}
+	spin_unlock(&dlm->spinlock);
 
 done:
 	ret = 0;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 278a223aae1..ab207901d32 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -891,6 +891,21 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 	/* Do some basic inode verification... */
 	di = (struct ocfs2_dinode *) di_bh->b_data;
 	if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+		/*
+		 * Inodes in the orphan dir must have ORPHANED_FL.  The only
+		 * inodes that come back out of the orphan dir are reflink
+		 * targets. A reflink target may be moved out of the orphan
+		 * dir between the time we scan the directory and the time we
+		 * process it. This would lead to HAS_REFCOUNT_FL being set but
+		 * ORPHANED_FL not.
+		 */
+		if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
+			mlog(0, "Reflinked inode %llu is no longer orphaned.  "
+			     "it shouldn't be deleted\n",
+			     (unsigned long long)oi->ip_blkno);
+			goto bail;
+		}
+
 		/* for lack of a better error? */
 		status = -EEXIST;
 		mlog(ML_ERROR,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ca992d91f51..c983715d8d8 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 			     (unsigned long long)la_start_blk,
 			     (unsigned long long)blkno);
 
-			status = ocfs2_free_clusters(handle, main_bm_inode,
-						     main_bm_bh, blkno, count);
+			status = ocfs2_release_clusters(handle,
+							main_bm_inode,
+							main_bm_bh, blkno,
+							count);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
@@ -984,8 +986,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 	}
 
 retry_enospc:
-	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
-
+	(*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
 	if (status == -ENOSPC) {
 		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1061,6 +1062,7 @@ retry_enospc:
 		    OCFS2_LA_DISABLED)
 			goto bail;
 
+		ac->ac_bits_wanted = osb->local_alloc_default_bits;
 		status = ocfs2_claim_clusters(osb, handle, ac,
 					      osb->local_alloc_bits,
 					      &cluster_off,
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 544ac624517..b5cb3ede940 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if (__mandatory_lock(inode))
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d9cd4e373a5..b1eb50ae409 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct ocfs2_dinode *fe,
+			    struct buffer_head *fe_bh,
 			    char *name,
 			    struct ocfs2_dir_lookup_result *lookup,
 			    struct inode *orphan_dir_inode);
@@ -879,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir,
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	if (inode_is_unlinkable(inode)) {
-		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
+		status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name,
 					  &orphan_insert, orphan_dir);
 		if (status < 0) {
 			mlog_errno(status);
@@ -1300,7 +1300,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		if (S_ISDIR(new_inode->i_mode) ||
 		    (ocfs2_read_links_count(newfe) == 1)) {
 			status = ocfs2_orphan_add(osb, handle, new_inode,
-						  newfe, orphan_name,
+						  newfe_bh, orphan_name,
 						  &orphan_insert, orphan_dir);
 			if (status < 0) {
 				mlog_errno(status);
@@ -1911,7 +1911,7 @@ leave:
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct ocfs2_dinode *fe,
+			    struct buffer_head *fe_bh,
 			    char *name,
 			    struct ocfs2_dir_lookup_result *lookup,
 			    struct inode *orphan_dir_inode)
@@ -1919,6 +1919,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	struct buffer_head *orphan_dir_bh = NULL;
 	int status = 0;
 	struct ocfs2_dinode *orphan_fe;
+	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
@@ -1959,6 +1960,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 		goto leave;
 	}
 
+	/*
+	 * We're going to journal the change of i_flags and i_orphaned_slot.
+	 * It's safe anyway, though some callers may duplicate the journaling.
+	 * Journaling within the func just make the logic look more
+	 * straightforward.
+	 */
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(inode),
+					 fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
 
 	/* Record which orphan dir our inode now resides
@@ -1966,6 +1982,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	 * dir to lock. */
 	fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
 
+	ocfs2_journal_dirty(handle, fe_bh);
+
 	mlog(0, "Inode %llu orphaned in slot %d\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
 
@@ -2123,7 +2141,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
 	}
 
 	di = (struct ocfs2_dinode *)new_di_bh->b_data;
-	status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
+	status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
 				  &orphan_insert, orphan_dir);
 	if (status < 0) {
 		mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 1238b491db9..adf5e2ebc2c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -763,8 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
 	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
 }
 
-#define ocfs2_set_bit ext2_set_bit
-#define ocfs2_clear_bit ext2_clear_bit
+static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
+{
+	ext2_set_bit(bit, bitmap);
+}
+#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
+
+static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
+{
+	ext2_clear_bit(bit, bitmap);
+}
+#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
+
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
 #define ocfs2_find_next_bit ext2_find_next_bit
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9e96921dffd..29405f2ff61 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4075,6 +4075,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
 	OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
 	spin_unlock(&OCFS2_I(t_inode)->ip_lock);
 	i_size_write(t_inode, size);
+	t_inode->i_blocks = s_inode->i_blocks;
 
 	di->i_xattr_inline_size = s_di->i_xattr_inline_size;
 	di->i_clusters = s_di->i_clusters;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c3c60bc3e07..19ba00f2854 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 					     struct buffer_head *group_bh,
 					     unsigned int bit_off,
 					     unsigned int num_bits);
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
-					       struct inode *alloc_inode,
-					       struct ocfs2_group_desc *bg,
-					       struct buffer_head *group_bh,
-					       unsigned int bit_off,
-					       unsigned int num_bits);
-
 static int ocfs2_relink_block_group(handle_t *handle,
 				    struct inode *alloc_inode,
 				    struct buffer_head *fe_bh,
@@ -152,7 +145,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 
 #define do_error(fmt, ...)						\
 	do{								\
-		if (clean_error)					\
+		if (resize)					\
 			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
 		else							\
 			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
@@ -160,7 +153,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 
 static int ocfs2_validate_gd_self(struct super_block *sb,
 				  struct buffer_head *bh,
-				  int clean_error)
+				  int resize)
 {
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
@@ -211,7 +204,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 static int ocfs2_validate_gd_parent(struct super_block *sb,
 				    struct ocfs2_dinode *di,
 				    struct buffer_head *bh,
-				    int clean_error)
+				    int resize)
 {
 	unsigned int max_bits;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
@@ -233,8 +226,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
 		return -EINVAL;
 	}
 
-	if (le16_to_cpu(gd->bg_chain) >=
-	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
+	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
+	if ((le16_to_cpu(gd->bg_chain) >
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
+	    ((le16_to_cpu(gd->bg_chain) ==
+	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
 		do_error("Group descriptor #%llu has bad chain %u",
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_chain));
@@ -1975,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
 				      bits_wanted, cluster_start, num_clusters);
 }
 
-static inline int ocfs2_block_group_clear_bits(handle_t *handle,
-					       struct inode *alloc_inode,
-					       struct ocfs2_group_desc *bg,
-					       struct buffer_head *group_bh,
-					       unsigned int bit_off,
-					       unsigned int num_bits)
+static int ocfs2_block_group_clear_bits(handle_t *handle,
+					struct inode *alloc_inode,
+					struct ocfs2_group_desc *bg,
+					struct buffer_head *group_bh,
+					unsigned int bit_off,
+					unsigned int num_bits,
+					void (*undo_fn)(unsigned int bit,
+							unsigned long *bmap))
 {
 	int status;
 	unsigned int tmp;
-	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
 	struct ocfs2_group_desc *undo_bg = NULL;
-	int cluster_bitmap = 0;
 
 	mlog_entry_void();
 
@@ -1996,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 
 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
 
-	if (ocfs2_is_cluster_bitmap(alloc_inode))
-		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-
+	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
-					 group_bh, journal_type);
+					 group_bh,
+					 undo_fn ?
+					 OCFS2_JOURNAL_ACCESS_UNDO :
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 
-	if (ocfs2_is_cluster_bitmap(alloc_inode))
-		cluster_bitmap = 1;
-
-	if (cluster_bitmap) {
+	if (undo_fn) {
 		jbd_lock_bh_state(group_bh);
 		undo_bg = (struct ocfs2_group_desc *)
 					bh2jh(group_bh)->b_committed_data;
@@ -2020,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 	while(tmp--) {
 		ocfs2_clear_bit((bit_off + tmp),
 				(unsigned long *) bg->bg_bitmap);
-		if (cluster_bitmap)
-			ocfs2_set_bit(bit_off + tmp,
-				      (unsigned long *) undo_bg->bg_bitmap);
+		if (undo_fn)
+			undo_fn(bit_off + tmp,
+				(unsigned long *) undo_bg->bg_bitmap);
 	}
 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
 
-	if (cluster_bitmap)
+	if (undo_fn)
 		jbd_unlock_bh_state(group_bh);
 
 	status = ocfs2_journal_dirty(handle, group_bh);
@@ -2039,12 +2033,14 @@ bail:
 /*
  * expects the suballoc inode to already be locked.
  */
-int ocfs2_free_suballoc_bits(handle_t *handle,
-			     struct inode *alloc_inode,
-			     struct buffer_head *alloc_bh,
-			     unsigned int start_bit,
-			     u64 bg_blkno,
-			     unsigned int count)
+static int _ocfs2_free_suballoc_bits(handle_t *handle,
+				     struct inode *alloc_inode,
+				     struct buffer_head *alloc_bh,
+				     unsigned int start_bit,
+				     u64 bg_blkno,
+				     unsigned int count,
+				     void (*undo_fn)(unsigned int bit,
+						     unsigned long *bitmap))
 {
 	int status = 0;
 	u32 tmp_used;
@@ -2079,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 
 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
 					      group, group_bh,
-					      start_bit, count);
+					      start_bit, count, undo_fn);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -2110,6 +2106,17 @@ bail:
 	return status;
 }
 
+int ocfs2_free_suballoc_bits(handle_t *handle,
+			     struct inode *alloc_inode,
+			     struct buffer_head *alloc_bh,
+			     unsigned int start_bit,
+			     u64 bg_blkno,
+			     unsigned int count)
+{
+	return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
+					 start_bit, bg_blkno, count, NULL);
+}
+
 int ocfs2_free_dinode(handle_t *handle,
 		      struct inode *inode_alloc_inode,
 		      struct buffer_head *inode_alloc_bh,
@@ -2123,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle,
 					inode_alloc_bh, bit, bg_blkno, 1);
 }
 
-int ocfs2_free_clusters(handle_t *handle,
-		       struct inode *bitmap_inode,
-		       struct buffer_head *bitmap_bh,
-		       u64 start_blk,
-		       unsigned int num_clusters)
+static int _ocfs2_free_clusters(handle_t *handle,
+				struct inode *bitmap_inode,
+				struct buffer_head *bitmap_bh,
+				u64 start_blk,
+				unsigned int num_clusters,
+				void (*undo_fn)(unsigned int bit,
+						unsigned long *bitmap))
 {
 	int status;
 	u16 bg_start_bit;
@@ -2154,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle,
 	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
 	     (unsigned long long)bg_blkno, bg_start_bit);
 
-	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
-					  bg_start_bit, bg_blkno,
-					  num_clusters);
+	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
+					   bg_start_bit, bg_blkno,
+					   num_clusters, undo_fn);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
@@ -2170,6 +2179,32 @@ out:
 	return status;
 }
 
+int ocfs2_free_clusters(handle_t *handle,
+			struct inode *bitmap_inode,
+			struct buffer_head *bitmap_bh,
+			u64 start_blk,
+			unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_set_bit);
+}
+
+/*
+ * Give never-used clusters back to the global bitmap.  We don't need
+ * to protect these bits in the undo buffer.
+ */
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters)
+{
+	return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
+				    start_blk, num_clusters,
+				    _ocfs2_clear_bit);
+}
+
 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
 {
 	printk("Block Group:\n");
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index fa60723c43e..e0f46df357e 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -127,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle,
 			struct buffer_head *bitmap_bh,
 			u64 start_blk,
 			unsigned int num_clusters);
+int ocfs2_release_clusters(handle_t *handle,
+			   struct inode *bitmap_inode,
+			   struct buffer_head *bitmap_bh,
+			   u64 start_blk,
+			   unsigned int num_clusters);
 
 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
 {
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1b0d386f6d..3e7773089b9 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1622,7 +1622,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc)
 	/* Now tell xh->xh_entries about it */
 	for (i = 0; i < count; i++) {
 		offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset);
-		if (offset < namevalue_offset)
+		if (offset <= namevalue_offset)
 			le16_add_cpu(&xh->xh_entries[i].xe_name_offset,
 				     namevalue_size);
 	}
@@ -6528,13 +6528,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
 					  int indexed)
 {
 	int ret;
-	struct ocfs2_alloc_context *meta_ac;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_xattr_set_ctxt ctxt = {
-		.meta_ac = meta_ac,
-	};
+	struct ocfs2_xattr_set_ctxt ctxt;
 
-	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+	memset(&ctxt, 0, sizeof(ctxt));
+	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -6556,7 +6554,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
 
 	ocfs2_commit_trans(osb, ctxt.handle);
 out:
-	ocfs2_free_alloc_context(meta_ac);
+	ocfs2_free_alloc_context(ctxt.meta_ac);
 	return ret;
 }
 
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef066..90be97f1f5a 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
  */
 #include <asm/unaligned.h>
 
-#define SYS_IND(p)	(get_unaligned(&p->sys_ind))
-#define NR_SECTS(p)	({ __le32 __a =	get_unaligned(&p->nr_sects);	\
-				le32_to_cpu(__a); \
-			})
+#define SYS_IND(p)	get_unaligned(&p->sys_ind)
 
-#define START_SECT(p)	({ __le32 __a =	get_unaligned(&p->start_sect);	\
-				le32_to_cpu(__a); \
-			})
+static inline sector_t nr_sects(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+
+static inline sector_t start_sect(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->start_sect);
+}
 
 static inline int is_extended_partition(struct partition *p)
 {
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
 
 static void
 parse_extended(struct parsed_partitions *state, struct block_device *bdev,
-			u32 first_sector, u32 first_size)
+			sector_t first_sector, sector_t first_size)
 {
 	struct partition *p;
 	Sector sect;
 	unsigned char *data;
-	u32 this_sector, this_size;
-	int sector_size = bdev_logical_block_size(bdev) / 512;
+	sector_t this_sector, this_size;
+	sector_t sector_size = bdev_logical_block_size(bdev) / 512;
 	int loopct = 0;		/* number of links followed
 				   without finding a data partition */
 	int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 		 * First process the data partition(s)
 		 */
 		for (i=0; i<4; i++, p++) {
-			u32 offs, size, next;
-			if (!NR_SECTS(p) || is_extended_partition(p))
+			sector_t offs, size, next;
+			if (!nr_sects(p) || is_extended_partition(p))
 				continue;
 
 			/* Check the 3rd and 4th entries -
 			   these sometimes contain random garbage */
-			offs = START_SECT(p)*sector_size;
-			size = NR_SECTS(p)*sector_size;
+			offs = start_sect(p)*sector_size;
+			size = nr_sects(p)*sector_size;
 			next = this_sector + offs;
 			if (i >= 2) {
 				if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 		 */
 		p -= 4;
 		for (i=0; i<4; i++, p++)
-			if (NR_SECTS(p) && is_extended_partition(p))
+			if (nr_sects(p) && is_extended_partition(p))
 				break;
 		if (i == 4)
 			goto done;	 /* nothing left to do */
 
-		this_sector = first_sector + START_SECT(p) * sector_size;
-		this_size = NR_SECTS(p) * sector_size;
+		this_sector = first_sector + start_sect(p) * sector_size;
+		this_size = nr_sects(p) * sector_size;
 		put_dev_sector(sect);
 	}
 done:
@@ -197,7 +200,7 @@ done:
 
 static void
 parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
-			u32 offset, u32 size, int origin)
+			sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_SOLARIS_X86_PARTITION
 	Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
  */
 static void
 parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin, char *flavour,
+		sector_t offset, sector_t size, int origin, char *flavour,
 		int max_partitions)
 {
 	Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 	if (le16_to_cpu(l->d_npartitions) < max_partitions)
 		max_partitions = le16_to_cpu(l->d_npartitions);
 	for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
-		u32 bsd_start, bsd_size;
+		sector_t bsd_start, bsd_size;
 
 		if (state->next == state->limit)
 			break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
 
 static void
 parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
 	parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
 
 static void
 parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
 	parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
 
 static void
 parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_BSD_DISKLABEL
 	parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
  */
 static void
 parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_UNIXWARE_DISKLABEL
 	Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
 
 		if (p->s_label != UNIXWARE_FS_UNUSED)
 			put_partition(state, state->next++,
-						START_SECT(p), NR_SECTS(p));
+				      le32_to_cpu(p->start_sect),
+				      le32_to_cpu(p->nr_sects));
 		p++;
 	}
 	put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
  */
 static void
 parse_minix(struct parsed_partitions *state, struct block_device *bdev,
-		u32 offset, u32 size, int origin)
+		sector_t offset, sector_t size, int origin)
 {
 #ifdef CONFIG_MINIX_SUBPARTITION
 	Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 			/* add each partition in use */
 			if (SYS_IND(p) == MINIX_PARTITION)
 				put_partition(state, state->next++,
-					      START_SECT(p), NR_SECTS(p));
+					      start_sect(p), nr_sects(p));
 		}
 		printk(" >\n");
 	}
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
 static struct {
 	unsigned char id;
 	void (*parse)(struct parsed_partitions *, struct block_device *,
-			u32, u32, int);
+			sector_t, sector_t, int);
 } subtypes[] = {
 	{FREEBSD_PARTITION, parse_freebsd},
 	{NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
  
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int sector_size = bdev_logical_block_size(bdev) / 512;
+	sector_t sector_size = bdev_logical_block_size(bdev) / 512;
 	Sector sect;
 	unsigned char *data;
 	struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
 	state->next = 5;
 	for (slot = 1 ; slot <= 4 ; slot++, p++) {
-		u32 start = START_SECT(p)*sector_size;
-		u32 size = NR_SECTS(p)*sector_size;
+		sector_t start = start_sect(p)*sector_size;
+		sector_t size = nr_sects(p)*sector_size;
 		if (!size)
 			continue;
 		if (is_extended_partition(p)) {
-			/* prevent someone doing mkfs or mkswap on an
-			   extended partition, but leave room for LILO */
-			put_partition(state, slot, start, size == 1 ? 1 : 2);
+			/*
+			 * prevent someone doing mkfs or mkswap on an
+			 * extended partition, but leave room for LILO
+			 * FIXME: this uses one logical sector for > 512b
+			 * sector, although it may not be enough/proper.
+			 */
+			sector_t n = 2;
+			n = min(size, max(sector_size, n));
+			put_partition(state, slot, start, n);
+
 			printk(" <");
 			parse_extended(state, bdev, start, size);
 			printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 		unsigned char id = SYS_IND(p);
 		int n;
 
-		if (!NR_SECTS(p))
+		if (!nr_sects(p))
 			continue;
 
 		for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 
 		if (!subtypes[n].parse)
 			continue;
-		subtypes[n].parse(state, bdev, START_SECT(p)*sector_size,
-						NR_SECTS(p)*sector_size, slot);
+		subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
+						nr_sects(p)*sector_size, slot);
 	}
 	put_dev_sector(sect);
 	return 1;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a7310841c83..b1f6e62773d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -442,12 +442,13 @@ static const struct file_operations proc_lstats_operations = {
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
-	unsigned long points;
+	unsigned long points = 0;
 	struct timespec uptime;
 
 	do_posix_clock_monotonic_gettime(&uptime);
 	read_lock(&tasklist_lock);
-	points = badness(task->group_leader, uptime.tv_sec);
+	if (pid_alive(task))
+		points = badness(task, uptime.tv_sec);
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4..b442dac8f5f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -490,7 +490,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		}
 		read_unlock(&kclist_lock);
 
-		if (m == NULL) {
+		if (&m->list == &kclist_head) {
 			if (clear_user(buffer, tsz))
 				return -EFAULT;
 		} else if (is_vmalloc_or_module_addr((void *)start)) {
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d4..113386d6fd2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
 	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
 
 	for (;;) {
 		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
 	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
 
 	for (;;) {
 		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabb..f3de5e8a2ae 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2217,6 +2217,15 @@ static int journal_read_transaction(struct super_block *sb,
 		brelse(d_bh);
 		return 1;
 	}
+
+	if (bdev_read_only(sb->s_bdev)) {
+		reiserfs_warning(sb, "clm-2076",
+				 "device is readonly, unable to replay log");
+		brelse(c_bh);
+		brelse(d_bh);
+		return -EROFS;
+	}
+
 	trans_id = get_desc_trans_id(desc);
 	/* now we know we've got a good transaction, and it was inside the valid time ranges */
 	log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2468,6 @@ static int journal_read(struct super_block *sb)
 		goto start_log_replay;
 	}
 
-	if (continue_replay && bdev_read_only(sb->s_bdev)) {
-		reiserfs_warning(sb, "clm-2076",
-				 "device is readonly, unable to replay log");
-		return -1;
-	}
-
 	/* ok, there are transactions that need to be replayed.  start with the first log block, find
 	 ** all the valid transactions, and pick out the oldest.
 	 */
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd3..de1fcffd906 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -76,7 +76,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
 		return error;
 	}
 
-	if (sec->length) {
+	if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
 		blocks = reiserfs_xattr_jcreate_nblocks(inode) +
 			 reiserfs_xattr_nblocks(inode, sec->length);
 		/* We don't want to count the directories twice if we have
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index 3a4767c01c5..4f7b44866b7 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -65,6 +65,8 @@
 #define ACPI_VIDEO_HID			"LNXVIDEO"
 #define ACPI_BAY_HID			"LNXIOBAY"
 #define ACPI_DOCK_HID			"LNXDOCK"
+/* Quirk for broken IBM BIOSes */
+#define ACPI_SMBUS_IBM_HID		"SMBUSIBM"
 
 /*
  * For fixed hardware buttons, we fabricate acpi_devices with HID
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 4a3c4e44102..de2f82efb15 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -1545,39 +1545,7 @@ static __inline__ void drm_core_dropmap(struct drm_local_map *map)
 {
 }
 
-
-static __inline__ void *drm_calloc_large(size_t nmemb, size_t size)
-{
-	if (size != 0 && nmemb > ULONG_MAX / size)
-		return NULL;
-
-	if (size * nmemb <= PAGE_SIZE)
-	    return kcalloc(nmemb, size, GFP_KERNEL);
-
-	return __vmalloc(size * nmemb,
-			 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
-}
-
-/* Modeled after cairo's malloc_ab, it's like calloc but without the zeroing. */
-static __inline__ void *drm_malloc_ab(size_t nmemb, size_t size)
-{
-	if (size != 0 && nmemb > ULONG_MAX / size)
-		return NULL;
-
-	if (size * nmemb <= PAGE_SIZE)
-	    return kmalloc(nmemb * size, GFP_KERNEL);
-
-	return __vmalloc(size * nmemb,
-			 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
-}
-
-static __inline void drm_free_large(void *ptr)
-{
-	if (!is_vmalloc_addr(ptr))
-		return kfree(ptr);
-
-	vfree(ptr);
-}
+#include "drm_mem_util.h"
 /*@}*/
 
 #endif				/* __KERNEL__ */
diff --git a/include/drm/drm_mem_util.h b/include/drm/drm_mem_util.h
new file mode 100644
index 00000000000..6bd325fedc8
--- /dev/null
+++ b/include/drm/drm_mem_util.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *     Jesse Barnes <jbarnes@virtuousgeek.org>
+ *
+ */
+#ifndef _DRM_MEM_UTIL_H_
+#define _DRM_MEM_UTIL_H_
+
+#include <linux/vmalloc.h>
+
+static __inline__ void *drm_calloc_large(size_t nmemb, size_t size)
+{
+	if (size != 0 && nmemb > ULONG_MAX / size)
+		return NULL;
+
+	if (size * nmemb <= PAGE_SIZE)
+	    return kcalloc(nmemb, size, GFP_KERNEL);
+
+	return __vmalloc(size * nmemb,
+			 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+}
+
+/* Modeled after cairo's malloc_ab, it's like calloc but without the zeroing. */
+static __inline__ void *drm_malloc_ab(size_t nmemb, size_t size)
+{
+	if (size != 0 && nmemb > ULONG_MAX / size)
+		return NULL;
+
+	if (size * nmemb <= PAGE_SIZE)
+	    return kmalloc(nmemb * size, GFP_KERNEL);
+
+	return __vmalloc(size * nmemb,
+			 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+
+static __inline void drm_free_large(void *ptr)
+{
+	if (!is_vmalloc_addr(ptr))
+		return kfree(ptr);
+
+	vfree(ptr);
+}
+
+#endif
diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h
index 676104b7818..04a6ebc27b9 100644
--- a/include/drm/drm_pciids.h
+++ b/include/drm/drm_pciids.h
@@ -410,6 +410,7 @@
 	{0x1002, 0x9712, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \
 	{0x1002, 0x9713, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \
 	{0x1002, 0x9714, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \
+	{0x1002, 0x9715, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \
 	{0, 0, 0}
 
 #define r128_PCI_IDS \
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index e3f1b4a4b60..e929c27ede2 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -115,7 +115,6 @@ struct ttm_backend {
 	struct ttm_backend_func *func;
 };
 
-#define TTM_PAGE_FLAG_VMALLOC         (1 << 0)
 #define TTM_PAGE_FLAG_USER            (1 << 1)
 #define TTM_PAGE_FLAG_USER_DIRTY      (1 << 2)
 #define TTM_PAGE_FLAG_WRITE           (1 << 3)
diff --git a/include/linux/circ_buf.h b/include/linux/circ_buf.h
index a2ed0591fb1..90f2471dc6f 100644
--- a/include/linux/circ_buf.h
+++ b/include/linux/circ_buf.h
@@ -1,3 +1,7 @@
+/*
+ * See Documentation/circular-buffers.txt for more information.
+ */
+
 #ifndef _LINUX_CIRC_BUF_H
 #define _LINUX_CIRC_BUF_H 1
 
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 0cf725bdd2a..fc53492b6ad 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -73,6 +73,7 @@ enum clock_event_nofitiers {
  * @list:		list head for the management code
  * @mode:		operating mode assigned by the management code
  * @next_event:		local storage for the next event in oneshot mode
+ * @retries:		number of forced programming retries
  */
 struct clock_event_device {
 	const char		*name;
@@ -93,6 +94,7 @@ struct clock_event_device {
 	struct list_head	list;
 	enum clock_event_mode	mode;
 	ktime_t			next_event;
+	unsigned long		retries;
 };
 
 /*
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index cac84b00666..5f494b46509 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -565,17 +565,17 @@ enum {
 
 static inline int ext3_test_inode_state(struct inode *inode, int bit)
 {
-	return test_bit(bit, &EXT3_I(inode)->i_state);
+	return test_bit(bit, &EXT3_I(inode)->i_state_flags);
 }
 
 static inline void ext3_set_inode_state(struct inode *inode, int bit)
 {
-	set_bit(bit, &EXT3_I(inode)->i_state);
+	set_bit(bit, &EXT3_I(inode)->i_state_flags);
 }
 
 static inline void ext3_clear_inode_state(struct inode *inode, int bit)
 {
-	clear_bit(bit, &EXT3_I(inode)->i_state);
+	clear_bit(bit, &EXT3_I(inode)->i_state_flags);
 }
 #else
 /* Assume that user mode programs are passing in an ext3fs superblock, not
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 7679acdb519..f42c098aed8 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -87,7 +87,7 @@ struct ext3_inode_info {
 	 * near to their parent directory's inode.
 	 */
 	__u32	i_block_group;
-	unsigned long	i_state;	/* Dynamic state flags for ext3 */
+	unsigned long	i_state_flags;	/* Dynamic state flags for ext3 */
 
 	/* block reservation info */
 	struct ext3_block_alloc_info *i_block_alloc_info;
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 7be0c6fbe88..c57db27ac86 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -105,7 +105,7 @@ struct fscache_operation {
 	/* operation releaser */
 	fscache_operation_release_t release;
 
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	const char *name;		/* operation name */
 	const char *state;		/* operation state */
 #define fscache_set_op_name(OP, N)	do { (OP)->name  = (N); } while(0)
diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index 1822d635be6..16b92d008be 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -2,6 +2,7 @@
 #define _IF_TUNNEL_H_
 
 #include <linux/types.h>
+#include <asm/byteorder.h>
 
 #ifdef __KERNEL__
 #include <linux/ip.h>
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 71ab79da7e7..26fad187d66 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -112,12 +112,14 @@ struct resource_list {
 extern struct resource ioport_resource;
 extern struct resource iomem_resource;
 
+extern struct resource *request_resource_conflict(struct resource *root, struct resource *new);
 extern int request_resource(struct resource *root, struct resource *new);
 extern int release_resource(struct resource *new);
 void release_child_resources(struct resource *new);
 extern void reserve_region_with_split(struct resource *root,
 			     resource_size_t start, resource_size_t end,
 			     const char *name);
+extern struct resource *insert_resource_conflict(struct resource *parent, struct resource *new);
 extern int insert_resource(struct resource *parent, struct resource *new);
 extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new);
 extern int allocate_resource(struct resource *root, struct resource *new,
diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index bc0fc795bd3..ece0b1c3381 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -102,8 +102,6 @@ union { \
 	unsigned char name##kfifo_buffer[size]; \
 	struct kfifo name = __kfifo_initializer(size, name##kfifo_buffer)
 
-#undef __kfifo_initializer
-
 extern void kfifo_init(struct kfifo *fifo, void *buffer,
 			unsigned int size);
 extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size,
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index c02c8db7370..8a49cbf0376 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -268,6 +268,7 @@ struct _mmc_csd {
 
 #define EXT_CSD_CARD_TYPE_26	(1<<0)	/* Card can run at 26MHz */
 #define EXT_CSD_CARD_TYPE_52	(1<<1)	/* Card can run at 52MHz */
+#define EXT_CSD_CARD_TYPE_MASK	0x3	/* Mask out reserved and DDR bits */
 
 #define EXT_CSD_BUS_WIDTH_1	0	/* Card is in 1 bit mode */
 #define EXT_CSD_BUS_WIDTH_4	1	/* Card is in 4 bit mode */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c79a88be7c3..fa8b4763799 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2059,12 +2059,12 @@ static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
  * ARP on active-backup slaves with arp_validate enabled.
  */
-static inline int skb_bond_should_drop(struct sk_buff *skb)
+static inline int skb_bond_should_drop(struct sk_buff *skb,
+				       struct net_device *master)
 {
-	struct net_device *dev = skb->dev;
-	struct net_device *master = dev->master;
-
 	if (master) {
+		struct net_device *dev = skb->dev;
+
 		if (master->priv_flags & IFF_MASTER_ARPMON)
 			dev->last_rx = jiffies;
 
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 53923868c9b..361d6b5630e 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -76,7 +76,7 @@ extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 extern int nfnetlink_has_listeners(struct net *net, unsigned int group);
 extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group,
 			  int echo, gfp_t flags);
-extern void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error);
+extern int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error);
 extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags);
 
 extern void nfnl_lock(void);
diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index d654873aa25..1f7e300094c 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -59,6 +59,7 @@
 enum nf_ip6_hook_priorities {
 	NF_IP6_PRI_FIRST = INT_MIN,
 	NF_IP6_PRI_CONNTRACK_DEFRAG = -400,
+	NF_IP6_PRI_RAW = -300,
 	NF_IP6_PRI_SELINUX_FIRST = -225,
 	NF_IP6_PRI_CONNTRACK = -200,
 	NF_IP6_PRI_MANGLE = -150,
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index fde27c01732..6eaca5e1e8c 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -188,7 +188,7 @@ extern int netlink_has_listeners(struct sock *sk, unsigned int group);
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid,
 			     __u32 group, gfp_t allocation);
-extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code);
+extern int netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code);
 extern int netlink_register_notifier(struct notifier_block *nb);
 extern int netlink_unregister_notifier(struct notifier_block *nb);
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 3024050c82a..872a98e13d6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -123,22 +123,11 @@ static inline int rcu_read_lock_held(void)
 	return lock_is_held(&rcu_lock_map);
 }
 
-/**
- * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
- *
- * If CONFIG_PROVE_LOCKING is selected and enabled, returns nonzero iff in
- * an RCU-bh read-side critical section.  In absence of CONFIG_PROVE_LOCKING,
- * this assumes we are in an RCU-bh read-side critical section unless it can
- * prove otherwise.
- *
- * Check rcu_scheduler_active to prevent false positives during boot.
+/*
+ * rcu_read_lock_bh_held() is defined out of line to avoid #include-file
+ * hell.
  */
-static inline int rcu_read_lock_bh_held(void)
-{
-	if (!debug_lockdep_rcu_enabled())
-		return 1;
-	return lock_is_held(&rcu_bh_lock_map);
-}
+extern int rcu_read_lock_bh_held(void);
 
 /**
  * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section?
@@ -160,7 +149,7 @@ static inline int rcu_read_lock_sched_held(void)
 		return 1;
 	if (debug_locks)
 		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
-	return lockdep_opinion || preempt_count() != 0;
+	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
 }
 #else /* #ifdef CONFIG_PREEMPT */
 static inline int rcu_read_lock_sched_held(void)
@@ -191,7 +180,7 @@ static inline int rcu_read_lock_bh_held(void)
 #ifdef CONFIG_PREEMPT
 static inline int rcu_read_lock_sched_held(void)
 {
-	return !rcu_scheduler_active || preempt_count() != 0;
+	return !rcu_scheduler_active || preempt_count() != 0 || irqs_disabled();
 }
 #else /* #ifdef CONFIG_PREEMPT */
 static inline int rcu_read_lock_sched_held(void)
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 99928dce37e..7fa02b4af83 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -70,6 +70,11 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th,
 void reiserfs_security_free(struct reiserfs_security_handle *sec);
 #endif
 
+static inline int reiserfs_xattrs_initialized(struct super_block *sb)
+{
+	return REISERFS_SB(sb)->priv_root != NULL;
+}
+
 #define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
 static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
 {
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 1b177d29a7f..193d4bfe42f 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -2,7 +2,9 @@
 #define __LINUX_SERIAL_SCI_H
 
 #include <linux/serial_core.h>
+#ifdef CONFIG_SERIAL_SH_SCI_DMA
 #include <asm/dmaengine.h>
+#endif
 
 /*
  * Generic header for SuperH SCI(F) (used by sh/sh64/h8300 and related parts)
@@ -30,8 +32,10 @@ struct plat_sci_port {
 	upf_t		flags;			/* UPF_* flags */
 	char		*clk;			/* clock string */
 	struct device	*dma_dev;
+#ifdef CONFIG_SERIAL_SH_SCI_DMA
 	enum sh_dmae_slave_chan_id dma_slave_tx;
 	enum sh_dmae_slave_chan_id dma_slave_rx;
+#endif
 };
 
 #endif /* __LINUX_SERIAL_SCI_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 03f816a9b65..124f90cd5a3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -190,9 +190,6 @@ struct skb_shared_info {
 	atomic_t	dataref;
 	unsigned short	nr_frags;
 	unsigned short	gso_size;
-#ifdef CONFIG_HAS_DMA
-	dma_addr_t	dma_head;
-#endif
 	/* Warning: this field is not always filled in (UFO)! */
 	unsigned short	gso_segs;
 	unsigned short  gso_type;
@@ -201,9 +198,6 @@ struct skb_shared_info {
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
 	skb_frag_t	frags[MAX_SKB_FRAGS];
-#ifdef CONFIG_HAS_DMA
-	dma_addr_t	dma_maps[MAX_SKB_FRAGS];
-#endif
 	/* Intermediate layers must ensure that destructor_arg
 	 * remains valid until skb destructor */
 	void *		destructor_arg;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 7b3aae2052a..354cc5617f8 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -255,6 +255,7 @@ struct ucred {
 #define MSG_ERRQUEUE	0x2000	/* Fetch message from error queue */
 #define MSG_NOSIGNAL	0x4000	/* Do not generate SIGPIPE */
 #define MSG_MORE	0x8000	/* Sender will send more */
+#define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
 
 #define MSG_EOF         MSG_FIN
 
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index d7152b451e2..7c91260c44a 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -36,7 +36,6 @@ struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, int max_reqs);
-void bc_release_request(struct rpc_task *);
 int bc_send(struct rpc_rqst *req);
 
 /*
@@ -59,6 +58,10 @@ static inline int svc_is_backchannel(const struct svc_rqst *rqstp)
 {
 	return 0;
 }
+
+static inline void xprt_free_bc_request(struct rpc_rqst *req)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* _LINUX_SUNRPC_BC_XPRT_H */
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f994ae58a00..057929b0a65 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -688,7 +688,7 @@ asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg);
 asmlinkage long sys_shmget(key_t key, size_t size, int flag);
 asmlinkage long sys_shmdt(char __user *shmaddr);
 asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
-asmlinkage long sys_ipc(unsigned int call, int first, int second,
+asmlinkage long sys_ipc(unsigned int call, int first, unsigned long second,
 		unsigned long third, void __user *ptr, long fifth);
 
 asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr);
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index f59604ed0ec..78b4bd3be49 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -49,7 +49,7 @@ struct tracepoint {
 		void **it_func;						\
 									\
 		rcu_read_lock_sched_notrace();				\
-		it_func = rcu_dereference((tp)->funcs);			\
+		it_func = rcu_dereference_sched((tp)->funcs);		\
 		if (it_func) {						\
 			do {						\
 				((void(*)(proto))(*it_func))(args);	\
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 568369a8630..4409967db0c 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -70,12 +70,13 @@ struct tty_buffer {
 
 /*
  * We default to dicing tty buffer allocations to this many characters
- * in order to avoid multiple page allocations. We assume tty_buffer itself
- * is under 256 bytes. See tty_buffer_find for the allocation logic this
- * must match
+ * in order to avoid multiple page allocations. We know the size of
+ * tty_buffer itself but it must also be taken into account that the
+ * the buffer is 256 byte aligned. See tty_buffer_find for the allocation
+ * logic this must match
  */
 
-#define TTY_BUFFER_PAGE		((PAGE_SIZE  - 256) / 2)
+#define TTY_BUFFER_PAGE	(((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~0xFF)
 
 
 struct tty_bufhead {
@@ -223,6 +224,7 @@ struct tty_port {
 	wait_queue_head_t	close_wait;	/* Close waiters */
 	wait_queue_head_t	delta_msr_wait;	/* Modem status change */
 	unsigned long		flags;		/* TTY flags ASY_*/
+	unsigned char		console:1;	/* port is a console */
 	struct mutex		mutex;		/* Locking */
 	struct mutex		buf_mutex;	/* Buffer alloc lock */
 	unsigned char		*xmit_buf;	/* Optional buffer */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 8c9f053111b..ce1323c4e47 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1055,7 +1055,8 @@ typedef void (*usb_complete_t)(struct urb *);
  * @number_of_packets: Lists the number of ISO transfer buffers.
  * @interval: Specifies the polling interval for interrupt or isochronous
  *	transfers.  The units are frames (milliseconds) for full and low
- *	speed devices, and microframes (1/8 millisecond) for highspeed ones.
+ *	speed devices, and microframes (1/8 millisecond) for highspeed
+ *	and SuperSpeed devices.
  * @error_count: Returns the number of ISO transfers that reported errors.
  * @context: For use in completion functions.  This normally points to
  *	request-specific driver context.
@@ -1286,9 +1287,16 @@ static inline void usb_fill_bulk_urb(struct urb *urb,
  *
  * Initializes a interrupt urb with the proper information needed to submit
  * it to a device.
- * Note that high speed interrupt endpoints use a logarithmic encoding of
- * the endpoint interval, and express polling intervals in microframes
- * (eight per millisecond) rather than in frames (one per millisecond).
+ *
+ * Note that High Speed and SuperSpeed interrupt endpoints use a logarithmic
+ * encoding of the endpoint interval, and express polling intervals in
+ * microframes (eight per millisecond) rather than in frames (one per
+ * millisecond).
+ *
+ * Wireless USB also uses the logarithmic encoding, but specifies it in units of
+ * 128us instead of 125us.  For Wireless USB devices, the interval is passed
+ * through to the host controller, rather than being translated into microframe
+ * units.
  */
 static inline void usb_fill_int_urb(struct urb *urb,
 				    struct usb_device *dev,
@@ -1305,7 +1313,7 @@ static inline void usb_fill_int_urb(struct urb *urb,
 	urb->transfer_buffer_length = buffer_length;
 	urb->complete = complete_fn;
 	urb->context = context;
-	if (dev->speed == USB_SPEED_HIGH)
+	if (dev->speed == USB_SPEED_HIGH || dev->speed == USB_SPEED_SUPER)
 		urb->interval = 1 << (interval - 1);
 	else
 		urb->interval = interval;
diff --git a/include/linux/vt.h b/include/linux/vt.h
index 778b7b2a47d..d5dd0bc408f 100644
--- a/include/linux/vt.h
+++ b/include/linux/vt.h
@@ -27,7 +27,7 @@ struct vt_mode {
 #define VT_SETMODE	0x5602	/* set mode of active vt */
 #define		VT_AUTO		0x00	/* auto vt switching */
 #define		VT_PROCESS	0x01	/* process controls switching */
-#define		VT_PROCESS_AUTO 0x02	/* process is notified of switching */
+#define		VT_ACKACQ	0x02	/* acknowledge switch */
 
 struct vt_stat {
 	unsigned short v_active;	/* active vt */
@@ -38,7 +38,6 @@ struct vt_stat {
 #define VT_SENDSIG	0x5604	/* signal to send to bitmask of vts */
 
 #define VT_RELDISP	0x5605	/* release display */
-#define		VT_ACKACQ	0x02	/* acknowledge switch */
 
 #define VT_ACTIVATE	0x5606	/* make vt active */
 #define VT_WAITACTIVE	0x5607	/* wait for vt active */
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 04a6908e38d..ff77e8f882f 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -176,6 +176,6 @@ extern void hci_sock_cleanup(void);
 extern int bt_sysfs_init(void);
 extern void bt_sysfs_cleanup(void);
 
-extern struct class *bt_class;
+extern struct dentry *bt_debugfs;
 
 #endif /* __BLUETOOTH_H */
diff --git a/include/net/netlink.h b/include/net/netlink.h
index f82e463c875..4fc05b58503 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -945,7 +945,11 @@ static inline u64 nla_get_u64(const struct nlattr *nla)
  */
 static inline __be64 nla_get_be64(const struct nlattr *nla)
 {
-	return *(__be64 *) nla_data(nla);
+	__be64 tmp;
+
+	nla_memcpy(&tmp, nla, sizeof(tmp));
+
+	return tmp;
 }
 
 /**
diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h
index 32896a77391..2e488b60bc7 100644
--- a/include/pcmcia/ss.h
+++ b/include/pcmcia/ss.h
@@ -277,12 +277,6 @@ extern struct pccard_resource_ops pccard_nonstatic_ops;
 #endif
 
 
-/* socket drivers are expected to use these callbacks in their .drv struct */
-extern int pcmcia_socket_dev_suspend(struct device *dev);
-extern void pcmcia_socket_dev_early_resume(struct device *dev);
-extern void pcmcia_socket_dev_late_resume(struct device *dev);
-extern int pcmcia_socket_dev_resume(struct device *dev);
-
 /* socket drivers use this callback in their IRQ handler */
 extern void pcmcia_parse_events(struct pcmcia_socket *socket,
 				unsigned int events);
diff --git a/init/main.c b/init/main.c
index a1ab78ceb4b..cbead27caef 100644
--- a/init/main.c
+++ b/init/main.c
@@ -858,7 +858,7 @@ static int __init kernel_init(void * unused)
 	/*
 	 * init can allocate pages on any node
 	 */
-	set_mems_allowed(node_possible_map);
+	set_mems_allowed(node_states[N_HIGH_MEMORY]);
 	/*
 	 * init can run on any cpu.
 	 */
diff --git a/ipc/syscall.c b/ipc/syscall.c
index 355a3da9ec7..1d6f53f6b56 100644
--- a/ipc/syscall.c
+++ b/ipc/syscall.c
@@ -13,7 +13,7 @@
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
 
-SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, int, second,
+SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second,
 		unsigned long, third, void __user *, ptr, long, fifth)
 {
 	int version, ret;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ef909a32975..e2769e13980 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -27,7 +27,6 @@
  */
 
 #include <linux/cgroup.h>
-#include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459..d10946748ec 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  *    call to guarantee_online_mems(), as we know no one is changing
  *    our task's cpuset.
  *
- *    Hold callback_mutex around the two modifications of our tasks
- *    mems_allowed to synchronize with cpuset_mems_allowed().
- *
  *    While the mm_struct we are migrating is typically from some
  *    other task, the task_struct mems_allowed that we are hacking
  *    is for our current task, which must allocate new pages for that
@@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
-	nodemask_t newmems;
+	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
+
+	if (!newmems)
+		return;
 
 	cs = cgroup_cs(scan->cg);
-	guarantee_online_mems(cs, &newmems);
+	guarantee_online_mems(cs, newmems);
 
 	task_lock(p);
-	cpuset_change_task_nodemask(p, &newmems);
+	cpuset_change_task_nodemask(p, newmems);
 	task_unlock(p);
 
+	NODEMASK_FREE(newmems);
+
 	mm = get_task_mm(p);
 	if (!mm)
 		return;
@@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 			   const char *buf)
 {
-	nodemask_t oldmem;
+	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
 	int retval;
 	struct ptr_heap heap;
 
+	if (!oldmem)
+		return -ENOMEM;
+
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
 	 * it's read-only
 	 */
-	if (cs == &top_cpuset)
-		return -EACCES;
+	if (cs == &top_cpuset) {
+		retval = -EACCES;
+		goto done;
+	}
 
 	/*
 	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 			goto done;
 
 		if (!nodes_subset(trialcs->mems_allowed,
-				node_states[N_HIGH_MEMORY]))
-			return -EINVAL;
+				node_states[N_HIGH_MEMORY])) {
+			retval =  -EINVAL;
+			goto done;
+		}
 	}
-	oldmem = cs->mems_allowed;
-	if (nodes_equal(oldmem, trialcs->mems_allowed)) {
+	*oldmem = cs->mems_allowed;
+	if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
 		retval = 0;		/* Too easy - nothing to do */
 		goto done;
 	}
@@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	cs->mems_allowed = trialcs->mems_allowed;
 	mutex_unlock(&callback_mutex);
 
-	update_tasks_nodemask(cs, &oldmem, &heap);
+	update_tasks_nodemask(cs, oldmem, &heap);
 
 	heap_free(&heap);
 done:
+	NODEMASK_FREE(oldmem);
 	return retval;
 }
 
@@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 			  struct cgroup *oldcont, struct task_struct *tsk,
 			  bool threadgroup)
 {
-	nodemask_t from, to;
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
+	NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
+	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
+
+	if (from == NULL || to == NULL)
+		goto alloc_fail;
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
-		to = node_possible_map;
 	} else {
 		guarantee_online_cpus(cs, cpus_attach);
-		guarantee_online_mems(cs, &to);
 	}
+	guarantee_online_mems(cs, to);
 
 	/* do per-task migration stuff possibly for each in the threadgroup */
-	cpuset_attach_task(tsk, &to, cs);
+	cpuset_attach_task(tsk, to, cs);
 	if (threadgroup) {
 		struct task_struct *c;
 		rcu_read_lock();
 		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-			cpuset_attach_task(c, &to, cs);
+			cpuset_attach_task(c, to, cs);
 		}
 		rcu_read_unlock();
 	}
 
 	/* change mm; only needs to be done once even if threadgroup */
-	from = oldcs->mems_allowed;
-	to = cs->mems_allowed;
+	*from = oldcs->mems_allowed;
+	*to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
 	if (mm) {
-		mpol_rebind_mm(mm, &to);
+		mpol_rebind_mm(mm, to);
 		if (is_memory_migrate(cs))
-			cpuset_migrate_mm(mm, &from, &to);
+			cpuset_migrate_mm(mm, from, to);
 		mmput(mm);
 	}
+
+alloc_fail:
+	NODEMASK_FREE(from);
+	NODEMASK_FREE(to);
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-	nodemask_t mask;
+	NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
+	int retval;
+
+	if (mask == NULL)
+		return -ENOMEM;
 
 	mutex_lock(&callback_mutex);
-	mask = cs->mems_allowed;
+	*mask = cs->mems_allowed;
 	mutex_unlock(&callback_mutex);
 
-	return nodelist_scnprintf(page, PAGE_SIZE, mask);
+	retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
+
+	NODEMASK_FREE(mask);
+
+	return retval;
 }
 
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 	struct cpuset *cp;	/* scans cpusets being updated */
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct cgroup *cont;
-	nodemask_t oldmems;
+	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+
+	if (oldmems == NULL)
+		return;
 
 	list_add_tail((struct list_head *)&root->stack_list, &queue);
 
@@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
 			continue;
 
-		oldmems = cp->mems_allowed;
+		*oldmems = cp->mems_allowed;
 
 		/* Remove offline cpus and mems from this cpuset. */
 		mutex_lock(&callback_mutex);
@@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 			remove_tasks_in_empty_cpuset(cp);
 		else {
 			update_tasks_cpumask(cp, NULL);
-			update_tasks_nodemask(cp, &oldmems, NULL);
+			update_tasks_nodemask(cp, oldmems, NULL);
 		}
 	}
+	NODEMASK_FREE(oldmems);
 }
 
 /*
@@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
+	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+
+	if (oldmems == NULL)
+		return NOTIFY_DONE;
+
 	cgroup_lock();
 	switch (action) {
 	case MEM_ONLINE:
-	case MEM_OFFLINE:
+		*oldmems = top_cpuset.mems_allowed;
 		mutex_lock(&callback_mutex);
 		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 		mutex_unlock(&callback_mutex);
-		if (action == MEM_OFFLINE)
-			scan_for_empty_cpusets(&top_cpuset);
+		update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+		break;
+	case MEM_OFFLINE:
+		/*
+		 * needn't update top_cpuset.mems_allowed explicitly because
+		 * scan_for_empty_cpusets() will update it.
+		 */
+		scan_for_empty_cpusets(&top_cpuset);
 		break;
 	default:
 		break;
 	}
 	cgroup_unlock();
+
+	NODEMASK_FREE(oldmems);
 	return NOTIFY_OK;
 }
 #endif
diff --git a/kernel/cred.c b/kernel/cred.c
index 1ed8ca18790..1b1129d0cce 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -364,7 +364,7 @@ struct cred *prepare_usermodehelper_creds(void)
 
 	new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
 	if (!new)
-		return NULL;
+		goto free_tgcred;
 
 	kdebug("prepare_usermodehelper_creds() alloc %p", new);
 
@@ -397,6 +397,10 @@ struct cred *prepare_usermodehelper_creds(void)
 
 error:
 	put_cred(new);
+free_tgcred:
+#ifdef CONFIG_KEYS
+	kfree(tgcred);
+#endif
 	return NULL;
 }
 
diff --git a/kernel/early_res.c b/kernel/early_res.c
index 3cb2c661bb7..31aa9332ef3 100644
--- a/kernel/early_res.c
+++ b/kernel/early_res.c
@@ -333,6 +333,12 @@ void __init free_early_partial(u64 start, u64 end)
 	struct early_res *r;
 	int i;
 
+	if (start == end)
+		return;
+
+	if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end))
+		return;
+
 try_next:
 	i = find_overlapped_early(start, end);
 	if (i >= max_early_res)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 42ec11b2af8..b7091d5ca2f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
 		if (desc->chip->ack)
 			desc->chip->ack(irq);
 	}
+	desc->status |= IRQ_MASKED;
+}
+
+static inline void mask_irq(struct irq_desc *desc, int irq)
+{
+	if (desc->chip->mask) {
+		desc->chip->mask(irq);
+		desc->status |= IRQ_MASKED;
+	}
+}
+
+static inline void unmask_irq(struct irq_desc *desc, int irq)
+{
+	if (desc->chip->unmask) {
+		desc->chip->unmask(irq);
+		desc->status &= ~IRQ_MASKED;
+	}
 }
 
 /*
@@ -484,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 	raw_spin_lock(&desc->lock);
 	desc->status &= ~IRQ_INPROGRESS;
 
-	if (unlikely(desc->status & IRQ_ONESHOT))
-		desc->status |= IRQ_MASKED;
-	else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-		desc->chip->unmask(irq);
+	if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
+		unmask_irq(desc, irq);
 out_unlock:
 	raw_spin_unlock(&desc->lock);
 }
@@ -524,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	action = desc->action;
 	if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
 		desc->status |= IRQ_PENDING;
-		if (desc->chip->mask)
-			desc->chip->mask(irq);
+		mask_irq(desc, irq);
 		goto out;
 	}
 
@@ -593,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		irqreturn_t action_ret;
 
 		if (unlikely(!action)) {
-			desc->chip->mask(irq);
+			mask_irq(desc, irq);
 			goto out_unlock;
 		}
 
@@ -605,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		if (unlikely((desc->status &
 			       (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
 			      (IRQ_PENDING | IRQ_MASKED))) {
-			desc->chip->unmask(irq);
-			desc->status &= ~IRQ_MASKED;
+			unmask_irq(desc, irq);
 		}
 
 		desc->status &= ~IRQ_PENDING;
@@ -716,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 	__set_irq_handler(irq, handle, 0, name);
 }
 
-void __init set_irq_noprobe(unsigned int irq)
+void set_irq_noprobe(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
@@ -731,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
-void __init set_irq_probe(unsigned int irq)
+void set_irq_probe(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned long flags;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c..398fda155f6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action;
+	unsigned long flags;
 
 	if (!desc)
 		return 0;
@@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 	if (desc->status & IRQ_NOREQUEST)
 		return 0;
 
+	raw_spin_lock_irqsave(&desc->lock, flags);
 	action = desc->action;
 	if (action)
 		if (irqflags & action->flags & IRQF_SHARED)
 			action = NULL;
 
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
 	return !action;
 }
 
@@ -483,8 +487,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
  */
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
+again:
 	chip_bus_lock(irq, desc);
 	raw_spin_lock_irq(&desc->lock);
+
+	/*
+	 * Implausible though it may be we need to protect us against
+	 * the following scenario:
+	 *
+	 * The thread is faster done than the hard interrupt handler
+	 * on the other CPU. If we unmask the irq line then the
+	 * interrupt can come in again and masks the line, leaves due
+	 * to IRQ_INPROGRESS and the irq line is masked forever.
+	 */
+	if (unlikely(desc->status & IRQ_INPROGRESS)) {
+		raw_spin_unlock_irq(&desc->lock);
+		chip_bus_sync_unlock(irq, desc);
+		cpu_relax();
+		goto again;
+	}
+
 	if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
 		desc->status &= ~IRQ_MASKED;
 		desc->chip->unmask(irq);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 82ed0ea1519..83911c78017 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_cpus_allowed_ptr(tsk, cpu_all_mask);
-	set_mems_allowed(node_possible_map);
+	set_mems_allowed(node_states[N_HIGH_MEMORY]);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1a22dfd42df..bc7704b3a44 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1061,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 }
 
-static void stop_process_timers(struct task_struct *tsk)
+static void stop_process_timers(struct signal_struct *sig)
 {
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct thread_group_cputimer *cputimer = &sig->cputimer;
 	unsigned long flags;
 
 	if (!cputimer->running)
@@ -1072,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk)
 	spin_lock_irqsave(&cputimer->lock, flags);
 	cputimer->running = 0;
 	spin_unlock_irqrestore(&cputimer->lock, flags);
+
+	sig->cputime_expires.prof_exp = cputime_zero;
+	sig->cputime_expires.virt_exp = cputime_zero;
+	sig->cputime_expires.sched_exp = 0;
 }
 
 static u32 onecputick;
@@ -1133,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk,
 	    list_empty(&timers[CPUCLOCK_VIRT]) &&
 	    cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
 	    list_empty(&timers[CPUCLOCK_SCHED])) {
-		stop_process_timers(tsk);
+		stop_process_timers(sig);
 		return;
 	}
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f1125c1a632..63fe2543398 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/kernel_stat.h>
+#include <linux/hardirq.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -66,6 +67,28 @@ EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+/**
+ * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ *
+ * Check for bottom half being disabled, which covers both the
+ * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
+ * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
+ * will show the situation.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
+ */
+int rcu_read_lock_bh_held(void)
+{
+	if (!debug_lockdep_rcu_enabled())
+		return 1;
+	return in_softirq();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 /*
  * This function is invoked towards the end of the scheduler's initialization
  * process.  Before this is called, the idle task might contain
diff --git a/kernel/resource.c b/kernel/resource.c
index 2d5be5d9bf5..9c358e26353 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -219,19 +219,34 @@ void release_child_resources(struct resource *r)
 }
 
 /**
- * request_resource - request and reserve an I/O or memory resource
+ * request_resource_conflict - request and reserve an I/O or memory resource
  * @root: root resource descriptor
  * @new: resource descriptor desired by caller
  *
- * Returns 0 for success, negative error code on error.
+ * Returns 0 for success, conflict resource on error.
  */
-int request_resource(struct resource *root, struct resource *new)
+struct resource *request_resource_conflict(struct resource *root, struct resource *new)
 {
 	struct resource *conflict;
 
 	write_lock(&resource_lock);
 	conflict = __request_resource(root, new);
 	write_unlock(&resource_lock);
+	return conflict;
+}
+
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
+int request_resource(struct resource *root, struct resource *new)
+{
+	struct resource *conflict;
+
+	conflict = request_resource_conflict(root, new);
 	return conflict ? -EBUSY : 0;
 }
 
@@ -474,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
 }
 
 /**
- * insert_resource - Inserts a resource in the resource tree
+ * insert_resource_conflict - Inserts resource in the resource tree
  * @parent: parent of the new resource
  * @new: new resource to insert
  *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ * Returns 0 on success, conflict resource if the resource can't be inserted.
  *
- * This function is equivalent to request_resource when no conflict
+ * This function is equivalent to request_resource_conflict when no conflict
  * happens. If a conflict happens, and the conflicting resources
  * entirely fit within the range of the new resource, then the new
  * resource is inserted and the conflicting resources become children of
  * the new resource.
  */
-int insert_resource(struct resource *parent, struct resource *new)
+struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
 {
 	struct resource *conflict;
 
 	write_lock(&resource_lock);
 	conflict = __insert_resource(parent, new);
 	write_unlock(&resource_lock);
+	return conflict;
+}
+
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+	struct resource *conflict;
+
+	conflict = insert_resource_conflict(parent, new);
 	return conflict ? -EBUSY : 0;
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 9ab3cd7858d..49d2fa7b687 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2650,7 +2650,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	struct rq *rq;
-	int cpu = get_cpu();
+	int cpu __maybe_unused = get_cpu();
 
 #ifdef CONFIG_SMP
 	/*
@@ -4902,7 +4902,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 	int ret;
 	cpumask_var_t mask;
 
-	if (len < cpumask_size())
+	if (len < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(unsigned long)-1))
 		return -EINVAL;
 
 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -4910,10 +4912,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 
 	ret = sched_getaffinity(pid, mask);
 	if (ret == 0) {
-		if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+		size_t retlen = min_t(size_t, len, cpumask_size());
+
+		if (copy_to_user(user_mask_ptr, mask, retlen))
 			ret = -EFAULT;
 		else
-			ret = cpumask_size();
+			ret = retlen;
 	}
 	free_cpumask_var(mask);
 
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 7494bbf5a27..7d3f4fa9ef4 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
 			goto cancelled;
 
 		/* the timer holds a reference whilst it is pending */
-		ret = work->ops->get_ref(work);
+		ret = slow_work_get_ref(work);
 		if (ret < 0)
 			goto cant_get_ref;
 
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d73..a29ebd1ef41 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
  */
 static inline void slow_work_set_thread_pid(int id, pid_t pid)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	slow_work_pids[id] = pid;
 #endif
 }
 
 static inline void slow_work_mark_time(struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	work->mark = CURRENT_TIME;
 #endif
 }
 
 static inline void slow_work_begin_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	slow_work_execs[id] = work;
 #endif
 }
 
 static inline void slow_work_end_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	write_lock(&slow_work_execs_lock);
 	slow_work_execs[id] = NULL;
 	write_unlock(&slow_work_execs_lock);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 0d4c7898ab8..4b493f67dcb 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -155,11 +155,11 @@ void softlockup_tick(void)
 	 * Wake up the high-prio watchdog task twice per
 	 * threshold timespan.
 	 */
-	if (now > touch_ts + softlockup_thresh/2)
+	if (time_after(now - softlockup_thresh/2, touch_ts))
 		wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
 
 	/* Warn about unreasonable delays: */
-	if (now <= (touch_ts + softlockup_thresh))
+	if (time_before_eq(now - softlockup_thresh, touch_ts))
 		return;
 
 	per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f..aada0e52680 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
 
 #include "tick-internal.h"
 
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ)
+
+static int tick_increase_min_delta(struct clock_event_device *dev)
+{
+	/* Nothing to do if we already reached the limit */
+	if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
+		return -ETIME;
+
+	if (dev->min_delta_ns < 5000)
+		dev->min_delta_ns = 5000;
+	else
+		dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+	if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+		dev->min_delta_ns = MIN_DELTA_LIMIT;
+
+	printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+	       dev->name ? dev->name : "?",
+	       (unsigned long long) dev->min_delta_ns);
+	return 0;
+}
+
 /**
  * tick_program_event internal worker function
  */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
 		if (!ret || !force)
 			return ret;
 
+		dev->retries++;
 		/*
-		 * We tried 2 times to program the device with the given
-		 * min_delta_ns. If that's not working then we double it
+		 * We tried 3 times to program the device with the given
+		 * min_delta_ns. If that's not working then we increase it
 		 * and emit a warning.
 		 */
 		if (++i > 2) {
 			/* Increase the min. delta and try again */
-			if (!dev->min_delta_ns)
-				dev->min_delta_ns = 5000;
-			else
-				dev->min_delta_ns += dev->min_delta_ns >> 1;
-
-			printk(KERN_WARNING
-			       "CE: %s increasing min_delta_ns to %llu nsec\n",
-			       dev->name ? dev->name : "?",
-			       (unsigned long long) dev->min_delta_ns << 1);
-
+			if (tick_increase_min_delta(dev)) {
+				/*
+				 * Get out of the loop if min_delta_ns
+				 * hit the limit already. That's
+				 * better than staying here forever.
+				 *
+				 * We clear next_event so we have a
+				 * chance that the box survives.
+				 */
+				printk(KERN_WARNING
+				       "CE: Reprogramming failure. Giving up\n");
+				dev->next_event.tv64 = KTIME_MAX;
+				return -ETIME;
+			}
 			i = 0;
 		}
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 16736379a9c..39f6177fafa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -818,7 +818,8 @@ void update_wall_time(void)
 	shift = min(shift, maxshift);
 	while (offset >= timekeeper.cycle_interval) {
 		offset = logarithmic_accumulation(offset, shift);
-		shift--;
+		if(offset < timekeeper.cycle_interval<<shift)
+			shift--;
 	}
 
 	/* correct the clock when NTP error is too big */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050..1a4a7dd7877 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	SEQ_printf(m, " event_handler:  ");
 	print_name_offset(m, dev->event_handler);
 	SEQ_printf(m, "\n");
+	SEQ_printf(m, " retries:        %lu\n", dev->retries);
 }
 
 static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Timer List Version: v0.5\n");
+	SEQ_printf(m, "Timer List Version: v0.6\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387..fc965eae0e8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -880,6 +880,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
 	if (base->running_timer == timer)
 		goto out;
 
+	timer_stats_timer_clear_start_info(timer);
 	ret = 0;
 	if (timer_pending(timer)) {
 		detach_timer(timer, 1);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 05a9f83b881..d1187ef20ca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -207,6 +207,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 #define RB_EVNT_MIN_SIZE	8U	/* two 32bit words */
 
+#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+# define RB_FORCE_8BYTE_ALIGNMENT	0
+# define RB_ARCH_ALIGNMENT		RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT	1
+# define RB_ARCH_ALIGNMENT		8U
+#endif
+
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 
@@ -1547,7 +1555,7 @@ rb_update_event(struct ring_buffer_event *event,
 
 	case 0:
 		length -= RB_EVNT_HDR_SIZE;
-		if (length > RB_MAX_SMALL_DATA)
+		if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
 			event->array[0] = length;
 		else
 			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1722,11 +1730,11 @@ static unsigned rb_calculate_event_length(unsigned length)
 	if (!length)
 		length = 1;
 
-	if (length > RB_MAX_SMALL_DATA)
+	if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
 		length += sizeof(event.array[0]);
 
 	length += RB_EVNT_HDR_SIZE;
-	length = ALIGN(length, RB_ALIGNMENT);
+	length = ALIGN(length, RB_ARCH_ALIGNMENT);
 
 	return length;
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8e5ec5e1ab9..1fafb4b99c9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -103,7 +103,8 @@ config HEADERS_CHECK
 
 config DEBUG_SECTION_MISMATCH
 	bool "Enable full Section mismatch analysis"
-	depends on UNDEFINED
+	depends on UNDEFINED || (BLACKFIN)
+	default y
 	# This option is on purpose disabled for now.
 	# It will be enabled when we are down to a reasonable number
 	# of section mismatch warnings (< 10 for an allyesconfig build)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d7c791ef003..9b134460b01 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -180,19 +180,12 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
 	end_aligned = end & ~(BITS_PER_LONG - 1);
 
 	if (end_aligned <= start_aligned) {
-#if 1
-		printk(KERN_DEBUG " %lx - %lx\n", start, end);
-#endif
 		for (i = start; i < end; i++)
 			__free_pages_bootmem(pfn_to_page(i), 0);
 
 		return;
 	}
 
-#if 1
-	printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
-		 start, start_aligned, end_aligned, end);
-#endif
 	for (i = start; i < start_aligned; i++)
 		__free_pages_bootmem(pfn_to_page(i), 0);
 
@@ -428,9 +421,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 {
 #ifdef CONFIG_NO_BOOTMEM
 	free_early(physaddr, physaddr + size);
-#if 0
-	printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
-#endif
 #else
 	unsigned long start, end;
 
@@ -456,9 +446,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
 {
 #ifdef CONFIG_NO_BOOTMEM
 	free_early(addr, addr + size);
-#if 0
-	printk(KERN_DEBUG "free %lx %lx\n", addr, size);
-#endif
 #else
 	unsigned long start, end;
 
diff --git a/mm/ksm.c b/mm/ksm.c
index a93f1b7f508..8cdfc2a1e8b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -751,7 +751,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		 * page
 		 */
 		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
-			set_pte_at_notify(mm, addr, ptep, entry);
+			set_pte_at(mm, addr, ptep, entry);
 			goto out_unlock;
 		}
 		entry = pte_wrprotect(entry);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7973b5221fb..9ed760dc744 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3691,8 +3691,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	else
 		mem = vmalloc(size);
 
-	if (mem)
-		memset(mem, 0, size);
+	if (!mem)
+		return NULL;
+
+	memset(mem, 0, size);
 	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!mem->stat) {
 		if (size < PAGE_SIZE)
@@ -3946,28 +3948,6 @@ one_by_one:
 	}
 	return ret;
 }
-#else	/* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
-				struct cgroup *cgroup,
-				struct task_struct *p,
-				bool threadgroup)
-{
-	return 0;
-}
-static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
-				struct cgroup *cgroup,
-				struct task_struct *p,
-				bool threadgroup)
-{
-}
-static void mem_cgroup_move_task(struct cgroup_subsys *ss,
-				struct cgroup *cont,
-				struct cgroup *old_cont,
-				struct task_struct *p,
-				bool threadgroup)
-{
-}
-#endif
 
 /**
  * is_target_pte_for_mc - check a pte whether it is valid for move charge
@@ -4330,6 +4310,28 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 	}
 	mem_cgroup_clear_mc();
 }
+#else	/* !CONFIG_MMU */
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+				struct cgroup *cgroup,
+				struct task_struct *p,
+				bool threadgroup)
+{
+	return 0;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+				struct cgroup *cgroup,
+				struct task_struct *p,
+				bool threadgroup)
+{
+}
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+				struct cgroup *cont,
+				struct cgroup *old_cont,
+				struct task_struct *p,
+				bool threadgroup)
+{
+}
+#endif
 
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
diff --git a/mm/memory.c b/mm/memory.c
index 5b7f2002e54..bc9ba5a1f5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -130,6 +130,7 @@ void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
 
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
 		if (task->rss_stat.count[i]) {
+			BUG_ON(!mm);
 			add_mm_counter(mm, i, task->rss_stat.count[i]);
 			task->rss_stat.count[i] = 0;
 		}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 643f66e1018..8034abd3a13 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -806,9 +806,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 
 	err = 0;
 	if (nmask) {
-		task_lock(current);
-		get_policy_nodemask(pol, nmask);
-		task_unlock(current);
+		if (mpol_store_user_nodemask(pol)) {
+			*nmask = pol->w.user_nodemask;
+		} else {
+			task_lock(current);
+			get_policy_nodemask(pol, nmask);
+			task_unlock(current);
+		}
 	}
 
  out:
@@ -2195,8 +2199,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 			char *rest = nodelist;
 			while (isdigit(*rest))
 				rest++;
-			if (!*rest)
-				err = 0;
+			if (*rest)
+				goto out;
 		}
 		break;
 	case MPOL_INTERLEAVE:
@@ -2205,7 +2209,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		 */
 		if (!nodelist)
 			nodes = node_states[N_HIGH_MEMORY];
-		err = 0;
 		break;
 	case MPOL_LOCAL:
 		/*
@@ -2215,11 +2218,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 			goto out;
 		mode = MPOL_PREFERRED;
 		break;
-
-	/*
-	 * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
-	 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
-	 */
+	case MPOL_DEFAULT:
+		/*
+		 * Insist on a empty nodelist
+		 */
+		if (!nodelist)
+			err = 0;
+		goto out;
+	case MPOL_BIND:
+		/*
+		 * Insist on a nodelist
+		 */
+		if (!nodelist)
+			goto out;
 	}
 
 	mode_flags = 0;
@@ -2233,13 +2244,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		else if (!strcmp(flags, "relative"))
 			mode_flags |= MPOL_F_RELATIVE_NODES;
 		else
-			err = 1;
+			goto out;
 	}
 
 	new = mpol_new(mode, mode_flags, &nodes);
 	if (IS_ERR(new))
-		err = 1;
-	else {
+		goto out;
+
+	{
 		int ret;
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
@@ -2250,13 +2262,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 			ret = -ENOMEM;
 		NODEMASK_SCRATCH_FREE(scratch);
 		if (ret) {
-			err = 1;
 			mpol_put(new);
-		} else if (no_context) {
-			/* save for contextualization */
-			new->w.user_nodemask = nodes;
+			goto out;
 		}
 	}
+	err = 0;
+	if (no_context) {
+		/* save for contextualization */
+		new->w.user_nodemask = nodes;
+	}
 
 out:
 	/* Restore string for error message */
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 0777654147c..9e82e937000 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,6 +53,7 @@ void unuse_mm(struct mm_struct *mm)
 	struct task_struct *tsk = current;
 
 	task_lock(tsk);
+	sync_mm_rss(tsk, mm);
 	tsk->mm = NULL;
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
diff --git a/mm/nommu.c b/mm/nommu.c
index 605ace8982a..63fa17d121f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -146,7 +146,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 
 	for (i = 0; i < nr_pages; i++) {
-		vma = find_extend_vma(mm, start);
+		vma = find_vma(mm, start);
 		if (!vma)
 			goto finish_or_fault;
 
@@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		}
 		if (vmas)
 			vmas[i] = vma;
-		start += PAGE_SIZE;
+		start = (start + PAGE_SIZE) & PAGE_MASK;
 	}
 
 	return i;
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(find_vma);
  */
 struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
 {
-	return find_vma(mm, addr & PAGE_MASK);
+	return find_vma(mm, addr);
 }
 
 /*
@@ -1040,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
 	if (ret != -ENOSYS)
 		return ret;
 
-	/* getting an ENOSYS error indicates that direct mmap isn't
-	 * possible (as opposed to tried but failed) so we'll fall
-	 * through to making a private copy of the data and mapping
-	 * that if we can */
+	/* getting -ENOSYS indicates that direct mmap isn't possible (as
+	 * opposed to tried but failed) so we can only give a suitable error as
+	 * it's not possible to make a private copy if MAP_SHARED was given */
 	return -ENODEV;
 }
 
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 453512266ea..db783d7af5a 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -378,6 +378,8 @@ static void vlan_transfer_features(struct net_device *dev,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid;
 #endif
+	vlandev->real_num_tx_queues = dev->real_num_tx_queues;
+	BUG_ON(vlandev->real_num_tx_queues > vlandev->num_tx_queues);
 
 	if (old_features != vlandev->features)
 		netdev_features_change(vlandev);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index c0316e0ca6e..c584a0af77d 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -11,7 +11,7 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 	if (netpoll_rx(skb))
 		return NET_RX_DROP;
 
-	if (skb_bond_should_drop(skb))
+	if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
 		goto drop;
 
 	skb->skb_iif = skb->dev->ifindex;
@@ -83,7 +83,7 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
 {
 	struct sk_buff *p;
 
-	if (skb_bond_should_drop(skb))
+	if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
 		goto drop;
 
 	skb->skb_iif = skb->dev->ifindex;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 9e83272fc5b..2fd057c81bb 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -361,6 +361,14 @@ static netdev_tx_t vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb,
 	return ret;
 }
 
+static u16 vlan_dev_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	struct net_device *rdev = vlan_dev_info(dev)->real_dev;
+	const struct net_device_ops *ops = rdev->netdev_ops;
+
+	return ops->ndo_select_queue(rdev, skb);
+}
+
 static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu)
 {
 	/* TODO: gotta make sure the underlying layer can handle it,
@@ -688,7 +696,8 @@ static const struct header_ops vlan_header_ops = {
 	.parse	 = eth_header_parse,
 };
 
-static const struct net_device_ops vlan_netdev_ops, vlan_netdev_accel_ops;
+static const struct net_device_ops vlan_netdev_ops, vlan_netdev_accel_ops,
+		    vlan_netdev_ops_sq, vlan_netdev_accel_ops_sq;
 
 static int vlan_dev_init(struct net_device *dev)
 {
@@ -722,11 +731,17 @@ static int vlan_dev_init(struct net_device *dev)
 	if (real_dev->features & NETIF_F_HW_VLAN_TX) {
 		dev->header_ops      = real_dev->header_ops;
 		dev->hard_header_len = real_dev->hard_header_len;
-		dev->netdev_ops         = &vlan_netdev_accel_ops;
+		if (real_dev->netdev_ops->ndo_select_queue)
+			dev->netdev_ops = &vlan_netdev_accel_ops_sq;
+		else
+			dev->netdev_ops = &vlan_netdev_accel_ops;
 	} else {
 		dev->header_ops      = &vlan_header_ops;
 		dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
-		dev->netdev_ops         = &vlan_netdev_ops;
+		if (real_dev->netdev_ops->ndo_select_queue)
+			dev->netdev_ops = &vlan_netdev_ops_sq;
+		else
+			dev->netdev_ops = &vlan_netdev_ops;
 	}
 
 	if (is_vlan_dev(real_dev))
@@ -865,6 +880,56 @@ static const struct net_device_ops vlan_netdev_accel_ops = {
 #endif
 };
 
+static const struct net_device_ops vlan_netdev_ops_sq = {
+	.ndo_select_queue	= vlan_dev_select_queue,
+	.ndo_change_mtu		= vlan_dev_change_mtu,
+	.ndo_init		= vlan_dev_init,
+	.ndo_uninit		= vlan_dev_uninit,
+	.ndo_open		= vlan_dev_open,
+	.ndo_stop		= vlan_dev_stop,
+	.ndo_start_xmit =  vlan_dev_hard_start_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= vlan_dev_set_mac_address,
+	.ndo_set_rx_mode	= vlan_dev_set_rx_mode,
+	.ndo_set_multicast_list	= vlan_dev_set_rx_mode,
+	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
+	.ndo_do_ioctl		= vlan_dev_ioctl,
+	.ndo_neigh_setup	= vlan_dev_neigh_setup,
+	.ndo_get_stats		= vlan_dev_get_stats,
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
+	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
+	.ndo_fcoe_enable	= vlan_dev_fcoe_enable,
+	.ndo_fcoe_disable	= vlan_dev_fcoe_disable,
+	.ndo_fcoe_get_wwn	= vlan_dev_fcoe_get_wwn,
+#endif
+};
+
+static const struct net_device_ops vlan_netdev_accel_ops_sq = {
+	.ndo_select_queue	= vlan_dev_select_queue,
+	.ndo_change_mtu		= vlan_dev_change_mtu,
+	.ndo_init		= vlan_dev_init,
+	.ndo_uninit		= vlan_dev_uninit,
+	.ndo_open		= vlan_dev_open,
+	.ndo_stop		= vlan_dev_stop,
+	.ndo_start_xmit =  vlan_dev_hwaccel_hard_start_xmit,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_set_mac_address	= vlan_dev_set_mac_address,
+	.ndo_set_rx_mode	= vlan_dev_set_rx_mode,
+	.ndo_set_multicast_list	= vlan_dev_set_rx_mode,
+	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
+	.ndo_do_ioctl		= vlan_dev_ioctl,
+	.ndo_neigh_setup	= vlan_dev_neigh_setup,
+	.ndo_get_stats		= vlan_dev_get_stats,
+#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
+	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
+	.ndo_fcoe_enable	= vlan_dev_fcoe_enable,
+	.ndo_fcoe_disable	= vlan_dev_fcoe_disable,
+	.ndo_fcoe_get_wwn	= vlan_dev_fcoe_get_wwn,
+#endif
+};
+
 void vlan_setup(struct net_device *dev)
 {
 	ether_setup(dev);
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index cafb55b0cea..05fd125f74f 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -8,8 +8,7 @@
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_core.h>
 
-struct class *bt_class = NULL;
-EXPORT_SYMBOL_GPL(bt_class);
+static struct class *bt_class;
 
 struct dentry *bt_debugfs = NULL;
 EXPORT_SYMBOL_GPL(bt_debugfs);
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 4db7ae2fe07..7794a2e2adc 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -40,6 +40,8 @@
 #include <linux/skbuff.h>
 #include <linux/list.h>
 #include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/crc16.h>
 #include <net/sock.h>
@@ -2830,6 +2832,11 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr
 			int len = cmd->len - sizeof(*rsp);
 			char req[64];
 
+			if (len > sizeof(req) - sizeof(struct l2cap_conf_req)) {
+				l2cap_send_disconn_req(conn, sk);
+				goto done;
+			}
+
 			/* throw out any old stored conf requests */
 			result = L2CAP_CONF_SUCCESS;
 			len = l2cap_parse_conf_rsp(sk, rsp->data,
@@ -3937,31 +3944,42 @@ drop:
 	return 0;
 }
 
-static ssize_t l2cap_sysfs_show(struct class *dev,
-				struct class_attribute *attr,
-				char *buf)
+static int l2cap_debugfs_show(struct seq_file *f, void *p)
 {
 	struct sock *sk;
 	struct hlist_node *node;
-	char *str = buf;
 
 	read_lock_bh(&l2cap_sk_list.lock);
 
 	sk_for_each(sk, node, &l2cap_sk_list.head) {
 		struct l2cap_pinfo *pi = l2cap_pi(sk);
 
-		str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d\n",
-				batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
-				sk->sk_state, __le16_to_cpu(pi->psm), pi->scid,
-				pi->dcid, pi->imtu, pi->omtu, pi->sec_level);
+		seq_printf(f, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d\n",
+					batostr(&bt_sk(sk)->src),
+					batostr(&bt_sk(sk)->dst),
+					sk->sk_state, __le16_to_cpu(pi->psm),
+					pi->scid, pi->dcid,
+					pi->imtu, pi->omtu, pi->sec_level);
 	}
 
 	read_unlock_bh(&l2cap_sk_list.lock);
 
-	return str - buf;
+	return 0;
 }
 
-static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL);
+static int l2cap_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, l2cap_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations l2cap_debugfs_fops = {
+	.open		= l2cap_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *l2cap_debugfs;
 
 static const struct proto_ops l2cap_sock_ops = {
 	.family		= PF_BLUETOOTH,
@@ -4021,8 +4039,12 @@ static int __init l2cap_init(void)
 		goto error;
 	}
 
-	if (class_create_file(bt_class, &class_attr_l2cap) < 0)
-		BT_ERR("Failed to create L2CAP info file");
+	if (bt_debugfs) {
+		l2cap_debugfs = debugfs_create_file("l2cap", 0444,
+					bt_debugfs, NULL, &l2cap_debugfs_fops);
+		if (!l2cap_debugfs)
+			BT_ERR("Failed to create L2CAP debug file");
+	}
 
 	BT_INFO("L2CAP ver %s", VERSION);
 	BT_INFO("L2CAP socket layer initialized");
@@ -4036,7 +4058,7 @@ error:
 
 static void __exit l2cap_exit(void)
 {
-	class_remove_file(bt_class, &class_attr_l2cap);
+	debugfs_remove(l2cap_debugfs);
 
 	if (bt_sock_unregister(BTPROTO_L2CAP) < 0)
 		BT_ERR("L2CAP socket unregistration failed");
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index db8a68e1a5b..13f114e8b0f 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -33,6 +33,8 @@
 #include <linux/init.h>
 #include <linux/wait.h>
 #include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/net.h>
 #include <linux/mutex.h>
 #include <linux/kthread.h>
@@ -2098,13 +2100,10 @@ static struct hci_cb rfcomm_cb = {
 	.security_cfm	= rfcomm_security_cfm
 };
 
-static ssize_t rfcomm_dlc_sysfs_show(struct class *dev,
-				     struct class_attribute *attr,
-				     char *buf)
+static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x)
 {
 	struct rfcomm_session *s;
 	struct list_head *pp, *p;
-	char *str = buf;
 
 	rfcomm_lock();
 
@@ -2114,18 +2113,32 @@ static ssize_t rfcomm_dlc_sysfs_show(struct class *dev,
 			struct sock *sk = s->sock->sk;
 			struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list);
 
-			str += sprintf(str, "%s %s %ld %d %d %d %d\n",
-					batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
-					d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits);
+			seq_printf(f, "%s %s %ld %d %d %d %d\n",
+						batostr(&bt_sk(sk)->src),
+						batostr(&bt_sk(sk)->dst),
+						d->state, d->dlci, d->mtu,
+						d->rx_credits, d->tx_credits);
 		}
 	}
 
 	rfcomm_unlock();
 
-	return (str - buf);
+	return 0;
 }
 
-static CLASS_ATTR(rfcomm_dlc, S_IRUGO, rfcomm_dlc_sysfs_show, NULL);
+static int rfcomm_dlc_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rfcomm_dlc_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations rfcomm_dlc_debugfs_fops = {
+	.open		= rfcomm_dlc_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *rfcomm_dlc_debugfs;
 
 /* ---- Initialization ---- */
 static int __init rfcomm_init(void)
@@ -2142,8 +2155,12 @@ static int __init rfcomm_init(void)
 		goto unregister;
 	}
 
-	if (class_create_file(bt_class, &class_attr_rfcomm_dlc) < 0)
-		BT_ERR("Failed to create RFCOMM info file");
+	if (bt_debugfs) {
+		rfcomm_dlc_debugfs = debugfs_create_file("rfcomm_dlc", 0444,
+				bt_debugfs, NULL, &rfcomm_dlc_debugfs_fops);
+		if (!rfcomm_dlc_debugfs)
+			BT_ERR("Failed to create RFCOMM debug file");
+	}
 
 	err = rfcomm_init_ttys();
 	if (err < 0)
@@ -2171,7 +2188,7 @@ unregister:
 
 static void __exit rfcomm_exit(void)
 {
-	class_remove_file(bt_class, &class_attr_rfcomm_dlc);
+	debugfs_remove(rfcomm_dlc_debugfs);
 
 	hci_unregister_cb(&rfcomm_cb);
 
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index ca87d6ac6a2..7f439765403 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -40,6 +40,8 @@
 #include <linux/skbuff.h>
 #include <linux/list.h>
 #include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <net/sock.h>
 
 #include <asm/system.h>
@@ -1061,28 +1063,38 @@ done:
 	return result;
 }
 
-static ssize_t rfcomm_sock_sysfs_show(struct class *dev,
-				      struct class_attribute *attr,
-				      char *buf)
+static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p)
 {
 	struct sock *sk;
 	struct hlist_node *node;
-	char *str = buf;
 
 	read_lock_bh(&rfcomm_sk_list.lock);
 
 	sk_for_each(sk, node, &rfcomm_sk_list.head) {
-		str += sprintf(str, "%s %s %d %d\n",
-				batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
+		seq_printf(f, "%s %s %d %d\n",
+				batostr(&bt_sk(sk)->src),
+				batostr(&bt_sk(sk)->dst),
 				sk->sk_state, rfcomm_pi(sk)->channel);
 	}
 
 	read_unlock_bh(&rfcomm_sk_list.lock);
 
-	return (str - buf);
+	return 0;
 }
 
-static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL);
+static int rfcomm_sock_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rfcomm_sock_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations rfcomm_sock_debugfs_fops = {
+	.open		= rfcomm_sock_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *rfcomm_sock_debugfs;
 
 static const struct proto_ops rfcomm_sock_ops = {
 	.family		= PF_BLUETOOTH,
@@ -1122,8 +1134,12 @@ int __init rfcomm_init_sockets(void)
 	if (err < 0)
 		goto error;
 
-	if (class_create_file(bt_class, &class_attr_rfcomm) < 0)
-		BT_ERR("Failed to create RFCOMM info file");
+	if (bt_debugfs) {
+		rfcomm_sock_debugfs = debugfs_create_file("rfcomm", 0444,
+				bt_debugfs, NULL, &rfcomm_sock_debugfs_fops);
+		if (!rfcomm_sock_debugfs)
+			BT_ERR("Failed to create RFCOMM debug file");
+	}
 
 	BT_INFO("RFCOMM socket layer initialized");
 
@@ -1137,7 +1153,7 @@ error:
 
 void rfcomm_cleanup_sockets(void)
 {
-	class_remove_file(bt_class, &class_attr_rfcomm);
+	debugfs_remove(rfcomm_sock_debugfs);
 
 	if (bt_sock_unregister(BTPROTO_RFCOMM) < 0)
 		BT_ERR("RFCOMM socket layer unregistration failed");
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index f93b939539b..e5b16b76b22 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -38,6 +38,8 @@
 #include <linux/socket.h>
 #include <linux/skbuff.h>
 #include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/list.h>
 #include <net/sock.h>
 
@@ -953,28 +955,36 @@ drop:
 	return 0;
 }
 
-static ssize_t sco_sysfs_show(struct class *dev,
-				struct class_attribute *attr,
-				char *buf)
+static int sco_debugfs_show(struct seq_file *f, void *p)
 {
 	struct sock *sk;
 	struct hlist_node *node;
-	char *str = buf;
 
 	read_lock_bh(&sco_sk_list.lock);
 
 	sk_for_each(sk, node, &sco_sk_list.head) {
-		str += sprintf(str, "%s %s %d\n",
-				batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
-				sk->sk_state);
+		seq_printf(f, "%s %s %d\n", batostr(&bt_sk(sk)->src),
+				batostr(&bt_sk(sk)->dst), sk->sk_state);
 	}
 
 	read_unlock_bh(&sco_sk_list.lock);
 
-	return (str - buf);
+	return 0;
 }
 
-static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL);
+static int sco_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sco_debugfs_show, inode->i_private);
+}
+
+static const struct file_operations sco_debugfs_fops = {
+	.open		= sco_debugfs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *sco_debugfs;
 
 static const struct proto_ops sco_sock_ops = {
 	.family		= PF_BLUETOOTH,
@@ -1032,8 +1042,12 @@ static int __init sco_init(void)
 		goto error;
 	}
 
-	if (class_create_file(bt_class, &class_attr_sco) < 0)
-		BT_ERR("Failed to create SCO info file");
+	if (bt_debugfs) {
+		sco_debugfs = debugfs_create_file("sco", 0444,
+					bt_debugfs, NULL, &sco_debugfs_fops);
+		if (!sco_debugfs)
+			BT_ERR("Failed to create SCO debug file");
+	}
 
 	BT_INFO("SCO (Voice Link) ver %s", VERSION);
 	BT_INFO("SCO socket layer initialized");
@@ -1047,7 +1061,7 @@ error:
 
 static void __exit sco_exit(void)
 {
-	class_remove_file(bt_class, &class_attr_sco);
+	debugfs_remove(sco_debugfs);
 
 	if (bt_sock_unregister(BTPROTO_SCO) < 0)
 		BT_ERR("SCO socket unregistration failed");
diff --git a/net/core/dev.c b/net/core/dev.c
index bcc490cc945..59d4394d2ce 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2483,6 +2483,7 @@ int netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
 	struct net_device *orig_dev;
+	struct net_device *master;
 	struct net_device *null_or_orig;
 	struct net_device *null_or_bond;
 	int ret = NET_RX_DROP;
@@ -2503,11 +2504,12 @@ int netif_receive_skb(struct sk_buff *skb)
 
 	null_or_orig = NULL;
 	orig_dev = skb->dev;
-	if (orig_dev->master) {
-		if (skb_bond_should_drop(skb))
+	master = ACCESS_ONCE(orig_dev->master);
+	if (master) {
+		if (skb_bond_should_drop(skb, master))
 			null_or_orig = orig_dev; /* deliver only exact match */
 		else
-			skb->dev = orig_dev->master;
+			skb->dev = master;
 	}
 
 	__get_cpu_var(netdev_rx_stat).total++;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index d4ec38fa64e..6f9206b36dc 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -614,7 +614,7 @@ void netpoll_print_options(struct netpoll *np)
 			 np->name, np->local_port);
 	printk(KERN_INFO "%s: local IP %pI4\n",
 			 np->name, &np->local_ip);
-	printk(KERN_INFO "%s: interface %s\n",
+	printk(KERN_INFO "%s: interface '%s'\n",
 			 np->name, np->dev_name);
 	printk(KERN_INFO "%s: remote port %d\n",
 			 np->name, np->remote_port);
@@ -661,6 +661,9 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 		if ((delim = strchr(cur, '@')) == NULL)
 			goto parse_failed;
 		*delim = 0;
+		if (*cur == ' ' || *cur == '\t')
+			printk(KERN_INFO "%s: warning: whitespace"
+					"is not allowed\n", np->name);
 		np->remote_port = simple_strtol(cur, NULL, 10);
 		cur = delim;
 	}
@@ -708,7 +711,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
 	return 0;
 
  parse_failed:
-	printk(KERN_INFO "%s: couldn't parse config at %s!\n",
+	printk(KERN_INFO "%s: couldn't parse config at '%s'!\n",
 	       np->name, cur);
 	return -1;
 }
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 51ca946e339..3feb2b39030 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1194,7 +1194,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
 			if (idx < s_idx)
 				goto cont;
-			if (idx > s_idx)
+			if (h > s_h || idx > s_idx)
 				s_ip_idx = 0;
 			in_dev = __in_dev_get_rcu(dev);
 			if (!in_dev)
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index af5d8979286..01ef8ba9025 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -961,7 +961,9 @@ fib_find_node(struct trie *t, u32 key)
 	struct node *n;
 
 	pos = 0;
-	n = rcu_dereference(t->trie);
+	n = rcu_dereference_check(t->trie,
+				  rcu_read_lock_held() ||
+				  lockdep_rtnl_is_held());
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f47c9f76754..f78402d097b 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -810,11 +810,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			tunnel->err_count = 0;
 	}
 
-	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
 
 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+		if (max_headroom > dev->needed_headroom)
+			dev->needed_headroom = max_headroom;
 		if (!new_skb) {
 			ip_rt_put(rt);
 			txq->tx_dropped++;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8582e12e4a6..d0a6092a67b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -802,6 +802,9 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
 	int line;
 	struct mfc_cache *uc, *c, **cp;
 
+	if (mfc->mfcc_parent >= MAXVIFS)
+		return -ENFILE;
+
 	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
 
 	for (cp = &net->ipv4.mfc_cache_array[line];
@@ -1613,17 +1616,20 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
 	int ct;
 	struct rtnexthop *nhp;
 	struct net *net = mfc_net(c);
-	struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
 	u8 *b = skb_tail_pointer(skb);
 	struct rtattr *mp_head;
 
-	if (dev)
-		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+	/* If cache is unresolved, don't try to parse IIF and OIF */
+	if (c->mfc_parent > MAXVIFS)
+		return -ENOENT;
+
+	if (VIF_EXISTS(net, c->mfc_parent))
+		RTA_PUT(skb, RTA_IIF, 4, &net->ipv4.vif_table[c->mfc_parent].dev->ifindex);
 
 	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
 
 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
-		if (c->mfc_un.res.ttls[ct] < 255) {
+		if (VIF_EXISTS(net, ct) && c->mfc_un.res.ttls[ct] < 255) {
 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
 				goto rtattr_failure;
 			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a770df2493d..d413b57be9b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1097,7 +1097,7 @@ static int slow_chain_length(const struct rtable *head)
 }
 
 static int rt_intern_hash(unsigned hash, struct rtable *rt,
-			  struct rtable **rp, struct sk_buff *skb)
+			  struct rtable **rp, struct sk_buff *skb, int ifindex)
 {
 	struct rtable	*rth, **rthp;
 	unsigned long	now;
@@ -1212,11 +1212,16 @@ restart:
 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
 			struct net *net = dev_net(rt->u.dst.dev);
 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
-			if (!rt_caching(dev_net(rt->u.dst.dev))) {
+			if (!rt_caching(net)) {
 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
 					rt->u.dst.dev->name, num);
 			}
-			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
+			rt_emergency_hash_rebuild(net);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
+
+			hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
+					ifindex, rt_genid(net));
+			goto restart;
 		}
 	}
 
@@ -1441,7 +1446,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 					dev_hold(rt->u.dst.dev);
 				if (rt->idev)
 					in_dev_hold(rt->idev);
-				rt->u.dst.obsolete	= 0;
+				rt->u.dst.obsolete	= -1;
 				rt->u.dst.lastuse	= jiffies;
 				rt->u.dst.path		= &rt->u.dst;
 				rt->u.dst.neighbour	= NULL;
@@ -1477,7 +1482,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 							&netevent);
 
 				rt_del(hash, rth);
-				if (!rt_intern_hash(hash, rt, &rt, NULL))
+				if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
 					ip_rt_put(rt);
 				goto do_next;
 			}
@@ -1506,11 +1511,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 	struct dst_entry *ret = dst;
 
 	if (rt) {
-		if (dst->obsolete) {
+		if (dst->obsolete > 0) {
 			ip_rt_put(rt);
 			ret = NULL;
 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
-			   rt->u.dst.expires) {
+			   (rt->u.dst.expires &&
+			    time_after_eq(jiffies, rt->u.dst.expires))) {
 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
 						rt->fl.oif,
 						rt_genid(dev_net(dst->dev)));
@@ -1726,7 +1732,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
-	return NULL;
+	if (rt_is_expired((struct rtable *)dst))
+		return NULL;
+	return dst;
 }
 
 static void ipv4_dst_destroy(struct dst_entry *dst)
@@ -1888,7 +1896,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (!rth)
 		goto e_nobufs;
 
-	rth->u.dst.output= ip_rt_bug;
+	rth->u.dst.output = ip_rt_bug;
+	rth->u.dst.obsolete = -1;
 
 	atomic_set(&rth->u.dst.__refcnt, 1);
 	rth->u.dst.flags= DST_HOST;
@@ -1927,7 +1936,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
 	in_dev_put(in_dev);
 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	return rt_intern_hash(hash, rth, NULL, skb);
+	return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
 
 e_nobufs:
 	in_dev_put(in_dev);
@@ -2054,6 +2063,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->fl.oif 	= 0;
 	rth->rt_spec_dst= spec_dst;
 
+	rth->u.dst.obsolete = -1;
 	rth->u.dst.input = ip_forward;
 	rth->u.dst.output = ip_output;
 	rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
@@ -2093,7 +2103,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	/* put it into the cache */
 	hash = rt_hash(daddr, saddr, fl->iif,
 		       rt_genid(dev_net(rth->u.dst.dev)));
-	return rt_intern_hash(hash, rth, NULL, skb);
+	return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
 }
 
 /*
@@ -2218,6 +2228,7 @@ local_input:
 		goto e_nobufs;
 
 	rth->u.dst.output= ip_rt_bug;
+	rth->u.dst.obsolete = -1;
 	rth->rt_genid = rt_genid(net);
 
 	atomic_set(&rth->u.dst.__refcnt, 1);
@@ -2249,7 +2260,7 @@ local_input:
 	}
 	rth->rt_type	= res.type;
 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
-	err = rt_intern_hash(hash, rth, NULL, skb);
+	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
 	goto done;
 
 no_route:
@@ -2444,6 +2455,7 @@ static int __mkroute_output(struct rtable **result,
 	rth->rt_spec_dst= fl->fl4_src;
 
 	rth->u.dst.output=ip_output;
+	rth->u.dst.obsolete = -1;
 	rth->rt_genid = rt_genid(dev_net(dev_out));
 
 	RT_CACHE_STAT_INC(out_slow_tot);
@@ -2495,7 +2507,7 @@ static int ip_mkroute_output(struct rtable **rp,
 	if (err == 0) {
 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
 			       rt_genid(dev_net(dev_out)));
-		err = rt_intern_hash(hash, rth, rp, NULL);
+		err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
 	}
 
 	return err;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5901010fad5..6afb6d8662b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -429,7 +429,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		if (tp->urg_seq == tp->copied_seq &&
 		    !sock_flag(sk, SOCK_URGINLINE) &&
 		    tp->urg_data)
-			target--;
+			target++;
 
 		/* Potential race condition. If read of tp below will
 		 * escape above sk->sk_state, we can be illegally awaken
@@ -1254,6 +1254,39 @@ static void tcp_prequeue_process(struct sock *sk)
 	tp->ucopy.memory = 0;
 }
 
+#ifdef CONFIG_NET_DMA
+static void tcp_service_net_dma(struct sock *sk, bool wait)
+{
+	dma_cookie_t done, used;
+	dma_cookie_t last_issued;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->ucopy.dma_chan)
+		return;
+
+	last_issued = tp->ucopy.dma_cookie;
+	dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
+	do {
+		if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+					      last_issued, &done,
+					      &used) == DMA_SUCCESS) {
+			/* Safe to free early-copied skbs now */
+			__skb_queue_purge(&sk->sk_async_wait_queue);
+			break;
+		} else {
+			struct sk_buff *skb;
+			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
+			       (dma_async_is_complete(skb->dma_cookie, done,
+						      used) == DMA_SUCCESS)) {
+				__skb_dequeue(&sk->sk_async_wait_queue);
+				kfree_skb(skb);
+			}
+		}
+	} while (wait);
+}
+#endif
+
 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
 {
 	struct sk_buff *skb;
@@ -1546,6 +1579,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			/* __ Set realtime policy in scheduler __ */
 		}
 
+#ifdef CONFIG_NET_DMA
+		if (tp->ucopy.dma_chan)
+			dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+#endif
 		if (copied >= target) {
 			/* Do not sleep, just process backlog. */
 			release_sock(sk);
@@ -1554,6 +1591,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			sk_wait_data(sk, &timeo);
 
 #ifdef CONFIG_NET_DMA
+		tcp_service_net_dma(sk, false);  /* Don't block */
 		tp->ucopy.wakeup = 0;
 #endif
 
@@ -1633,6 +1671,9 @@ do_prequeue:
 						copied = -EFAULT;
 					break;
 				}
+
+				dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
 				if ((offset + used) == skb->len)
 					copied_early = 1;
 
@@ -1702,27 +1743,9 @@ skip_copy:
 	}
 
 #ifdef CONFIG_NET_DMA
-	if (tp->ucopy.dma_chan) {
-		dma_cookie_t done, used;
-
-		dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
-
-		while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
-						 tp->ucopy.dma_cookie, &done,
-						 &used) == DMA_IN_PROGRESS) {
-			/* do partial cleanup of sk_async_wait_queue */
-			while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
-			       (dma_async_is_complete(skb->dma_cookie, done,
-						      used) == DMA_SUCCESS)) {
-				__skb_dequeue(&sk->sk_async_wait_queue);
-				kfree_skb(skb);
-			}
-		}
+	tcp_service_net_dma(sk, true);  /* Wait for queue to drain */
+	tp->ucopy.dma_chan = NULL;
 
-		/* Safe to free early-copied skbs now */
-		__skb_queue_purge(&sk->sk_async_wait_queue);
-		tp->ucopy.dma_chan = NULL;
-	}
 	if (tp->ucopy.pinned_list) {
 		dma_unpin_iovec_pages(tp->ucopy.pinned_list);
 		tp->ucopy.pinned_list = NULL;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 788851ca8c5..c096a4218b8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2511,6 +2511,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
 	int err;
 	unsigned int mss;
 
+	if (packets == 0)
+		return;
+
 	WARN_ON(packets > tp->packets_out);
 	if (tp->lost_skb_hint) {
 		skb = tp->lost_skb_hint;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 70df40980a8..f4df5f931f3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -370,6 +370,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 	if (sk->sk_state == TCP_CLOSE)
 		goto out;
 
+	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+		goto out;
+	}
+
 	icsk = inet_csk(sk);
 	tp = tcp_sk(sk);
 	seq = ntohl(th->seq);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3381b4317c2..7e567ae5eaa 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3610,7 +3610,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 		hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
 			if (idx < s_idx)
 				goto cont;
-			if (idx > s_idx)
+			if (h > s_h || idx > s_idx)
 				s_ip_idx = 0;
 			ip_idx = 0;
 			if ((idev = __in6_dev_get(dev)) == NULL)
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 52e0f74fdfe..27acfb58650 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1113,6 +1113,9 @@ static int ip6mr_mfc_add(struct net *net, struct mf6cctl *mfc, int mrtsock)
 	unsigned char ttls[MAXMIFS];
 	int i;
 
+	if (mfc->mf6cc_parent >= MAXMIFS)
+		return -ENFILE;
+
 	memset(ttls, 255, MAXMIFS);
 	for (i = 0; i < MAXMIFS; i++) {
 		if (IF_ISSET(i, &mfc->mf6cc_ifset))
@@ -1692,17 +1695,20 @@ ip6mr_fill_mroute(struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm)
 	int ct;
 	struct rtnexthop *nhp;
 	struct net *net = mfc6_net(c);
-	struct net_device *dev = net->ipv6.vif6_table[c->mf6c_parent].dev;
 	u8 *b = skb_tail_pointer(skb);
 	struct rtattr *mp_head;
 
-	if (dev)
-		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+	/* If cache is unresolved, don't try to parse IIF and OIF */
+	if (c->mf6c_parent > MAXMIFS)
+		return -ENOENT;
+
+	if (MIF_EXISTS(net, c->mf6c_parent))
+		RTA_PUT(skb, RTA_IIF, 4, &net->ipv6.vif6_table[c->mf6c_parent].dev->ifindex);
 
 	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
 
 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
-		if (c->mfc_un.res.ttls[ct] < 255) {
+		if (MIF_EXISTS(net, ct) && c->mfc_un.res.ttls[ct] < 255) {
 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
 				goto rtattr_failure;
 			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index aef31a29de9..b9cf7cd6192 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -13,7 +13,7 @@ static const struct xt_table packet_raw = {
 	.valid_hooks = RAW_VALID_HOOKS,
 	.me = THIS_MODULE,
 	.af = NFPROTO_IPV6,
-	.priority = NF_IP6_PRI_FIRST,
+	.priority = NF_IP6_PRI_RAW,
 };
 
 /* The work comes in here from netfilter.c. */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 52cd3eff31d..0d7713c5c20 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -879,7 +879,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 
 	rt = (struct rt6_info *) dst;
 
-	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
+	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
 		return dst;
 
 	return NULL;
@@ -890,12 +890,17 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 	struct rt6_info *rt = (struct rt6_info *) dst;
 
 	if (rt) {
-		if (rt->rt6i_flags & RTF_CACHE)
-			ip6_del_rt(rt);
-		else
+		if (rt->rt6i_flags & RTF_CACHE) {
+			if (rt6_check_expired(rt)) {
+				ip6_del_rt(rt);
+				dst = NULL;
+			}
+		} else {
 			dst_release(dst);
+			dst = NULL;
+		}
 	}
-	return NULL;
+	return dst;
 }
 
 static void ip6_link_failure(struct sk_buff *skb)
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 36870788264..344145f23c3 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2129,10 +2129,9 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c
 	int err;
 
 	out_skb = pfkey_xfrm_policy2msg_prep(xp);
-	if (IS_ERR(out_skb)) {
-		err = PTR_ERR(out_skb);
-		goto out;
-	}
+	if (IS_ERR(out_skb))
+		return PTR_ERR(out_skb);
+
 	err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
 	if (err < 0)
 		return err;
@@ -2148,7 +2147,6 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c
 	out_hdr->sadb_msg_seq = c->seq;
 	out_hdr->sadb_msg_pid = c->pid;
 	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp));
-out:
 	return 0;
 
 }
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2b2af631d2b..569410a8595 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -582,7 +582,9 @@ nla_put_failure:
 nlmsg_failure:
 	kfree_skb(skb);
 errout:
-	nfnetlink_set_err(net, 0, group, -ENOBUFS);
+	if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0)
+		return -ENOBUFS;
+
 	return 0;
 }
 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 8eb0cc23ada..6afa3d52ea5 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -113,9 +113,9 @@ int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid,
 }
 EXPORT_SYMBOL_GPL(nfnetlink_send);
 
-void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error)
+int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error)
 {
-	netlink_set_err(net->nfnl, pid, group, error);
+	return netlink_set_err(net->nfnl, pid, group, error);
 }
 EXPORT_SYMBOL_GPL(nfnetlink_set_err);
 
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9e9c4896394..215a64835de 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -493,6 +493,7 @@ static void hashlimit_ipv6_mask(__be32 *i, unsigned int p)
 	case 64 ... 95:
 		i[2] = maskl(i[2], p - 64);
 		i[3] = 0;
+		break;
 	case 96 ... 127:
 		i[3] = maskl(i[3], p - 96);
 		break;
@@ -879,7 +880,8 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 	struct xt_hashlimit_htable *htable = s->private;
 	unsigned int *bucket = (unsigned int *)v;
 
-	kfree(bucket);
+	if (!IS_ERR(bucket))
+		kfree(bucket);
 	spin_unlock_bh(&htable->lock);
 }
 
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 7073dbb8100..971d172afec 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -267,7 +267,7 @@ recent_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 		for (i = 0; i < e->nstamps; i++) {
 			if (info->seconds && time_after(time, e->stamps[i]))
 				continue;
-			if (info->hit_count && ++hits >= info->hit_count) {
+			if (!info->hit_count || ++hits >= info->hit_count) {
 				ret = !ret;
 				break;
 			}
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 320d0423a24..acbbae1e89b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1093,6 +1093,7 @@ static inline int do_one_set_err(struct sock *sk,
 				 struct netlink_set_err_data *p)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
+	int ret = 0;
 
 	if (sk == p->exclude_sk)
 		goto out;
@@ -1104,10 +1105,15 @@ static inline int do_one_set_err(struct sock *sk,
 	    !test_bit(p->group - 1, nlk->groups))
 		goto out;
 
+	if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) {
+		ret = 1;
+		goto out;
+	}
+
 	sk->sk_err = p->code;
 	sk->sk_error_report(sk);
 out:
-	return 0;
+	return ret;
 }
 
 /**
@@ -1116,12 +1122,16 @@ out:
  * @pid: the PID of a process that we want to skip (if any)
  * @groups: the broadcast group that will notice the error
  * @code: error code, must be negative (as usual in kernelspace)
+ *
+ * This function returns the number of broadcast listeners that have set the
+ * NETLINK_RECV_NO_ENOBUFS socket option.
  */
-void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
+int netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 {
 	struct netlink_set_err_data info;
 	struct hlist_node *node;
 	struct sock *sk;
+	int ret = 0;
 
 	info.exclude_sk = ssk;
 	info.pid = pid;
@@ -1132,9 +1142,10 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
 	read_lock(&nl_table_lock);
 
 	sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
-		do_one_set_err(sk, &info);
+		ret += do_one_set_err(sk, &info);
 
 	read_unlock(&nl_table_lock);
+	return ret;
 }
 EXPORT_SYMBOL(netlink_set_err);
 
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
index 77228f28fa3..2d744f22a9a 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/ar-accept.c
@@ -88,6 +88,11 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
 
 	/* get a notification message to send to the server app */
 	notification = alloc_skb(0, GFP_NOFS);
+	if (!notification) {
+		_debug("no memory");
+		ret = -ENOMEM;
+		goto error_nofree;
+	}
 	rxrpc_new_skb(notification);
 	notification->mark = RXRPC_SKB_MARK_NEW_CALL;
 
@@ -189,6 +194,7 @@ invalid_service:
 	ret = -ECONNREFUSED;
 error:
 	rxrpc_free_skb(notification);
+error_nofree:
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 21f9c7678aa..2f691fb180d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -328,13 +328,16 @@ config NET_CLS_FLOW
 	  module will be called cls_flow.
 
 config NET_CLS_CGROUP
-	bool "Control Group Classifier"
+	tristate "Control Group Classifier"
 	select NET_CLS
 	depends on CGROUPS
 	---help---
 	  Say Y here if you want to classify packets based on the control
 	  cgroup of their process.
 
+	  To compile this code as a module, choose M here: the
+	  module will be called cls_cgroup.
+
 config NET_EMATCH
 	bool "Extended Matches"
 	select NET_CLS
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index e4877ca6727..7f27d2c15e0 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -24,6 +24,25 @@ struct cgroup_cls_state
 	u32 classid;
 };
 
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+					       struct cgroup *cgrp);
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
+
+struct cgroup_subsys net_cls_subsys = {
+	.name		= "net_cls",
+	.create		= cgrp_create,
+	.destroy	= cgrp_destroy,
+	.populate	= cgrp_populate,
+#ifdef CONFIG_NET_CLS_CGROUP
+	.subsys_id	= net_cls_subsys_id,
+#else
+#define net_cls_subsys_id net_cls_subsys.subsys_id
+#endif
+	.module		= THIS_MODULE,
+};
+
+
 static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
 {
 	return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
@@ -79,14 +98,6 @@ static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
 }
 
-struct cgroup_subsys net_cls_subsys = {
-	.name		= "net_cls",
-	.create		= cgrp_create,
-	.destroy	= cgrp_destroy,
-	.populate	= cgrp_populate,
-	.subsys_id	= net_cls_subsys_id,
-};
-
 struct cls_cgroup_head
 {
 	u32			handle;
@@ -277,12 +288,19 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
 
 static int __init init_cgroup_cls(void)
 {
-	return register_tcf_proto_ops(&cls_cgroup_ops);
+	int ret = register_tcf_proto_ops(&cls_cgroup_ops);
+	if (ret)
+		return ret;
+	ret = cgroup_load_subsys(&net_cls_subsys);
+	if (ret)
+		unregister_tcf_proto_ops(&cls_cgroup_ops);
+	return ret;
 }
 
 static void __exit exit_cgroup_cls(void)
 {
 	unregister_tcf_proto_ops(&cls_cgroup_ops);
+	cgroup_unload_subsys(&net_cls_subsys);
 }
 
 module_init(init_cgroup_cls);
diff --git a/net/socket.c b/net/socket.c
index 769c386bd42..f55ffe9f8c8 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2135,6 +2135,10 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 			break;
 		++datagrams;
 
+		/* MSG_WAITFORONE turns on MSG_DONTWAIT after one packet */
+		if (flags & MSG_WAITFORONE)
+			flags |= MSG_DONTWAIT;
+
 		if (timeout) {
 			ktime_get_ts(timeout);
 			*timeout = timespec_sub(end_time, *timeout);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 0cfccc2a029..c389ccf6437 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1280,9 +1280,8 @@ alloc_enc_pages(struct rpc_rqst *rqstp)
 	rqstp->rq_release_snd_buf = priv_release_snd_buf;
 	return 0;
 out_free:
-	for (i--; i >= 0; i--) {
-		__free_page(rqstp->rq_enc_pages[i]);
-	}
+	rqstp->rq_enc_pages_num = i;
+	priv_release_snd_buf(rqstp);
 out:
 	return -EAGAIN;
 }
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
index 13f214f5312..f0c05d3311c 100644
--- a/net/sunrpc/bc_svc.c
+++ b/net/sunrpc/bc_svc.c
@@ -37,21 +37,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define RPCDBG_FACILITY	RPCDBG_SVCDSP
 
-void bc_release_request(struct rpc_task *task)
-{
-	struct rpc_rqst *req = task->tk_rqstp;
-
-	dprintk("RPC:       bc_release_request: task= %p\n", task);
-
-	/*
-	 * Release this request only if it's a backchannel
-	 * preallocated request
-	 */
-	if (!bc_prealloc(req))
-		return;
-	xprt_free_bc_request(req);
-}
-
 /* Empty callback ops */
 static const struct rpc_call_ops nfs41_callback_ops = {
 };
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 154034b675b..19c9983d536 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -659,6 +659,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
 	task = rpc_new_task(&task_setup_data);
 	if (!task) {
 		xprt_free_bc_request(req);
+		task = ERR_PTR(-ENOMEM);
 		goto out;
 	}
 	task->tk_rqstp = req;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 8d63f8fd29b..20e30c6f835 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -587,6 +587,8 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
 	struct dentry *dentry;
 
 	dentry = __rpc_lookup_create(parent, name);
+	if (IS_ERR(dentry))
+		return dentry;
 	if (dentry->d_inode == NULL)
 		return dentry;
 	dput(dentry);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 469de292c23..42f09ade004 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -46,6 +46,7 @@
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/metrics.h>
+#include <linux/sunrpc/bc_xprt.h>
 
 #include "sunrpc.h"
 
@@ -1032,21 +1033,16 @@ void xprt_release(struct rpc_task *task)
 	if (req->rq_release_snd_buf)
 		req->rq_release_snd_buf(req);
 
-	/*
-	 * Early exit if this is a backchannel preallocated request.
-	 * There is no need to have it added to the RPC slot list.
-	 */
-	if (is_bc_request)
-		return;
-
-	memset(req, 0, sizeof(*req));	/* mark unused */
-
 	dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
+	if (likely(!is_bc_request)) {
+		memset(req, 0, sizeof(*req));	/* mark unused */
 
-	spin_lock(&xprt->reserve_lock);
-	list_add(&req->rq_list, &xprt->free);
-	rpc_wake_up_next(&xprt->backlog);
-	spin_unlock(&xprt->reserve_lock);
+		spin_lock(&xprt->reserve_lock);
+		list_add(&req->rq_list, &xprt->free);
+		rpc_wake_up_next(&xprt->backlog);
+		spin_unlock(&xprt->reserve_lock);
+	} else
+		xprt_free_bc_request(req);
 }
 
 /**
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e4839c07c91..9847c30b500 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2251,9 +2251,6 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.buf_free		= rpc_free,
 	.send_request		= xs_tcp_send_request,
 	.set_retrans_timeout	= xprt_set_retrans_timeout_def,
-#if defined(CONFIG_NFS_V4_1)
-	.release_request	= bc_release_request,
-#endif /* CONFIG_NFS_V4_1 */
 	.close			= xs_tcp_close,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_tcp_print_stats,
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index f76f3d13276..6f97a13bcee 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -284,7 +284,7 @@ foreach my $file (@ARGV) {
 	my $file_cnt = @files;
 	my $lastfile;
 
-	open(my $patch, '<', $file)
+	open(my $patch, "< $file")
 	    or die "$P: Can't open $file: $!\n";
 	while (<$patch>) {
 	    my $patch_line = $_;
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index c7865c362d2..fcdfb245a57 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1424,6 +1424,8 @@ sub dump_struct($$) {
 	$nested =~ s/\/\*.*?\*\///gos;
 	# strip kmemcheck_bitfield_{begin,end}.*;
 	$members =~ s/kmemcheck_bitfield_.*?;//gos;
+	# strip attributes
+	$members =~ s/__aligned\s*\(\d+\)//gos;
 
 	create_parameterlist($members, ';', $file);
 	check_sections($file, $declaration_name, "struct", $sectcheck, $struct_actual, $nested);
@@ -1728,6 +1730,7 @@ sub dump_function($$) {
     $prototype =~ s/^noinline +//;
     $prototype =~ s/__devinit +//;
     $prototype =~ s/__init +//;
+    $prototype =~ s/__init_or_module +//;
     $prototype =~ s/^#\s*define\s+//; #ak added
     $prototype =~ s/__attribute__\s*\(\([a-z,]*\)\)//;
 
diff --git a/sound/arm/pxa2xx-pcm-lib.c b/sound/arm/pxa2xx-pcm-lib.c
index 743ac6a2906..fd51fa8b06a 100644
--- a/sound/arm/pxa2xx-pcm-lib.c
+++ b/sound/arm/pxa2xx-pcm-lib.c
@@ -205,6 +205,7 @@ int __pxa2xx_pcm_open(struct snd_pcm_substream *substream)
 	if (!rtd->dma_desc_array)
 		goto err1;
 
+	rtd->dma_ch = -1;
 	runtime->private_data = rtd;
 	return 0;
 
diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c
index b546ac2660f..a2ff86189d2 100644
--- a/sound/core/pcm_lib.c
+++ b/sound/core/pcm_lib.c
@@ -148,6 +148,9 @@ static void pcm_debug_name(struct snd_pcm_substream *substream,
 
 #define xrun_debug(substream, mask) \
 			((substream)->pstr->xrun_debug & (mask))
+#else
+#define xrun_debug(substream, mask)	0
+#endif
 
 #define dump_stack_on_xrun(substream) do {			\
 		if (xrun_debug(substream, XRUN_DEBUG_STACK))	\
@@ -169,6 +172,7 @@ static void xrun(struct snd_pcm_substream *substream)
 	}
 }
 
+#ifdef CONFIG_SND_PCM_XRUN_DEBUG
 #define hw_ptr_error(substream, fmt, args...)				\
 	do {								\
 		if (xrun_debug(substream, XRUN_DEBUG_BASIC)) {		\
@@ -255,8 +259,6 @@ static void xrun_log_show(struct snd_pcm_substream *substream)
 
 #else /* ! CONFIG_SND_PCM_XRUN_DEBUG */
 
-#define xrun_debug(substream, mask)	0
-#define xrun(substream)			do { } while (0)
 #define hw_ptr_error(substream, fmt, args...) do { } while (0)
 #define xrun_log(substream, pos)	do { } while (0)
 #define xrun_log_show(substream)	do { } while (0)
diff --git a/sound/oss/vidc.c b/sound/oss/vidc.c
index 725fef0f59a..a4127bab923 100644
--- a/sound/oss/vidc.c
+++ b/sound/oss/vidc.c
@@ -363,13 +363,13 @@ static void vidc_audio_trigger(int dev, int enable_bits)
 	struct audio_operations *adev = audio_devs[dev];
 
 	if (enable_bits & PCM_ENABLE_OUTPUT) {
-		if (!(adev->flags & DMA_ACTIVE)) {
+		if (!(adev->dmap_out->flags & DMA_ACTIVE)) {
 			unsigned long flags;
 
 			local_irq_save(flags);
 
 			/* prevent recusion */
-			adev->flags |= DMA_ACTIVE;
+			adev->dmap_out->flags |= DMA_ACTIVE;
 
 			dma_interrupt = vidc_audio_dma_interrupt;
 			vidc_sound_dma_irq(0, NULL);
diff --git a/sound/pci/ac97/ac97_patch.c b/sound/pci/ac97/ac97_patch.c
index 1caf5e3c1f6..e68c98ef404 100644
--- a/sound/pci/ac97/ac97_patch.c
+++ b/sound/pci/ac97/ac97_patch.c
@@ -1852,12 +1852,14 @@ static unsigned int ad1981_jacks_blacklist[] = {
 	0x10140523, /* Thinkpad R40 */
 	0x10140534, /* Thinkpad X31 */
 	0x10140537, /* Thinkpad T41p */
+	0x1014053e, /* Thinkpad R40e */
 	0x10140554, /* Thinkpad T42p/R50p */
 	0x10140567, /* Thinkpad T43p 2668-G7U */
 	0x10140581, /* Thinkpad X41-2527 */
 	0x10280160, /* Dell Dimension 2400 */
 	0x104380b0, /* Asus A7V8X-MX */
 	0x11790241, /* Toshiba Satellite A-15 S127 */
+	0x1179ff10, /* Toshiba P500 */
 	0x144dc01a, /* Samsung NP-X20C004/SEG */
 	0 /* end */
 };
diff --git a/sound/pci/cmipci.c b/sound/pci/cmipci.c
index 1ded64e0564..329968edca9 100644
--- a/sound/pci/cmipci.c
+++ b/sound/pci/cmipci.c
@@ -941,13 +941,21 @@ static snd_pcm_uframes_t snd_cmipci_pcm_pointer(struct cmipci *cm, struct cmipci
 						struct snd_pcm_substream *substream)
 {
 	size_t ptr;
-	unsigned int reg;
+	unsigned int reg, rem, tries;
+
 	if (!rec->running)
 		return 0;
 #if 1 // this seems better..
 	reg = rec->ch ? CM_REG_CH1_FRAME2 : CM_REG_CH0_FRAME2;
-	ptr = rec->dma_size - (snd_cmipci_read_w(cm, reg) + 1);
-	ptr >>= rec->shift;
+	for (tries = 0; tries < 3; tries++) {
+		rem = snd_cmipci_read_w(cm, reg);
+		if (rem < rec->dma_size)
+			goto ok;
+	} 
+	printk(KERN_ERR "cmipci: invalid PCM pointer: %#x\n", rem);
+	return SNDRV_PCM_POS_XRUN;
+ok:
+	ptr = (rec->dma_size - (rem + 1)) >> rec->shift;
 #else
 	reg = rec->ch ? CM_REG_CH1_FRAME1 : CM_REG_CH0_FRAME1;
 	ptr = snd_cmipci_read(cm, reg) - rec->offset;
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 8b2915631cc..4bb90675f70 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2269,6 +2269,7 @@ static struct snd_pci_quirk position_fix_list[] __devinitdata = {
 	SND_PCI_QUIRK(0x103c, 0x306d, "HP dv3", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x1106, 0x3288, "ASUS M2V-MX SE", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x1043, 0x813d, "ASUS P5AD2", POS_FIX_LPIB),
+	SND_PCI_QUIRK(0x1458, 0xa022, "ga-ma770-ud3", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x1462, 0x1002, "MSI Wind U115", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x1565, 0x820f, "Biostar Microtech", POS_FIX_LPIB),
 	SND_PCI_QUIRK(0x8086, 0xd601, "eMachines T5212", POS_FIX_LPIB),
diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 194a28c5499..61682e1d09d 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -1591,6 +1591,21 @@ static int patch_cxt5047(struct hda_codec *codec)
 #endif	
 	}
 	spec->vmaster_nid = 0x13;
+
+	switch (codec->subsystem_id >> 16) {
+	case 0x103c:
+		/* HP laptops have really bad sound over 0 dB on NID 0x10.
+		 * Fix max PCM level to 0 dB (originally it has 0x1e steps
+		 * with 0 dB offset 0x17)
+		 */
+		snd_hda_override_amp_caps(codec, 0x10, HDA_INPUT,
+					  (0x17 << AC_AMPCAP_OFFSET_SHIFT) |
+					  (0x17 << AC_AMPCAP_NUM_STEPS_SHIFT) |
+					  (0x05 << AC_AMPCAP_STEP_SIZE_SHIFT) |
+					  (1 << AC_AMPCAP_MUTE_SHIFT));
+		break;
+	}
+
 	return 0;
 }
 
diff --git a/sound/pci/hda/patch_nvhdmi.c b/sound/pci/hda/patch_nvhdmi.c
index 70669a24690..3c10c0b149f 100644
--- a/sound/pci/hda/patch_nvhdmi.c
+++ b/sound/pci/hda/patch_nvhdmi.c
@@ -538,8 +538,6 @@ static int patch_nvhdmi_2ch(struct hda_codec *codec)
  * patch entries
  */
 static struct hda_codec_preset snd_hda_preset_nvhdmi[] = {
-	{ .id = 0x10de0067, .name = "MCP67 HDMI", .patch = patch_nvhdmi_2ch },
-	{ .id = 0x10de8001, .name = "MCP73 HDMI", .patch = patch_nvhdmi_2ch },
 	{ .id = 0x10de0002, .name = "MCP77/78 HDMI",
 	  .patch = patch_nvhdmi_8ch_7x },
 	{ .id = 0x10de0003, .name = "MCP77/78 HDMI",
@@ -550,12 +548,16 @@ static struct hda_codec_preset snd_hda_preset_nvhdmi[] = {
 	  .patch = patch_nvhdmi_8ch_7x },
 	{ .id = 0x10de0007, .name = "MCP79/7A HDMI",
 	  .patch = patch_nvhdmi_8ch_7x },
-	{ .id = 0x10de000c, .name = "MCP89 HDMI",
+	{ .id = 0x10de000a, .name = "GT220 HDMI",
 	  .patch = patch_nvhdmi_8ch_89 },
 	{ .id = 0x10de000b, .name = "GT21x HDMI",
 	  .patch = patch_nvhdmi_8ch_89 },
+	{ .id = 0x10de000c, .name = "MCP89 HDMI",
+	  .patch = patch_nvhdmi_8ch_89 },
 	{ .id = 0x10de000d, .name = "GT240 HDMI",
 	  .patch = patch_nvhdmi_8ch_89 },
+	{ .id = 0x10de0067, .name = "MCP67 HDMI", .patch = patch_nvhdmi_2ch },
+	{ .id = 0x10de8001, .name = "MCP73 HDMI", .patch = patch_nvhdmi_2ch },
 	{} /* terminator */
 };
 
@@ -564,11 +566,12 @@ MODULE_ALIAS("snd-hda-codec-id:10de0003");
 MODULE_ALIAS("snd-hda-codec-id:10de0005");
 MODULE_ALIAS("snd-hda-codec-id:10de0006");
 MODULE_ALIAS("snd-hda-codec-id:10de0007");
-MODULE_ALIAS("snd-hda-codec-id:10de0067");
-MODULE_ALIAS("snd-hda-codec-id:10de8001");
-MODULE_ALIAS("snd-hda-codec-id:10de000c");
+MODULE_ALIAS("snd-hda-codec-id:10de000a");
 MODULE_ALIAS("snd-hda-codec-id:10de000b");
+MODULE_ALIAS("snd-hda-codec-id:10de000c");
 MODULE_ALIAS("snd-hda-codec-id:10de000d");
+MODULE_ALIAS("snd-hda-codec-id:10de0067");
+MODULE_ALIAS("snd-hda-codec-id:10de8001");
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("NVIDIA HDMI HD-audio codec");
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 4ec57633af8..9a23444e9e7 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -2532,8 +2532,6 @@ static int alc_build_controls(struct hda_codec *codec)
 			return err;
 	}
 
-	alc_free_kctls(codec); /* no longer needed */
-
 	/* assign Capture Source enums to NID */
 	kctl = snd_hda_find_mixer_ctl(codec, "Capture Source");
 	if (!kctl)
@@ -2602,6 +2600,9 @@ static int alc_build_controls(struct hda_codec *codec)
 			}
 		}
 	}
+
+	alc_free_kctls(codec); /* no longer needed */
+
 	return 0;
 }
 
@@ -10042,8 +10043,11 @@ static void alc882_auto_set_output_and_unmute(struct hda_codec *codec,
 	alc_set_pin_output(codec, nid, pin_type);
 	if (spec->multiout.dac_nids[dac_idx] == 0x25)
 		idx = 4;
-	else
+	else {
+		if (spec->multiout.num_dacs >= dac_idx)
+			return;
 		idx = spec->multiout.dac_nids[dac_idx] - 2;
+	}
 	snd_hda_codec_write(codec, nid, 0, AC_VERB_SET_CONNECT_SEL, idx);
 
 }
diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
index 8c416bb18a5..c4be3fab94e 100644
--- a/sound/pci/hda/patch_sigmatel.c
+++ b/sound/pci/hda/patch_sigmatel.c
@@ -1730,6 +1730,8 @@ static struct snd_pci_quirk stac92hd71bxx_cfg_tbl[] = {
 		      "HP HDX", STAC_HP_HDX),  /* HDX16 */
 	SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x3620,
 		      "HP dv6", STAC_HP_DV5),
+	SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x3061,
+		      "HP dv6", STAC_HP_DV5), /* HP dv6-1110ax */
 	SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x7010,
 		      "HP", STAC_HP_DV5),
 	SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0233,
diff --git a/sound/soc/codecs/tlv320dac33.c b/sound/soc/codecs/tlv320dac33.c
index f9f367d29a9..d50f1699ccb 100644
--- a/sound/soc/codecs/tlv320dac33.c
+++ b/sound/soc/codecs/tlv320dac33.c
@@ -778,7 +778,7 @@ static int dac33_prepare_chip(struct snd_pcm_substream *substream)
 	if (dac33->fifo_mode) {
 		/* Generic for all FIFO modes */
 		/* 50-51 : ASRC Control registers */
-		dac33_write(codec, DAC33_ASRC_CTRL_A, (1 << 4)); /* div=2 */
+		dac33_write(codec, DAC33_ASRC_CTRL_A, DAC33_SRCLKDIV(1));
 		dac33_write(codec, DAC33_ASRC_CTRL_B, 1); /* ??? */
 
 		/* Write registers 0x34 and 0x35 (MSB, LSB) */
@@ -1038,11 +1038,7 @@ static int dac33_set_dai_fmt(struct snd_soc_dai *codec_dai,
 	case SND_SOC_DAIFMT_DSP_A:
 		aictrl_a |= DAC33_AFMT_DSP;
 		aictrl_b &= ~DAC33_DATA_DELAY_MASK;
-		aictrl_b |= DAC33_DATA_DELAY(1); /* 1 bit delay */
-		break;
-	case SND_SOC_DAIFMT_DSP_B:
-		aictrl_a |= DAC33_AFMT_DSP;
-		aictrl_b &= ~DAC33_DATA_DELAY_MASK; /* No delay */
+		aictrl_b |= DAC33_DATA_DELAY(0);
 		break;
 	case SND_SOC_DAIFMT_RIGHT_J:
 		aictrl_a |= DAC33_AFMT_RIGHT_J;
@@ -1066,7 +1062,7 @@ static void dac33_init_chip(struct snd_soc_codec *codec)
 {
 	/* 44-46: DAC Control Registers */
 	/* A : DAC sample rate Fsref/1.5 */
-	dac33_write(codec, DAC33_DAC_CTRL_A, DAC33_DACRATE(1));
+	dac33_write(codec, DAC33_DAC_CTRL_A, DAC33_DACRATE(0));
 	/* B : DAC src=normal, not muted */
 	dac33_write(codec, DAC33_DAC_CTRL_B, DAC33_DACSRCR_RIGHT |
 					     DAC33_DACSRCL_LEFT);
diff --git a/sound/soc/codecs/wm_hubs.c b/sound/soc/codecs/wm_hubs.c
index 0ad9f5d536c..486bdd21a98 100644
--- a/sound/soc/codecs/wm_hubs.c
+++ b/sound/soc/codecs/wm_hubs.c
@@ -74,7 +74,7 @@ static void wait_for_dc_servo(struct snd_soc_codec *codec)
 		msleep(1);
 		reg = snd_soc_read(codec, WM8993_DC_SERVO_READBACK_0);
 		dev_dbg(codec->dev, "DC servo: %x\n", reg);
-	} while (reg & WM8993_DCS_DATAPATH_BUSY);
+	} while (reg & WM8993_DCS_DATAPATH_BUSY && count < 400);
 
 	if (reg & WM8993_DCS_DATAPATH_BUSY)
 		dev_err(codec->dev, "Timed out waiting for DC Servo\n");
diff --git a/sound/soc/imx/Kconfig b/sound/soc/imx/Kconfig
index c7d0fd9b7de..7174b4c710d 100644
--- a/sound/soc/imx/Kconfig
+++ b/sound/soc/imx/Kconfig
@@ -1,6 +1,6 @@
 config SND_IMX_SOC
 	tristate "SoC Audio for Freescale i.MX CPUs"
-	depends on ARCH_MXC && BROKEN
+	depends on ARCH_MXC
 	select SND_PCM
 	select FIQ
 	select SND_SOC_AC97_BUS
diff --git a/sound/soc/sh/Kconfig b/sound/soc/sh/Kconfig
index 106674979b5..f07f6d8b93e 100644
--- a/sound/soc/sh/Kconfig
+++ b/sound/soc/sh/Kconfig
@@ -32,6 +32,7 @@ config SND_SOC_SH4_SIU
 	select DMA_ENGINE
 	select DMADEVICES
 	select SH_DMAE
+	select FW_LOADER
 
 ##
 ## Boards
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index c30a3359234..152d6c9b1fa 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -47,7 +47,6 @@
 #include "util/probe-event.h"
 
 #define MAX_PATH_LEN 256
-#define MAX_PROBES 128
 
 /* Session management structure */
 static struct {
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 0b719e3dde0..1f529321607 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -455,7 +455,7 @@ static void print_sym_table(void)
 	struct sym_entry *syme, *n;
 	struct rb_root tmp = RB_ROOT;
 	struct rb_node *nd;
-	int sym_width = 0, dso_width = 0, max_dso_width;
+	int sym_width = 0, dso_width = 0, dso_short_width = 0;
 	const int win_width = winsize.ws_col - 1;
 
 	samples = userspace_samples = 0;
@@ -545,15 +545,20 @@ static void print_sym_table(void)
 		if (syme->map->dso->long_name_len > dso_width)
 			dso_width = syme->map->dso->long_name_len;
 
+		if (syme->map->dso->short_name_len > dso_short_width)
+			dso_short_width = syme->map->dso->short_name_len;
+
 		if (syme->name_len > sym_width)
 			sym_width = syme->name_len;
 	}
 
 	printed = 0;
 
-	max_dso_width = winsize.ws_col - sym_width - 29;
-	if (dso_width > max_dso_width)
-		dso_width = max_dso_width;
+	if (sym_width + dso_width > winsize.ws_col - 29) {
+		dso_width = dso_short_width;
+		if (sym_width + dso_width > winsize.ws_col - 29)
+			sym_width = winsize.ws_col - dso_width - 29;
+	}
 	putchar('\n');
 	if (nr_counters == 1)
 		printf("             samples  pcnt");
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 53181dbfe4a..7c004b6ef24 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -242,7 +242,7 @@ void parse_perf_probe_event(const char *str, struct probe_point *pp,
 
 	/* Parse probe point */
 	parse_perf_probe_probepoint(argv[0], pp);
-	if (pp->file || pp->line)
+	if (pp->file || pp->line || pp->lazy_line)
 		*need_dwarf = true;
 
 	/* Copy arguments and ensure return probe has no C argument */
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 1e6c65ebbd8..c171a243d05 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -333,8 +333,8 @@ static void show_location(Dwarf_Op *op, struct probe_finder *pf)
 		die("%u exceeds max register number.", regn);
 
 	if (deref)
-		ret = snprintf(pf->buf, pf->len, " %s=+%ju(%s)",
-			       pf->var, (uintmax_t)offs, regs);
+		ret = snprintf(pf->buf, pf->len, " %s=%+jd(%s)",
+			       pf->var, (intmax_t)offs, regs);
 	else
 		ret = snprintf(pf->buf, pf->len, " %s=%s", pf->var, regs);
 	DIE_IF(ret < 0);
@@ -352,8 +352,7 @@ static void show_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 	if (dwarf_attr(vr_die, DW_AT_location, &attr) == NULL)
 		goto error;
 	/* TODO: handle more than 1 exprs */
-	ret = dwarf_getlocation_addr(&attr, (pf->addr - pf->cu_base),
-				     &expr, &nexpr, 1);
+	ret = dwarf_getlocation_addr(&attr, pf->addr, &expr, &nexpr, 1);
 	if (ret <= 0 || nexpr == 0)
 		goto error;
 
@@ -437,8 +436,7 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 
 	/* Get the frame base attribute/ops */
 	dwarf_attr(sp_die, DW_AT_frame_base, &fb_attr);
-	ret = dwarf_getlocation_addr(&fb_attr, (pf->addr - pf->cu_base),
-				     &pf->fb_ops, &nops, 1);
+	ret = dwarf_getlocation_addr(&fb_attr, pf->addr, &pf->fb_ops, &nops, 1);
 	if (ret <= 0 || nops == 0)
 		pf->fb_ops = NULL;
 
@@ -455,6 +453,9 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 	/* *pf->fb_ops will be cached in libdw. Don't free it. */
 	pf->fb_ops = NULL;
 
+	if (pp->found == MAX_PROBES)
+		die("Too many( > %d) probe point found.\n", MAX_PROBES);
+
 	pp->probes[pp->found] = strdup(tmp);
 	pp->found++;
 }
@@ -641,7 +642,6 @@ static void find_probe_point_by_func(struct probe_finder *pf)
 int find_probe_point(int fd, struct probe_point *pp)
 {
 	struct probe_finder pf = {.pp = pp};
-	int ret;
 	Dwarf_Off off, noff;
 	size_t cuhl;
 	Dwarf_Die *diep;
@@ -668,10 +668,6 @@ int find_probe_point(int fd, struct probe_point *pp)
 			pf.fname = NULL;
 
 		if (!pp->file || pf.fname) {
-			/* Save CU base address (for frame_base) */
-			ret = dwarf_lowpc(&pf.cu_die, &pf.cu_base);
-			if (ret != 0)
-				pf.cu_base = 0;
 			if (pp->function)
 				find_probe_point_by_func(&pf);
 			else if (pp->lazy_line)
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index d1a651793ba..21f7354397b 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -71,7 +71,6 @@ struct probe_finder {
 
 	/* For variable searching */
 	Dwarf_Op		*fb_ops;	/* Frame base attribute */
-	Dwarf_Addr		cu_base;	/* Current CU base address */
 	const char		*var;		/* Current variable name */
 	char			*buf;		/* Current output buffer */
 	int			len;		/* Length of output buffer */
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 323c0aea0a9..c458c4a371d 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -163,9 +163,17 @@ void dso__set_long_name(struct dso *self, char *name)
 	self->long_name_len = strlen(name);
 }
 
+static void dso__set_short_name(struct dso *self, const char *name)
+{
+	if (name == NULL)
+		return;
+	self->short_name = name;
+	self->short_name_len = strlen(name);
+}
+
 static void dso__set_basename(struct dso *self)
 {
-	self->short_name = basename(self->long_name);
+	dso__set_short_name(self, basename(self->long_name));
 }
 
 struct dso *dso__new(const char *name)
@@ -176,7 +184,7 @@ struct dso *dso__new(const char *name)
 		int i;
 		strcpy(self->name, name);
 		dso__set_long_name(self, self->name);
-		self->short_name = self->name;
+		dso__set_short_name(self, self->name);
 		for (i = 0; i < MAP__NR_TYPES; ++i)
 			self->symbols[i] = self->symbol_names[i] = RB_ROOT;
 		self->slen_calculated = 0;
@@ -897,7 +905,6 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name,
 	struct kmap *kmap = self->kernel ? map__kmap(map) : NULL;
 	struct map *curr_map = map;
 	struct dso *curr_dso = self;
-	size_t dso_name_len = strlen(self->short_name);
 	Elf_Data *symstrs, *secstrs;
 	uint32_t nr_syms;
 	int err = -1;
@@ -987,7 +994,8 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name,
 			char dso_name[PATH_MAX];
 
 			if (strcmp(section_name,
-				   curr_dso->short_name + dso_name_len) == 0)
+				   (curr_dso->short_name +
+				    self->short_name_len)) == 0)
 				goto new_symbol;
 
 			if (strcmp(section_name, ".text") == 0) {
@@ -1782,7 +1790,7 @@ struct dso *dso__new_kernel(const char *name)
 	struct dso *self = dso__new(name ?: "[kernel.kallsyms]");
 
 	if (self != NULL) {
-		self->short_name = "[kernel]";
+		dso__set_short_name(self, "[kernel]");
 		self->kernel	 = 1;
 	}
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 280dadd32a0..f30a3742891 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -110,9 +110,10 @@ struct dso {
 	u8		 sorted_by_name;
 	u8		 loaded;
 	u8		 build_id[BUILD_ID_SIZE];
-	u16		 long_name_len;
 	const char	 *short_name;
 	char	 	 *long_name;
+	u16		 long_name_len;
+	u16		 short_name_len;
 	char		 name[0];
 };