57 files changed, 1855 insertions, 973 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 159e2a0c3e8..76b44290c15 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -217,6 +217,12 @@ exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
 to represent the cpuset hierarchy provides for a familiar permission
 and name space for cpusets, with a minimum of additional kernel code.
 
+The cpus file in the root (top_cpuset) cpuset is read-only.
+It automatically tracks the value of cpu_online_map, using a CPU
+hotplug notifier.  If and when memory nodes can be hotplugged,
+we expect to make the mems file in the root cpuset read-only
+as well, and have it track the value of node_online_map.
+
 
 1.4 What are exclusive cpusets ?
 --------------------------------
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 66fdc0744fe..16dec61d767 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -62,8 +62,8 @@ ramfs-rootfs-initramfs.txt
 	- info on the 'in memory' filesystems ramfs, rootfs and initramfs.
 reiser4.txt
 	- info on the Reiser4 filesystem based on dancing tree algorithms.
-relayfs.txt
-	- info on relayfs, for efficient streaming from kernel to user space.
+relay.txt
+	- info on relay, for efficient streaming from kernel to user space.
 romfs.txt
 	- description of the ROMFS filesystem.
 smbfs.txt
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt
new file mode 100644
index 00000000000..d6788dae034
--- /dev/null
+++ b/Documentation/filesystems/relay.txt
@@ -0,0 +1,479 @@
+relay interface (formerly relayfs)
+==================================
+
+The relay interface provides a means for kernel applications to
+efficiently log and transfer large quantities of data from the kernel
+to userspace via user-defined 'relay channels'.
+
+A 'relay channel' is a kernel->user data relay mechanism implemented
+as a set of per-cpu kernel buffers ('channel buffers'), each
+represented as a regular file ('relay file') in user space.  Kernel
+clients write into the channel buffers using efficient write
+functions; these automatically log into the current cpu's channel
+buffer.  User space applications mmap() or read() from the relay files
+and retrieve the data as it becomes available.  The relay files
+themselves are files created in a host filesystem, e.g. debugfs, and
+are associated with the channel buffers using the API described below.
+
+The format of the data logged into the channel buffers is completely
+up to the kernel client; the relay interface does however provide
+hooks which allow kernel clients to impose some structure on the
+buffer data.  The relay interface doesn't implement any form of data
+filtering - this also is left to the kernel client.  The purpose is to
+keep things as simple as possible.
+
+This document provides an overview of the relay interface API.  The
+details of the function parameters are documented along with the
+functions in the relay interface code - please see that for details.
+
+Semantics
+=========
+
+Each relay channel has one buffer per CPU, each buffer has one or more
+sub-buffers.  Messages are written to the first sub-buffer until it is
+too full to contain a new message, in which case it it is written to
+the next (if available).  Messages are never split across sub-buffers.
+At this point, userspace can be notified so it empties the first
+sub-buffer, while the kernel continues writing to the next.
+
+When notified that a sub-buffer is full, the kernel knows how many
+bytes of it are padding i.e. unused space occurring because a complete
+message couldn't fit into a sub-buffer.  Userspace can use this
+knowledge to copy only valid data.
+
+After copying it, userspace can notify the kernel that a sub-buffer
+has been consumed.
+
+A relay channel can operate in a mode where it will overwrite data not
+yet collected by userspace, and not wait for it to be consumed.
+
+The relay channel itself does not provide for communication of such
+data between userspace and kernel, allowing the kernel side to remain
+simple and not impose a single interface on userspace.  It does
+provide a set of examples and a separate helper though, described
+below.
+
+The read() interface both removes padding and internally consumes the
+read sub-buffers; thus in cases where read(2) is being used to drain
+the channel buffers, special-purpose communication between kernel and
+user isn't necessary for basic operation.
+
+One of the major goals of the relay interface is to provide a low
+overhead mechanism for conveying kernel data to userspace.  While the
+read() interface is easy to use, it's not as efficient as the mmap()
+approach; the example code attempts to make the tradeoff between the
+two approaches as small as possible.
+
+klog and relay-apps example code
+================================
+
+The relay interface itself is ready to use, but to make things easier,
+a couple simple utility functions and a set of examples are provided.
+
+The relay-apps example tarball, available on the relay sourceforge
+site, contains a set of self-contained examples, each consisting of a
+pair of .c files containing boilerplate code for each of the user and
+kernel sides of a relay application.  When combined these two sets of
+boilerplate code provide glue to easily stream data to disk, without
+having to bother with mundane housekeeping chores.
+
+The 'klog debugging functions' patch (klog.patch in the relay-apps
+tarball) provides a couple of high-level logging functions to the
+kernel which allow writing formatted text or raw data to a channel,
+regardless of whether a channel to write into exists or not, or even
+whether the relay interface is compiled into the kernel or not.  These
+functions allow you to put unconditional 'trace' statements anywhere
+in the kernel or kernel modules; only when there is a 'klog handler'
+registered will data actually be logged (see the klog and kleak
+examples for details).
+
+It is of course possible to use the relay interface from scratch,
+i.e. without using any of the relay-apps example code or klog, but
+you'll have to implement communication between userspace and kernel,
+allowing both to convey the state of buffers (full, empty, amount of
+padding).  The read() interface both removes padding and internally
+consumes the read sub-buffers; thus in cases where read(2) is being
+used to drain the channel buffers, special-purpose communication
+between kernel and user isn't necessary for basic operation.  Things
+such as buffer-full conditions would still need to be communicated via
+some channel though.
+
+klog and the relay-apps examples can be found in the relay-apps
+tarball on http://relayfs.sourceforge.net
+
+The relay interface user space API
+==================================
+
+The relay interface implements basic file operations for user space
+access to relay channel buffer data.  Here are the file operations
+that are available and some comments regarding their behavior:
+
+open()	    enables user to open an _existing_ channel buffer.
+
+mmap()      results in channel buffer being mapped into the caller's
+	    memory space. Note that you can't do a partial mmap - you
+	    must map the entire file, which is NRBUF * SUBBUFSIZE.
+
+read()      read the contents of a channel buffer.  The bytes read are
+	    'consumed' by the reader, i.e. they won't be available
+	    again to subsequent reads.  If the channel is being used
+	    in no-overwrite mode (the default), it can be read at any
+	    time even if there's an active kernel writer.  If the
+	    channel is being used in overwrite mode and there are
+	    active channel writers, results may be unpredictable -
+	    users should make sure that all logging to the channel has
+	    ended before using read() with overwrite mode.  Sub-buffer
+	    padding is automatically removed and will not be seen by
+	    the reader.
+
+sendfile()  transfer data from a channel buffer to an output file
+	    descriptor. Sub-buffer padding is automatically removed
+	    and will not be seen by the reader.
+
+poll()      POLLIN/POLLRDNORM/POLLERR supported.  User applications are
+	    notified when sub-buffer boundaries are crossed.
+
+close()     decrements the channel buffer's refcount.  When the refcount
+	    reaches 0, i.e. when no process or kernel client has the
+	    buffer open, the channel buffer is freed.
+
+In order for a user application to make use of relay files, the
+host filesystem must be mounted.  For example,
+
+	mount -t debugfs debugfs /debug
+
+NOTE:   the host filesystem doesn't need to be mounted for kernel
+	clients to create or use channels - it only needs to be
+	mounted when user space applications need access to the buffer
+	data.
+
+
+The relay interface kernel API
+==============================
+
+Here's a summary of the API the relay interface provides to in-kernel clients:
+
+TBD(curr. line MT:/API/)
+  channel management functions:
+
+    relay_open(base_filename, parent, subbuf_size, n_subbufs,
+               callbacks)
+    relay_close(chan)
+    relay_flush(chan)
+    relay_reset(chan)
+
+  channel management typically called on instigation of userspace:
+
+    relay_subbufs_consumed(chan, cpu, subbufs_consumed)
+
+  write functions:
+
+    relay_write(chan, data, length)
+    __relay_write(chan, data, length)
+    relay_reserve(chan, length)
+
+  callbacks:
+
+    subbuf_start(buf, subbuf, prev_subbuf, prev_padding)
+    buf_mapped(buf, filp)
+    buf_unmapped(buf, filp)
+    create_buf_file(filename, parent, mode, buf, is_global)
+    remove_buf_file(dentry)
+
+  helper functions:
+
+    relay_buf_full(buf)
+    subbuf_start_reserve(buf, length)
+
+
+Creating a channel
+------------------
+
+relay_open() is used to create a channel, along with its per-cpu
+channel buffers.  Each channel buffer will have an associated file
+created for it in the host filesystem, which can be and mmapped or
+read from in user space.  The files are named basename0...basenameN-1
+where N is the number of online cpus, and by default will be created
+in the root of the filesystem (if the parent param is NULL).  If you
+want a directory structure to contain your relay files, you should
+create it using the host filesystem's directory creation function,
+e.g. debugfs_create_dir(), and pass the parent directory to
+relay_open().  Users are responsible for cleaning up any directory
+structure they create, when the channel is closed - again the host
+filesystem's directory removal functions should be used for that,
+e.g. debugfs_remove().
+
+In order for a channel to be created and the host filesystem's files
+associated with its channel buffers, the user must provide definitions
+for two callback functions, create_buf_file() and remove_buf_file().
+create_buf_file() is called once for each per-cpu buffer from
+relay_open() and allows the user to create the file which will be used
+to represent the corresponding channel buffer.  The callback should
+return the dentry of the file created to represent the channel buffer.
+remove_buf_file() must also be defined; it's responsible for deleting
+the file(s) created in create_buf_file() and is called during
+relay_close().
+
+Here are some typical definitions for these callbacks, in this case
+using debugfs:
+
+/*
+ * create_buf_file() callback.  Creates relay file in debugfs.
+ */
+static struct dentry *create_buf_file_handler(const char *filename,
+                                              struct dentry *parent,
+                                              int mode,
+                                              struct rchan_buf *buf,
+                                              int *is_global)
+{
+        return debugfs_create_file(filename, mode, parent, buf,
+	                           &relay_file_operations);
+}
+
+/*
+ * remove_buf_file() callback.  Removes relay file from debugfs.
+ */
+static int remove_buf_file_handler(struct dentry *dentry)
+{
+        debugfs_remove(dentry);
+
+        return 0;
+}
+
+/*
+ * relay interface callbacks
+ */
+static struct rchan_callbacks relay_callbacks =
+{
+        .create_buf_file = create_buf_file_handler,
+        .remove_buf_file = remove_buf_file_handler,
+};
+
+And an example relay_open() invocation using them:
+
+  chan = relay_open("cpu", NULL, SUBBUF_SIZE, N_SUBBUFS, &relay_callbacks);
+
+If the create_buf_file() callback fails, or isn't defined, channel
+creation and thus relay_open() will fail.
+
+The total size of each per-cpu buffer is calculated by multiplying the
+number of sub-buffers by the sub-buffer size passed into relay_open().
+The idea behind sub-buffers is that they're basically an extension of
+double-buffering to N buffers, and they also allow applications to
+easily implement random-access-on-buffer-boundary schemes, which can
+be important for some high-volume applications.  The number and size
+of sub-buffers is completely dependent on the application and even for
+the same application, different conditions will warrant different
+values for these parameters at different times.  Typically, the right
+values to use are best decided after some experimentation; in general,
+though, it's safe to assume that having only 1 sub-buffer is a bad
+idea - you're guaranteed to either overwrite data or lose events
+depending on the channel mode being used.
+
+The create_buf_file() implementation can also be defined in such a way
+as to allow the creation of a single 'global' buffer instead of the
+default per-cpu set.  This can be useful for applications interested
+mainly in seeing the relative ordering of system-wide events without
+the need to bother with saving explicit timestamps for the purpose of
+merging/sorting per-cpu files in a postprocessing step.
+
+To have relay_open() create a global buffer, the create_buf_file()
+implementation should set the value of the is_global outparam to a
+non-zero value in addition to creating the file that will be used to
+represent the single buffer.  In the case of a global buffer,
+create_buf_file() and remove_buf_file() will be called only once.  The
+normal channel-writing functions, e.g. relay_write(), can still be
+used - writes from any cpu will transparently end up in the global
+buffer - but since it is a global buffer, callers should make sure
+they use the proper locking for such a buffer, either by wrapping
+writes in a spinlock, or by copying a write function from relay.h and
+creating a local version that internally does the proper locking.
+
+Channel 'modes'
+---------------
+
+relay channels can be used in either of two modes - 'overwrite' or
+'no-overwrite'.  The mode is entirely determined by the implementation
+of the subbuf_start() callback, as described below.  The default if no
+subbuf_start() callback is defined is 'no-overwrite' mode.  If the
+default mode suits your needs, and you plan to use the read()
+interface to retrieve channel data, you can ignore the details of this
+section, as it pertains mainly to mmap() implementations.
+
+In 'overwrite' mode, also known as 'flight recorder' mode, writes
+continuously cycle around the buffer and will never fail, but will
+unconditionally overwrite old data regardless of whether it's actually
+been consumed.  In no-overwrite mode, writes will fail, i.e. data will
+be lost, if the number of unconsumed sub-buffers equals the total
+number of sub-buffers in the channel.  It should be clear that if
+there is no consumer or if the consumer can't consume sub-buffers fast
+enough, data will be lost in either case; the only difference is
+whether data is lost from the beginning or the end of a buffer.
+
+As explained above, a relay channel is made of up one or more
+per-cpu channel buffers, each implemented as a circular buffer
+subdivided into one or more sub-buffers.  Messages are written into
+the current sub-buffer of the channel's current per-cpu buffer via the
+write functions described below.  Whenever a message can't fit into
+the current sub-buffer, because there's no room left for it, the
+client is notified via the subbuf_start() callback that a switch to a
+new sub-buffer is about to occur.  The client uses this callback to 1)
+initialize the next sub-buffer if appropriate 2) finalize the previous
+sub-buffer if appropriate and 3) return a boolean value indicating
+whether or not to actually move on to the next sub-buffer.
+
+To implement 'no-overwrite' mode, the userspace client would provide
+an implementation of the subbuf_start() callback something like the
+following:
+
+static int subbuf_start(struct rchan_buf *buf,
+                        void *subbuf,
+			void *prev_subbuf,
+			unsigned int prev_padding)
+{
+	if (prev_subbuf)
+		*((unsigned *)prev_subbuf) = prev_padding;
+
+	if (relay_buf_full(buf))
+		return 0;
+
+	subbuf_start_reserve(buf, sizeof(unsigned int));
+
+	return 1;
+}
+
+If the current buffer is full, i.e. all sub-buffers remain unconsumed,
+the callback returns 0 to indicate that the buffer switch should not
+occur yet, i.e. until the consumer has had a chance to read the
+current set of ready sub-buffers.  For the relay_buf_full() function
+to make sense, the consumer is reponsible for notifying the relay
+interface when sub-buffers have been consumed via
+relay_subbufs_consumed().  Any subsequent attempts to write into the
+buffer will again invoke the subbuf_start() callback with the same
+parameters; only when the consumer has consumed one or more of the
+ready sub-buffers will relay_buf_full() return 0, in which case the
+buffer switch can continue.
+
+The implementation of the subbuf_start() callback for 'overwrite' mode
+would be very similar:
+
+static int subbuf_start(struct rchan_buf *buf,
+                        void *subbuf,
+			void *prev_subbuf,
+			unsigned int prev_padding)
+{
+	if (prev_subbuf)
+		*((unsigned *)prev_subbuf) = prev_padding;
+
+	subbuf_start_reserve(buf, sizeof(unsigned int));
+
+	return 1;
+}
+
+In this case, the relay_buf_full() check is meaningless and the
+callback always returns 1, causing the buffer switch to occur
+unconditionally.  It's also meaningless for the client to use the
+relay_subbufs_consumed() function in this mode, as it's never
+consulted.
+
+The default subbuf_start() implementation, used if the client doesn't
+define any callbacks, or doesn't define the subbuf_start() callback,
+implements the simplest possible 'no-overwrite' mode, i.e. it does
+nothing but return 0.
+
+Header information can be reserved at the beginning of each sub-buffer
+by calling the subbuf_start_reserve() helper function from within the
+subbuf_start() callback.  This reserved area can be used to store
+whatever information the client wants.  In the example above, room is
+reserved in each sub-buffer to store the padding count for that
+sub-buffer.  This is filled in for the previous sub-buffer in the
+subbuf_start() implementation; the padding value for the previous
+sub-buffer is passed into the subbuf_start() callback along with a
+pointer to the previous sub-buffer, since the padding value isn't
+known until a sub-buffer is filled.  The subbuf_start() callback is
+also called for the first sub-buffer when the channel is opened, to
+give the client a chance to reserve space in it.  In this case the
+previous sub-buffer pointer passed into the callback will be NULL, so
+the client should check the value of the prev_subbuf pointer before
+writing into the previous sub-buffer.
+
+Writing to a channel
+--------------------
+
+Kernel clients write data into the current cpu's channel buffer using
+relay_write() or __relay_write().  relay_write() is the main logging
+function - it uses local_irqsave() to protect the buffer and should be
+used if you might be logging from interrupt context.  If you know
+you'll never be logging from interrupt context, you can use
+__relay_write(), which only disables preemption.  These functions
+don't return a value, so you can't determine whether or not they
+failed - the assumption is that you wouldn't want to check a return
+value in the fast logging path anyway, and that they'll always succeed
+unless the buffer is full and no-overwrite mode is being used, in
+which case you can detect a failed write in the subbuf_start()
+callback by calling the relay_buf_full() helper function.
+
+relay_reserve() is used to reserve a slot in a channel buffer which
+can be written to later.  This would typically be used in applications
+that need to write directly into a channel buffer without having to
+stage data in a temporary buffer beforehand.  Because the actual write
+may not happen immediately after the slot is reserved, applications
+using relay_reserve() can keep a count of the number of bytes actually
+written, either in space reserved in the sub-buffers themselves or as
+a separate array.  See the 'reserve' example in the relay-apps tarball
+at http://relayfs.sourceforge.net for an example of how this can be
+done.  Because the write is under control of the client and is
+separated from the reserve, relay_reserve() doesn't protect the buffer
+at all - it's up to the client to provide the appropriate
+synchronization when using relay_reserve().
+
+Closing a channel
+-----------------
+
+The client calls relay_close() when it's finished using the channel.
+The channel and its associated buffers are destroyed when there are no
+longer any references to any of the channel buffers.  relay_flush()
+forces a sub-buffer switch on all the channel buffers, and can be used
+to finalize and process the last sub-buffers before the channel is
+closed.
+
+Misc
+----
+
+Some applications may want to keep a channel around and re-use it
+rather than open and close a new channel for each use.  relay_reset()
+can be used for this purpose - it resets a channel to its initial
+state without reallocating channel buffer memory or destroying
+existing mappings.  It should however only be called when it's safe to
+do so, i.e. when the channel isn't currently being written to.
+
+Finally, there are a couple of utility callbacks that can be used for
+different purposes.  buf_mapped() is called whenever a channel buffer
+is mmapped from user space and buf_unmapped() is called when it's
+unmapped.  The client can use this notification to trigger actions
+within the kernel application, such as enabling/disabling logging to
+the channel.
+
+
+Resources
+=========
+
+For news, example code, mailing list, etc. see the relay interface homepage:
+
+    http://relayfs.sourceforge.net
+
+
+Credits
+=======
+
+The ideas and specs for the relay interface came about as a result of
+discussions on tracing involving the following:
+
+Michel Dagenais		<michel.dagenais@polymtl.ca>
+Richard Moore		<richardj_moore@uk.ibm.com>
+Bob Wisniewski		<bob@watson.ibm.com>
+Karim Yaghmour		<karim@opersys.com>
+Tom Zanussi		<zanussi@us.ibm.com>
+
+Also thanks to Hubertus Franke for a lot of useful suggestions and bug
+reports.
diff --git a/Documentation/filesystems/relayfs.txt b/Documentation/filesystems/relayfs.txt
deleted file mode 100644
index 5832377b734..00000000000
--- a/Documentation/filesystems/relayfs.txt
+++ /dev/null
@@ -1,442 +0,0 @@
-
-relayfs - a high-speed data relay filesystem
-============================================
-
-relayfs is a filesystem designed to provide an efficient mechanism for
-tools and facilities to relay large and potentially sustained streams
-of data from kernel space to user space.
-
-The main abstraction of relayfs is the 'channel'.  A channel consists
-of a set of per-cpu kernel buffers each represented by a file in the
-relayfs filesystem.  Kernel clients write into a channel using
-efficient write functions which automatically log to the current cpu's
-channel buffer.  User space applications mmap() the per-cpu files and
-retrieve the data as it becomes available.
-
-The format of the data logged into the channel buffers is completely
-up to the relayfs client; relayfs does however provide hooks which
-allow clients to impose some structure on the buffer data.  Nor does
-relayfs implement any form of data filtering - this also is left to
-the client.  The purpose is to keep relayfs as simple as possible.
-
-This document provides an overview of the relayfs API.  The details of
-the function parameters are documented along with the functions in the
-filesystem code - please see that for details.
-
-Semantics
-=========
-
-Each relayfs channel has one buffer per CPU, each buffer has one or
-more sub-buffers. Messages are written to the first sub-buffer until
-it is too full to contain a new message, in which case it it is
-written to the next (if available).  Messages are never split across
-sub-buffers.  At this point, userspace can be notified so it empties
-the first sub-buffer, while the kernel continues writing to the next.
-
-When notified that a sub-buffer is full, the kernel knows how many
-bytes of it are padding i.e. unused.  Userspace can use this knowledge
-to copy only valid data.
-
-After copying it, userspace can notify the kernel that a sub-buffer
-has been consumed.
-
-relayfs can operate in a mode where it will overwrite data not yet
-collected by userspace, and not wait for it to consume it.
-
-relayfs itself does not provide for communication of such data between
-userspace and kernel, allowing the kernel side to remain simple and
-not impose a single interface on userspace. It does provide a set of
-examples and a separate helper though, described below.
-
-klog and relay-apps example code
-================================
-
-relayfs itself is ready to use, but to make things easier, a couple
-simple utility functions and a set of examples are provided.
-
-The relay-apps example tarball, available on the relayfs sourceforge
-site, contains a set of self-contained examples, each consisting of a
-pair of .c files containing boilerplate code for each of the user and
-kernel sides of a relayfs application; combined these two sets of
-boilerplate code provide glue to easily stream data to disk, without
-having to bother with mundane housekeeping chores.
-
-The 'klog debugging functions' patch (klog.patch in the relay-apps
-tarball) provides a couple of high-level logging functions to the
-kernel which allow writing formatted text or raw data to a channel,
-regardless of whether a channel to write into exists or not, or
-whether relayfs is compiled into the kernel or is configured as a
-module.  These functions allow you to put unconditional 'trace'
-statements anywhere in the kernel or kernel modules; only when there
-is a 'klog handler' registered will data actually be logged (see the
-klog and kleak examples for details).
-
-It is of course possible to use relayfs from scratch i.e. without
-using any of the relay-apps example code or klog, but you'll have to
-implement communication between userspace and kernel, allowing both to
-convey the state of buffers (full, empty, amount of padding).
-
-klog and the relay-apps examples can be found in the relay-apps
-tarball on http://relayfs.sourceforge.net
-
-
-The relayfs user space API
-==========================
-
-relayfs implements basic file operations for user space access to
-relayfs channel buffer data.  Here are the file operations that are
-available and some comments regarding their behavior:
-
-open()	 enables user to open an _existing_ buffer.
-
-mmap()	 results in channel buffer being mapped into the caller's
-	 memory space. Note that you can't do a partial mmap - you must
-	 map the entire file, which is NRBUF * SUBBUFSIZE.
-
-read()	 read the contents of a channel buffer.  The bytes read are
-	 'consumed' by the reader i.e. they won't be available again
-	 to subsequent reads.  If the channel is being used in
-	 no-overwrite mode (the default), it can be read at any time
-	 even if there's an active kernel writer.  If the channel is
-	 being used in overwrite mode and there are active channel
-	 writers, results may be unpredictable - users should make
-	 sure that all logging to the channel has ended before using
-	 read() with overwrite mode.
-
-poll()	 POLLIN/POLLRDNORM/POLLERR supported.  User applications are
-	 notified when sub-buffer boundaries are crossed.
-
-close() decrements the channel buffer's refcount.  When the refcount
-	reaches 0 i.e. when no process or kernel client has the buffer
-	open, the channel buffer is freed.
-
-
-In order for a user application to make use of relayfs files, the
-relayfs filesystem must be mounted.  For example,
-
-	mount -t relayfs relayfs /mnt/relay
-
-NOTE:	relayfs doesn't need to be mounted for kernel clients to create
-	or use channels - it only needs to be mounted when user space
-	applications need access to the buffer data.
-
-
-The relayfs kernel API
-======================
-
-Here's a summary of the API relayfs provides to in-kernel clients:
-
-
-  channel management functions:
-
-    relay_open(base_filename, parent, subbuf_size, n_subbufs,
-               callbacks)
-    relay_close(chan)
-    relay_flush(chan)
-    relay_reset(chan)
-    relayfs_create_dir(name, parent)
-    relayfs_remove_dir(dentry)
-    relayfs_create_file(name, parent, mode, fops, data)
-    relayfs_remove_file(dentry)
-
-  channel management typically called on instigation of userspace:
-
-    relay_subbufs_consumed(chan, cpu, subbufs_consumed)
-
-  write functions:
-
-    relay_write(chan, data, length)
-    __relay_write(chan, data, length)
-    relay_reserve(chan, length)
-
-  callbacks:
-
-    subbuf_start(buf, subbuf, prev_subbuf, prev_padding)
-    buf_mapped(buf, filp)
-    buf_unmapped(buf, filp)
-    create_buf_file(filename, parent, mode, buf, is_global)
-    remove_buf_file(dentry)
-
-  helper functions:
-
-    relay_buf_full(buf)
-    subbuf_start_reserve(buf, length)
-
-
-Creating a channel
-------------------
-
-relay_open() is used to create a channel, along with its per-cpu
-channel buffers.  Each channel buffer will have an associated file
-created for it in the relayfs filesystem, which can be opened and
-mmapped from user space if desired.  The files are named
-basename0...basenameN-1 where N is the number of online cpus, and by
-default will be created in the root of the filesystem.  If you want a
-directory structure to contain your relayfs files, you can create it
-with relayfs_create_dir() and pass the parent directory to
-relay_open().  Clients are responsible for cleaning up any directory
-structure they create when the channel is closed - use
-relayfs_remove_dir() for that.
-
-The total size of each per-cpu buffer is calculated by multiplying the
-number of sub-buffers by the sub-buffer size passed into relay_open().
-The idea behind sub-buffers is that they're basically an extension of
-double-buffering to N buffers, and they also allow applications to
-easily implement random-access-on-buffer-boundary schemes, which can
-be important for some high-volume applications.  The number and size
-of sub-buffers is completely dependent on the application and even for
-the same application, different conditions will warrant different
-values for these parameters at different times.  Typically, the right
-values to use are best decided after some experimentation; in general,
-though, it's safe to assume that having only 1 sub-buffer is a bad
-idea - you're guaranteed to either overwrite data or lose events
-depending on the channel mode being used.
-
-Channel 'modes'
----------------
-
-relayfs channels can be used in either of two modes - 'overwrite' or
-'no-overwrite'.  The mode is entirely determined by the implementation
-of the subbuf_start() callback, as described below.  In 'overwrite'
-mode, also known as 'flight recorder' mode, writes continuously cycle
-around the buffer and will never fail, but will unconditionally
-overwrite old data regardless of whether it's actually been consumed.
-In no-overwrite mode, writes will fail i.e. data will be lost, if the
-number of unconsumed sub-buffers equals the total number of
-sub-buffers in the channel.  It should be clear that if there is no
-consumer or if the consumer can't consume sub-buffers fast enought,
-data will be lost in either case; the only difference is whether data
-is lost from the beginning or the end of a buffer.
-
-As explained above, a relayfs channel is made of up one or more
-per-cpu channel buffers, each implemented as a circular buffer
-subdivided into one or more sub-buffers.  Messages are written into
-the current sub-buffer of the channel's current per-cpu buffer via the
-write functions described below.  Whenever a message can't fit into
-the current sub-buffer, because there's no room left for it, the
-client is notified via the subbuf_start() callback that a switch to a
-new sub-buffer is about to occur.  The client uses this callback to 1)
-initialize the next sub-buffer if appropriate 2) finalize the previous
-sub-buffer if appropriate and 3) return a boolean value indicating
-whether or not to actually go ahead with the sub-buffer switch.
-
-To implement 'no-overwrite' mode, the userspace client would provide
-an implementation of the subbuf_start() callback something like the
-following:
-
-static int subbuf_start(struct rchan_buf *buf,
-                        void *subbuf,
-			void *prev_subbuf,
-			unsigned int prev_padding)
-{
-	if (prev_subbuf)
-		*((unsigned *)prev_subbuf) = prev_padding;
-
-	if (relay_buf_full(buf))
-		return 0;
-
-	subbuf_start_reserve(buf, sizeof(unsigned int));
-
-	return 1;
-}
-
-If the current buffer is full i.e. all sub-buffers remain unconsumed,
-the callback returns 0 to indicate that the buffer switch should not
-occur yet i.e. until the consumer has had a chance to read the current
-set of ready sub-buffers.  For the relay_buf_full() function to make
-sense, the consumer is reponsible for notifying relayfs when
-sub-buffers have been consumed via relay_subbufs_consumed().  Any
-subsequent attempts to write into the buffer will again invoke the
-subbuf_start() callback with the same parameters; only when the
-consumer has consumed one or more of the ready sub-buffers will
-relay_buf_full() return 0, in which case the buffer switch can
-continue.
-
-The implementation of the subbuf_start() callback for 'overwrite' mode
-would be very similar:
-
-static int subbuf_start(struct rchan_buf *buf,
-                        void *subbuf,
-			void *prev_subbuf,
-			unsigned int prev_padding)
-{
-	if (prev_subbuf)
-		*((unsigned *)prev_subbuf) = prev_padding;
-
-	subbuf_start_reserve(buf, sizeof(unsigned int));
-
-	return 1;
-}
-
-In this case, the relay_buf_full() check is meaningless and the
-callback always returns 1, causing the buffer switch to occur
-unconditionally.  It's also meaningless for the client to use the
-relay_subbufs_consumed() function in this mode, as it's never
-consulted.
-
-The default subbuf_start() implementation, used if the client doesn't
-define any callbacks, or doesn't define the subbuf_start() callback,
-implements the simplest possible 'no-overwrite' mode i.e. it does
-nothing but return 0.
-
-Header information can be reserved at the beginning of each sub-buffer
-by calling the subbuf_start_reserve() helper function from within the
-subbuf_start() callback.  This reserved area can be used to store
-whatever information the client wants.  In the example above, room is
-reserved in each sub-buffer to store the padding count for that
-sub-buffer.  This is filled in for the previous sub-buffer in the
-subbuf_start() implementation; the padding value for the previous
-sub-buffer is passed into the subbuf_start() callback along with a
-pointer to the previous sub-buffer, since the padding value isn't
-known until a sub-buffer is filled.  The subbuf_start() callback is
-also called for the first sub-buffer when the channel is opened, to
-give the client a chance to reserve space in it.  In this case the
-previous sub-buffer pointer passed into the callback will be NULL, so
-the client should check the value of the prev_subbuf pointer before
-writing into the previous sub-buffer.
-
-Writing to a channel
---------------------
-
-kernel clients write data into the current cpu's channel buffer using
-relay_write() or __relay_write().  relay_write() is the main logging
-function - it uses local_irqsave() to protect the buffer and should be
-used if you might be logging from interrupt context.  If you know
-you'll never be logging from interrupt context, you can use
-__relay_write(), which only disables preemption.  These functions
-don't return a value, so you can't determine whether or not they
-failed - the assumption is that you wouldn't want to check a return
-value in the fast logging path anyway, and that they'll always succeed
-unless the buffer is full and no-overwrite mode is being used, in
-which case you can detect a failed write in the subbuf_start()
-callback by calling the relay_buf_full() helper function.
-
-relay_reserve() is used to reserve a slot in a channel buffer which
-can be written to later.  This would typically be used in applications
-that need to write directly into a channel buffer without having to
-stage data in a temporary buffer beforehand.  Because the actual write
-may not happen immediately after the slot is reserved, applications
-using relay_reserve() can keep a count of the number of bytes actually
-written, either in space reserved in the sub-buffers themselves or as
-a separate array.  See the 'reserve' example in the relay-apps tarball
-at http://relayfs.sourceforge.net for an example of how this can be
-done.  Because the write is under control of the client and is
-separated from the reserve, relay_reserve() doesn't protect the buffer
-at all - it's up to the client to provide the appropriate
-synchronization when using relay_reserve().
-
-Closing a channel
------------------
-
-The client calls relay_close() when it's finished using the channel.
-The channel and its associated buffers are destroyed when there are no
-longer any references to any of the channel buffers.  relay_flush()
-forces a sub-buffer switch on all the channel buffers, and can be used
-to finalize and process the last sub-buffers before the channel is
-closed.
-
-Creating non-relay files
-------------------------
-
-relay_open() automatically creates files in the relayfs filesystem to
-represent the per-cpu kernel buffers; it's often useful for
-applications to be able to create their own files alongside the relay
-files in the relayfs filesystem as well e.g. 'control' files much like
-those created in /proc or debugfs for similar purposes, used to
-communicate control information between the kernel and user sides of a
-relayfs application.  For this purpose the relayfs_create_file() and
-relayfs_remove_file() API functions exist.  For relayfs_create_file(),
-the caller passes in a set of user-defined file operations to be used
-for the file and an optional void * to a user-specified data item,
-which will be accessible via inode->u.generic_ip (see the relay-apps
-tarball for examples).  The file_operations are a required parameter
-to relayfs_create_file() and thus the semantics of these files are
-completely defined by the caller.
-
-See the relay-apps tarball at http://relayfs.sourceforge.net for
-examples of how these non-relay files are meant to be used.
-
-Creating relay files in other filesystems
------------------------------------------
-
-By default of course, relay_open() creates relay files in the relayfs
-filesystem.  Because relay_file_operations is exported, however, it's
-also possible to create and use relay files in other pseudo-filesytems
-such as debugfs.
-
-For this purpose, two callback functions are provided,
-create_buf_file() and remove_buf_file().  create_buf_file() is called
-once for each per-cpu buffer from relay_open() to allow the client to
-create a file to be used to represent the corresponding buffer; if
-this callback is not defined, the default implementation will create
-and return a file in the relayfs filesystem to represent the buffer.
-The callback should return the dentry of the file created to represent
-the relay buffer.  Note that the parent directory passed to
-relay_open() (and passed along to the callback), if specified, must
-exist in the same filesystem the new relay file is created in.  If
-create_buf_file() is defined, remove_buf_file() must also be defined;
-it's responsible for deleting the file(s) created in create_buf_file()
-and is called during relay_close().
-
-The create_buf_file() implementation can also be defined in such a way
-as to allow the creation of a single 'global' buffer instead of the
-default per-cpu set.  This can be useful for applications interested
-mainly in seeing the relative ordering of system-wide events without
-the need to bother with saving explicit timestamps for the purpose of
-merging/sorting per-cpu files in a postprocessing step.
-
-To have relay_open() create a global buffer, the create_buf_file()
-implementation should set the value of the is_global outparam to a
-non-zero value in addition to creating the file that will be used to
-represent the single buffer.  In the case of a global buffer,
-create_buf_file() and remove_buf_file() will be called only once.  The
-normal channel-writing functions e.g. relay_write() can still be used
-- writes from any cpu will transparently end up in the global buffer -
-but since it is a global buffer, callers should make sure they use the
-proper locking for such a buffer, either by wrapping writes in a
-spinlock, or by copying a write function from relayfs_fs.h and
-creating a local version that internally does the proper locking.
-
-See the 'exported-relayfile' examples in the relay-apps tarball for
-examples of creating and using relay files in debugfs.
-
-Misc
-----
-
-Some applications may want to keep a channel around and re-use it
-rather than open and close a new channel for each use.  relay_reset()
-can be used for this purpose - it resets a channel to its initial
-state without reallocating channel buffer memory or destroying
-existing mappings.  It should however only be called when it's safe to
-do so i.e. when the channel isn't currently being written to.
-
-Finally, there are a couple of utility callbacks that can be used for
-different purposes.  buf_mapped() is called whenever a channel buffer
-is mmapped from user space and buf_unmapped() is called when it's
-unmapped.  The client can use this notification to trigger actions
-within the kernel application, such as enabling/disabling logging to
-the channel.
-
-
-Resources
-=========
-
-For news, example code, mailing list, etc. see the relayfs homepage:
-
-    http://relayfs.sourceforge.net
-
-
-Credits
-=======
-
-The ideas and specs for relayfs came about as a result of discussions
-on tracing involving the following:
-
-Michel Dagenais		<michel.dagenais@polymtl.ca>
-Richard Moore		<richardj_moore@uk.ibm.com>
-Bob Wisniewski		<bob@watson.ibm.com>
-Karim Yaghmour		<karim@opersys.com>
-Tom Zanussi		<zanussi@us.ibm.com>
-
-Also thanks to Hubertus Franke for a lot of useful suggestions and bug
-reports.
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 0b62c62142c..5c3a5190596 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -25,6 +25,7 @@ Currently, these files are in /proc/sys/fs:
 - inode-state
 - overflowuid
 - overflowgid
+- suid_dumpable
 - super-max
 - super-nr
 
@@ -131,6 +132,25 @@ The default is 65534.
 
 ==============================================================
 
+suid_dumpable:
+
+This value can be used to query and set the core dump mode for setuid
+or otherwise protected/tainted binaries. The modes are
+
+0 - (default) - traditional behaviour. Any process which has changed
+	privilege levels or is execute only will not be dumped
+1 - (debug) - all processes dump core when possible. The core dump is
+	owned by the current user and no security is applied. This is
+	intended for system debugging situations only. Ptrace is unchecked.
+2 - (suidsafe) - any binary which normally would not be dumped is dumped
+	readable by root only. This allows the end user to remove
+	such a dump but not access it directly. For security reasons
+	core dumps in this mode will not overwrite one another or
+	other files. This mode is appropriate when adminstrators are
+	attempting to debug problems in a normal environment.
+
+==============================================================
+
 super-max & super-nr:
 
 These numbers control the maximum number of superblocks, and
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 7345c338080..89bf8c20a58 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -50,7 +50,6 @@ show up in /proc/sys/kernel:
 - shmmax                      [ sysv ipc ]
 - shmmni
 - stop-a                      [ SPARC only ]
-- suid_dumpable
 - sysrq                       ==> Documentation/sysrq.txt
 - tainted
 - threads-max
@@ -310,25 +309,6 @@ kernel.  This value defaults to SHMMAX.
 
 ==============================================================
 
-suid_dumpable:
-
-This value can be used to query and set the core dump mode for setuid
-or otherwise protected/tainted binaries. The modes are
-
-0 - (default) - traditional behaviour. Any process which has changed
-	privilege levels or is execute only will not be dumped
-1 - (debug) - all processes dump core when possible. The core dump is
-	owned by the current user and no security is applied. This is
-	intended for system debugging situations only. Ptrace is unchecked.
-2 - (suidsafe) - any binary which normally would not be dumped is dumped
-	readable by root only. This allows the end user to remove
-	such a dump but not access it directly. For security reasons
-	core dumps in this mode will not overwrite one another or
-	other files. This mode is appropriate when adminstrators are
-	attempting to debug problems in a normal environment.
-
-==============================================================
-
 tainted: 
 
 Non-zero if the kernel has been tainted.  Numeric values, which
diff --git a/Makefile b/Makefile
index 8406d02c638..33559b56644 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 18
-EXTRAVERSION = -rc4
+EXTRAVERSION = -rc5
 NAME=Crazed Snow-Weasel
 
 # *DOCUMENTATION*
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f71fb4a029c..b2751eadbc5 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -142,6 +142,7 @@ config X86_SUMMIT
 	  In particular, it is needed for the x440.
 
 	  If you don't have one of these computers, you should say N here.
+	  If you want to build a NUMA kernel, you must select ACPI.
 
 config X86_BIGSMP
 	bool "Support for other sub-arch SMP systems with more than 8 CPUs"
@@ -169,6 +170,7 @@ config X86_GENERICARCH
        help
           This option compiles in the Summit, bigsmp, ES7000, default subarchitectures.
 	  It is intended for a generic binary kernel.
+	  If you want a NUMA kernel, select ACPI.   We need SRAT for NUMA.
 
 config X86_ES7000
 	bool "Support for Unisys ES7000 IA32 series"
@@ -542,7 +544,7 @@ config X86_PAE
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation and Scheduler Support"
-	depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI))
+	depends on SMP && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI)
 	default n if X86_PC
 	default y if (X86_NUMAQ || X86_SUMMIT)
 
diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
index efb41e81351..e6ea00edcb5 100644
--- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -567,16 +567,11 @@ static struct cpufreq_driver acpi_cpufreq_driver = {
 static int __init
 acpi_cpufreq_init (void)
 {
-	int                     result = 0;
-
 	dprintk("acpi_cpufreq_init\n");
 
-	result = acpi_cpufreq_early_init_acpi();
+	acpi_cpufreq_early_init_acpi();
 
-	if (!result)
- 		result = cpufreq_register_driver(&acpi_cpufreq_driver);
-	
-	return (result);
+ 	return cpufreq_register_driver(&acpi_cpufreq_driver);
 }
 
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index d7de1753e09..e9b0957f15d 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -64,7 +64,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf)
 		       "Node %d Mapped:       %8lu kB\n"
 		       "Node %d AnonPages:    %8lu kB\n"
 		       "Node %d PageTables:   %8lu kB\n"
-		       "Node %d NFS Unstable: %8lu kB\n"
+		       "Node %d NFS_Unstable: %8lu kB\n"
 		       "Node %d Bounce:       %8lu kB\n"
 		       "Node %d Slab:         %8lu kB\n",
 		       nid, K(i.totalram),
diff --git a/drivers/cdrom/gscd.c b/drivers/cdrom/gscd.c
index b6ee50a2916..fa708248976 100644
--- a/drivers/cdrom/gscd.c
+++ b/drivers/cdrom/gscd.c
@@ -266,7 +266,7 @@ repeat:
 		goto out;
 
 	if (req->cmd != READ) {
-		printk("GSCD: bad cmd %lu\n", rq_data_dir(req));
+		printk("GSCD: bad cmd %u\n", rq_data_dir(req));
 		end_request(req, 0);
 		goto repeat;
 	}
diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c
index 4ea7bd5f4f5..a369dd6877d 100644
--- a/drivers/char/moxa.c
+++ b/drivers/char/moxa.c
@@ -142,6 +142,7 @@ typedef struct _moxa_board_conf {
 
 static moxa_board_conf moxa_boards[MAX_BOARDS];
 static void __iomem *moxaBaseAddr[MAX_BOARDS];
+static int loadstat[MAX_BOARDS];
 
 struct moxa_str {
 	int type;
@@ -1688,6 +1689,8 @@ int MoxaDriverPoll(void)
 	if (moxaCard == 0)
 		return (-1);
 	for (card = 0; card < MAX_BOARDS; card++) {
+	        if (loadstat[card] == 0)
+			continue;
 		if ((ports = moxa_boards[card].numPorts) == 0)
 			continue;
 		if (readb(moxaIntPend[card]) == 0xff) {
@@ -2903,6 +2906,7 @@ static int moxaloadcode(int cardno, unsigned char __user *tmp, int len)
 		}
 		break;
 	}
+	loadstat[cardno] = 1;
 	return (0);
 }
 
@@ -2920,7 +2924,7 @@ static int moxaloadc218(int cardno, void __iomem *baseAddr, int len)
 	len1 = len >> 1;
 	ptr = (ushort *) moxaBuff;
 	for (i = 0; i < len1; i++)
-		usum += *(ptr + i);
+		usum += le16_to_cpu(*(ptr + i));
 	retry = 0;
 	do {
 		len1 = len >> 1;
@@ -2992,7 +2996,7 @@ static int moxaloadc320(int cardno, void __iomem *baseAddr, int len, int *numPor
 	wlen = len >> 1;
 	uptr = (ushort *) moxaBuff;
 	for (i = 0; i < wlen; i++)
-		usum += uptr[i];
+		usum += le16_to_cpu(uptr[i]);
 	retry = 0;
 	j = 0;
 	do {
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index bfdb90242a9..bb0d9199e99 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -153,6 +153,15 @@ int tty_ioctl(struct inode * inode, struct file * file,
 static int tty_fasync(int fd, struct file * filp, int on);
 static void release_mem(struct tty_struct *tty, int idx);
 
+/**
+ *	alloc_tty_struct	-	allocate a tty object
+ *
+ *	Return a new empty tty structure. The data fields have not
+ *	been initialized in any way but has been zeroed
+ *
+ *	Locking: none
+ *	FIXME: use kzalloc
+ */
 
 static struct tty_struct *alloc_tty_struct(void)
 {
@@ -166,6 +175,15 @@ static struct tty_struct *alloc_tty_struct(void)
 
 static void tty_buffer_free_all(struct tty_struct *);
 
+/**
+ *	free_tty_struct		-	free a disused tty
+ *	@tty: tty struct to free
+ *
+ *	Free the write buffers, tty queue and tty memory itself.
+ *
+ *	Locking: none. Must be called after tty is definitely unused
+ */
+
 static inline void free_tty_struct(struct tty_struct *tty)
 {
 	kfree(tty->write_buf);
@@ -175,6 +193,17 @@ static inline void free_tty_struct(struct tty_struct *tty)
 
 #define TTY_NUMBER(tty) ((tty)->index + (tty)->driver->name_base)
 
+/**
+ *	tty_name	-	return tty naming
+ *	@tty: tty structure
+ *	@buf: buffer for output
+ *
+ *	Convert a tty structure into a name. The name reflects the kernel
+ *	naming policy and if udev is in use may not reflect user space
+ *
+ *	Locking: none
+ */
+
 char *tty_name(struct tty_struct *tty, char *buf)
 {
 	if (!tty) /* Hmm.  NULL pointer.  That's fun. */
@@ -235,6 +264,28 @@ static int check_tty_count(struct tty_struct *tty, const char *routine)
  * Tty buffer allocation management
  */
 
+
+/**
+ *	tty_buffer_free_all		-	free buffers used by a tty
+ *	@tty: tty to free from
+ *
+ *	Remove all the buffers pending on a tty whether queued with data
+ *	or in the free ring. Must be called when the tty is no longer in use
+ *
+ *	Locking: none
+ */
+
+
+/**
+ *	tty_buffer_free_all		-	free buffers used by a tty
+ *	@tty: tty to free from
+ *
+ *	Remove all the buffers pending on a tty whether queued with data
+ *	or in the free ring. Must be called when the tty is no longer in use
+ *
+ *	Locking: none
+ */
+
 static void tty_buffer_free_all(struct tty_struct *tty)
 {
 	struct tty_buffer *thead;
@@ -247,19 +298,47 @@ static void tty_buffer_free_all(struct tty_struct *tty)
 		kfree(thead);
 	}
 	tty->buf.tail = NULL;
+	tty->buf.memory_used = 0;
 }
 
+/**
+ *	tty_buffer_init		-	prepare a tty buffer structure
+ *	@tty: tty to initialise
+ *
+ *	Set up the initial state of the buffer management for a tty device.
+ *	Must be called before the other tty buffer functions are used.
+ *
+ *	Locking: none
+ */
+
 static void tty_buffer_init(struct tty_struct *tty)
 {
 	spin_lock_init(&tty->buf.lock);
 	tty->buf.head = NULL;
 	tty->buf.tail = NULL;
 	tty->buf.free = NULL;
+	tty->buf.memory_used = 0;
 }
 
-static struct tty_buffer *tty_buffer_alloc(size_t size)
+/**
+ *	tty_buffer_alloc	-	allocate a tty buffer
+ *	@tty: tty device
+ *	@size: desired size (characters)
+ *
+ *	Allocate a new tty buffer to hold the desired number of characters.
+ *	Return NULL if out of memory or the allocation would exceed the
+ *	per device queue
+ *
+ *	Locking: Caller must hold tty->buf.lock
+ */
+
+static struct tty_buffer *tty_buffer_alloc(struct tty_struct *tty, size_t size)
 {
-	struct tty_buffer *p = kmalloc(sizeof(struct tty_buffer) + 2 * size, GFP_ATOMIC);
+	struct tty_buffer *p;
+
+	if (tty->buf.memory_used + size > 65536)
+		return NULL;
+	p = kmalloc(sizeof(struct tty_buffer) + 2 * size, GFP_ATOMIC);
 	if(p == NULL)
 		return NULL;
 	p->used = 0;
@@ -269,17 +348,27 @@ static struct tty_buffer *tty_buffer_alloc(size_t size)
 	p->read = 0;
 	p->char_buf_ptr = (char *)(p->data);
 	p->flag_buf_ptr = (unsigned char *)p->char_buf_ptr + size;
-/* 	printk("Flip create %p\n", p); */
+	tty->buf.memory_used += size;
 	return p;
 }
 
-/* Must be called with the tty_read lock held. This needs to acquire strategy
-   code to decide if we should kfree or relink a given expired buffer */
+/**
+ *	tty_buffer_free		-	free a tty buffer
+ *	@tty: tty owning the buffer
+ *	@b: the buffer to free
+ *
+ *	Free a tty buffer, or add it to the free list according to our
+ *	internal strategy
+ *
+ *	Locking: Caller must hold tty->buf.lock
+ */
 
 static void tty_buffer_free(struct tty_struct *tty, struct tty_buffer *b)
 {
 	/* Dumb strategy for now - should keep some stats */
-/* 	printk("Flip dispose %p\n", b); */
+	tty->buf.memory_used -= b->size;
+	WARN_ON(tty->buf.memory_used < 0);
+
 	if(b->size >= 512)
 		kfree(b);
 	else {
@@ -288,6 +377,18 @@ static void tty_buffer_free(struct tty_struct *tty, struct tty_buffer *b)
 	}
 }
 
+/**
+ *	tty_buffer_find		-	find a free tty buffer
+ *	@tty: tty owning the buffer
+ *	@size: characters wanted
+ *
+ *	Locate an existing suitable tty buffer or if we are lacking one then
+ *	allocate a new one. We round our buffers off in 256 character chunks
+ *	to get better allocation behaviour.
+ *
+ *	Locking: Caller must hold tty->buf.lock
+ */
+
 static struct tty_buffer *tty_buffer_find(struct tty_struct *tty, size_t size)
 {
 	struct tty_buffer **tbh = &tty->buf.free;
@@ -299,20 +400,28 @@ static struct tty_buffer *tty_buffer_find(struct tty_struct *tty, size_t size)
 			t->used = 0;
 			t->commit = 0;
 			t->read = 0;
-			/* DEBUG ONLY */
-/*			memset(t->data, '*', size); */
-/* 			printk("Flip recycle %p\n", t); */
+			tty->buf.memory_used += t->size;
 			return t;
 		}
 		tbh = &((*tbh)->next);
 	}
 	/* Round the buffer size out */
 	size = (size + 0xFF) & ~ 0xFF;
-	return tty_buffer_alloc(size);
+	return tty_buffer_alloc(tty, size);
 	/* Should possibly check if this fails for the largest buffer we
 	   have queued and recycle that ? */
 }
 
+/**
+ *	tty_buffer_request_room		-	grow tty buffer if needed
+ *	@tty: tty structure
+ *	@size: size desired
+ *
+ *	Make at least size bytes of linear space available for the tty
+ *	buffer. If we fail return the size we managed to find.
+ *
+ *	Locking: Takes tty->buf.lock
+ */
 int tty_buffer_request_room(struct tty_struct *tty, size_t size)
 {
 	struct tty_buffer *b, *n;
@@ -347,6 +456,18 @@ int tty_buffer_request_room(struct tty_struct *tty, size_t size)
 }
 EXPORT_SYMBOL_GPL(tty_buffer_request_room);
 
+/**
+ *	tty_insert_flip_string	-	Add characters to the tty buffer
+ *	@tty: tty structure
+ *	@chars: characters
+ *	@size: size
+ *
+ *	Queue a series of bytes to the tty buffering. All the characters
+ *	passed are marked as without error. Returns the number added.
+ *
+ *	Locking: Called functions may take tty->buf.lock
+ */
+
 int tty_insert_flip_string(struct tty_struct *tty, const unsigned char *chars,
 				size_t size)
 {
@@ -370,6 +491,20 @@ int tty_insert_flip_string(struct tty_struct *tty, const unsigned char *chars,
 }
 EXPORT_SYMBOL(tty_insert_flip_string);
 
+/**
+ *	tty_insert_flip_string_flags	-	Add characters to the tty buffer
+ *	@tty: tty structure
+ *	@chars: characters
+ *	@flags: flag bytes
+ *	@size: size
+ *
+ *	Queue a series of bytes to the tty buffering. For each character
+ *	the flags array indicates the status of the character. Returns the
+ *	number added.
+ *
+ *	Locking: Called functions may take tty->buf.lock
+ */
+
 int tty_insert_flip_string_flags(struct tty_struct *tty,
 		const unsigned char *chars, const char *flags, size_t size)
 {
@@ -394,6 +529,17 @@ int tty_insert_flip_string_flags(struct tty_struct *tty,
 }
 EXPORT_SYMBOL(tty_insert_flip_string_flags);
 
+/**
+ *	tty_schedule_flip	-	push characters to ldisc
+ *	@tty: tty to push from
+ *
+ *	Takes any pending buffers and transfers their ownership to the
+ *	ldisc side of the queue. It then schedules those characters for
+ *	processing by the line discipline.
+ *
+ *	Locking: Takes tty->buf.lock
+ */
+
 void tty_schedule_flip(struct tty_struct *tty)
 {
 	unsigned long flags;
@@ -405,12 +551,19 @@ void tty_schedule_flip(struct tty_struct *tty)
 }
 EXPORT_SYMBOL(tty_schedule_flip);
 
-/*
+/**
+ *	tty_prepare_flip_string		-	make room for characters
+ *	@tty: tty
+ *	@chars: return pointer for character write area
+ *	@size: desired size
+ *
  *	Prepare a block of space in the buffer for data. Returns the length
  *	available and buffer pointer to the space which is now allocated and
  *	accounted for as ready for normal characters. This is used for drivers
  *	that need their own block copy routines into the buffer. There is no
  *	guarantee the buffer is a DMA target!
+ *
+ *	Locking: May call functions taking tty->buf.lock
  */
 
 int tty_prepare_flip_string(struct tty_struct *tty, unsigned char **chars, size_t size)
@@ -427,12 +580,20 @@ int tty_prepare_flip_string(struct tty_struct *tty, unsigned char **chars, size_
 
 EXPORT_SYMBOL_GPL(tty_prepare_flip_string);
 
-/*
+/**
+ *	tty_prepare_flip_string_flags	-	make room for characters
+ *	@tty: tty
+ *	@chars: return pointer for character write area
+ *	@flags: return pointer for status flag write area
+ *	@size: desired size
+ *
  *	Prepare a block of space in the buffer for data. Returns the length
  *	available and buffer pointer to the space which is now allocated and
  *	accounted for as ready for characters. This is used for drivers
  *	that need their own block copy routines into the buffer. There is no
  *	guarantee the buffer is a DMA target!
+ *
+ *	Locking: May call functions taking tty->buf.lock
  */
 
 int tty_prepare_flip_string_flags(struct tty_struct *tty, unsigned char **chars, char **flags, size_t size)
@@ -451,10 +612,16 @@ EXPORT_SYMBOL_GPL(tty_prepare_flip_string_flags);
 
 
 
-/*
+/**
+ *	tty_set_termios_ldisc		-	set ldisc field
+ *	@tty: tty structure
+ *	@num: line discipline number
+ *
  *	This is probably overkill for real world processors but
  *	they are not on hot paths so a little discipline won't do 
  *	any harm.
+ *
+ *	Locking: takes termios_sem
  */
  
 static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
@@ -474,6 +641,19 @@ static DEFINE_SPINLOCK(tty_ldisc_lock);
 static DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait);
 static struct tty_ldisc tty_ldiscs[NR_LDISCS];	/* line disc dispatch table */
 
+/**
+ *	tty_register_ldisc	-	install a line discipline
+ *	@disc: ldisc number
+ *	@new_ldisc: pointer to the ldisc object
+ *
+ *	Installs a new line discipline into the kernel. The discipline
+ *	is set up as unreferenced and then made available to the kernel
+ *	from this point onwards.
+ *
+ *	Locking:
+ *		takes tty_ldisc_lock to guard against ldisc races
+ */
+
 int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc)
 {
 	unsigned long flags;
@@ -493,6 +673,18 @@ int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc)
 }
 EXPORT_SYMBOL(tty_register_ldisc);
 
+/**
+ *	tty_unregister_ldisc	-	unload a line discipline
+ *	@disc: ldisc number
+ *	@new_ldisc: pointer to the ldisc object
+ *
+ *	Remove a line discipline from the kernel providing it is not
+ *	currently in use.
+ *
+ *	Locking:
+ *		takes tty_ldisc_lock to guard against ldisc races
+ */
+
 int tty_unregister_ldisc(int disc)
 {
 	unsigned long flags;
@@ -512,6 +704,19 @@ int tty_unregister_ldisc(int disc)
 }
 EXPORT_SYMBOL(tty_unregister_ldisc);
 
+/**
+ *	tty_ldisc_get		-	take a reference to an ldisc
+ *	@disc: ldisc number
+ *
+ *	Takes a reference to a line discipline. Deals with refcounts and
+ *	module locking counts. Returns NULL if the discipline is not available.
+ *	Returns a pointer to the discipline and bumps the ref count if it is
+ *	available
+ *
+ *	Locking:
+ *		takes tty_ldisc_lock to guard against ldisc races
+ */
+
 struct tty_ldisc *tty_ldisc_get(int disc)
 {
 	unsigned long flags;
@@ -540,6 +745,17 @@ struct tty_ldisc *tty_ldisc_get(int disc)
 
 EXPORT_SYMBOL_GPL(tty_ldisc_get);
 
+/**
+ *	tty_ldisc_put		-	drop ldisc reference
+ *	@disc: ldisc number
+ *
+ *	Drop a reference to a line discipline. Manage refcounts and
+ *	module usage counts
+ *
+ *	Locking:
+ *		takes tty_ldisc_lock to guard against ldisc races
+ */
+
 void tty_ldisc_put(int disc)
 {
 	struct tty_ldisc *ld;
@@ -557,6 +773,19 @@ void tty_ldisc_put(int disc)
 	
 EXPORT_SYMBOL_GPL(tty_ldisc_put);
 
+/**
+ *	tty_ldisc_assign	-	set ldisc on a tty
+ *	@tty: tty to assign
+ *	@ld: line discipline
+ *
+ *	Install an instance of a line discipline into a tty structure. The
+ *	ldisc must have a reference count above zero to ensure it remains/
+ *	The tty instance refcount starts at zero.
+ *
+ *	Locking:
+ *		Caller must hold references
+ */
+
 static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld)
 {
 	tty->ldisc = *ld;
@@ -571,6 +800,8 @@ static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld)
  *	the tty ldisc. Return 0 on failure or 1 on success. This is
  *	used to implement both the waiting and non waiting versions
  *	of tty_ldisc_ref
+ *
+ *	Locking: takes tty_ldisc_lock
  */
 
 static int tty_ldisc_try(struct tty_struct *tty)
@@ -602,6 +833,8 @@ static int tty_ldisc_try(struct tty_struct *tty)
  *	must also be careful not to hold other locks that will deadlock
  *	against a discipline change, such as an existing ldisc reference
  *	(which we check for)
+ *
+ *	Locking: call functions take tty_ldisc_lock
  */
  
 struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty)
@@ -622,6 +855,8 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
  *	Dereference the line discipline for the terminal and take a 
  *	reference to it. If the line discipline is in flux then 
  *	return NULL. Can be called from IRQ and timer functions.
+ *
+ *	Locking: called functions take tty_ldisc_lock
  */
  
 struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty)
@@ -639,6 +874,8 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref);
  *
  *	Undoes the effect of tty_ldisc_ref or tty_ldisc_ref_wait. May
  *	be called in IRQ context.
+ *
+ *	Locking: takes tty_ldisc_lock
  */
  
 void tty_ldisc_deref(struct tty_ldisc *ld)
@@ -683,6 +920,9 @@ static void tty_ldisc_enable(struct tty_struct *tty)
  *
  *	Set the discipline of a tty line. Must be called from a process
  *	context.
+ *
+ *	Locking: takes tty_ldisc_lock.
+ *		called functions take termios_sem
  */
  
 static int tty_set_ldisc(struct tty_struct *tty, int ldisc)
@@ -846,9 +1086,17 @@ restart:
 	return retval;
 }
 
-/*
- * This routine returns a tty driver structure, given a device number
+/**
+ *	get_tty_driver		-	find device of a tty
+ *	@dev_t: device identifier
+ *	@index: returns the index of the tty
+ *
+ *	This routine returns a tty driver structure, given a device number
+ *	and also passes back the index number.
+ *
+ *	Locking: caller must hold tty_mutex
  */
+
 static struct tty_driver *get_tty_driver(dev_t device, int *index)
 {
 	struct tty_driver *p;
@@ -863,11 +1111,17 @@ static struct tty_driver *get_tty_driver(dev_t device, int *index)
 	return NULL;
 }
 
-/*
- * If we try to write to, or set the state of, a terminal and we're
- * not in the foreground, send a SIGTTOU.  If the signal is blocked or
- * ignored, go ahead and perform the operation.  (POSIX 7.2)
+/**
+ *	tty_check_change	-	check for POSIX terminal changes
+ *	@tty: tty to check
+ *
+ *	If we try to write to, or set the state of, a terminal and we're
+ *	not in the foreground, send a SIGTTOU.  If the signal is blocked or
+ *	ignored, go ahead and perform the operation.  (POSIX 7.2)
+ *
+ *	Locking: none
  */
+
 int tty_check_change(struct tty_struct * tty)
 {
 	if (current->signal->tty != tty)
@@ -1005,10 +1259,27 @@ void tty_ldisc_flush(struct tty_struct *tty)
 
 EXPORT_SYMBOL_GPL(tty_ldisc_flush);
 	
-/*
- * This can be called by the "eventd" kernel thread.  That is process synchronous,
- * but doesn't hold any locks, so we need to make sure we have the appropriate
- * locks for what we're doing..
+/**
+ *	do_tty_hangup		-	actual handler for hangup events
+ *	@data: tty device
+ *
+ *	This can be called by the "eventd" kernel thread.  That is process
+ *	synchronous but doesn't hold any locks, so we need to make sure we
+ *	have the appropriate locks for what we're doing.
+ *
+ *	The hangup event clears any pending redirections onto the hung up
+ *	device. It ensures future writes will error and it does the needed
+ *	line discipline hangup and signal delivery. The tty object itself
+ *	remains intact.
+ *
+ *	Locking:
+ *		BKL
+ *		redirect lock for undoing redirection
+ *		file list lock for manipulating list of ttys
+ *		tty_ldisc_lock from called functions
+ *		termios_sem resetting termios data
+ *		tasklist_lock to walk task list for hangup event
+ *
  */
 static void do_tty_hangup(void *data)
 {
@@ -1133,6 +1404,14 @@ static void do_tty_hangup(void *data)
 		fput(f);
 }
 
+/**
+ *	tty_hangup		-	trigger a hangup event
+ *	@tty: tty to hangup
+ *
+ *	A carrier loss (virtual or otherwise) has occurred on this like
+ *	schedule a hangup sequence to run after this event.
+ */
+
 void tty_hangup(struct tty_struct * tty)
 {
 #ifdef TTY_DEBUG_HANGUP
@@ -1145,6 +1424,15 @@ void tty_hangup(struct tty_struct * tty)
 
 EXPORT_SYMBOL(tty_hangup);
 
+/**
+ *	tty_vhangup		-	process vhangup
+ *	@tty: tty to hangup
+ *
+ *	The user has asked via system call for the terminal to be hung up.
+ *	We do this synchronously so that when the syscall returns the process
+ *	is complete. That guarantee is neccessary for security reasons.
+ */
+
 void tty_vhangup(struct tty_struct * tty)
 {
 #ifdef TTY_DEBUG_HANGUP
@@ -1156,6 +1444,14 @@ void tty_vhangup(struct tty_struct * tty)
 }
 EXPORT_SYMBOL(tty_vhangup);
 
+/**
+ *	tty_hung_up_p		-	was tty hung up
+ *	@filp: file pointer of tty
+ *
+ *	Return true if the tty has been subject to a vhangup or a carrier
+ *	loss
+ */
+
 int tty_hung_up_p(struct file * filp)
 {
 	return (filp->f_op == &hung_up_tty_fops);
@@ -1163,19 +1459,28 @@ int tty_hung_up_p(struct file * filp)
 
 EXPORT_SYMBOL(tty_hung_up_p);
 
-/*
- * This function is typically called only by the session leader, when
- * it wants to disassociate itself from its controlling tty.
+/**
+ *	disassociate_ctty	-	disconnect controlling tty
+ *	@on_exit: true if exiting so need to "hang up" the session
+ *
+ *	This function is typically called only by the session leader, when
+ *	it wants to disassociate itself from its controlling tty.
  *
- * It performs the following functions:
+ *	It performs the following functions:
  * 	(1)  Sends a SIGHUP and SIGCONT to the foreground process group
  * 	(2)  Clears the tty from being controlling the session
  * 	(3)  Clears the controlling tty for all processes in the
  * 		session group.
  *
- * The argument on_exit is set to 1 if called when a process is
- * exiting; it is 0 if called by the ioctl TIOCNOTTY.
+ *	The argument on_exit is set to 1 if called when a process is
+ *	exiting; it is 0 if called by the ioctl TIOCNOTTY.
+ *
+ *	Locking: tty_mutex is taken to protect current->signal->tty
+ *		BKL is taken for hysterical raisins
+ *		Tasklist lock is taken (under tty_mutex) to walk process
+ *		lists for the session.
  */
+
 void disassociate_ctty(int on_exit)
 {
 	struct tty_struct *tty;
@@ -1222,6 +1527,25 @@ void disassociate_ctty(int on_exit)
 	unlock_kernel();
 }
 
+
+/**
+ *	stop_tty	-	propogate flow control
+ *	@tty: tty to stop
+ *
+ *	Perform flow control to the driver. For PTY/TTY pairs we
+ *	must also propogate the TIOCKPKT status. May be called
+ *	on an already stopped device and will not re-call the driver
+ *	method.
+ *
+ *	This functionality is used by both the line disciplines for
+ *	halting incoming flow and by the driver. It may therefore be
+ *	called from any context, may be under the tty atomic_write_lock
+ *	but not always.
+ *
+ *	Locking:
+ *		Broken. Relies on BKL which is unsafe here.
+ */
+
 void stop_tty(struct tty_struct *tty)
 {
 	if (tty->stopped)
@@ -1238,6 +1562,19 @@ void stop_tty(struct tty_struct *tty)
 
 EXPORT_SYMBOL(stop_tty);
 
+/**
+ *	start_tty	-	propogate flow control
+ *	@tty: tty to start
+ *
+ *	Start a tty that has been stopped if at all possible. Perform
+ *	any neccessary wakeups and propogate the TIOCPKT status. If this
+ *	is the tty was previous stopped and is being started then the
+ *	driver start method is invoked and the line discipline woken.
+ *
+ *	Locking:
+ *		Broken. Relies on BKL which is unsafe here.
+ */
+
 void start_tty(struct tty_struct *tty)
 {
 	if (!tty->stopped || tty->flow_stopped)
@@ -1258,6 +1595,23 @@ void start_tty(struct tty_struct *tty)
 
 EXPORT_SYMBOL(start_tty);
 
+/**
+ *	tty_read	-	read method for tty device files
+ *	@file: pointer to tty file
+ *	@buf: user buffer
+ *	@count: size of user buffer
+ *	@ppos: unused
+ *
+ *	Perform the read system call function on this terminal device. Checks
+ *	for hung up devices before calling the line discipline method.
+ *
+ *	Locking:
+ *		Locks the line discipline internally while needed
+ *		For historical reasons the line discipline read method is
+ *	invoked under the BKL. This will go away in time so do not rely on it
+ *	in new code. Multiple read calls may be outstanding in parallel.
+ */
+
 static ssize_t tty_read(struct file * file, char __user * buf, size_t count, 
 			loff_t *ppos)
 {
@@ -1302,6 +1656,7 @@ static inline ssize_t do_tty_write(
 	ssize_t ret = 0, written = 0;
 	unsigned int chunk;
 	
+	/* FIXME: O_NDELAY ... */
 	if (mutex_lock_interruptible(&tty->atomic_write_lock)) {
 		return -ERESTARTSYS;
 	}
@@ -1318,6 +1673,9 @@ static inline ssize_t do_tty_write(
 	 * layer has problems with bigger chunks. It will
 	 * claim to be able to handle more characters than
 	 * it actually does.
+	 *
+	 * FIXME: This can probably go away now except that 64K chunks
+	 * are too likely to fail unless switched to vmalloc...
 	 */
 	chunk = 2048;
 	if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags))
@@ -1375,6 +1733,24 @@ static inline ssize_t do_tty_write(
 }
 
 
+/**
+ *	tty_write		-	write method for tty device file
+ *	@file: tty file pointer
+ *	@buf: user data to write
+ *	@count: bytes to write
+ *	@ppos: unused
+ *
+ *	Write data to a tty device via the line discipline.
+ *
+ *	Locking:
+ *		Locks the line discipline as required
+ *		Writes to the tty driver are serialized by the atomic_write_lock
+ *	and are then processed in chunks to the device. The line discipline
+ *	write method will not be involked in parallel for each device
+ *		The line discipline write method is called under the big
+ *	kernel lock for historical reasons. New code should not rely on this.
+ */
+
 static ssize_t tty_write(struct file * file, const char __user * buf, size_t count,
 			 loff_t *ppos)
 {
@@ -1422,7 +1798,18 @@ ssize_t redirected_tty_write(struct file * file, const char __user * buf, size_t
 
 static char ptychar[] = "pqrstuvwxyzabcde";
 
-static inline void pty_line_name(struct tty_driver *driver, int index, char *p)
+/**
+ *	pty_line_name	-	generate name for a pty
+ *	@driver: the tty driver in use
+ *	@index: the minor number
+ *	@p: output buffer of at least 6 bytes
+ *
+ *	Generate a name from a driver reference and write it to the output
+ *	buffer.
+ *
+ *	Locking: None
+ */
+static void pty_line_name(struct tty_driver *driver, int index, char *p)
 {
 	int i = index + driver->name_base;
 	/* ->name is initialized to "ttyp", but "tty" is expected */
@@ -1431,24 +1818,53 @@ static inline void pty_line_name(struct tty_driver *driver, int index, char *p)
 			ptychar[i >> 4 & 0xf], i & 0xf);
 }
 
-static inline void tty_line_name(struct tty_driver *driver, int index, char *p)
+/**
+ *	pty_line_name	-	generate name for a tty
+ *	@driver: the tty driver in use
+ *	@index: the minor number
+ *	@p: output buffer of at least 7 bytes
+ *
+ *	Generate a name from a driver reference and write it to the output
+ *	buffer.
+ *
+ *	Locking: None
+ */
+static void tty_line_name(struct tty_driver *driver, int index, char *p)
 {
 	sprintf(p, "%s%d", driver->name, index + driver->name_base);
 }
 
-/*
+/**
+ *	init_dev		-	initialise a tty device
+ *	@driver: tty driver we are opening a device on
+ *	@idx: device index
+ *	@tty: returned tty structure
+ *
+ *	Prepare a tty device. This may not be a "new" clean device but
+ *	could also be an active device. The pty drivers require special
+ *	handling because of this.
+ *
+ *	Locking:
+ *		The function is called under the tty_mutex, which
+ *	protects us from the tty struct or driver itself going away.
+ *
+ *	On exit the tty device has the line discipline attached and
+ *	a reference count of 1. If a pair was created for pty/tty use
+ *	and the other was a pty master then it too has a reference count of 1.
+ *
  * WSH 06/09/97: Rewritten to remove races and properly clean up after a
  * failed open.  The new code protects the open with a mutex, so it's
  * really quite straightforward.  The mutex locking can probably be
  * relaxed for the (most common) case of reopening a tty.
  */
+
 static int init_dev(struct tty_driver *driver, int idx,
 	struct tty_struct **ret_tty)
 {
 	struct tty_struct *tty, *o_tty;
 	struct termios *tp, **tp_loc, *o_tp, **o_tp_loc;
 	struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc;
-	int retval=0;
+	int retval = 0;
 
 	/* check whether we're reopening an existing tty */
 	if (driver->flags & TTY_DRIVER_DEVPTS_MEM) {
@@ -1662,10 +2078,20 @@ release_mem_out:
 	goto end_init;
 }
 
-/*
- * Releases memory associated with a tty structure, and clears out the
- * driver table slots.
+/**
+ *	release_mem		-	release tty structure memory
+ *
+ *	Releases memory associated with a tty structure, and clears out the
+ *	driver table slots. This function is called when a device is no longer
+ *	in use. It also gets called when setup of a device fails.
+ *
+ *	Locking:
+ *		tty_mutex - sometimes only
+ *		takes the file list lock internally when working on the list
+ *	of ttys that the driver keeps.
+ *		FIXME: should we require tty_mutex is held here ??
  */
+
 static void release_mem(struct tty_struct *tty, int idx)
 {
 	struct tty_struct *o_tty;
@@ -2006,18 +2432,27 @@ static void release_dev(struct file * filp)
 
 }
 
-/*
- * tty_open and tty_release keep up the tty count that contains the
- * number of opens done on a tty. We cannot use the inode-count, as
- * different inodes might point to the same tty.
+/**
+ *	tty_open		-	open a tty device
+ *	@inode: inode of device file
+ *	@filp: file pointer to tty
+ *
+ *	tty_open and tty_release keep up the tty count that contains the
+ *	number of opens done on a tty. We cannot use the inode-count, as
+ *	different inodes might point to the same tty.
  *
- * Open-counting is needed for pty masters, as well as for keeping
- * track of serial lines: DTR is dropped when the last close happens.
- * (This is not done solely through tty->count, now.  - Ted 1/27/92)
+ *	Open-counting is needed for pty masters, as well as for keeping
+ *	track of serial lines: DTR is dropped when the last close happens.
+ *	(This is not done solely through tty->count, now.  - Ted 1/27/92)
  *
- * The termios state of a pty is reset on first open so that
- * settings don't persist across reuse.
+ *	The termios state of a pty is reset on first open so that
+ *	settings don't persist across reuse.
+ *
+ *	Locking: tty_mutex protects current->signal->tty, get_tty_driver and
+ *		init_dev work. tty->count should protect the rest.
+ *		task_lock is held to update task details for sessions
  */
+
 static int tty_open(struct inode * inode, struct file * filp)
 {
 	struct tty_struct *tty;
@@ -2132,6 +2567,18 @@ got_driver:
 }
 
 #ifdef CONFIG_UNIX98_PTYS
+/**
+ *	ptmx_open		-	open a unix 98 pty master
+ *	@inode: inode of device file
+ *	@filp: file pointer to tty
+ *
+ *	Allocate a unix98 pty master device from the ptmx driver.
+ *
+ *	Locking: tty_mutex protects theinit_dev work. tty->count should
+ 		protect the rest.
+ *		allocated_ptys_lock handles the list of free pty numbers
+ */
+
 static int ptmx_open(struct inode * inode, struct file * filp)
 {
 	struct tty_struct *tty;
@@ -2191,6 +2638,18 @@ out:
 }
 #endif
 
+/**
+ *	tty_release		-	vfs callback for close
+ *	@inode: inode of tty
+ *	@filp: file pointer for handle to tty
+ *
+ *	Called the last time each file handle is closed that references
+ *	this tty. There may however be several such references.
+ *
+ *	Locking:
+ *		Takes bkl. See release_dev
+ */
+
 static int tty_release(struct inode * inode, struct file * filp)
 {
 	lock_kernel();
@@ -2199,7 +2658,18 @@ static int tty_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-/* No kernel lock held - fine */
+/**
+ *	tty_poll	-	check tty status
+ *	@filp: file being polled
+ *	@wait: poll wait structures to update
+ *
+ *	Call the line discipline polling method to obtain the poll
+ *	status of the device.
+ *
+ *	Locking: locks called line discipline but ldisc poll method
+ *	may be re-entered freely by other callers.
+ */
+
 static unsigned int tty_poll(struct file * filp, poll_table * wait)
 {
 	struct tty_struct * tty;
@@ -2243,6 +2713,21 @@ static int tty_fasync(int fd, struct file * filp, int on)
 	return 0;
 }
 
+/**
+ *	tiocsti			-	fake input character
+ *	@tty: tty to fake input into
+ *	@p: pointer to character
+ *
+ *	Fake input to a tty device. Does the neccessary locking and
+ *	input management.
+ *
+ *	FIXME: does not honour flow control ??
+ *
+ *	Locking:
+ *		Called functions take tty_ldisc_lock
+ *		current->signal->tty check is safe without locks
+ */
+
 static int tiocsti(struct tty_struct *tty, char __user *p)
 {
 	char ch, mbz = 0;
@@ -2258,6 +2743,18 @@ static int tiocsti(struct tty_struct *tty, char __user *p)
 	return 0;
 }
 
+/**
+ *	tiocgwinsz		-	implement window query ioctl
+ *	@tty; tty
+ *	@arg: user buffer for result
+ *
+ *	Copies the kernel idea of the window size into the user buffer. No
+ *	locking is done.
+ *
+ *	FIXME: Returning random values racing a window size set is wrong
+ *	should lock here against that
+ */
+
 static int tiocgwinsz(struct tty_struct *tty, struct winsize __user * arg)
 {
 	if (copy_to_user(arg, &tty->winsize, sizeof(*arg)))
@@ -2265,6 +2762,24 @@ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user * arg)
 	return 0;
 }
 
+/**
+ *	tiocswinsz		-	implement window size set ioctl
+ *	@tty; tty
+ *	@arg: user buffer for result
+ *
+ *	Copies the user idea of the window size to the kernel. Traditionally
+ *	this is just advisory information but for the Linux console it
+ *	actually has driver level meaning and triggers a VC resize.
+ *
+ *	Locking:
+ *		The console_sem is used to ensure we do not try and resize
+ *	the console twice at once.
+ *	FIXME: Two racing size sets may leave the console and kernel
+ *		parameters disagreeing. Is this exploitable ?
+ *	FIXME: Random values racing a window size get is wrong
+ *	should lock here against that
+ */
+
 static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty,
 	struct winsize __user * arg)
 {
@@ -2294,6 +2809,15 @@ static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty,
 	return 0;
 }
 
+/**
+ *	tioccons	-	allow admin to move logical console
+ *	@file: the file to become console
+ *
+ *	Allow the adminstrator to move the redirected console device
+ *
+ *	Locking: uses redirect_lock to guard the redirect information
+ */
+
 static int tioccons(struct file *file)
 {
 	if (!capable(CAP_SYS_ADMIN))
@@ -2319,6 +2843,17 @@ static int tioccons(struct file *file)
 	return 0;
 }
 
+/**
+ *	fionbio		-	non blocking ioctl
+ *	@file: file to set blocking value
+ *	@p: user parameter
+ *
+ *	Historical tty interfaces had a blocking control ioctl before
+ *	the generic functionality existed. This piece of history is preserved
+ *	in the expected tty API of posix OS's.
+ *
+ *	Locking: none, the open fle handle ensures it won't go away.
+ */
 
 static int fionbio(struct file *file, int __user *p)
 {
@@ -2334,6 +2869,23 @@ static int fionbio(struct file *file, int __user *p)
 	return 0;
 }
 
+/**
+ *	tiocsctty	-	set controlling tty
+ *	@tty: tty structure
+ *	@arg: user argument
+ *
+ *	This ioctl is used to manage job control. It permits a session
+ *	leader to set this tty as the controlling tty for the session.
+ *
+ *	Locking:
+ *		Takes tasklist lock internally to walk sessions
+ *		Takes task_lock() when updating signal->tty
+ *
+ *	FIXME: tty_mutex is needed to protect signal->tty references.
+ *	FIXME: why task_lock on the signal->tty reference ??
+ *
+ */
+
 static int tiocsctty(struct tty_struct *tty, int arg)
 {
 	struct task_struct *p;
@@ -2374,6 +2926,18 @@ static int tiocsctty(struct tty_struct *tty, int arg)
 	return 0;
 }
 
+/**
+ *	tiocgpgrp		-	get process group
+ *	@tty: tty passed by user
+ *	@real_tty: tty side of the tty pased by the user if a pty else the tty
+ *	@p: returned pid
+ *
+ *	Obtain the process group of the tty. If there is no process group
+ *	return an error.
+ *
+ *	Locking: none. Reference to ->signal->tty is safe.
+ */
+
 static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
 {
 	/*
@@ -2385,6 +2949,20 @@ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 	return put_user(real_tty->pgrp, p);
 }
 
+/**
+ *	tiocspgrp		-	attempt to set process group
+ *	@tty: tty passed by user
+ *	@real_tty: tty side device matching tty passed by user
+ *	@p: pid pointer
+ *
+ *	Set the process group of the tty to the session passed. Only
+ *	permitted where the tty session is our session.
+ *
+ *	Locking: None
+ *
+ *	FIXME: current->signal->tty referencing is unsafe.
+ */
+
 static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
 {
 	pid_t pgrp;
@@ -2408,6 +2986,18 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 	return 0;
 }
 
+/**
+ *	tiocgsid		-	get session id
+ *	@tty: tty passed by user
+ *	@real_tty: tty side of the tty pased by the user if a pty else the tty
+ *	@p: pointer to returned session id
+ *
+ *	Obtain the session id of the tty. If there is no session
+ *	return an error.
+ *
+ *	Locking: none. Reference to ->signal->tty is safe.
+ */
+
 static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
 {
 	/*
@@ -2421,6 +3011,16 @@ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t _
 	return put_user(real_tty->session, p);
 }
 
+/**
+ *	tiocsetd	-	set line discipline
+ *	@tty: tty device
+ *	@p: pointer to user data
+ *
+ *	Set the line discipline according to user request.
+ *
+ *	Locking: see tty_set_ldisc, this function is just a helper
+ */
+
 static int tiocsetd(struct tty_struct *tty, int __user *p)
 {
 	int ldisc;
@@ -2430,6 +3030,21 @@ static int tiocsetd(struct tty_struct *tty, int __user *p)
 	return tty_set_ldisc(tty, ldisc);
 }
 
+/**
+ *	send_break	-	performed time break
+ *	@tty: device to break on
+ *	@duration: timeout in mS
+ *
+ *	Perform a timed break on hardware that lacks its own driver level
+ *	timed break functionality.
+ *
+ *	Locking:
+ *		None
+ *
+ *	FIXME:
+ *		What if two overlap
+ */
+
 static int send_break(struct tty_struct *tty, unsigned int duration)
 {
 	tty->driver->break_ctl(tty, -1);
@@ -2442,8 +3057,19 @@ static int send_break(struct tty_struct *tty, unsigned int duration)
 	return 0;
 }
 
-static int
-tty_tiocmget(struct tty_struct *tty, struct file *file, int __user *p)
+/**
+ *	tiocmget		-	get modem status
+ *	@tty: tty device
+ *	@file: user file pointer
+ *	@p: pointer to result
+ *
+ *	Obtain the modem status bits from the tty driver if the feature
+ *	is supported. Return -EINVAL if it is not available.
+ *
+ *	Locking: none (up to the driver)
+ */
+
+static int tty_tiocmget(struct tty_struct *tty, struct file *file, int __user *p)
 {
 	int retval = -EINVAL;
 
@@ -2456,8 +3082,20 @@ tty_tiocmget(struct tty_struct *tty, struct file *file, int __user *p)
 	return retval;
 }
 
-static int
-tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int cmd,
+/**
+ *	tiocmset		-	set modem status
+ *	@tty: tty device
+ *	@file: user file pointer
+ *	@cmd: command - clear bits, set bits or set all
+ *	@p: pointer to desired bits
+ *
+ *	Set the modem status bits from the tty driver if the feature
+ *	is supported. Return -EINVAL if it is not available.
+ *
+ *	Locking: none (up to the driver)
+ */
+
+static int tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int cmd,
 	     unsigned __user *p)
 {
 	int retval = -EINVAL;
@@ -2573,6 +3211,7 @@ int tty_ioctl(struct inode * inode, struct file * file,
 			clear_bit(TTY_EXCLUSIVE, &tty->flags);
 			return 0;
 		case TIOCNOTTY:
+			/* FIXME: taks lock or tty_mutex ? */
 			if (current->signal->tty != tty)
 				return -ENOTTY;
 			if (current->signal->leader)
@@ -2753,9 +3392,16 @@ void do_SAK(struct tty_struct *tty)
 
 EXPORT_SYMBOL(do_SAK);
 
-/*
- * This routine is called out of the software interrupt to flush data
- * from the buffer chain to the line discipline.
+/**
+ *	flush_to_ldisc
+ *	@private_: tty structure passed from work queue.
+ *
+ *	This routine is called out of the software interrupt to flush data
+ *	from the buffer chain to the line discipline.
+ *
+ *	Locking: holds tty->buf.lock to guard buffer list. Drops the lock
+ *	while invoking the line discipline receive_buf method. The
+ *	receive_buf method is single threaded for each tty instance.
  */
  
 static void flush_to_ldisc(void *private_)
@@ -2831,6 +3477,8 @@ static int n_baud_table = ARRAY_SIZE(baud_table);
  *	Convert termios baud rate data into a speed. This should be called
  *	with the termios lock held if this termios is a terminal termios
  *	structure. May change the termios data.
+ *
+ *	Locking: none
  */
  
 int tty_termios_baud_rate(struct termios *termios)
@@ -2859,6 +3507,8 @@ EXPORT_SYMBOL(tty_termios_baud_rate);
  *	Returns the baud rate as an integer for this terminal. The
  *	termios lock must be held by the caller and the terminal bit
  *	flags may be updated.
+ *
+ *	Locking: none
  */
  
 int tty_get_baud_rate(struct tty_struct *tty)
@@ -2888,6 +3538,8 @@ EXPORT_SYMBOL(tty_get_baud_rate);
  *
  *	In the event of the queue being busy for flipping the work will be
  *	held off and retried later.
+ *
+ *	Locking: tty buffer lock. Driver locks in low latency mode.
  */
 
 void tty_flip_buffer_push(struct tty_struct *tty)
@@ -2907,9 +3559,16 @@ void tty_flip_buffer_push(struct tty_struct *tty)
 EXPORT_SYMBOL(tty_flip_buffer_push);
 
 
-/*
- * This subroutine initializes a tty structure.
+/**
+ *	initialize_tty_struct
+ *	@tty: tty to initialize
+ *
+ *	This subroutine initializes a tty structure that has been newly
+ *	allocated.
+ *
+ *	Locking: none - tty in question must not be exposed at this point
  */
+
 static void initialize_tty_struct(struct tty_struct *tty)
 {
 	memset(tty, 0, sizeof(struct tty_struct));
@@ -2935,6 +3594,7 @@ static void initialize_tty_struct(struct tty_struct *tty)
 /*
  * The default put_char routine if the driver did not define one.
  */
+
 static void tty_default_put_char(struct tty_struct *tty, unsigned char ch)
 {
 	tty->driver->write(tty, &ch, 1);
@@ -2943,19 +3603,23 @@ static void tty_default_put_char(struct tty_struct *tty, unsigned char ch)
 static struct class *tty_class;
 
 /**
- * tty_register_device - register a tty device
- * @driver: the tty driver that describes the tty device
- * @index: the index in the tty driver for this tty device
- * @device: a struct device that is associated with this tty device.
- *	This field is optional, if there is no known struct device for this
- *	tty device it can be set to NULL safely.
+ *	tty_register_device - register a tty device
+ *	@driver: the tty driver that describes the tty device
+ *	@index: the index in the tty driver for this tty device
+ *	@device: a struct device that is associated with this tty device.
+ *		This field is optional, if there is no known struct device
+ *		for this tty device it can be set to NULL safely.
  *
- * Returns a pointer to the class device (or ERR_PTR(-EFOO) on error).
+ *	Returns a pointer to the class device (or ERR_PTR(-EFOO) on error).
  *
- * This call is required to be made to register an individual tty device if
- * the tty driver's flags have the TTY_DRIVER_DYNAMIC_DEV bit set.  If that
- * bit is not set, this function should not be called by a tty driver.
+ *	This call is required to be made to register an individual tty device
+ *	if the tty driver's flags have the TTY_DRIVER_DYNAMIC_DEV bit set.  If
+ *	that bit is not set, this function should not be called by a tty
+ *	driver.
+ *
+ *	Locking: ??
  */
+
 struct class_device *tty_register_device(struct tty_driver *driver,
 					 unsigned index, struct device *device)
 {
@@ -2977,13 +3641,16 @@ struct class_device *tty_register_device(struct tty_driver *driver,
 }
 
 /**
- * tty_unregister_device - unregister a tty device
- * @driver: the tty driver that describes the tty device
- * @index: the index in the tty driver for this tty device
+ * 	tty_unregister_device - unregister a tty device
+ * 	@driver: the tty driver that describes the tty device
+ * 	@index: the index in the tty driver for this tty device
  *
- * If a tty device is registered with a call to tty_register_device() then
- * this function must be made when the tty device is gone.
+ * 	If a tty device is registered with a call to tty_register_device() then
+ *	this function must be called when the tty device is gone.
+ *
+ *	Locking: ??
  */
+
 void tty_unregister_device(struct tty_driver *driver, unsigned index)
 {
 	class_device_destroy(tty_class, MKDEV(driver->major, driver->minor_start) + index);
@@ -3094,7 +3761,6 @@ int tty_register_driver(struct tty_driver *driver)
 	driver->cdev.owner = driver->owner;
 	error = cdev_add(&driver->cdev, dev, driver->num);
 	if (error) {
-		cdev_del(&driver->cdev);
 		unregister_chrdev_region(dev, driver->num);
 		driver->ttys = NULL;
 		driver->termios = driver->termios_locked = NULL;
diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index f19cf9d7792..4ad47d321bd 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -36,6 +36,18 @@
 #define TERMIOS_WAIT	2
 #define TERMIOS_TERMIO	4
 
+
+/**
+ *	tty_wait_until_sent	-	wait for I/O to finish
+ *	@tty: tty we are waiting for
+ *	@timeout: how long we will wait
+ *
+ *	Wait for characters pending in a tty driver to hit the wire, or
+ *	for a timeout to occur (eg due to flow control)
+ *
+ *	Locking: none
+ */
+
 void tty_wait_until_sent(struct tty_struct * tty, long timeout)
 {
 	DECLARE_WAITQUEUE(wait, current);
@@ -94,6 +106,18 @@ static void unset_locked_termios(struct termios *termios,
 			old->c_cc[i] : termios->c_cc[i];
 }
 
+/**
+ *	change_termios		-	update termios values
+ *	@tty: tty to update
+ *	@new_termios: desired new value
+ *
+ *	Perform updates to the termios values set on this terminal. There
+ *	is a bit of layering violation here with n_tty in terms of the
+ *	internal knowledge of this function.
+ *
+ *	Locking: termios_sem
+ */
+
 static void change_termios(struct tty_struct * tty, struct termios * new_termios)
 {
 	int canon_change;
@@ -155,6 +179,19 @@ static void change_termios(struct tty_struct * tty, struct termios * new_termios
 	up(&tty->termios_sem);
 }
 
+/**
+ *	set_termios		-	set termios values for a tty
+ *	@tty: terminal device
+ *	@arg: user data
+ *	@opt: option information
+ *
+ *	Helper function to prepare termios data and run neccessary other
+ *	functions before using change_termios to do the actual changes.
+ *
+ *	Locking:
+ *		Called functions take ldisc and termios_sem locks
+ */
+
 static int set_termios(struct tty_struct * tty, void __user *arg, int opt)
 {
 	struct termios tmp_termios;
@@ -284,6 +321,17 @@ static void set_sgflags(struct termios * termios, int flags)
 	}
 }
 
+/**
+ *	set_sgttyb		-	set legacy terminal values
+ *	@tty: tty structure
+ *	@sgttyb: pointer to old style terminal structure
+ *
+ *	Updates a terminal from the legacy BSD style terminal information
+ *	structure.
+ *
+ *	Locking: termios_sem
+ */
+
 static int set_sgttyb(struct tty_struct * tty, struct sgttyb __user * sgttyb)
 {
 	int retval;
@@ -369,9 +417,16 @@ static int set_ltchars(struct tty_struct * tty, struct ltchars __user * ltchars)
 }
 #endif
 
-/*
- * Send a high priority character to the tty.
+/**
+ *	send_prio_char		-	send priority character
+ *
+ *	Send a high priority character to the tty even if stopped
+ *
+ *	Locking: none
+ *
+ *	FIXME: overlapping calls with start/stop tty lose state of tty
  */
+
 static void send_prio_char(struct tty_struct *tty, char ch)
 {
 	int	was_stopped = tty->stopped;
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index eccffaf26fa..a5628a8b662 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -1011,6 +1011,8 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		   return -EPERM;
 		vt_dont_switch = 0;
 		return 0;
+	case VT_GETHIFONTMASK:
+		return put_user(vc->vc_hi_font_mask, (unsigned short __user *)arg);
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index d4bad6704bb..448df277337 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -3552,6 +3552,8 @@ static int ohci1394_pci_resume (struct pci_dev *pdev)
 
 static int ohci1394_pci_suspend (struct pci_dev *pdev, pm_message_t state)
 {
+	pci_save_state(pdev);
+
 #ifdef CONFIG_PPC_PMAC
 	if (machine_is(powermac)) {
 		struct device_node *of_node;
@@ -3563,8 +3565,6 @@ static int ohci1394_pci_suspend (struct pci_dev *pdev, pm_message_t state)
 	}
 #endif
 
-	pci_save_state(pdev);
-
 	return 0;
 }
 
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index be48cedf986..c54de989eb0 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -255,7 +255,9 @@ static struct region *__rh_alloc(struct region_hash *rh, region_t region)
 	struct region *reg, *nreg;
 
 	read_unlock(&rh->hash_lock);
-	nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
+	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
+	if (unlikely(!nreg))
+		nreg = kmalloc(sizeof(struct region), GFP_NOIO);
 	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
 		RH_CLEAN : RH_NOSYNC;
 	nreg->rh = rh;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index b6d16022a53..8dbab2ef388 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1597,6 +1597,19 @@ void md_update_sb(mddev_t * mddev)
 
 repeat:
 	spin_lock_irq(&mddev->write_lock);
+
+	if (mddev->degraded && mddev->sb_dirty == 3)
+		/* If the array is degraded, then skipping spares is both
+		 * dangerous and fairly pointless.
+		 * Dangerous because a device that was removed from the array
+		 * might have a event_count that still looks up-to-date,
+		 * so it can be re-added without a resync.
+		 * Pointless because if there are any spares to skip,
+		 * then a recovery will happen and soon that array won't
+		 * be degraded any more and the spare can go back to sleep then.
+		 */
+		mddev->sb_dirty = 1;
+
 	sync_req = mddev->in_sync;
 	mddev->utime = get_seconds();
 	if (mddev->sb_dirty == 3)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1efe22a2d04..87bfe9e7d8c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1625,15 +1625,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		return 0;
 	}
 
-	/* before building a request, check if we can skip these blocks..
-	 * This call the bitmap_start_sync doesn't actually record anything
-	 */
 	if (mddev->bitmap == NULL &&
 	    mddev->recovery_cp == MaxSector &&
+	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
 	    conf->fullsync == 0) {
 		*skipped = 1;
 		return max_sector - sector_nr;
 	}
+	/* before building a request, check if we can skip these blocks..
+	 * This call the bitmap_start_sync doesn't actually record anything
+	 */
 	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
 	    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 		/* We can skip this block, and probably several more */
diff --git a/drivers/mtd/nand/ams-delta.c b/drivers/mtd/nand/ams-delta.c
index d7897dc6b3c..a0ba07c36ee 100644
--- a/drivers/mtd/nand/ams-delta.c
+++ b/drivers/mtd/nand/ams-delta.c
@@ -130,11 +130,13 @@ static void ams_delta_hwcontrol(struct mtd_info *mtd, int cmd,
 	if (ctrl & NAND_CTRL_CHANGE) {
 		unsigned long bits;
 
-		bits = (~ctrl & NAND_NCE) << 2;
-		bits |= (ctrl & NAND_CLE) << 7;
-		bits |= (ctrl & NAND_ALE) << 6;
+		bits = (~ctrl & NAND_NCE) ? AMS_DELTA_LATCH2_NAND_NCE : 0;
+		bits |= (ctrl & NAND_CLE) ? AMS_DELTA_LATCH2_NAND_CLE : 0;
+		bits |= (ctrl & NAND_ALE) ? AMS_DELTA_LATCH2_NAND_ALE : 0;
 
-		ams_delta_latch2_write(0xC2, bits);
+		ams_delta_latch2_write(AMS_DELTA_LATCH2_NAND_CLE |
+				AMS_DELTA_LATCH2_NAND_ALE |
+				AMS_DELTA_LATCH2_NAND_NCE, bits);
 	}
 
 	if (cmd != NAND_CMD_NONE)
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 62b861304e0..c8cbc00243f 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1093,9 +1093,10 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len,
 
 	ret = nand_do_read_ops(mtd, from, &chip->ops);
 
+	*retlen = chip->ops.retlen;
+
 	nand_release_device(mtd);
 
-	*retlen = chip->ops.retlen;
 	return ret;
 }
 
@@ -1691,9 +1692,10 @@ static int nand_write(struct mtd_info *mtd, loff_t to, size_t len,
 
 	ret = nand_do_write_ops(mtd, to, &chip->ops);
 
+	*retlen = chip->ops.retlen;
+
 	nand_release_device(mtd);
 
-	*retlen = chip->ops.retlen;
 	return ret;
 }
 
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index d6d1bff52b8..2c7de79c83b 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -69,12 +69,12 @@ static void s3c_rtc_setaie(int to)
 
 	pr_debug("%s: aie=%d\n", __FUNCTION__, to);
 
-	tmp = readb(S3C2410_RTCALM) & ~S3C2410_RTCALM_ALMEN;
+	tmp = readb(s3c_rtc_base + S3C2410_RTCALM) & ~S3C2410_RTCALM_ALMEN;
 
 	if (to)
 		tmp |= S3C2410_RTCALM_ALMEN;
 
-	writeb(tmp, S3C2410_RTCALM);
+	writeb(tmp, s3c_rtc_base + S3C2410_RTCALM);
 }
 
 static void s3c_rtc_setpie(int to)
@@ -84,12 +84,12 @@ static void s3c_rtc_setpie(int to)
 	pr_debug("%s: pie=%d\n", __FUNCTION__, to);
 
 	spin_lock_irq(&s3c_rtc_pie_lock);
-	tmp = readb(S3C2410_TICNT) & ~S3C2410_TICNT_ENABLE;
+	tmp = readb(s3c_rtc_base + S3C2410_TICNT) & ~S3C2410_TICNT_ENABLE;
 
 	if (to)
 		tmp |= S3C2410_TICNT_ENABLE;
 
-	writeb(tmp, S3C2410_TICNT);
+	writeb(tmp, s3c_rtc_base + S3C2410_TICNT);
 	spin_unlock_irq(&s3c_rtc_pie_lock);
 }
 
@@ -98,13 +98,13 @@ static void s3c_rtc_setfreq(int freq)
 	unsigned int tmp;
 
 	spin_lock_irq(&s3c_rtc_pie_lock);
-	tmp = readb(S3C2410_TICNT) & S3C2410_TICNT_ENABLE;
+	tmp = readb(s3c_rtc_base + S3C2410_TICNT) & S3C2410_TICNT_ENABLE;
 
 	s3c_rtc_freq = freq;
 
 	tmp |= (128 / freq)-1;
 
-	writeb(tmp, S3C2410_TICNT);
+	writeb(tmp, s3c_rtc_base + S3C2410_TICNT);
 	spin_unlock_irq(&s3c_rtc_pie_lock);
 }
 
@@ -113,14 +113,15 @@ static void s3c_rtc_setfreq(int freq)
 static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
 {
 	unsigned int have_retried = 0;
+	void __iomem *base = s3c_rtc_base;
 
  retry_get_time:
-	rtc_tm->tm_min  = readb(S3C2410_RTCMIN);
-	rtc_tm->tm_hour = readb(S3C2410_RTCHOUR);
-	rtc_tm->tm_mday = readb(S3C2410_RTCDATE);
-	rtc_tm->tm_mon  = readb(S3C2410_RTCMON);
-	rtc_tm->tm_year = readb(S3C2410_RTCYEAR);
-	rtc_tm->tm_sec  = readb(S3C2410_RTCSEC);
+	rtc_tm->tm_min  = readb(base + S3C2410_RTCMIN);
+	rtc_tm->tm_hour = readb(base + S3C2410_RTCHOUR);
+	rtc_tm->tm_mday = readb(base + S3C2410_RTCDATE);
+	rtc_tm->tm_mon  = readb(base + S3C2410_RTCMON);
+	rtc_tm->tm_year = readb(base + S3C2410_RTCYEAR);
+	rtc_tm->tm_sec  = readb(base + S3C2410_RTCSEC);
 
 	/* the only way to work out wether the system was mid-update
 	 * when we read it is to check the second counter, and if it
@@ -151,17 +152,26 @@ static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
 
 static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm)
 {
-	/* the rtc gets round the y2k problem by just not supporting it */
+	void __iomem *base = s3c_rtc_base;
+	int year = tm->tm_year - 100;
 
-	if (tm->tm_year < 100)
+	pr_debug("set time %02d.%02d.%02d %02d/%02d/%02d\n",
+		 tm->tm_year, tm->tm_mon, tm->tm_mday,
+		 tm->tm_hour, tm->tm_min, tm->tm_sec);
+
+	/* we get around y2k by simply not supporting it */
+
+	if (year < 0 || year >= 100) {
+		dev_err(dev, "rtc only supports 100 years\n");
 		return -EINVAL;
+	}
 
-	writeb(BIN2BCD(tm->tm_sec),  S3C2410_RTCSEC);
-	writeb(BIN2BCD(tm->tm_min),  S3C2410_RTCMIN);
-	writeb(BIN2BCD(tm->tm_hour), S3C2410_RTCHOUR);
-	writeb(BIN2BCD(tm->tm_mday), S3C2410_RTCDATE);
-	writeb(BIN2BCD(tm->tm_mon + 1), S3C2410_RTCMON);
-	writeb(BIN2BCD(tm->tm_year - 100), S3C2410_RTCYEAR);
+	writeb(BIN2BCD(tm->tm_sec),  base + S3C2410_RTCSEC);
+	writeb(BIN2BCD(tm->tm_min),  base + S3C2410_RTCMIN);
+	writeb(BIN2BCD(tm->tm_hour), base + S3C2410_RTCHOUR);
+	writeb(BIN2BCD(tm->tm_mday), base + S3C2410_RTCDATE);
+	writeb(BIN2BCD(tm->tm_mon + 1), base + S3C2410_RTCMON);
+	writeb(BIN2BCD(year), base + S3C2410_RTCYEAR);
 
 	return 0;
 }
@@ -169,16 +179,17 @@ static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm)
 static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
 	struct rtc_time *alm_tm = &alrm->time;
+	void __iomem *base = s3c_rtc_base;
 	unsigned int alm_en;
 
-	alm_tm->tm_sec  = readb(S3C2410_ALMSEC);
-	alm_tm->tm_min  = readb(S3C2410_ALMMIN);
-	alm_tm->tm_hour = readb(S3C2410_ALMHOUR);
-	alm_tm->tm_mon  = readb(S3C2410_ALMMON);
-	alm_tm->tm_mday = readb(S3C2410_ALMDATE);
-	alm_tm->tm_year = readb(S3C2410_ALMYEAR);
+	alm_tm->tm_sec  = readb(base + S3C2410_ALMSEC);
+	alm_tm->tm_min  = readb(base + S3C2410_ALMMIN);
+	alm_tm->tm_hour = readb(base + S3C2410_ALMHOUR);
+	alm_tm->tm_mon  = readb(base + S3C2410_ALMMON);
+	alm_tm->tm_mday = readb(base + S3C2410_ALMDATE);
+	alm_tm->tm_year = readb(base + S3C2410_ALMYEAR);
 
-	alm_en = readb(S3C2410_RTCALM);
+	alm_en = readb(base + S3C2410_RTCALM);
 
 	pr_debug("read alarm %02x %02x.%02x.%02x %02x/%02x/%02x\n",
 		 alm_en,
@@ -226,6 +237,7 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
 static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 {
 	struct rtc_time *tm = &alrm->time;
+	void __iomem *base = s3c_rtc_base;
 	unsigned int alrm_en;
 
 	pr_debug("s3c_rtc_setalarm: %d, %02x/%02x/%02x %02x.%02x.%02x\n",
@@ -234,32 +246,32 @@ static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 		 tm->tm_hour & 0xff, tm->tm_min & 0xff, tm->tm_sec);
 
 
-	alrm_en = readb(S3C2410_RTCALM) & S3C2410_RTCALM_ALMEN;
-	writeb(0x00, S3C2410_RTCALM);
+	alrm_en = readb(base + S3C2410_RTCALM) & S3C2410_RTCALM_ALMEN;
+	writeb(0x00, base + S3C2410_RTCALM);
 
 	if (tm->tm_sec < 60 && tm->tm_sec >= 0) {
 		alrm_en |= S3C2410_RTCALM_SECEN;
-		writeb(BIN2BCD(tm->tm_sec), S3C2410_ALMSEC);
+		writeb(BIN2BCD(tm->tm_sec), base + S3C2410_ALMSEC);
 	}
 
 	if (tm->tm_min < 60 && tm->tm_min >= 0) {
 		alrm_en |= S3C2410_RTCALM_MINEN;
-		writeb(BIN2BCD(tm->tm_min), S3C2410_ALMMIN);
+		writeb(BIN2BCD(tm->tm_min), base + S3C2410_ALMMIN);
 	}
 
 	if (tm->tm_hour < 24 && tm->tm_hour >= 0) {
 		alrm_en |= S3C2410_RTCALM_HOUREN;
-		writeb(BIN2BCD(tm->tm_hour), S3C2410_ALMHOUR);
+		writeb(BIN2BCD(tm->tm_hour), base + S3C2410_ALMHOUR);
 	}
 
 	pr_debug("setting S3C2410_RTCALM to %08x\n", alrm_en);
 
-	writeb(alrm_en, S3C2410_RTCALM);
+	writeb(alrm_en, base + S3C2410_RTCALM);
 
 	if (0) {
-		alrm_en = readb(S3C2410_RTCALM);
+		alrm_en = readb(base + S3C2410_RTCALM);
 		alrm_en &= ~S3C2410_RTCALM_ALMEN;
-		writeb(alrm_en, S3C2410_RTCALM);
+		writeb(alrm_en, base + S3C2410_RTCALM);
 		disable_irq_wake(s3c_rtc_alarmno);
 	}
 
@@ -319,8 +331,8 @@ static int s3c_rtc_ioctl(struct device *dev,
 
 static int s3c_rtc_proc(struct device *dev, struct seq_file *seq)
 {
-	unsigned int rtcalm = readb(S3C2410_RTCALM);
-	unsigned int ticnt = readb (S3C2410_TICNT);
+	unsigned int rtcalm = readb(s3c_rtc_base + S3C2410_RTCALM);
+	unsigned int ticnt = readb(s3c_rtc_base + S3C2410_TICNT);
 
 	seq_printf(seq, "alarm_IRQ\t: %s\n",
 		   (rtcalm & S3C2410_RTCALM_ALMEN) ? "yes" : "no" );
@@ -387,39 +399,40 @@ static struct rtc_class_ops s3c_rtcops = {
 
 static void s3c_rtc_enable(struct platform_device *pdev, int en)
 {
+	void __iomem *base = s3c_rtc_base;
 	unsigned int tmp;
 
 	if (s3c_rtc_base == NULL)
 		return;
 
 	if (!en) {
-		tmp = readb(S3C2410_RTCCON);
-		writeb(tmp & ~S3C2410_RTCCON_RTCEN, S3C2410_RTCCON);
+		tmp = readb(base + S3C2410_RTCCON);
+		writeb(tmp & ~S3C2410_RTCCON_RTCEN, base + S3C2410_RTCCON);
 
-		tmp = readb(S3C2410_TICNT);
-		writeb(tmp & ~S3C2410_TICNT_ENABLE, S3C2410_TICNT);
+		tmp = readb(base + S3C2410_TICNT);
+		writeb(tmp & ~S3C2410_TICNT_ENABLE, base + S3C2410_TICNT);
 	} else {
 		/* re-enable the device, and check it is ok */
 
-		if ((readb(S3C2410_RTCCON) & S3C2410_RTCCON_RTCEN) == 0){
+		if ((readb(base+S3C2410_RTCCON) & S3C2410_RTCCON_RTCEN) == 0){
 			dev_info(&pdev->dev, "rtc disabled, re-enabling\n");
 
-			tmp = readb(S3C2410_RTCCON);
-			writeb(tmp | S3C2410_RTCCON_RTCEN , S3C2410_RTCCON);
+			tmp = readb(base + S3C2410_RTCCON);
+			writeb(tmp|S3C2410_RTCCON_RTCEN, base+S3C2410_RTCCON);
 		}
 
-		if ((readb(S3C2410_RTCCON) & S3C2410_RTCCON_CNTSEL)){
+		if ((readb(base + S3C2410_RTCCON) & S3C2410_RTCCON_CNTSEL)){
 			dev_info(&pdev->dev, "removing RTCCON_CNTSEL\n");
 
-			tmp = readb(S3C2410_RTCCON);
-			writeb(tmp& ~S3C2410_RTCCON_CNTSEL , S3C2410_RTCCON);
+			tmp = readb(base + S3C2410_RTCCON);
+			writeb(tmp& ~S3C2410_RTCCON_CNTSEL, base+S3C2410_RTCCON);
 		}
 
-		if ((readb(S3C2410_RTCCON) & S3C2410_RTCCON_CLKRST)){
+		if ((readb(base + S3C2410_RTCCON) & S3C2410_RTCCON_CLKRST)){
 			dev_info(&pdev->dev, "removing RTCCON_CLKRST\n");
 
-			tmp = readb(S3C2410_RTCCON);
-			writeb(tmp & ~S3C2410_RTCCON_CLKRST, S3C2410_RTCCON);
+			tmp = readb(base + S3C2410_RTCCON);
+			writeb(tmp & ~S3C2410_RTCCON_CLKRST, base+S3C2410_RTCCON);
 		}
 	}
 }
@@ -475,8 +488,8 @@ static int s3c_rtc_probe(struct platform_device *pdev)
 	}
 
 	s3c_rtc_mem = request_mem_region(res->start,
-					     res->end-res->start+1,
-					     pdev->name);
+					 res->end-res->start+1,
+					 pdev->name);
 
 	if (s3c_rtc_mem == NULL) {
 		dev_err(&pdev->dev, "failed to reserve memory region\n");
@@ -495,7 +508,8 @@ static int s3c_rtc_probe(struct platform_device *pdev)
 
 	s3c_rtc_enable(pdev, 1);
 
- 	pr_debug("s3c2410_rtc: RTCCON=%02x\n", readb(S3C2410_RTCCON));
+ 	pr_debug("s3c2410_rtc: RTCCON=%02x\n",
+		 readb(s3c_rtc_base + S3C2410_RTCCON));
 
 	s3c_rtc_setfreq(s3c_rtc_freq);
 
@@ -543,7 +557,7 @@ static int s3c_rtc_suspend(struct platform_device *pdev, pm_message_t state)
 
 	/* save TICNT for anyone using periodic interrupts */
 
-	ticnt_save = readb(S3C2410_TICNT);
+	ticnt_save = readb(s3c_rtc_base + S3C2410_TICNT);
 
 	/* calculate time delta for suspend */
 
@@ -567,7 +581,7 @@ static int s3c_rtc_resume(struct platform_device *pdev)
 	rtc_tm_to_time(&tm, &time.tv_sec);
 	restore_time_delta(&s3c_rtc_delta, &time);
 
-	writeb(ticnt_save, S3C2410_TICNT);
+	writeb(ticnt_save, s3c_rtc_base + S3C2410_TICNT);
 	return 0;
 }
 #else
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index f7b5d7372d2..94d1de55607 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -517,7 +517,7 @@ static ide_startstop_t idescsi_pc_intr (ide_drive_t *drive)
 		/* No more interrupts */
 		if (test_bit(IDESCSI_LOG_CMD, &scsi->log))
 			printk (KERN_INFO "Packet command completed, %d bytes transferred\n", pc->actually_transferred);
-		local_irq_enable();
+		local_irq_enable_in_hardirq();
 		if (status.b.check)
 			rq->errors++;
 		idescsi_end_request (drive, 1, 0);
diff --git a/drivers/video/imacfb.c b/drivers/video/imacfb.c
index b485bece5fc..18ea4a54910 100644
--- a/drivers/video/imacfb.c
+++ b/drivers/video/imacfb.c
@@ -71,10 +71,10 @@ static int set_system(struct dmi_system_id *id)
 static struct dmi_system_id __initdata dmi_system_table[] = {
 	{ set_system, "iMac4,1", {
 	  DMI_MATCH(DMI_BIOS_VENDOR,"Apple Computer, Inc."),
-	  DMI_MATCH(DMI_BIOS_VERSION,"iMac4,1") }, (void*)M_I17},
+	  DMI_MATCH(DMI_PRODUCT_NAME,"iMac4,1") }, (void*)M_I17},
 	{ set_system, "MacBookPro1,1", {
 	  DMI_MATCH(DMI_BIOS_VENDOR,"Apple Computer, Inc."),
-	  DMI_MATCH(DMI_BIOS_VERSION,"MacBookPro1,1") }, (void*)M_I17},
+	  DMI_MATCH(DMI_PRODUCT_NAME,"MacBookPro1,1") }, (void*)M_I17},
 	{ set_system, "MacBook1,1", {
 	  DMI_MATCH(DMI_BIOS_VENDOR,"Apple Computer, Inc."),
 	  DMI_MATCH(DMI_PRODUCT_NAME,"MacBook1,1")}, (void *)M_MACBOOK},
diff --git a/drivers/video/matrox/g450_pll.c b/drivers/video/matrox/g450_pll.c
index 440272ad10e..7c76e079ca7 100644
--- a/drivers/video/matrox/g450_pll.c
+++ b/drivers/video/matrox/g450_pll.c
@@ -331,7 +331,15 @@ static int __g450_setclk(WPMINFO unsigned int fout, unsigned int pll,
 					tmp |= M1064_XPIXCLKCTRL_PLL_UP;
 				}
 				matroxfb_DAC_out(PMINFO M1064_XPIXCLKCTRL, tmp);
+#ifdef __powerpc__
+				/* This is necessary to avoid jitter on PowerPC
+				 * (OpenFirmware) systems, but apparently
+				 * introduces jitter, at least on a x86-64
+				 * using DVI.
+				 * A simple workaround is disable for non-PPC.
+				 */
 				matroxfb_DAC_out(PMINFO M1064_XDVICLKCTRL, 0);
+#endif /* __powerpc__ */
 				matroxfb_DAC_out(PMINFO M1064_XPWRCTRL, xpwrctrl);
 
 				matroxfb_DAC_unlock_irqrestore(flags);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 37534573960..045f98854f1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -884,6 +884,61 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
+static int __blkdev_put(struct block_device *bdev, unsigned int subclass)
+{
+	int ret = 0;
+	struct inode *bd_inode = bdev->bd_inode;
+	struct gendisk *disk = bdev->bd_disk;
+
+	mutex_lock_nested(&bdev->bd_mutex, subclass);
+	lock_kernel();
+	if (!--bdev->bd_openers) {
+		sync_blockdev(bdev);
+		kill_bdev(bdev);
+	}
+	if (bdev->bd_contains == bdev) {
+		if (disk->fops->release)
+			ret = disk->fops->release(bd_inode, NULL);
+	} else {
+		mutex_lock_nested(&bdev->bd_contains->bd_mutex,
+				  subclass + 1);
+		bdev->bd_contains->bd_part_count--;
+		mutex_unlock(&bdev->bd_contains->bd_mutex);
+	}
+	if (!bdev->bd_openers) {
+		struct module *owner = disk->fops->owner;
+
+		put_disk(disk);
+		module_put(owner);
+
+		if (bdev->bd_contains != bdev) {
+			kobject_put(&bdev->bd_part->kobj);
+			bdev->bd_part = NULL;
+		}
+		bdev->bd_disk = NULL;
+		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+		if (bdev != bdev->bd_contains)
+			__blkdev_put(bdev->bd_contains, subclass + 1);
+		bdev->bd_contains = NULL;
+	}
+	unlock_kernel();
+	mutex_unlock(&bdev->bd_mutex);
+	bdput(bdev);
+	return ret;
+}
+
+int blkdev_put(struct block_device *bdev)
+{
+	return __blkdev_put(bdev, BD_MUTEX_NORMAL);
+}
+EXPORT_SYMBOL(blkdev_put);
+
+int blkdev_put_partition(struct block_device *bdev)
+{
+	return __blkdev_put(bdev, BD_MUTEX_PARTITION);
+}
+EXPORT_SYMBOL(blkdev_put_partition);
+
 static int
 blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags);
 
@@ -980,7 +1035,7 @@ out_first:
 	bdev->bd_disk = NULL;
 	bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
 	if (bdev != bdev->bd_contains)
-		blkdev_put(bdev->bd_contains);
+		__blkdev_put(bdev->bd_contains, BD_MUTEX_WHOLE);
 	bdev->bd_contains = NULL;
 	put_disk(disk);
 	module_put(owner);
@@ -1079,63 +1134,6 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	return res;
 }
 
-static int __blkdev_put(struct block_device *bdev, unsigned int subclass)
-{
-	int ret = 0;
-	struct inode *bd_inode = bdev->bd_inode;
-	struct gendisk *disk = bdev->bd_disk;
-
-	mutex_lock_nested(&bdev->bd_mutex, subclass);
-	lock_kernel();
-	if (!--bdev->bd_openers) {
-		sync_blockdev(bdev);
-		kill_bdev(bdev);
-	}
-	if (bdev->bd_contains == bdev) {
-		if (disk->fops->release)
-			ret = disk->fops->release(bd_inode, NULL);
-	} else {
-		mutex_lock_nested(&bdev->bd_contains->bd_mutex,
-				  subclass + 1);
-		bdev->bd_contains->bd_part_count--;
-		mutex_unlock(&bdev->bd_contains->bd_mutex);
-	}
-	if (!bdev->bd_openers) {
-		struct module *owner = disk->fops->owner;
-
-		put_disk(disk);
-		module_put(owner);
-
-		if (bdev->bd_contains != bdev) {
-			kobject_put(&bdev->bd_part->kobj);
-			bdev->bd_part = NULL;
-		}
-		bdev->bd_disk = NULL;
-		bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
-		if (bdev != bdev->bd_contains)
-			__blkdev_put(bdev->bd_contains, subclass + 1);
-		bdev->bd_contains = NULL;
-	}
-	unlock_kernel();
-	mutex_unlock(&bdev->bd_mutex);
-	bdput(bdev);
-	return ret;
-}
-
-int blkdev_put(struct block_device *bdev)
-{
-	return __blkdev_put(bdev, BD_MUTEX_NORMAL);
-}
-
-EXPORT_SYMBOL(blkdev_put);
-
-int blkdev_put_partition(struct block_device *bdev)
-{
-	return __blkdev_put(bdev, BD_MUTEX_PARTITION);
-}
-
-EXPORT_SYMBOL(blkdev_put_partition);
-
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 19ffb043abb..3a3567433b9 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1168,7 +1168,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
 eexit_1:
 
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
-		     current, ep, epi->file, error));
+		     current, ep, epi->ffd.file, error));
 
 	return error;
 }
@@ -1236,7 +1236,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	struct eventpoll *ep = epi->ep;
 
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
-		     current, epi->file, epi, ep));
+		     current, epi->ffd.file, epi, ep));
 
 	write_lock_irqsave(&ep->lock, flags);
 
diff --git a/fs/exec.c b/fs/exec.c
index f7aabfeca03..54135df2a96 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -751,7 +751,7 @@ no_thread_group:
 
 		write_lock_irq(&tasklist_lock);
 		spin_lock(&oldsighand->siglock);
-		spin_lock(&newsighand->siglock);
+		spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING);
 
 		rcu_assign_pointer(current->sighand, newsighand);
 		recalc_sigpending();
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f2702cda977..681dea8f953 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -775,7 +775,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT2_INODE_SIZE(sb) == 0)
 		goto cantfind_ext2;
 	sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
-	if (sbi->s_inodes_per_block == 0)
+	if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0)
 		goto cantfind_ext2;
 	sbi->s_itb_per_group = sbi->s_inodes_per_group /
 					sbi->s_inodes_per_block;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index a504a40d6d2..063d994bda0 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1269,12 +1269,12 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 		goal = le32_to_cpu(es->s_first_data_block);
 	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
 			EXT3_BLOCKS_PER_GROUP(sb);
+	goal_group = group_no;
+retry_alloc:
 	gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
 	if (!gdp)
 		goto io_error;
 
-	goal_group = group_no;
-retry:
 	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
 	/*
 	 * if there is not enough free blocks to make a new resevation
@@ -1349,7 +1349,7 @@ retry:
 	if (my_rsv) {
 		my_rsv = NULL;
 		group_no = goal_group;
-		goto retry;
+		goto retry_alloc;
 	}
 	/* No space left on the device */
 	*errp = -ENOSPC;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 0971814c38b..42da6078431 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -261,7 +261,7 @@ void journal_commit_transaction(journal_t *journal)
 			struct buffer_head *bh = jh2bh(jh);
 
 			jbd_lock_bh_state(bh);
-			kfree(jh->b_committed_data);
+			jbd_slab_free(jh->b_committed_data, bh->b_size);
 			jh->b_committed_data = NULL;
 			jbd_unlock_bh_state(bh);
 		}
@@ -745,14 +745,14 @@ restart_loop:
 		 * Otherwise, we can just throw away the frozen data now.
 		 */
 		if (jh->b_committed_data) {
-			kfree(jh->b_committed_data);
+			jbd_slab_free(jh->b_committed_data, bh->b_size);
 			jh->b_committed_data = NULL;
 			if (jh->b_frozen_data) {
 				jh->b_committed_data = jh->b_frozen_data;
 				jh->b_frozen_data = NULL;
 			}
 		} else if (jh->b_frozen_data) {
-			kfree(jh->b_frozen_data);
+			jbd_slab_free(jh->b_frozen_data, bh->b_size);
 			jh->b_frozen_data = NULL;
 		}
 
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 8c9b28dff11..f66724ce443 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -84,6 +84,7 @@ EXPORT_SYMBOL(journal_force_commit);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
+static int journal_create_jbd_slab(size_t slab_size);
 
 /*
  * Helper function used to manage commit timeouts
@@ -328,10 +329,10 @@ repeat:
 		char *tmp;
 
 		jbd_unlock_bh_state(bh_in);
-		tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS);
+		tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS);
 		jbd_lock_bh_state(bh_in);
 		if (jh_in->b_frozen_data) {
-			kfree(tmp);
+			jbd_slab_free(tmp, bh_in->b_size);
 			goto repeat;
 		}
 
@@ -1069,17 +1070,17 @@ static int load_superblock(journal_t *journal)
 int journal_load(journal_t *journal)
 {
 	int err;
+	journal_superblock_t *sb;
 
 	err = load_superblock(journal);
 	if (err)
 		return err;
 
+	sb = journal->j_superblock;
 	/* If this is a V2 superblock, then we have to check the
 	 * features flags on it. */
 
 	if (journal->j_format_version >= 2) {
-		journal_superblock_t *sb = journal->j_superblock;
-
 		if ((sb->s_feature_ro_compat &
 		     ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
 		    (sb->s_feature_incompat &
@@ -1090,6 +1091,13 @@ int journal_load(journal_t *journal)
 		}
 	}
 
+	/*
+	 * Create a slab for this blocksize
+	 */
+	err = journal_create_jbd_slab(cpu_to_be32(sb->s_blocksize));
+	if (err)
+		return err;
+
 	/* Let the recovery code check whether it needs to recover any
 	 * data from the journal. */
 	if (journal_recover(journal))
@@ -1612,6 +1620,77 @@ void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
 }
 
 /*
+ * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
+ * and allocate frozen and commit buffers from these slabs.
+ *
+ * Reason for doing this is to avoid, SLAB_DEBUG - since it could
+ * cause bh to cross page boundary.
+ */
+
+#define JBD_MAX_SLABS 5
+#define JBD_SLAB_INDEX(size)  (size >> 11)
+
+static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
+static const char *jbd_slab_names[JBD_MAX_SLABS] = {
+	"jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k"
+};
+
+static void journal_destroy_jbd_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < JBD_MAX_SLABS; i++) {
+		if (jbd_slab[i])
+			kmem_cache_destroy(jbd_slab[i]);
+		jbd_slab[i] = NULL;
+	}
+}
+
+static int journal_create_jbd_slab(size_t slab_size)
+{
+	int i = JBD_SLAB_INDEX(slab_size);
+
+	BUG_ON(i >= JBD_MAX_SLABS);
+
+	/*
+	 * Check if we already have a slab created for this size
+	 */
+	if (jbd_slab[i])
+		return 0;
+
+	/*
+	 * Create a slab and force alignment to be same as slabsize -
+	 * this will make sure that allocations won't cross the page
+	 * boundary.
+	 */
+	jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
+				slab_size, slab_size, 0, NULL, NULL);
+	if (!jbd_slab[i]) {
+		printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void * jbd_slab_alloc(size_t size, gfp_t flags)
+{
+	int idx;
+
+	idx = JBD_SLAB_INDEX(size);
+	BUG_ON(jbd_slab[idx] == NULL);
+	return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
+}
+
+void jbd_slab_free(void *ptr,  size_t size)
+{
+	int idx;
+
+	idx = JBD_SLAB_INDEX(size);
+	BUG_ON(jbd_slab[idx] == NULL);
+	kmem_cache_free(jbd_slab[idx], ptr);
+}
+
+/*
  * Journal_head storage management
  */
 static kmem_cache_t *journal_head_cache;
@@ -1799,13 +1878,13 @@ static void __journal_remove_journal_head(struct buffer_head *bh)
 				printk(KERN_WARNING "%s: freeing "
 						"b_frozen_data\n",
 						__FUNCTION__);
-				kfree(jh->b_frozen_data);
+				jbd_slab_free(jh->b_frozen_data, bh->b_size);
 			}
 			if (jh->b_committed_data) {
 				printk(KERN_WARNING "%s: freeing "
 						"b_committed_data\n",
 						__FUNCTION__);
-				kfree(jh->b_committed_data);
+				jbd_slab_free(jh->b_committed_data, bh->b_size);
 			}
 			bh->b_private = NULL;
 			jh->b_bh = NULL;	/* debug, really */
@@ -1961,6 +2040,7 @@ static void journal_destroy_caches(void)
 	journal_destroy_revoke_caches();
 	journal_destroy_journal_head_cache();
 	journal_destroy_handle_cache();
+	journal_destroy_jbd_slabs();
 }
 
 static int __init journal_init(void)
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 508b2ea91f4..de2e4cbbf79 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -666,8 +666,9 @@ repeat:
 			if (!frozen_buffer) {
 				JBUFFER_TRACE(jh, "allocate memory for buffer");
 				jbd_unlock_bh_state(bh);
-				frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
-							    GFP_NOFS);
+				frozen_buffer =
+					jbd_slab_alloc(jh2bh(jh)->b_size,
+							 GFP_NOFS);
 				if (!frozen_buffer) {
 					printk(KERN_EMERG
 					       "%s: OOM for frozen_buffer\n",
@@ -879,7 +880,7 @@ int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 
 repeat:
 	if (!jh->b_committed_data) {
-		committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS);
+		committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
 		if (!committed_data) {
 			printk(KERN_EMERG "%s: No memory for committed data\n",
 				__FUNCTION__);
@@ -906,7 +907,7 @@ repeat:
 out:
 	journal_put_journal_head(jh);
 	if (unlikely(committed_data))
-		kfree(committed_data);
+		jbd_slab_free(committed_data, bh->b_size);
 	return err;
 }
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 9ea91c5eeb7..330ff9fc7cf 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -204,6 +204,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
 	/*
 	 * Allocate the buffer map to keep the superblock small.
 	 */
+	if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
+		goto out_illegal_sb;
 	i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
 	map = kmalloc(i, GFP_KERNEL);
 	if (!map)
@@ -263,7 +265,7 @@ out_no_root:
 
 out_no_bitmap:
 	printk("MINIX-fs: bad superblock or unable to read bitmaps\n");
-    out_freemap:
+out_freemap:
 	for (i = 0; i < sbi->s_imap_blocks; i++)
 		brelse(sbi->s_imap[i]);
 	for (i = 0; i < sbi->s_zmap_blocks; i++)
@@ -276,11 +278,16 @@ out_no_map:
 		printk("MINIX-fs: can't allocate map\n");
 	goto out_release;
 
+out_illegal_sb:
+	if (!silent)
+		printk("MINIX-fs: bad superblock\n");
+	goto out_release;
+
 out_no_fs:
 	if (!silent)
 		printk("VFS: Can't find a Minix or Minix V2 filesystem "
 			"on device %s\n", s->s_id);
-    out_release:
+out_release:
 	brelse(bh);
 	goto out;
 
@@ -290,7 +297,7 @@ out_bad_hblock:
 
 out_bad_sb:
 	printk("MINIX-fs: unable to read superblock\n");
- out:
+out:
 	s->s_fs_info = NULL;
 	kfree(sbi);
 	return -EINVAL;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 9f2cfc30f9c..94215622544 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -169,7 +169,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
 		"PageTables:   %8lu kB\n"
-		"NFS Unstable: %8lu kB\n"
+		"NFS_Unstable: %8lu kB\n"
 		"Bounce:       %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
 		"Committed_AS: %8lu kB\n"
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 39fedaa88a0..d935fb9394e 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -424,7 +424,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf)
 	int res = -ENOTDIR;
 	if (!file->f_op || !file->f_op->readdir)
 		goto out;
-	mutex_lock(&inode->i_mutex);
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR);
 //        down(&inode->i_zombie);
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index e7c8615beb6..30c6e8a9446 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -169,18 +169,20 @@ static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh)
 
 static struct buffer_head *
 ufs_clear_frags(struct inode *inode, sector_t beg,
-		unsigned int n)
+		unsigned int n, sector_t want)
 {
-	struct buffer_head *res, *bh;
+	struct buffer_head *res = NULL, *bh;
 	sector_t end = beg + n;
 
-	res = sb_getblk(inode->i_sb, beg);
-	ufs_clear_frag(inode, res);
-	for (++beg; beg < end; ++beg) {
+	for (; beg < end; ++beg) {
 		bh = sb_getblk(inode->i_sb, beg);
 		ufs_clear_frag(inode, bh);
-		brelse(bh);
+		if (want != beg)
+			brelse(bh);
+		else
+			res = bh;
 	}
+	BUG_ON(!res);
 	return res;
 }
 
@@ -265,7 +267,9 @@ repeat:
 			lastfrag = ufsi->i_lastfrag;
 			
 		}
-		goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb;
+		tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]);
+		if (tmp)
+			goal = tmp + uspi->s_fpb;
 		tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
 					 goal, required + blockoff,
 					 err, locked_page);
@@ -277,13 +281,15 @@ repeat:
 		tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff),
 					fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff),
 					err, locked_page);
-	}
+	} else /* (lastblock > block) */ {
 	/*
 	 * We will allocate new block before last allocated block
 	 */
-	else /* (lastblock > block) */ {
-		if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1])))
-			goal = tmp + uspi->s_fpb;
+		if (block) {
+			tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[block-1]);
+			if (tmp)
+				goal = tmp + uspi->s_fpb;
+		}
 		tmp = ufs_new_fragments(inode, p, fragment - blockoff,
 					goal, uspi->s_fpb, err, locked_page);
 	}
@@ -296,7 +302,7 @@ repeat:
 	}
 
 	if (!phys) {
-		result = ufs_clear_frags(inode, tmp + blockoff, required);
+		result = ufs_clear_frags(inode, tmp, required, tmp + blockoff);
 	} else {
 		*phys = tmp + blockoff;
 		result = NULL;
@@ -383,7 +389,7 @@ repeat:
 		}
 	}
 
-	if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1]) + uspi->s_fpb))
+	if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1])))
 		goal = tmp + uspi->s_fpb;
 	else
 		goal = bh->b_blocknr + uspi->s_fpb;
@@ -397,7 +403,8 @@ repeat:
 
 
 	if (!phys) {
-		result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb);
+		result = ufs_clear_frags(inode, tmp, uspi->s_fpb,
+					 tmp + blockoff);
 	} else {
 		*phys = tmp + blockoff;
 		*new = 1;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index c9b55872079..ea11d04c41a 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -375,17 +375,15 @@ static int ufs_alloc_lastblock(struct inode *inode)
 	int err = 0;
 	struct address_space *mapping = inode->i_mapping;
 	struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
-	struct ufs_inode_info *ufsi = UFS_I(inode);
 	unsigned lastfrag, i, end;
 	struct page *lastpage;
 	struct buffer_head *bh;
 
 	lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift;
 
-	if (!lastfrag) {
-		ufsi->i_lastfrag = 0;
+	if (!lastfrag)
 		goto out;
-	}
+
 	lastfrag--;
 
 	lastpage = ufs_get_locked_page(mapping, lastfrag >>
@@ -400,25 +398,25 @@ static int ufs_alloc_lastblock(struct inode *inode)
        for (i = 0; i < end; ++i)
                bh = bh->b_this_page;
 
-       if (!buffer_mapped(bh)) {
-               err = ufs_getfrag_block(inode, lastfrag, bh, 1);
-
-               if (unlikely(err))
-                       goto out_unlock;
-
-               if (buffer_new(bh)) {
-                       clear_buffer_new(bh);
-                       unmap_underlying_metadata(bh->b_bdev,
-						 bh->b_blocknr);
-		       /*
-			* we do not zeroize fragment, because of
-			* if it maped to hole, it already contains zeroes
-			*/
-                       set_buffer_uptodate(bh);
-                       mark_buffer_dirty(bh);
-                       set_page_dirty(lastpage);
-               }
+
+       err = ufs_getfrag_block(inode, lastfrag, bh, 1);
+
+       if (unlikely(err))
+	       goto out_unlock;
+
+       if (buffer_new(bh)) {
+	       clear_buffer_new(bh);
+	       unmap_underlying_metadata(bh->b_bdev,
+					 bh->b_blocknr);
+	       /*
+		* we do not zeroize fragment, because of
+		* if it maped to hole, it already contains zeroes
+		*/
+	       set_buffer_uptodate(bh);
+	       mark_buffer_dirty(bh);
+	       set_page_dirty(lastpage);
        }
+
 out_unlock:
        ufs_put_locked_page(lastpage);
 out:
@@ -440,23 +438,11 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return -EPERM;
 
-	if (inode->i_size > old_i_size) {
-		/*
-		 * if we expand file we should care about
-		 * allocation of block for last byte first of all
-		 */
-		err = ufs_alloc_lastblock(inode);
+	err = ufs_alloc_lastblock(inode);
 
-		if (err) {
-			i_size_write(inode, old_i_size);
-			goto out;
-		}
-		/*
-		 * go away, because of we expand file, and we do not
-		 * need free blocks, and zeroizes page
-		 */
-		lock_kernel();
-		goto almost_end;
+	if (err) {
+		i_size_write(inode, old_i_size);
+		goto out;
 	}
 
 	block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
@@ -477,21 +463,8 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
 		yield();
 	}
 
-	if (inode->i_size < old_i_size) {
-		/*
-		 * now we should have enough space
-		 * to allocate block for last byte
-		 */
-		err = ufs_alloc_lastblock(inode);
-		if (err)
-			/*
-			 * looks like all the same - we have no space,
-			 * but we truncate file already
-			 */
-			inode->i_size = (ufsi->i_lastfrag - 1) * uspi->s_fsize;
-	}
-almost_end:
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	ufsi->i_lastfrag = DIRECT_FRAGMENT;
 	unlock_kernel();
 	mark_inode_dirty(inode);
 out:
diff --git a/include/asm-arm/arch-s3c2410/regs-rtc.h b/include/asm-arm/arch-s3c2410/regs-rtc.h
index 228983f89bc..0fbec07bb6b 100644
--- a/include/asm-arm/arch-s3c2410/regs-rtc.h
+++ b/include/asm-arm/arch-s3c2410/regs-rtc.h
@@ -18,7 +18,7 @@
 #ifndef __ASM_ARCH_REGS_RTC_H
 #define __ASM_ARCH_REGS_RTC_H __FILE__
 
-#define S3C2410_RTCREG(x) ((x) + S3C24XX_VA_RTC)
+#define S3C2410_RTCREG(x) (x)
 
 #define S3C2410_RTCCON	      S3C2410_RTCREG(0x40)
 #define S3C2410_RTCCON_RTCEN  (1<<0)
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index e33e9f9e4c6..22cb07cc8f3 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -14,7 +14,7 @@ extern struct pglist_data *node_data[];
 
 #ifdef CONFIG_X86_NUMAQ
 	#include <asm/numaq.h>
-#else	/* summit or generic arch */
+#elif defined(CONFIG_ACPI_SRAT)/* summit or generic arch */
 	#include <asm/srat.h>
 #endif
 
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 269d000bb2a..bea0255196c 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -216,6 +216,7 @@ COMPATIBLE_IOCTL(VT_RESIZE)
 COMPATIBLE_IOCTL(VT_RESIZEX)
 COMPATIBLE_IOCTL(VT_LOCKSWITCH)
 COMPATIBLE_IOCTL(VT_UNLOCKSWITCH)
+COMPATIBLE_IOCTL(VT_GETHIFONTMASK)
 /* Little p (/dev/rtc, /dev/envctrl, etc.) */
 COMPATIBLE_IOCTL(RTC_AIE_ON)
 COMPATIBLE_IOCTL(RTC_AIE_OFF)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 25610205c90..555bc195c42 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -570,13 +570,14 @@ struct inode {
  * 3: quota file
  *
  * The locking order between these classes is
- * parent -> child -> normal -> quota
+ * parent -> child -> normal -> xattr -> quota
  */
 enum inode_i_mutex_lock_class
 {
 	I_MUTEX_NORMAL,
 	I_MUTEX_PARENT,
 	I_MUTEX_CHILD,
+	I_MUTEX_XATTR,
 	I_MUTEX_QUOTA
 };
 
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 20eb34403d0..a04c154c520 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -72,6 +72,9 @@ extern int journal_enable_debug;
 #endif
 
 extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry);
+extern void * jbd_slab_alloc(size_t size, gfp_t flags);
+extern void jbd_slab_free(void *ptr, size_t size);
+
 #define jbd_kmalloc(size, flags) \
 	__jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
 #define jbd_rep_kmalloc(size, flags) \
diff --git a/include/linux/node.h b/include/linux/node.h
index 81dcec84cd8..bc001bc225c 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -30,12 +30,20 @@ extern struct node node_devices[];
 
 extern int register_node(struct node *, int, struct node *);
 extern void unregister_node(struct node *node);
+#ifdef CONFIG_NUMA
 extern int register_one_node(int nid);
 extern void unregister_one_node(int nid);
-#ifdef CONFIG_NUMA
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 #else
+static inline int register_one_node(int nid)
+{
+	return 0;
+}
+static inline int unregister_one_node(int nid)
+{
+	return 0;
+}
 static inline int register_cpu_under_node(unsigned int cpu, unsigned int nid)
 {
 	return 0;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index e421d5e3481..04827ca6578 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -59,6 +59,7 @@ struct tty_bufhead {
 	struct tty_buffer *head;	/* Queue head */
 	struct tty_buffer *tail;	/* Active buffer */
 	struct tty_buffer *free;	/* Free queue head */
+	int memory_used;		/* Buffer space used excluding free queue */
 };
 /*
  * The pty uses char_buf and flag_buf as a contiguous buffer
diff --git a/include/linux/vt.h b/include/linux/vt.h
index 8ab334a4822..ba806e8711b 100644
--- a/include/linux/vt.h
+++ b/include/linux/vt.h
@@ -60,5 +60,6 @@ struct vt_consize {
 #define VT_RESIZEX      0x560A  /* set kernel's idea of screensize + more */
 #define VT_LOCKSWITCH   0x560B  /* disallow vt switching */
 #define VT_UNLOCKSWITCH 0x560C  /* allow vt switching */
+#define VT_GETHIFONTMASK 0x560D  /* return hi font mask */
 
 #endif /* _LINUX_VT_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1a649f2bb9b..4ea6f0dc2fc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -816,6 +816,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	struct cpuset trialcs;
 	int retval, cpus_unchanged;
 
+	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
 	trialcs = *cs;
 	retval = cpulist_parse(buf, trialcs.cpus_allowed);
 	if (retval < 0)
@@ -2033,6 +2037,33 @@ out:
 	return err;
 }
 
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period.  This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This handles CPU hotplug (cpuhp) events.  If someday Memory
+ * Nodes can be hotplugged (dynamically changing node_online_map)
+ * then we should handle that too, perhaps in a similar way.
+ */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+				unsigned long phase, void *cpu)
+{
+	mutex_lock(&manage_mutex);
+	mutex_lock(&callback_mutex);
+
+	top_cpuset.cpus_allowed = cpu_online_map;
+
+	mutex_unlock(&callback_mutex);
+	mutex_unlock(&manage_mutex);
+
+	return 0;
+}
+#endif
+
 /**
  * cpuset_init_smp - initialize cpus_allowed
  *
@@ -2043,6 +2074,8 @@ void __init cpuset_init_smp(void)
 {
 	top_cpuset.cpus_allowed = cpu_online_map;
 	top_cpuset.mems_allowed = node_online_map;
+
+	hotcpu_notifier(cpuset_handle_cpuhp, 0);
 }
 
 /**
@@ -2387,7 +2420,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
 {
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
-	int overlap = 0;		/* do cpusets overlap? */
+	int overlap = 1;		/* do cpusets overlap? */
 
 	task_lock(current);
 	if (current->flags & PF_EXITING) {
diff --git a/kernel/futex.c b/kernel/futex.c
index d4633c588f3..b9b8aea5389 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -397,7 +397,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 		p = NULL;
 		goto out_unlock;
 	}
-	if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+	if (p->exit_state != 0) {
 		p = NULL;
 		goto out_unlock;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index a2be2d05529..a234fbee123 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4162,10 +4162,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 		read_unlock_irq(&tasklist_lock);
 		return -ESRCH;
 	}
-	get_task_struct(p);
-	read_unlock_irq(&tasklist_lock);
 	retval = sched_setscheduler(p, policy, &lparam);
-	put_task_struct(p);
+	read_unlock_irq(&tasklist_lock);
 
 	return retval;
 }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d73146..51cacd111db 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -111,7 +111,6 @@ static int stop_machine(void)
 	/* If some failed, kill them all. */
 	if (ret < 0) {
 		stopmachine_set_state(STOPMACHINE_EXIT);
-		up(&stopmachine_mutex);
 		return ret;
 	}
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e70d6c6d6fe..f1f5ec78378 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -442,11 +442,12 @@ int swap_type_of(dev_t device)
 
 		if (!(swap_info[i].flags & SWP_WRITEOK))
 			continue;
+
 		if (!device) {
 			spin_unlock(&swap_lock);
 			return i;
 		}
-		inode = swap_info->swap_file->f_dentry->d_inode;
+		inode = swap_info[i].swap_file->f_dentry->d_inode;
 		if (S_ISBLK(inode->i_mode) &&
 		    device == MKDEV(imajor(inode), iminor(inode))) {
 			spin_unlock(&swap_lock);
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 0f85970ee6d..090bc39e819 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -342,6 +342,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk,
 		new_packet->dccphtx_ccval =
 			DCCP_SKB_CB(skb)->dccpd_ccval =
 				hctx->ccid3hctx_last_win_count;
+		timeval_add_usecs(&hctx->ccid3hctx_t_nom,
+				  hctx->ccid3hctx_t_ipi);
 	}
 out:
 	return rc;
@@ -413,7 +415,8 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
 	case TFRC_SSTATE_NO_FBACK:
 	case TFRC_SSTATE_FBACK:
 		if (len > 0) {
-			hctx->ccid3hctx_t_nom = now;
+			timeval_sub_usecs(&hctx->ccid3hctx_t_nom,
+				  hctx->ccid3hctx_t_ipi);
 			ccid3_calc_new_t_ipi(hctx);
 			ccid3_calc_new_delta(hctx);
 			timeval_add_usecs(&hctx->ccid3hctx_t_nom,
@@ -757,8 +760,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk)
 	}
 
 	hcrx->ccid3hcrx_tstamp_last_feedback = now;
-	hcrx->ccid3hcrx_last_counter	     = packet->dccphrx_ccval;
-	hcrx->ccid3hcrx_seqno_last_counter   = packet->dccphrx_seqno;
+	hcrx->ccid3hcrx_ccval_last_counter   = packet->dccphrx_ccval;
 	hcrx->ccid3hcrx_bytes_recv	     = 0;
 
 	/* Convert to multiples of 10us */
@@ -782,7 +784,7 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
 	if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
 		return 0;
 
-	DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter;
+	DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_ccval_last_counter;
 
 	if (dccp_packet_without_ack(skb))
 		return 0;
@@ -854,6 +856,11 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk)
 		interval = 1;
 	}
 found:
+	if (!tail) {
+		LIMIT_NETDEBUG(KERN_WARNING "%s: tail is null\n",
+		   __FUNCTION__);
+		return ~0;
+	}
 	rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval;
 	ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n",
 		       dccp_role(sk), sk, rtt);
@@ -864,9 +871,20 @@ found:
 	delta = timeval_delta(&tstamp, &hcrx->ccid3hcrx_tstamp_last_feedback);
 	x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv, delta);
 
+	if (x_recv == 0)
+		x_recv = hcrx->ccid3hcrx_x_recv;
+
 	tmp1 = (u64)x_recv * (u64)rtt;
 	do_div(tmp1,10000000);
 	tmp2 = (u32)tmp1;
+
+	if (!tmp2) {
+		LIMIT_NETDEBUG(KERN_WARNING "tmp2 = 0 "
+		   "%s: x_recv = %u, rtt =%u\n",
+		   __FUNCTION__, x_recv, rtt);
+		return ~0;
+	}
+
 	fval = (hcrx->ccid3hcrx_s * 100000) / tmp2;
 	/* do not alter order above or you will get overflow on 32 bit */
 	p = tfrc_calc_x_reverse_lookup(fval);
@@ -882,31 +900,101 @@ found:
 static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+	struct dccp_li_hist_entry *next, *head;
+	u64 seq_temp;
 
-	if (seq_loss != DCCP_MAX_SEQNO + 1 &&
-	    list_empty(&hcrx->ccid3hcrx_li_hist)) {
-		struct dccp_li_hist_entry *li_tail;
+	if (list_empty(&hcrx->ccid3hcrx_li_hist)) {
+		if (!dccp_li_hist_interval_new(ccid3_li_hist,
+		   &hcrx->ccid3hcrx_li_hist, seq_loss, win_loss))
+			return;
 
-		li_tail = dccp_li_hist_interval_new(ccid3_li_hist,
-						    &hcrx->ccid3hcrx_li_hist,
-						    seq_loss, win_loss);
-		if (li_tail == NULL)
+		next = (struct dccp_li_hist_entry *)
+		   hcrx->ccid3hcrx_li_hist.next;
+		next->dccplih_interval = ccid3_hc_rx_calc_first_li(sk);
+	} else {
+		struct dccp_li_hist_entry *entry;
+		struct list_head *tail;
+
+		head = (struct dccp_li_hist_entry *)
+		   hcrx->ccid3hcrx_li_hist.next;
+		/* FIXME win count check removed as was wrong */
+		/* should make this check with receive history */
+		/* and compare there as per section 10.2 of RFC4342 */
+
+		/* new loss event detected */
+		/* calculate last interval length */
+		seq_temp = dccp_delta_seqno(head->dccplih_seqno, seq_loss);
+		entry = dccp_li_hist_entry_new(ccid3_li_hist, SLAB_ATOMIC);
+
+		if (entry == NULL) {
+			printk(KERN_CRIT "%s: out of memory\n",__FUNCTION__);
+			dump_stack();
 			return;
-		li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk);
-	} else
-		    LIMIT_NETDEBUG(KERN_WARNING "%s: FIXME: find end of "
-				   "interval\n", __FUNCTION__);
+		}
+
+		list_add(&entry->dccplih_node, &hcrx->ccid3hcrx_li_hist);
+
+		tail = hcrx->ccid3hcrx_li_hist.prev;
+		list_del(tail);
+		kmem_cache_free(ccid3_li_hist->dccplih_slab, tail);
+
+		/* Create the newest interval */
+		entry->dccplih_seqno = seq_loss;
+		entry->dccplih_interval = seq_temp;
+		entry->dccplih_win_count = win_loss;
+	}
 }
 
-static void ccid3_hc_rx_detect_loss(struct sock *sk)
+static int ccid3_hc_rx_detect_loss(struct sock *sk,
+                                    struct dccp_rx_hist_entry *packet)
 {
 	struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-	u8 win_loss;
-	const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist,
-						      &hcrx->ccid3hcrx_li_hist,
-						      &win_loss);
+	struct dccp_rx_hist_entry *rx_hist = dccp_rx_hist_head(&hcrx->ccid3hcrx_hist);
+	u64 seqno = packet->dccphrx_seqno;
+	u64 tmp_seqno;
+	int loss = 0;
+	u8 ccval;
+
+
+	tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss;
+
+	if (!rx_hist ||
+	   follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) {
+		hcrx->ccid3hcrx_seqno_nonloss = seqno;
+		hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval;
+		goto detect_out;
+	}
+
 
-	ccid3_hc_rx_update_li(sk, seq_loss, win_loss);
+	while (dccp_delta_seqno(hcrx->ccid3hcrx_seqno_nonloss, seqno)
+	   > TFRC_RECV_NUM_LATE_LOSS) {
+		loss = 1;
+		ccid3_hc_rx_update_li(sk, hcrx->ccid3hcrx_seqno_nonloss,
+		   hcrx->ccid3hcrx_ccval_nonloss);
+		tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss;
+		dccp_inc_seqno(&tmp_seqno);
+		hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno;
+		dccp_inc_seqno(&tmp_seqno);
+		while (dccp_rx_hist_find_entry(&hcrx->ccid3hcrx_hist,
+		   tmp_seqno, &ccval)) {
+		   	hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno;
+			hcrx->ccid3hcrx_ccval_nonloss = ccval;
+			dccp_inc_seqno(&tmp_seqno);
+		}
+	}
+
+	/* FIXME - this code could be simplified with above while */
+	/* but works at moment */
+	if (follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) {
+		hcrx->ccid3hcrx_seqno_nonloss = seqno;
+		hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval;
+	}
+
+detect_out:
+	dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist,
+		   &hcrx->ccid3hcrx_li_hist, packet,
+		   hcrx->ccid3hcrx_seqno_nonloss);
+	return loss;
 }
 
 static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
@@ -916,8 +1004,8 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	struct dccp_rx_hist_entry *packet;
 	struct timeval now;
 	u8 win_count;
-	u32 p_prev, r_sample, t_elapsed;
-	int ins;
+	u32 p_prev, rtt_prev, r_sample, t_elapsed;
+	int loss;
 
 	BUG_ON(hcrx == NULL ||
 	       !(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA ||
@@ -932,7 +1020,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	case DCCP_PKT_DATAACK:
 		if (opt_recv->dccpor_timestamp_echo == 0)
 			break;
-		p_prev = hcrx->ccid3hcrx_rtt;
+		rtt_prev = hcrx->ccid3hcrx_rtt;
 		dccp_timestamp(sk, &now);
 		timeval_sub_usecs(&now, opt_recv->dccpor_timestamp_echo * 10);
 		r_sample = timeval_usecs(&now);
@@ -951,8 +1039,8 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			hcrx->ccid3hcrx_rtt = (hcrx->ccid3hcrx_rtt * 9) / 10 +
 					      r_sample / 10;
 
-		if (p_prev != hcrx->ccid3hcrx_rtt)
-			ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n",
+		if (rtt_prev != hcrx->ccid3hcrx_rtt)
+			ccid3_pr_debug("%s, New RTT=%uus, elapsed time=%u\n",
 				       dccp_role(sk), hcrx->ccid3hcrx_rtt,
 				       opt_recv->dccpor_elapsed_time);
 		break;
@@ -973,8 +1061,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 
 	win_count = packet->dccphrx_ccval;
 
-	ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist,
-				      &hcrx->ccid3hcrx_li_hist, packet);
+	loss = ccid3_hc_rx_detect_loss(sk, packet);
 
 	if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK)
 		return;
@@ -991,7 +1078,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	case TFRC_RSTATE_DATA:
 		hcrx->ccid3hcrx_bytes_recv += skb->len -
 					      dccp_hdr(skb)->dccph_doff * 4;
-		if (ins != 0)
+		if (loss)
 			break;
 
 		dccp_timestamp(sk, &now);
@@ -1012,7 +1099,6 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n",
 		       dccp_role(sk), sk, dccp_state_name(sk->sk_state));
 
-	ccid3_hc_rx_detect_loss(sk);
 	p_prev = hcrx->ccid3hcrx_p;
 	
 	/* Calculate loss event rate */
@@ -1022,6 +1108,9 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		/* Scaling up by 1000000 as fixed decimal */
 		if (i_mean != 0)
 			hcrx->ccid3hcrx_p = 1000000 / i_mean;
+	} else {
+		printk(KERN_CRIT "%s: empty loss hist\n",__FUNCTION__);
+		dump_stack();
 	}
 
 	if (hcrx->ccid3hcrx_p > p_prev) {
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 22cb9f80a09..0a2cb7536d2 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -120,9 +120,10 @@ struct ccid3_hc_rx_sock {
 #define ccid3hcrx_x_recv	ccid3hcrx_tfrc.tfrcrx_x_recv
 #define ccid3hcrx_rtt		ccid3hcrx_tfrc.tfrcrx_rtt
 #define ccid3hcrx_p		ccid3hcrx_tfrc.tfrcrx_p
-  	u64			ccid3hcrx_seqno_last_counter:48,
+  	u64			ccid3hcrx_seqno_nonloss:48,
+				ccid3hcrx_ccval_nonloss:4,
 				ccid3hcrx_state:8,
-				ccid3hcrx_last_counter:4;
+				ccid3hcrx_ccval_last_counter:4;
   	u32			ccid3hcrx_bytes_recv;
   	struct timeval		ccid3hcrx_tstamp_last_feedback;
   	struct timeval		ccid3hcrx_tstamp_last_ack;
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index b93d9fc98cb..906c81ab9d4 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/module.h>
+#include <net/sock.h>
 
 #include "loss_interval.h"
 
@@ -90,13 +91,13 @@ u32 dccp_li_hist_calc_i_mean(struct list_head *list)
 	u32 w_tot  = 0;
 
 	list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) {
-		if (i < DCCP_LI_HIST_IVAL_F_LENGTH) {
+		if (li_entry->dccplih_interval != ~0) {
 			i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i];
 			w_tot  += dccp_li_hist_w[i];
+			if (i != 0)
+				i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
 		}
 
-		if (i != 0)
-			i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
 
 		if (++i > DCCP_LI_HIST_IVAL_F_LENGTH)
 			break;
@@ -107,37 +108,36 @@ u32 dccp_li_hist_calc_i_mean(struct list_head *list)
 
 	i_tot = max(i_tot0, i_tot1);
 
-	/* FIXME: Why do we do this? -Ian McDonald */
-	if (i_tot * 4 < w_tot)
-		i_tot = w_tot * 4;
+	if (!w_tot) {
+		LIMIT_NETDEBUG(KERN_WARNING "%s: w_tot = 0\n", __FUNCTION__);
+		return 1;
+	}
 
-	return i_tot * 4 / w_tot;
+	return i_tot / w_tot;
 }
 
 EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean);
 
-struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist,
-						     struct list_head *list,
-						     const u64 seq_loss,
-						     const u8 win_loss)
+int dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+   struct list_head *list, const u64 seq_loss, const u8 win_loss)
 {
-	struct dccp_li_hist_entry *tail = NULL, *entry;
+	struct dccp_li_hist_entry *entry;
 	int i;
 
-	for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) {
+	for (i = 0; i < DCCP_LI_HIST_IVAL_F_LENGTH; i++) {
 		entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC);
 		if (entry == NULL) {
 			dccp_li_hist_purge(hist, list);
-			return NULL;
+			dump_stack();
+			return 0;
 		}
-		if (tail == NULL)
-			tail = entry;
+		entry->dccplih_interval = ~0;
 		list_add(&entry->dccplih_node, list);
 	}
 
 	entry->dccplih_seqno     = seq_loss;
 	entry->dccplih_win_count = win_loss;
-	return tail;
+	return 1;
 }
 
 EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new);
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index dcb370a53f5..0ae85f0340b 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -52,9 +52,6 @@ extern void dccp_li_hist_purge(struct dccp_li_hist *hist,
 
 extern u32 dccp_li_hist_calc_i_mean(struct list_head *list);
 
-extern struct dccp_li_hist_entry *
-			dccp_li_hist_interval_new(struct dccp_li_hist *hist,
-						  struct list_head *list,
-						  const u64 seq_loss,
-						  const u8 win_loss);
+extern int dccp_li_hist_interval_new(struct dccp_li_hist *hist,
+   struct list_head *list, const u64 seq_loss, const u8 win_loss);
 #endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index 420c60f8604..b876c9c81c6 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -112,64 +112,27 @@ struct dccp_rx_hist_entry *
 
 EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet);
 
-int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
 			    struct list_head *rx_list,
 			    struct list_head *li_list,
-			    struct dccp_rx_hist_entry *packet)
+			    struct dccp_rx_hist_entry *packet,
+			    u64 nonloss_seqno)
 {
-	struct dccp_rx_hist_entry *entry, *next, *iter;
+	struct dccp_rx_hist_entry *entry, *next;
 	u8 num_later = 0;
 
-	iter = dccp_rx_hist_head(rx_list);
-	if (iter == NULL)
-		dccp_rx_hist_add_entry(rx_list, packet);
-	else {
-		const u64 seqno = packet->dccphrx_seqno;
-
-		if (after48(seqno, iter->dccphrx_seqno))
-			dccp_rx_hist_add_entry(rx_list, packet);
-		else {
-			if (dccp_rx_hist_entry_data_packet(iter))
-				num_later = 1;
-
-			list_for_each_entry_continue(iter, rx_list,
-						     dccphrx_node) {
-				if (after48(seqno, iter->dccphrx_seqno)) {
-					dccp_rx_hist_add_entry(&iter->dccphrx_node,
-							       packet);
-					goto trim_history;
-				}
-
-				if (dccp_rx_hist_entry_data_packet(iter))
-					num_later++;
-
-				if (num_later == TFRC_RECV_NUM_LATE_LOSS) {
-					dccp_rx_hist_entry_delete(hist, packet);
-					return 1;
-				}
-			}
-
-			if (num_later < TFRC_RECV_NUM_LATE_LOSS)
-				dccp_rx_hist_add_entry(rx_list, packet);
-			/*
-			 * FIXME: else what? should we destroy the packet
-			 * like above?
-			 */
-		}
-	}
+	list_add(&packet->dccphrx_node, rx_list);
 
-trim_history:
-	/*
-	 * Trim history (remove all packets after the NUM_LATE_LOSS + 1
-	 * data packets)
-	 */
 	num_later = TFRC_RECV_NUM_LATE_LOSS + 1;
 
 	if (!list_empty(li_list)) {
 		list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
 			if (num_later == 0) {
-				list_del_init(&entry->dccphrx_node);
-				dccp_rx_hist_entry_delete(hist, entry);
+				if (after48(nonloss_seqno,
+				   entry->dccphrx_seqno)) {
+					list_del_init(&entry->dccphrx_node);
+					dccp_rx_hist_entry_delete(hist, entry);
+				}
 			} else if (dccp_rx_hist_entry_data_packet(entry))
 				--num_later;
 		}
@@ -217,94 +180,10 @@ trim_history:
 				--num_later;
 		}
 	}
-
-	return 0;
 }
 
 EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet);
 
-u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
-			     struct list_head *li_list, u8 *win_loss)
-{
-	struct dccp_rx_hist_entry *entry, *next, *packet;
-	struct dccp_rx_hist_entry *a_loss = NULL;
-	struct dccp_rx_hist_entry *b_loss = NULL;
-	u64 seq_loss = DCCP_MAX_SEQNO + 1;
-	u8 num_later = TFRC_RECV_NUM_LATE_LOSS;
-
-	list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
-		if (num_later == 0) {
-			b_loss = entry;
-			break;
-		} else if (dccp_rx_hist_entry_data_packet(entry))
-			--num_later;
-	}
-
-	if (b_loss == NULL)
-		goto out;
-
-	num_later = 1;
-	list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
-		if (num_later == 0) {
-			a_loss = entry;
-			break;
-		} else if (dccp_rx_hist_entry_data_packet(entry))
-			--num_later;
-	}
-
-	if (a_loss == NULL) {
-		if (list_empty(li_list)) {
-			/* no loss event have occured yet */
-			LIMIT_NETDEBUG("%s: TODO: find a lost data packet by "
-				       "comparing to initial seqno\n",
-				       __FUNCTION__);
-			goto out;
-		} else {
-			LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!",
-				       __FUNCTION__);
-			goto out;
-		}
-	}
-
-	/* Locate a lost data packet */
-	entry = packet = b_loss;
-	list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
-		u64 delta = dccp_delta_seqno(entry->dccphrx_seqno,
-					     packet->dccphrx_seqno);
-
-		if (delta != 0) {
-			if (dccp_rx_hist_entry_data_packet(packet))
-				--delta;
-			/*
-			 * FIXME: check this, probably this % usage is because
-			 * in earlier drafts the ndp count was just 8 bits
-			 * long, but now it cam be up to 24 bits long.
-			 */
-#if 0
-			if (delta % DCCP_NDP_LIMIT !=
-			    (packet->dccphrx_ndp -
-			     entry->dccphrx_ndp) % DCCP_NDP_LIMIT)
-#endif
-			if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) {
-				seq_loss = entry->dccphrx_seqno;
-				dccp_inc_seqno(&seq_loss);
-			}
-		}
-		packet = entry;
-		if (packet == a_loss)
-			break;
-	}
-out:
-	if (seq_loss != DCCP_MAX_SEQNO + 1)
-		*win_loss = a_loss->dccphrx_ccval;
-	else
-		*win_loss = 0; /* Paranoia */
-
-	return seq_loss;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss);
-
 struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
 {
 	struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index aea9c5d7091..067cf1c85a3 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -166,12 +166,6 @@ static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist,
 extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist,
 			       struct list_head *list);
 
-static inline void dccp_rx_hist_add_entry(struct list_head *list,
-					  struct dccp_rx_hist_entry *entry)
-{
-	list_add(&entry->dccphrx_node, list);
-}
-
 static inline struct dccp_rx_hist_entry *
 		dccp_rx_hist_head(struct list_head *list)
 {
@@ -190,10 +184,11 @@ static inline int
 	       entry->dccphrx_type == DCCP_PKT_DATAACK;
 }
 
-extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
+extern void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
 				   struct list_head *rx_list,
 				   struct list_head *li_list,
-				   struct dccp_rx_hist_entry *packet);
+				   struct dccp_rx_hist_entry *packet,
+				   u64 nonloss_seqno);
 
 extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
 				    struct list_head *li_list, u8 *win_loss);