12 files changed, 417 insertions, 241 deletions
diff --git a/Documentation/auxdisplay/cfag12864b-example.c b/Documentation/auxdisplay/cfag12864b-example.c
index 1d2c010bae1..e7823ffb1ca 100644
--- a/Documentation/auxdisplay/cfag12864b-example.c
+++ b/Documentation/auxdisplay/cfag12864b-example.c
@@ -194,7 +194,6 @@ static void cfag12864b_blit(void)
  */
 
 #include <stdio.h>
-#include <string.h>
 
 #define EXAMPLES	6
 
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 6eb1a97e88c..455d4e6d346 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -408,6 +408,26 @@ You can attach the current shell task by echoing 0:
 
 # echo 0 > tasks
 
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
 3. Kernel API
 =============
 
@@ -501,7 +521,7 @@ rmdir() will fail with it. From this behavior, pre_destroy() can be
 called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	       struct task_struct *task)
+	       struct task_struct *task, bool threadgroup)
 (cgroup_mutex held by caller)
 
 Called prior to moving a task into a cgroup; if the subsystem
@@ -509,14 +529,20 @@ returns an error, this will abort the attach operation.  If a NULL
 task is passed, then a successful result indicates that *any*
 unspecified task can be moved into the cgroup. Note that this isn't
 called on a fork. If this method returns 0 (success) then this should
-remain valid while the caller holds cgroup_mutex.
+remain valid while the caller holds cgroup_mutex. If threadgroup is
+true, then a successful result indicates that all threads in the given
+thread's threadgroup can be moved together.
 
 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-	    struct cgroup *old_cgrp, struct task_struct *task)
+	    struct cgroup *old_cgrp, struct task_struct *task,
+	    bool threadgroup)
 (cgroup_mutex held by caller)
 
 Called after the task has been attached to the cgroup, to allow any
 post-attachment activity that requires memory allocations or blocking.
+If threadgroup is true, the subsystem should take care of all threads
+in the specified thread's threadgroup. Currently does not support any
+subsystem that might need the old_cgrp for every thread in the group.
 
 void fork(struct cgroup_subsy *ss, struct task_struct *task)
 
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 23d1262c077..b871f2552b4 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -179,6 +179,9 @@ The reclaim algorithm has not been modified for cgroups, except that
 pages that are selected for reclaiming come from the per cgroup LRU
 list.
 
+NOTE: Reclaim does not work for the root cgroup, since we cannot set any
+limits on the root cgroup.
+
 2. Locking
 
 The memory controller uses the following hierarchy
@@ -210,6 +213,7 @@ We can alter the memory limit:
 NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
 mega or gigabytes.
 NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
+NOTE: We cannot set limits on the root cgroup any more.
 
 # cat /cgroups/0/memory.limit_in_bytes
 4194304
@@ -375,7 +379,42 @@ cgroups created below it.
 
 NOTE2: This feature can be enabled/disabled per subtree.
 
-7. TODO
+7. Soft limits
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best effort feature, it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is setup such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 megabytes)
+
+# echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use
+
+# echo 1G > memory.soft_limit_in_bytes
+
+NOTE1: Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2: It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+
+8. TODO
 
 1. Add support for accounting huge pages (as a separate controller)
 2. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index 9f59fcbf5d8..ba046b8fa92 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -54,20 +54,23 @@ features surfaced as a result:
 
 3.1 General format of the API:
 struct dma_async_tx_descriptor *
-async_<operation>(<op specific parameters>,
-		  enum async_tx_flags flags,
-        	  struct dma_async_tx_descriptor *dependency,
-        	  dma_async_tx_callback callback_routine,
-		  void *callback_parameter);
+async_<operation>(<op specific parameters>, struct async_submit ctl *submit)
 
 3.2 Supported operations:
-memcpy       - memory copy between a source and a destination buffer
-memset       - fill a destination buffer with a byte value
-xor          - xor a series of source buffers and write the result to a
-	       destination buffer
-xor_zero_sum - xor a series of source buffers and set a flag if the
-	       result is zero.  The implementation attempts to prevent
-	       writes to memory
+memcpy  - memory copy between a source and a destination buffer
+memset  - fill a destination buffer with a byte value
+xor     - xor a series of source buffers and write the result to a
+	  destination buffer
+xor_val - xor a series of source buffers and set a flag if the
+	  result is zero.  The implementation attempts to prevent
+	  writes to memory
+pq	- generate the p+q (raid6 syndrome) from a series of source buffers
+pq_val  - validate that a p and or q buffer are in sync with a given series of
+	  sources
+datap	- (raid6_datap_recov) recover a raid6 data block and the p block
+	  from the given sources
+2data	- (raid6_2data_recov) recover 2 raid6 data blocks from the given
+	  sources
 
 3.3 Descriptor management:
 The return value is non-NULL and points to a 'descriptor' when the operation
@@ -80,8 +83,8 @@ acknowledged by the application before the offload engine driver is allowed to
 recycle (or free) the descriptor.  A descriptor can be acked by one of the
 following methods:
 1/ setting the ASYNC_TX_ACK flag if no child operations are to be submitted
-2/ setting the ASYNC_TX_DEP_ACK flag to acknowledge the parent
-   descriptor of a new operation.
+2/ submitting an unacknowledged descriptor as a dependency to another
+   async_tx call will implicitly set the acknowledged state.
 3/ calling async_tx_ack() on the descriptor.
 
 3.4 When does the operation execute?
@@ -119,30 +122,42 @@ of an operation.
 Perform a xor->copy->xor operation where each operation depends on the
 result from the previous operation:
 
-void complete_xor_copy_xor(void *param)
+void callback(void *param)
 {
-	printk("complete\n");
+	struct completion *cmp = param;
+
+	complete(cmp);
 }
 
-int run_xor_copy_xor(struct page **xor_srcs,
-		     int xor_src_cnt,
-		     struct page *xor_dest,
-		     size_t xor_len,
-		     struct page *copy_src,
-		     struct page *copy_dest,
-		     size_t copy_len)
+void run_xor_copy_xor(struct page **xor_srcs,
+		      int xor_src_cnt,
+		      struct page *xor_dest,
+		      size_t xor_len,
+		      struct page *copy_src,
+		      struct page *copy_dest,
+		      size_t copy_len)
 {
 	struct dma_async_tx_descriptor *tx;
+	addr_conv_t addr_conv[xor_src_cnt];
+	struct async_submit_ctl submit;
+	addr_conv_t addr_conv[NDISKS];
+	struct completion cmp;
+
+	init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL,
+			  addr_conv);
+	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit)
 
-	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
-		       ASYNC_TX_XOR_DROP_DST, NULL, NULL, NULL);
-	tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len,
-			  ASYNC_TX_DEP_ACK, tx, NULL, NULL);
-	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len,
-		       ASYNC_TX_XOR_DROP_DST | ASYNC_TX_DEP_ACK | ASYNC_TX_ACK,
-		       tx, complete_xor_copy_xor, NULL);
+	submit->depend_tx = tx;
+	tx = async_memcpy(copy_dest, copy_src, 0, 0, copy_len, &submit);
+
+	init_completion(&cmp);
+	init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST | ASYNC_TX_ACK, tx,
+			  callback, &cmp, addr_conv);
+	tx = async_xor(xor_dest, xor_srcs, 0, xor_src_cnt, xor_len, &submit);
 
 	async_tx_issue_pending_all();
+
+	wait_for_completion(&cmp);
 }
 
 See include/linux/async_tx.h for more information on the flags.  See the
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 736540045dc..23a181074f9 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -4,7 +4,7 @@ Shared Subtrees
 Contents:
 	1) Overview
 	2) Features
-	3) smount command
+	3) Setting mount states
 	4) Use-case
 	5) Detailed semantics
 	6) Quiz
@@ -41,14 +41,14 @@ replicas continue to be exactly same.
 
 	Here is an example:
 
-	Lets say /mnt has a mount that is shared.
+	Let's say /mnt has a mount that is shared.
 	mount --make-shared /mnt
 
-	note: mount command does not yet support the --make-shared flag.
-	I have included a small C program which does the same by executing
-	'smount /mnt shared'
+	Note: mount(8) command now supports the --make-shared flag,
+	so the sample 'smount' program is no longer needed and has been
+	removed.
 
-	#mount --bind /mnt /tmp
+	# mount --bind /mnt /tmp
 	The above command replicates the mount at /mnt to the mountpoint /tmp
 	and the contents of both the mounts remain identical.
 
@@ -58,8 +58,8 @@ replicas continue to be exactly same.
 	#ls /tmp
 	a b c
 
-	Now lets say we mount a device at /tmp/a
-	#mount /dev/sd0  /tmp/a
+	Now let's say we mount a device at /tmp/a
+	# mount /dev/sd0  /tmp/a
 
 	#ls /tmp/a
 	t1 t2 t2
@@ -80,21 +80,20 @@ replicas continue to be exactly same.
 
 	Here is an example:
 
-	Lets say /mnt has a mount which is shared.
-	#mount --make-shared /mnt
+	Let's say /mnt has a mount which is shared.
+	# mount --make-shared /mnt
 
-	Lets bind mount /mnt to /tmp
-	#mount --bind /mnt /tmp
+	Let's bind mount /mnt to /tmp
+	# mount --bind /mnt /tmp
 
 	the new mount at /tmp becomes a shared mount and it is a replica of
 	the mount at /mnt.
 
-	Now lets make the mount at /tmp; a slave of /mnt
-	#mount --make-slave /tmp
-	[or smount /tmp slave]
+	Now let's make the mount at /tmp; a slave of /mnt
+	# mount --make-slave /tmp
 
-	lets mount /dev/sd0 on /mnt/a
-	#mount /dev/sd0 /mnt/a
+	let's mount /dev/sd0 on /mnt/a
+	# mount /dev/sd0 /mnt/a
 
 	#ls /mnt/a
 	t1 t2 t3
@@ -104,9 +103,9 @@ replicas continue to be exactly same.
 
 	Note the mount event has propagated to the mount at /tmp
 
-	However lets see what happens if we mount something on the mount at /tmp
+	However let's see what happens if we mount something on the mount at /tmp
 
-	#mount /dev/sd1 /tmp/b
+	# mount /dev/sd1 /tmp/b
 
 	#ls /tmp/b
 	s1 s2 s3
@@ -124,12 +123,11 @@ replicas continue to be exactly same.
 
 2d) A unbindable mount is a unbindable private mount
 
-	lets say we have a mount at /mnt and we make is unbindable
+	let's say we have a mount at /mnt and we make is unbindable
 
-	#mount --make-unbindable /mnt
-	 [ smount /mnt  unbindable ]
+	# mount --make-unbindable /mnt
 
-	 Lets try to bind mount this mount somewhere else.
+	 Let's try to bind mount this mount somewhere else.
 	 # mount --bind /mnt /tmp
 	 mount: wrong fs type, bad option, bad superblock on /mnt,
 	        or too many mounted file systems
@@ -137,149 +135,15 @@ replicas continue to be exactly same.
 	Binding a unbindable mount is a invalid operation.
 
 
-3) smount command
+3) Setting mount states
 
-	Currently the mount command is not aware of shared subtree features.
-	Work is in progress to add the support in mount ( util-linux package ).
-	Till then use the following program.
+	The mount command (util-linux package) can be used to set mount
+	states:
 
-	------------------------------------------------------------------------
-	//
-	//this code was developed my Miklos Szeredi <miklos@szeredi.hu>
-	//and modified by Ram Pai <linuxram@us.ibm.com>
-	// sample usage:
-	//              smount /tmp shared
-	//
-	#include <stdio.h>
-	#include <stdlib.h>
-	#include <unistd.h>
-	#include <string.h>
-	#include <sys/mount.h>
-	#include <sys/fsuid.h>
-
-	#ifndef MS_REC
-	#define MS_REC		0x4000	/* 16384: Recursive loopback */
-	#endif
-
-	#ifndef MS_SHARED
-	#define MS_SHARED		1<<20	/* Shared */
-	#endif
-
-	#ifndef MS_PRIVATE
-	#define MS_PRIVATE		1<<18	/* Private */
-	#endif
-
-	#ifndef MS_SLAVE
-	#define MS_SLAVE		1<<19	/* Slave */
-	#endif
-
-	#ifndef MS_UNBINDABLE
-	#define MS_UNBINDABLE		1<<17	/* Unbindable */
-	#endif
-
-	int main(int argc, char *argv[])
-	{
-		int type;
-		if(argc != 3) {
-			fprintf(stderr, "usage: %s dir "
-			"<rshared|rslave|rprivate|runbindable|shared|slave"
-			"|private|unbindable>\n" , argv[0]);
-			return 1;
-		}
-
-		fprintf(stdout, "%s %s %s\n", argv[0], argv[1], argv[2]);
-
-		if (strcmp(argv[2],"rshared")==0)
-			type=(MS_SHARED|MS_REC);
-		else if (strcmp(argv[2],"rslave")==0)
-			type=(MS_SLAVE|MS_REC);
-		else if (strcmp(argv[2],"rprivate")==0)
-			type=(MS_PRIVATE|MS_REC);
-		else if (strcmp(argv[2],"runbindable")==0)
-			type=(MS_UNBINDABLE|MS_REC);
-		else if (strcmp(argv[2],"shared")==0)
-			type=MS_SHARED;
-		else if (strcmp(argv[2],"slave")==0)
-			type=MS_SLAVE;
-		else if (strcmp(argv[2],"private")==0)
-			type=MS_PRIVATE;
-		else if (strcmp(argv[2],"unbindable")==0)
-			type=MS_UNBINDABLE;
-		else {
-			fprintf(stderr, "invalid operation: %s\n", argv[2]);
-			return 1;
-		}
-		setfsuid(getuid());
-
-		if(mount("", argv[1], "dontcare", type, "") == -1) {
-			perror("mount");
-			return 1;
-		}
-		return 0;
-	}
-	-----------------------------------------------------------------------
-
-	Copy the above code snippet into smount.c
-	gcc -o smount smount.c
-
-
-	(i) To mark all the mounts under /mnt as shared execute the following
-	command:
-
-	 	smount /mnt rshared
-		the corresponding syntax planned for mount command is
-		mount --make-rshared /mnt
-
-	    just to mark a mount /mnt as shared, execute the following
-	    command:
-	 	smount /mnt shared
-		the corresponding syntax planned for mount command is
-		mount --make-shared /mnt
-
-	(ii) To mark all the shared mounts under /mnt as slave execute the
-	following
-
-	     command:
-		smount /mnt rslave
-		the corresponding syntax planned for mount command is
-		mount --make-rslave /mnt
-
-	    just to mark a mount /mnt as slave, execute the following
-	    command:
-	 	smount /mnt slave
-		the corresponding syntax planned for mount command is
-		mount --make-slave /mnt
-
-	(iii) To mark all the mounts under /mnt as private execute the
-	following command:
-
-		smount /mnt rprivate
-		the corresponding syntax planned for mount command is
-		mount --make-rprivate /mnt
-
-	    just to mark a mount /mnt as private, execute the following
-	    command:
-	 	smount /mnt private
-		the corresponding syntax planned for mount command is
-		mount --make-private /mnt
-
-	      NOTE: by default all the mounts are created as private. But if
-	      you want to change some shared/slave/unbindable  mount as
-	      private at a later point in time, this command can help.
-
-	(iv) To mark all the mounts under /mnt as unbindable execute the
-	following
-
-	     command:
-		smount /mnt runbindable
-		the corresponding syntax planned for mount command is
-		mount --make-runbindable /mnt
-
-	    just to mark a mount /mnt as unbindable, execute the following
-	    command:
-	 	smount /mnt unbindable
-		the corresponding syntax planned for mount command is
-		mount --make-unbindable /mnt
+	mount --make-shared mountpoint
+	mount --make-slave mountpoint
+	mount --make-private mountpoint
+	mount --make-unbindable mountpoint
 
 
 4) Use cases
@@ -350,7 +214,7 @@ replicas continue to be exactly same.
 		mount --rbind / /view/v3
 		mount --rbind / /view/v4
 
-		and if /usr has a versioning filesystem mounted, than that
+		and if /usr has a versioning filesystem mounted, then that
 		mount appears at /view/v1/usr, /view/v2/usr, /view/v3/usr and
 		/view/v4/usr too
 
@@ -390,7 +254,7 @@ replicas continue to be exactly same.
 
 		For example:
 			mount --make-shared /mnt
-			mount --bin /mnt /tmp
+			mount --bind /mnt /tmp
 
 		The mount at /mnt and that at /tmp are both shared and belong
 		to the same peer group. Anything mounted or unmounted under
@@ -558,7 +422,7 @@ replicas continue to be exactly same.
 	then the subtree under the unbindable mount is pruned in the new
 	location.
 
-	eg: lets say we have the following mount tree.
+	eg: let's say we have the following mount tree.
 
 		A
 	      /   \
@@ -566,7 +430,7 @@ replicas continue to be exactly same.
 	     / \ / \
 	     D E F G
 
-	     Lets say all the mount except the mount C in the tree are
+	     Let's say all the mount except the mount C in the tree are
 	     of a type other than unbindable.
 
 	     If this tree is rbound to say Z
@@ -683,13 +547,13 @@ replicas continue to be exactly same.
 	'b' on mounts that receive propagation from mount 'B' and does not have
 	sub-mounts within them are unmounted.
 
-	Example: Lets say 'B1', 'B2', 'B3' are shared mounts that propagate to
+	Example: Let's say 'B1', 'B2', 'B3' are shared mounts that propagate to
 	each other.
 
-	lets say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
+	let's say 'A1', 'A2', 'A3' are first mounted at dentry 'b' on mount
 	'B1', 'B2' and 'B3' respectively.
 
-	lets say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
+	let's say 'C1', 'C2', 'C3' are next mounted at the same dentry 'b' on
 	mount 'B1', 'B2' and 'B3' respectively.
 
 	if 'C1' is unmounted, all the mounts that are most-recently-mounted on
@@ -710,7 +574,7 @@ replicas continue to be exactly same.
 	A cloned namespace contains all the mounts as that of the parent
 	namespace.
 
-	Lets say 'A' and 'B' are the corresponding mounts in the parent and the
+	Let's say 'A' and 'B' are the corresponding mounts in the parent and the
 	child namespace.
 
 	If 'A' is shared, then 'B' is also shared and 'A' and 'B' propagate to
@@ -759,11 +623,11 @@ replicas continue to be exactly same.
 		mount --make-slave /mnt
 
 		At this point we have the first mount at /tmp and
-		its root dentry is 1. Lets call this mount 'A'
+		its root dentry is 1. Let's call this mount 'A'
 		And then we have a second mount at /tmp1 with root
-		dentry 2. Lets call this mount 'B'
+		dentry 2. Let's call this mount 'B'
 		Next we have a third mount at /mnt with root dentry
-		mnt. Lets call this mount 'C'
+		mnt. Let's call this mount 'C'
 
 		'B' is the slave of 'A' and 'C' is a slave of 'B'
 		A -> B -> C
@@ -794,7 +658,7 @@ replicas continue to be exactly same.
 
 	Q3 Why is unbindable mount needed?
 
-		Lets say we want to replicate the mount tree at multiple
+		Let's say we want to replicate the mount tree at multiple
 		locations within the same subtree.
 
 		if one rbind mounts a tree within the same subtree 'n' times
@@ -803,7 +667,7 @@ replicas continue to be exactly same.
 		mounts. Here is a example.
 
 		step 1:
-		   lets say the root tree has just two directories with
+		   let's say the root tree has just two directories with
 		   one vfsmount.
 				    root
 				   /    \
@@ -875,7 +739,7 @@ replicas continue to be exactly same.
 		Unclonable mounts come in handy here.
 
 		step 1:
-		   lets say the root tree has just two directories with
+		   let's say the root tree has just two directories with
 		   one vfsmount.
 				    root
 				   /    \
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f49eecf2e57..623f094c9d8 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -536,6 +536,7 @@ struct address_space_operations {
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
+	int (*error_remove_page) (struct mapping *mapping, struct page *page);
 };
 
   writepage: called by the VM to write a dirty page to backing store.
@@ -694,6 +695,12 @@ struct address_space_operations {
   	prevent redirtying the page, it is kept locked during the whole
 	operation.
 
+  error_remove_page: normally set to generic_error_remove_page if truncation
+	is ok for this address space. Used for memory failure handling.
+	Setting this implies you deal with pages going away under you,
+	unless you have them locked or reference counts increased.
+
+
 The File Object
 ===============
 
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index aafca0a8f66..947374977ca 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -135,6 +135,7 @@ Code	Seq#	Include File		Comments
 					<http://mikonos.dia.unisa.it/tcfs>
 'l'	40-7F	linux/udf_fs_i.h	in development:
 					<http://sourceforge.net/projects/linux-udf/>
+'m'	00-09	linux/mmtimer.h
 'm'	all	linux/mtio.h		conflict!
 'm'	all	linux/soundcard.h	conflict!
 'm'	all	linux/synclink.h	conflict!
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 1458448436c..62682500878 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -96,13 +96,16 @@ handles that the Linux kernel will allocate. When you get lots
 of error messages about running out of file handles, you might
 want to increase this limit.
 
-The three values in file-nr denote the number of allocated
-file handles, the number of unused file handles and the maximum
-number of file handles. When the allocated file handles come
-close to the maximum, but the number of unused file handles is
-significantly greater than 0, you've encountered a peak in your 
-usage of file handles and you don't need to increase the maximum.
-
+Historically, the three values in file-nr denoted the number of
+allocated file handles, the number of allocated but unused file
+handles, and the maximum number of file handles. Linux 2.6 always
+reports 0 as the number of free file handles -- this is not an
+error, it just means that the number of allocated file handles
+exactly matches the number of used file handles.
+
+Attempts to allocate more file descriptors than file-max are
+reported with printk, look for "VFS: file-max limit <number>
+reached".
 ==============================================================
 
 nr_open:
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index b3d8b492274..a028b92001e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -22,6 +22,7 @@ show up in /proc/sys/kernel:
 - callhome		     [ S390 only ]
 - auto_msgmni
 - core_pattern
+- core_pipe_limit
 - core_uses_pid
 - ctrl-alt-del
 - dentry-state
@@ -135,6 +136,27 @@ core_pattern is used to specify a core dumpfile pattern name.
 
 ==============================================================
 
+core_pipe_limit:
+
+This sysctl is only applicable when core_pattern is configured to pipe core
+files to user space helper a (when the first character of core_pattern is a '|',
+see above).  When collecting cores via a pipe to an application, it is
+occasionally usefull for the collecting application to gather data about the
+crashing process from its /proc/pid directory.  In order to do this safely, the
+kernel must wait for the collecting process to exit, so as not to remove the
+crashing processes proc files prematurely.  This in turn creates the possibility
+that a misbehaving userspace collecting process can block the reaping of a
+crashed process simply by never exiting.  This sysctl defends against that.  It
+defines how many concurrent crashing processes may be piped to user space
+applications in parallel.  If this value is exceeded, then those crashing
+processes above that value are noted via the kernel log and their cores are
+skipped.  0 is a special value, indicating that unlimited processes may be
+captured in parallel, but that no waiting will take place (i.e. the collecting
+process is not guaranteed access to /proc/<crahing pid>/).  This value defaults
+to 0.
+
+==============================================================
+
 core_uses_pid:
 
 The default coredump filename is "core".  By setting
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index e6fb1ec2744..a6e360d2055 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/vm:
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
+- memory_failure_early_kill
+- memory_failure_recovery
 - min_free_kbytes
 - min_slab_ratio
 - min_unmapped_ratio
@@ -53,7 +55,6 @@ Currently, these files are in /proc/sys/vm:
 - vfs_cache_pressure
 - zone_reclaim_mode
 
-
 ==============================================================
 
 block_dump
@@ -275,6 +276,44 @@ e.g., up to one or two maps per allocation.
 
 The default value is 65536.
 
+=============================================================
+
+memory_failure_early_kill:
+
+Control how to kill processes when uncorrected memory error (typically
+a 2bit error in a memory module) is detected in the background by hardware
+that cannot be handled by the kernel. In some cases (like the page
+still having a valid copy on disk) the kernel will handle the failure
+transparently without affecting any applications. But if there is
+no other uptodate copy of the data it will kill to prevent any data
+corruptions from propagating.
+
+1: Kill all processes that have the corrupted and not reloadable page mapped
+as soon as the corruption is detected.  Note this is not supported
+for a few types of pages, like kernel internally allocated data or
+the swap cache, but works for the majority of user pages.
+
+0: Only unmap the corrupted page from all processes and only kill a process
+who tries to access it.
+
+The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
+handle this if they want to.
+
+This is only active on architectures/platforms with advanced machine
+check handling and depends on the hardware capabilities.
+
+Applications can override this setting individually with the PR_MCE_KILL prctl
+
+==============================================================
+
+memory_failure_recovery
+
+Enable memory failure recovery (when supported by the platform)
+
+1: Attempt recovery.
+
+0: Always panic on a memory failure.
+
 ==============================================================
 
 min_free_kbytes:
diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore
index 33e8a023df0..09b164a5700 100644
--- a/Documentation/vm/.gitignore
+++ b/Documentation/vm/.gitignore
@@ -1 +1,2 @@
+page-types
 slabinfo
diff --git a/Documentation/vm/page-types.c b/Documentation/vm/page-types.c
index 3eda8ea0085..fa1a30d9e9d 100644
--- a/Documentation/vm/page-types.c
+++ b/Documentation/vm/page-types.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2009 Wu Fengguang <fengguang.wu@intel.com>
  */
 
+#define _LARGEFILE64_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -13,12 +14,33 @@
 #include <string.h>
 #include <getopt.h>
 #include <limits.h>
+#include <assert.h>
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/fcntl.h>
 
 
 /*
+ * pagemap kernel ABI bits
+ */
+
+#define PM_ENTRY_BYTES      sizeof(uint64_t)
+#define PM_STATUS_BITS      3
+#define PM_STATUS_OFFSET    (64 - PM_STATUS_BITS)
+#define PM_STATUS_MASK      (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
+#define PM_STATUS(nr)       (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
+#define PM_PSHIFT_BITS      6
+#define PM_PSHIFT_OFFSET    (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
+#define PM_PSHIFT_MASK      (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
+#define PM_PSHIFT(x)        (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
+#define PM_PFRAME_MASK      ((1LL << PM_PSHIFT_OFFSET) - 1)
+#define PM_PFRAME(x)        ((x) & PM_PFRAME_MASK)
+
+#define PM_PRESENT          PM_STATUS(4LL)
+#define PM_SWAP             PM_STATUS(2LL)
+
+
+/*
  * kernel page flags
  */
 
@@ -126,6 +148,14 @@ static int		nr_addr_ranges;
 static unsigned long	opt_offset[MAX_ADDR_RANGES];
 static unsigned long	opt_size[MAX_ADDR_RANGES];
 
+#define MAX_VMAS	10240
+static int		nr_vmas;
+static unsigned long	pg_start[MAX_VMAS];
+static unsigned long	pg_end[MAX_VMAS];
+static unsigned long	voffset;
+
+static int		pagemap_fd;
+
 #define MAX_BIT_FILTERS	64
 static int		nr_bit_filters;
 static uint64_t		opt_mask[MAX_BIT_FILTERS];
@@ -135,7 +165,6 @@ static int		page_size;
 
 #define PAGES_BATCH	(64 << 10)	/* 64k pages */
 static int		kpageflags_fd;
-static uint64_t		kpageflags_buf[KPF_BYTES * PAGES_BATCH];
 
 #define HASH_SHIFT	13
 #define HASH_SIZE	(1 << HASH_SHIFT)
@@ -158,6 +187,11 @@ static uint64_t 	page_flags[HASH_SIZE];
 	type __min2 = (y);			\
 	__min1 < __min2 ? __min1 : __min2; })
 
+#define max_t(type, x, y) ({			\
+	type __max1 = (x);			\
+	type __max2 = (y);			\
+	__max1 > __max2 ? __max1 : __max2; })
+
 static unsigned long pages2mb(unsigned long pages)
 {
 	return (pages * page_size) >> 20;
@@ -224,26 +258,34 @@ static char *page_flag_longname(uint64_t flags)
 static void show_page_range(unsigned long offset, uint64_t flags)
 {
 	static uint64_t      flags0;
+	static unsigned long voff;
 	static unsigned long index;
 	static unsigned long count;
 
-	if (flags == flags0 && offset == index + count) {
+	if (flags == flags0 && offset == index + count &&
+	    (!opt_pid || voffset == voff + count)) {
 		count++;
 		return;
 	}
 
-	if (count)
-		printf("%lu\t%lu\t%s\n",
+	if (count) {
+		if (opt_pid)
+			printf("%lx\t", voff);
+		printf("%lx\t%lx\t%s\n",
 				index, count, page_flag_name(flags0));
+	}
 
 	flags0 = flags;
 	index  = offset;
+	voff   = voffset;
 	count  = 1;
 }
 
 static void show_page(unsigned long offset, uint64_t flags)
 {
-	printf("%lu\t%s\n", offset, page_flag_name(flags));
+	if (opt_pid)
+		printf("%lx\t", voffset);
+	printf("%lx\t%s\n", offset, page_flag_name(flags));
 }
 
 static void show_summary(void)
@@ -383,6 +425,8 @@ static void walk_pfn(unsigned long index, unsigned long count)
 	lseek(kpageflags_fd, index * KPF_BYTES, SEEK_SET);
 
 	while (count) {
+		uint64_t kpageflags_buf[KPF_BYTES * PAGES_BATCH];
+
 		batch = min_t(unsigned long, count, PAGES_BATCH);
 		n = read(kpageflags_fd, kpageflags_buf, batch * KPF_BYTES);
 		if (n == 0)
@@ -404,6 +448,81 @@ static void walk_pfn(unsigned long index, unsigned long count)
 	}
 }
 
+
+#define PAGEMAP_BATCH	4096
+static unsigned long task_pfn(unsigned long pgoff)
+{
+	static uint64_t buf[PAGEMAP_BATCH];
+	static unsigned long start;
+	static long count;
+	uint64_t pfn;
+
+	if (pgoff < start || pgoff >= start + count) {
+		if (lseek64(pagemap_fd,
+			    (uint64_t)pgoff * PM_ENTRY_BYTES,
+			    SEEK_SET) < 0) {
+			perror("pagemap seek");
+			exit(EXIT_FAILURE);
+		}
+		count = read(pagemap_fd, buf, sizeof(buf));
+		if (count == 0)
+			return 0;
+		if (count < 0) {
+			perror("pagemap read");
+			exit(EXIT_FAILURE);
+		}
+		if (count % PM_ENTRY_BYTES) {
+			fatal("pagemap read not aligned.\n");
+			exit(EXIT_FAILURE);
+		}
+		count /= PM_ENTRY_BYTES;
+		start = pgoff;
+	}
+
+	pfn = buf[pgoff - start];
+	if (pfn & PM_PRESENT)
+		pfn = PM_PFRAME(pfn);
+	else
+		pfn = 0;
+
+	return pfn;
+}
+
+static void walk_task(unsigned long index, unsigned long count)
+{
+	int i = 0;
+	const unsigned long end = index + count;
+
+	while (index < end) {
+
+		while (pg_end[i] <= index)
+			if (++i >= nr_vmas)
+				return;
+		if (pg_start[i] >= end)
+			return;
+
+		voffset = max_t(unsigned long, pg_start[i], index);
+		index   = min_t(unsigned long, pg_end[i], end);
+
+		assert(voffset < index);
+		for (; voffset < index; voffset++) {
+			unsigned long pfn = task_pfn(voffset);
+			if (pfn)
+				walk_pfn(pfn, 1);
+		}
+	}
+}
+
+static void add_addr_range(unsigned long offset, unsigned long size)
+{
+	if (nr_addr_ranges >= MAX_ADDR_RANGES)
+		fatal("too many addr ranges\n");
+
+	opt_offset[nr_addr_ranges] = offset;
+	opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
+	nr_addr_ranges++;
+}
+
 static void walk_addr_ranges(void)
 {
 	int i;
@@ -415,10 +534,13 @@ static void walk_addr_ranges(void)
 	}
 
 	if (!nr_addr_ranges)
-		walk_pfn(0, ULONG_MAX);
+		add_addr_range(0, ULONG_MAX);
 
 	for (i = 0; i < nr_addr_ranges; i++)
-		walk_pfn(opt_offset[i], opt_size[i]);
+		if (!opt_pid)
+			walk_pfn(opt_offset[i], opt_size[i]);
+		else
+			walk_task(opt_offset[i], opt_size[i]);
 
 	close(kpageflags_fd);
 }
@@ -446,8 +568,8 @@ static void usage(void)
 "            -r|--raw                  Raw mode, for kernel developers\n"
 "            -a|--addr    addr-spec    Walk a range of pages\n"
 "            -b|--bits    bits-spec    Walk pages with specified bits\n"
-#if 0 /* planned features */
 "            -p|--pid     pid          Walk process address space\n"
+#if 0 /* planned features */
 "            -f|--file    filename     Walk file address space\n"
 #endif
 "            -l|--list                 Show page details in ranges\n"
@@ -459,7 +581,7 @@ static void usage(void)
 "            N+M                       pages range from N to N+M-1\n"
 "            N,M                       pages range from N to M-1\n"
 "            N,                        pages range from N to end\n"
-"            ,M                        pages range from 0 to M\n"
+"            ,M                        pages range from 0 to M-1\n"
 "bits-spec:\n"
 "            bit1,bit2                 (flags & (bit1|bit2)) != 0\n"
 "            bit1,bit2=bit1            (flags & (bit1|bit2)) == bit1\n"
@@ -496,21 +618,57 @@ static unsigned long long parse_number(const char *str)
 
 static void parse_pid(const char *str)
 {
+	FILE *file;
+	char buf[5000];
+
 	opt_pid = parse_number(str);
-}
 
-static void parse_file(const char *name)
-{
+	sprintf(buf, "/proc/%d/pagemap", opt_pid);
+	pagemap_fd = open(buf, O_RDONLY);
+	if (pagemap_fd < 0) {
+		perror(buf);
+		exit(EXIT_FAILURE);
+	}
+
+	sprintf(buf, "/proc/%d/maps", opt_pid);
+	file = fopen(buf, "r");
+	if (!file) {
+		perror(buf);
+		exit(EXIT_FAILURE);
+	}
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		unsigned long vm_start;
+		unsigned long vm_end;
+		unsigned long long pgoff;
+		int major, minor;
+		char r, w, x, s;
+		unsigned long ino;
+		int n;
+
+		n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
+			   &vm_start,
+			   &vm_end,
+			   &r, &w, &x, &s,
+			   &pgoff,
+			   &major, &minor,
+			   &ino);
+		if (n < 10) {
+			fprintf(stderr, "unexpected line: %s\n", buf);
+			continue;
+		}
+		pg_start[nr_vmas] = vm_start / page_size;
+		pg_end[nr_vmas] = vm_end / page_size;
+		if (++nr_vmas >= MAX_VMAS) {
+			fprintf(stderr, "too many VMAs\n");
+			break;
+		}
+	}
+	fclose(file);
 }
 
-static void add_addr_range(unsigned long offset, unsigned long size)
+static void parse_file(const char *name)
 {
-	if (nr_addr_ranges >= MAX_ADDR_RANGES)
-		fatal("too much addr ranges\n");
-
-	opt_offset[nr_addr_ranges] = offset;
-	opt_size[nr_addr_ranges] = size;
-	nr_addr_ranges++;
 }
 
 static void parse_addr_range(const char *optarg)
@@ -676,8 +834,10 @@ int main(int argc, char *argv[])
 		}
 	}
 
+	if (opt_list && opt_pid)
+		printf("voffset\t");
 	if (opt_list == 1)
-		printf("offset\tcount\tflags\n");
+		printf("offset\tlen\tflags\n");
 	if (opt_list == 2)
 		printf("offset\tflags\n");