67 files changed, 3106 insertions, 1613 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index f2ab70073bd..ab4f1090f43 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -3,3 +3,4 @@
 #
 config_data.h
 config_data.gz
+timeconst.h
diff --git a/kernel/Makefile b/kernel/Makefile
index 135a1b94344..6c584c55a6e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,12 +4,12 @@
 
 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    exit.o itimer.o time.o softirq.o resource.o \
-	    sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
+	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o \
-	    utsname.o notifier.o ksysfs.o pm_qos_params.o
+	    notifier.o ksysfs.o pm_qos_params.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -42,7 +42,11 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_UTS_NS) += utsname.o
+obj-$(CONFIG_USER_NS) += user_namespace.o
+obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -87,3 +91,11 @@ quiet_cmd_ikconfiggz = IKCFG   $@
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
 	$(call if_changed,ikconfiggz)
+
+$(obj)/time.o: $(obj)/timeconst.h
+
+quiet_cmd_timeconst  = TIMEC   $@
+      cmd_timeconst  = $(PERL) $< $(CONFIG_HZ) > $@
+targets += timeconst.h
+$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
+	$(call if_changed,timeconst)
diff --git a/kernel/audit.c b/kernel/audit.c
index c8555b18021..2eeea9a1424 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1312,26 +1312,26 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
 
 /* This is a helper-function to print the escaped d_path */
 void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
-		      struct dentry *dentry, struct vfsmount *vfsmnt)
+		      struct path *path)
 {
-	char *p, *path;
+	char *p, *pathname;
 
 	if (prefix)
 		audit_log_format(ab, " %s", prefix);
 
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
-	path = kmalloc(PATH_MAX+11, ab->gfp_mask);
-	if (!path) {
+	pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
+	if (!pathname) {
 		audit_log_format(ab, "<no memory>");
 		return;
 	}
-	p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
+	p = d_path(path, pathname, PATH_MAX+11);
 	if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
 		/* FIXME: can we save some information here? */
 		audit_log_format(ab, "<too long>");
 	} else
 		audit_log_untrustedstring(ab, p);
-	kfree(path);
+	kfree(pathname);
 }
 
 /**
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f4fcf58f20f..9ef5e0aacc3 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -549,8 +549,8 @@ void audit_trim_trees(void)
 		if (err)
 			goto skip_it;
 
-		root_mnt = collect_mounts(nd.mnt, nd.dentry);
-		path_release(&nd);
+		root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
+		path_put(&nd.path);
 		if (!root_mnt)
 			goto skip_it;
 
@@ -583,17 +583,17 @@ skip_it:
 static int is_under(struct vfsmount *mnt, struct dentry *dentry,
 		    struct nameidata *nd)
 {
-	if (mnt != nd->mnt) {
+	if (mnt != nd->path.mnt) {
 		for (;;) {
 			if (mnt->mnt_parent == mnt)
 				return 0;
-			if (mnt->mnt_parent == nd->mnt)
+			if (mnt->mnt_parent == nd->path.mnt)
 					break;
 			mnt = mnt->mnt_parent;
 		}
 		dentry = mnt->mnt_mountpoint;
 	}
-	return is_subdir(dentry, nd->dentry);
+	return is_subdir(dentry, nd->path.dentry);
 }
 
 int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -641,8 +641,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	err = path_lookup(tree->pathname, 0, &nd);
 	if (err)
 		goto Err;
-	mnt = collect_mounts(nd.mnt, nd.dentry);
-	path_release(&nd);
+	mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
+	path_put(&nd.path);
 	if (!mnt) {
 		err = -ENOMEM;
 		goto Err;
@@ -701,8 +701,8 @@ int audit_tag_tree(char *old, char *new)
 	err = path_lookup(new, 0, &nd);
 	if (err)
 		return err;
-	tagged = collect_mounts(nd.mnt, nd.dentry);
-	path_release(&nd);
+	tagged = collect_mounts(nd.path.mnt, nd.path.dentry);
+	path_put(&nd.path);
 	if (!tagged)
 		return -ENOMEM;
 
@@ -711,9 +711,9 @@ int audit_tag_tree(char *old, char *new)
 		drop_collected_mounts(tagged);
 		return err;
 	}
-	mnt = mntget(nd.mnt);
-	dentry = dget(nd.dentry);
-	path_release(&nd);
+	mnt = mntget(nd.path.mnt);
+	dentry = dget(nd.path.dentry);
+	path_put(&nd.path);
 
 	if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
 		follow_up(&mnt, &dentry);
@@ -744,13 +744,13 @@ int audit_tag_tree(char *old, char *new)
 		spin_lock(&vfsmount_lock);
 		if (!is_under(mnt, dentry, &nd)) {
 			spin_unlock(&vfsmount_lock);
-			path_release(&nd);
+			path_put(&nd.path);
 			put_tree(tree);
 			mutex_lock(&audit_filter_mutex);
 			continue;
 		}
 		spin_unlock(&vfsmount_lock);
-		path_release(&nd);
+		path_put(&nd.path);
 
 		list_for_each_entry(p, &list, mnt_list) {
 			failed = tag_chunk(p->mnt_root->d_inode, tree);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6f19fd477aa..2f2914b7cc3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -169,8 +169,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
 	inotify_init_watch(&parent->wdata);
 	/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
 	get_inotify_watch(&parent->wdata);
-	wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
-			       AUDIT_IN_WATCH);
+	wd = inotify_add_watch(audit_ih, &parent->wdata,
+			       ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
 	if (wd < 0) {
 		audit_free_parent(&parent->wdata);
 		return ERR_PTR(wd);
@@ -1161,11 +1161,11 @@ static int audit_get_nd(char *path, struct nameidata **ndp,
 static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
 {
 	if (ndp) {
-		path_release(ndp);
+		path_put(&ndp->path);
 		kfree(ndp);
 	}
 	if (ndw) {
-		path_release(ndw);
+		path_put(&ndw->path);
 		kfree(ndw);
 	}
 }
@@ -1214,8 +1214,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
 
 	/* update watch filter fields */
 	if (ndw) {
-		watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
-		watch->ino = ndw->dentry->d_inode->i_ino;
+		watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+		watch->ino = ndw->path.dentry->d_inode->i_ino;
 	}
 
 	/* The audit_filter_mutex must not be held during inotify calls because
@@ -1225,7 +1225,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
 	 */
 	mutex_unlock(&audit_filter_mutex);
 
-	if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
+	if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+			       &i_watch) < 0) {
 		parent = audit_init_parent(ndp);
 		if (IS_ERR(parent)) {
 			/* caller expects mutex locked */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c06ecf38d7..ac6d9b23b01 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -208,8 +208,7 @@ struct audit_context {
 	int		    name_count;
 	struct audit_names  names[AUDIT_NAMES];
 	char *		    filterkey;	/* key for rule that triggered record */
-	struct dentry *	    pwd;
-	struct vfsmount *   pwdmnt;
+	struct path	    pwd;
 	struct audit_context *previous; /* For nested syscalls */
 	struct audit_aux_data *aux;
 	struct audit_aux_data *aux_pids;
@@ -786,12 +785,9 @@ static inline void audit_free_names(struct audit_context *context)
 			__putname(context->names[i].name);
 	}
 	context->name_count = 0;
-	if (context->pwd)
-		dput(context->pwd);
-	if (context->pwdmnt)
-		mntput(context->pwdmnt);
-	context->pwd = NULL;
-	context->pwdmnt = NULL;
+	path_put(&context->pwd);
+	context->pwd.dentry = NULL;
+	context->pwd.mnt = NULL;
 }
 
 static inline void audit_free_aux(struct audit_context *context)
@@ -930,8 +926,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
 			if ((vma->vm_flags & VM_EXECUTABLE) &&
 			    vma->vm_file) {
 				audit_log_d_path(ab, "exe=",
-						 vma->vm_file->f_path.dentry,
-						 vma->vm_file->f_path.mnt);
+						 &vma->vm_file->f_path);
 				break;
 			}
 			vma = vma->vm_next;
@@ -1341,10 +1336,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				  context->target_sid, context->target_comm))
 			call_panic = 1;
 
-	if (context->pwd && context->pwdmnt) {
+	if (context->pwd.dentry && context->pwd.mnt) {
 		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
-			audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
+			audit_log_d_path(ab, "cwd=", &context->pwd);
 			audit_log_end(ab);
 		}
 	}
@@ -1367,8 +1362,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			case 0:
 				/* name was specified as a relative path and the
 				 * directory component is the cwd */
-				audit_log_d_path(ab, " name=", context->pwd,
-						 context->pwdmnt);
+				audit_log_d_path(ab, " name=", &context->pwd);
 				break;
 			default:
 				/* log the name's directory component */
@@ -1695,10 +1689,10 @@ void __audit_getname(const char *name)
 	context->names[context->name_count].ino  = (unsigned long)-1;
 	context->names[context->name_count].osid = 0;
 	++context->name_count;
-	if (!context->pwd) {
+	if (!context->pwd.dentry) {
 		read_lock(&current->fs->lock);
-		context->pwd = dget(current->fs->pwd);
-		context->pwdmnt = mntget(current->fs->pwdmnt);
+		context->pwd = current->fs->pwd;
+		path_get(&current->fs->pwd);
 		read_unlock(&current->fs->lock);
 	}
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1a3c23936d4..4766bb65e4d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -141,7 +141,7 @@ enum {
 	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
 };
 
-inline int cgroup_is_releasable(const struct cgroup *cgrp)
+static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
 		(1 << CGRP_RELEASABLE) |
@@ -149,7 +149,7 @@ inline int cgroup_is_releasable(const struct cgroup *cgrp)
 	return (cgrp->flags & bits) == bits;
 }
 
-inline int notify_on_release(const struct cgroup *cgrp)
+static int notify_on_release(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
@@ -489,7 +489,7 @@ static struct css_set *find_css_set(
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding cgroup_mutex can't rely on the count
  * field not changing.  However, if the count goes to zero, then only
- * attach_task() can increment it again.  Because a count of zero
+ * cgroup_attach_task() can increment it again.  Because a count of zero
  * means that no tasks are currently attached, therefore there is no
  * way a task attached to that cgroup can fork (the other way to
  * increment the count).  So code holding cgroup_mutex can safely
@@ -520,17 +520,17 @@ static struct css_set *find_css_set(
  *	The task_lock() exception
  *
  * The need for this exception arises from the action of
- * attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
  * another.  It does so using cgroup_mutexe, however there are
  * several performance critical places that need to reference
  * task->cgroup without the expense of grabbing a system global
  * mutex.  Therefore except as noted below, when dereferencing or, as
- * in attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
  * task_lock(), which acts on a spinlock (task->alloc_lock) already in
  * the task_struct routinely used for such matters.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cgroup pointer by attach_task()
+ * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 
 /**
@@ -586,11 +586,27 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 	return inode;
 }
 
+/*
+ * Call subsys's pre_destroy handler.
+ * This is called before css refcnt check.
+ */
+
+static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+{
+	struct cgroup_subsys *ss;
+	for_each_subsys(cgrp->root, ss)
+		if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
+			ss->pre_destroy(ss, cgrp);
+	return;
+}
+
+
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
 	/* is dentry a directory ? if so, kfree() associated cgroup */
 	if (S_ISDIR(inode->i_mode)) {
 		struct cgroup *cgrp = dentry->d_fsdata;
+		struct cgroup_subsys *ss;
 		BUG_ON(!(cgroup_is_removed(cgrp)));
 		/* It's possible for external users to be holding css
 		 * reference counts on a cgroup; css_put() needs to
@@ -599,6 +615,23 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		 * queue the cgroup to be handled by the release
 		 * agent */
 		synchronize_rcu();
+
+		mutex_lock(&cgroup_mutex);
+		/*
+		 * Release the subsystem state objects.
+		 */
+		for_each_subsys(cgrp->root, ss) {
+			if (cgrp->subsys[ss->subsys_id])
+				ss->destroy(ss, cgrp);
+		}
+
+		cgrp->root->number_of_cgroups--;
+		mutex_unlock(&cgroup_mutex);
+
+		/* Drop the active superblock reference that we took when we
+		 * created the cgroup */
+		deactivate_super(cgrp->root->sb);
+
 		kfree(cgrp);
 	}
 	iput(inode);
@@ -1161,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
  * Call holding cgroup_mutex.  May take task_lock of
  * the task 'pid' during call.
  */
-static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	int retval = 0;
 	struct cgroup_subsys *ss;
@@ -1181,9 +1214,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
 			retval = ss->can_attach(ss, cgrp, tsk);
-			if (retval) {
+			if (retval)
 				return retval;
-			}
 		}
 	}
 
@@ -1192,9 +1224,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	 * based on its final set of cgroups
 	 */
 	newcg = find_css_set(cg, cgrp);
-	if (!newcg) {
+	if (!newcg)
 		return -ENOMEM;
-	}
 
 	task_lock(tsk);
 	if (tsk->flags & PF_EXITING) {
@@ -1214,9 +1245,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	write_unlock(&css_set_lock);
 
 	for_each_subsys(root, ss) {
-		if (ss->attach) {
+		if (ss->attach)
 			ss->attach(ss, cgrp, oldcgrp, tsk);
-		}
 	}
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	synchronize_rcu();
@@ -1239,7 +1269,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
 
 	if (pid) {
 		rcu_read_lock();
-		tsk = find_task_by_pid(pid);
+		tsk = find_task_by_vpid(pid);
 		if (!tsk || tsk->flags & PF_EXITING) {
 			rcu_read_unlock();
 			return -ESRCH;
@@ -1257,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
 		get_task_struct(tsk);
 	}
 
-	ret = attach_task(cgrp, tsk);
+	ret = cgroup_attach_task(cgrp, tsk);
 	put_task_struct(tsk);
 	return ret;
 }
@@ -1329,9 +1359,14 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
 		goto out1;
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
+	strstrip(buffer);	/* strip -just- trailing whitespace */
 
 	mutex_lock(&cgroup_mutex);
 
+	/*
+	 * This was already checked for in cgroup_file_write(), but
+	 * check again now we're holding cgroup_mutex.
+	 */
 	if (cgroup_is_removed(cgrp)) {
 		retval = -ENODEV;
 		goto out2;
@@ -1349,24 +1384,9 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
 			clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 		break;
 	case FILE_RELEASE_AGENT:
-	{
-		struct cgroupfs_root *root = cgrp->root;
-		/* Strip trailing newline */
-		if (nbytes && (buffer[nbytes-1] == '\n')) {
-			buffer[nbytes-1] = 0;
-		}
-		if (nbytes < sizeof(root->release_agent_path)) {
-			/* We never write anything other than '\0'
-			 * into the last char of release_agent_path,
-			 * so it always remains a NUL-terminated
-			 * string */
-			strncpy(root->release_agent_path, buffer, nbytes);
-			root->release_agent_path[nbytes] = 0;
-		} else {
-			retval = -ENOSPC;
-		}
+		BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+		strcpy(cgrp->root->release_agent_path, buffer);
 		break;
-	}
 	default:
 		retval = -EINVAL;
 		goto out2;
@@ -1387,7 +1407,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-	if (!cft)
+	if (!cft || cgroup_is_removed(cgrp))
 		return -ENODEV;
 	if (cft->write)
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1457,7 +1477,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-	if (!cft)
+	if (!cft || cgroup_is_removed(cgrp))
 		return -ENODEV;
 
 	if (cft->read)
@@ -1675,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
 	it->task = cg->tasks.next;
 }
 
+/*
+ * To reduce the fork() overhead for systems that are not actually
+ * using their cgroups capability, we don't maintain the lists running
+ * through each css_set to its tasks until we see the list actually
+ * used - in other words after the first call to cgroup_iter_start().
+ *
+ * The tasklist_lock is not held here, as do_each_thread() and
+ * while_each_thread() are protected by RCU.
+ */
+void cgroup_enable_task_cg_lists(void)
+{
+	struct task_struct *p, *g;
+	write_lock(&css_set_lock);
+	use_task_css_set_links = 1;
+	do_each_thread(g, p) {
+		task_lock(p);
+		if (list_empty(&p->cg_list))
+			list_add(&p->cg_list, &p->cgroups->tasks);
+		task_unlock(p);
+	} while_each_thread(g, p);
+	write_unlock(&css_set_lock);
+}
+
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 	/*
@@ -1682,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 	 * we need to enable the list linking each css_set to its
 	 * tasks, and fix up all existing tasks.
 	 */
-	if (!use_task_css_set_links) {
-		struct task_struct *p, *g;
-		write_lock(&css_set_lock);
-		use_task_css_set_links = 1;
- 		do_each_thread(g, p) {
-			task_lock(p);
-			if (list_empty(&p->cg_list))
-				list_add(&p->cg_list, &p->cgroups->tasks);
-			task_unlock(p);
- 		} while_each_thread(g, p);
-		write_unlock(&css_set_lock);
-	}
+	if (!use_task_css_set_links)
+		cgroup_enable_task_cg_lists();
+
 	read_lock(&css_set_lock);
 	it->cg_link = &cgrp->css_sets;
 	cgroup_advance_iter(cgrp, it);
@@ -1726,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
 	read_unlock(&css_set_lock);
 }
 
+static inline int started_after_time(struct task_struct *t1,
+				     struct timespec *time,
+				     struct task_struct *t2)
+{
+	int start_diff = timespec_compare(&t1->start_time, time);
+	if (start_diff > 0) {
+		return 1;
+	} else if (start_diff < 0) {
+		return 0;
+	} else {
+		/*
+		 * Arbitrarily, if two processes started at the same
+		 * time, we'll say that the lower pointer value
+		 * started first. Note that t2 may have exited by now
+		 * so this may not be a valid pointer any longer, but
+		 * that's fine - it still serves to distinguish
+		 * between two tasks started (effectively) simultaneously.
+		 */
+		return t1 > t2;
+	}
+}
+
+/*
+ * This function is a callback from heap_insert() and is used to order
+ * the heap.
+ * In this case we order the heap in descending task start time.
+ */
+static inline int started_after(void *p1, void *p2)
+{
+	struct task_struct *t1 = p1;
+	struct task_struct *t2 = p2;
+	return started_after_time(t1, &t2->start_time, t2);
+}
+
+/**
+ * cgroup_scan_tasks - iterate though all the tasks in a cgroup
+ * @scan: struct cgroup_scanner containing arguments for the scan
+ *
+ * Arguments include pointers to callback functions test_task() and
+ * process_task().
+ * Iterate through all the tasks in a cgroup, calling test_task() for each,
+ * and if it returns true, call process_task() for it also.
+ * The test_task pointer may be NULL, meaning always true (select all tasks).
+ * Effectively duplicates cgroup_iter_{start,next,end}()
+ * but does not lock css_set_lock for the call to process_task().
+ * The struct cgroup_scanner may be embedded in any structure of the caller's
+ * creation.
+ * It is guaranteed that process_task() will act on every task that
+ * is a member of the cgroup for the duration of this call. This
+ * function may or may not call process_task() for tasks that exit
+ * or move to a different cgroup during the call, or are forked or
+ * move into the cgroup during the call.
+ *
+ * Note that test_task() may be called with locks held, and may in some
+ * situations be called multiple times for the same task, so it should
+ * be cheap.
+ * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
+ * pre-allocated and will be used for heap operations (and its "gt" member will
+ * be overwritten), else a temporary heap will be used (allocation of which
+ * may cause this function to fail).
+ */
+int cgroup_scan_tasks(struct cgroup_scanner *scan)
+{
+	int retval, i;
+	struct cgroup_iter it;
+	struct task_struct *p, *dropped;
+	/* Never dereference latest_task, since it's not refcounted */
+	struct task_struct *latest_task = NULL;
+	struct ptr_heap tmp_heap;
+	struct ptr_heap *heap;
+	struct timespec latest_time = { 0, 0 };
+
+	if (scan->heap) {
+		/* The caller supplied our heap and pre-allocated its memory */
+		heap = scan->heap;
+		heap->gt = &started_after;
+	} else {
+		/* We need to allocate our own heap memory */
+		heap = &tmp_heap;
+		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+		if (retval)
+			/* cannot allocate the heap */
+			return retval;
+	}
+
+ again:
+	/*
+	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
+	 * to determine which are of interest, and using the scanner's
+	 * "process_task" callback to process any of them that need an update.
+	 * Since we don't want to hold any locks during the task updates,
+	 * gather tasks to be processed in a heap structure.
+	 * The heap is sorted by descending task start time.
+	 * If the statically-sized heap fills up, we overflow tasks that
+	 * started later, and in future iterations only consider tasks that
+	 * started after the latest task in the previous pass. This
+	 * guarantees forward progress and that we don't miss any tasks.
+	 */
+	heap->size = 0;
+	cgroup_iter_start(scan->cg, &it);
+	while ((p = cgroup_iter_next(scan->cg, &it))) {
+		/*
+		 * Only affect tasks that qualify per the caller's callback,
+		 * if he provided one
+		 */
+		if (scan->test_task && !scan->test_task(p, scan))
+			continue;
+		/*
+		 * Only process tasks that started after the last task
+		 * we processed
+		 */
+		if (!started_after_time(p, &latest_time, latest_task))
+			continue;
+		dropped = heap_insert(heap, p);
+		if (dropped == NULL) {
+			/*
+			 * The new task was inserted; the heap wasn't
+			 * previously full
+			 */
+			get_task_struct(p);
+		} else if (dropped != p) {
+			/*
+			 * The new task was inserted, and pushed out a
+			 * different task
+			 */
+			get_task_struct(p);
+			put_task_struct(dropped);
+		}
+		/*
+		 * Else the new task was newer than anything already in
+		 * the heap and wasn't inserted
+		 */
+	}
+	cgroup_iter_end(scan->cg, &it);
+
+	if (heap->size) {
+		for (i = 0; i < heap->size; i++) {
+			struct task_struct *p = heap->ptrs[i];
+			if (i == 0) {
+				latest_time = p->start_time;
+				latest_task = p;
+			}
+			/* Process the task per the caller's callback */
+			scan->process_task(p, scan);
+			put_task_struct(p);
+		}
+		/*
+		 * If we had to process any tasks at all, scan again
+		 * in case some of them were in the middle of forking
+		 * children that didn't get processed.
+		 * Not the most efficient way to do it, but it avoids
+		 * having to take callback_mutex in the fork path
+		 */
+		goto again;
+	}
+	if (heap == &tmp_heap)
+		heap_free(&tmp_heap);
+	return 0;
+}
+
 /*
  * Stuff for reading the 'tasks' file.
  *
@@ -1761,7 +1955,7 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
 		if (unlikely(n == npids))
 			break;
-		pidarray[n++] = task_pid_nr(tsk);
+		pidarray[n++] = task_pid_vnr(tsk);
 	}
 	cgroup_iter_end(cgrp, &it);
 	return n;
@@ -2126,9 +2320,8 @@ static inline int cgroup_has_css_refs(struct cgroup *cgrp)
 		 * matter, since it can only happen if the cgroup
 		 * has been deleted and hence no longer needs the
 		 * release agent to be called anyway. */
-		if (css && atomic_read(&css->refcnt)) {
+		if (css && atomic_read(&css->refcnt))
 			return 1;
-		}
 	}
 	return 0;
 }
@@ -2138,7 +2331,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
-	struct cgroup_subsys *ss;
 	struct super_block *sb;
 	struct cgroupfs_root *root;
 
@@ -2157,17 +2349,19 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	parent = cgrp->parent;
 	root = cgrp->root;
 	sb = root->sb;
+	/*
+	 * Call pre_destroy handlers of subsys
+	 */
+	cgroup_call_pre_destroy(cgrp);
+	/*
+	 * Notify subsyses that rmdir() request comes.
+	 */
 
 	if (cgroup_has_css_refs(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
 
-	for_each_subsys(root, ss) {
-		if (cgrp->subsys[ss->subsys_id])
-			ss->destroy(ss, cgrp);
-	}
-
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
 	if (!list_empty(&cgrp->release_list))
@@ -2182,15 +2376,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	cgroup_d_remove_dir(d);
 	dput(d);
-	root->number_of_cgroups--;
 
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
 
 	mutex_unlock(&cgroup_mutex);
-	/* Drop the active superblock reference that we took when we
-	 * created the cgroup */
-	deactivate_super(sb);
 	return 0;
 }
 
@@ -2324,7 +2514,7 @@ out:
  *  - Used for /proc/<pid>/cgroup.
  *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
  *    doesn't really matter if tsk->cgroup changes after we read it,
- *    and we take cgroup_mutex, keeping attach_task() from changing it
+ *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
  *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
  *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
  *    cgroup to top_cgroup.
@@ -2435,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
  * A pointer to the shared css_set was automatically copied in
  * fork.c by dup_task_struct().  However, we ignore that copy, since
  * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer.  attach_task() might
+ * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
  * have already changed current->cgroups, allowing the previously
  * referenced cgroup group to be removed and freed.
  *
@@ -2514,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
  *    attach us to a different cgroup, decrementing the count on
  *    the first cgroup that we never incremented.  But in this case,
  *    top_cgroup isn't going away, and either task has PF_EXITING set,
- *    which wards off any attach_task() attempts, or task is a failed
- *    fork, never visible to attach_task.
+ *    which wards off any cgroup_attach_task() attempts, or task is a failed
+ *    fork, never visible to cgroup_attach_task.
  *
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
@@ -2655,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
 	}
 
 	/* All seems fine. Finish by moving the task into the new cgroup */
-	ret = attach_task(child, tsk);
+	ret = cgroup_attach_task(child, tsk);
 	mutex_unlock(&cgroup_mutex);
 
  out_release:
diff --git a/kernel/compat.c b/kernel/compat.c
index 42a1ed4b61b..5f0e201bcfd 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -40,10 +40,35 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
 			__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
 
+static long compat_nanosleep_restart(struct restart_block *restart)
+{
+	struct compat_timespec __user *rmtp;
+	struct timespec rmt;
+	mm_segment_t oldfs;
+	long ret;
+
+	rmtp = (struct compat_timespec __user *)(restart->arg1);
+	restart->arg1 = (unsigned long)&rmt;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = hrtimer_nanosleep_restart(restart);
+	set_fs(oldfs);
+
+	if (ret) {
+		restart->arg1 = (unsigned long)rmtp;
+
+		if (rmtp && put_compat_timespec(&rmt, rmtp))
+			return -EFAULT;
+	}
+
+	return ret;
+}
+
 asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
 				     struct compat_timespec __user *rmtp)
 {
 	struct timespec tu, rmt;
+	mm_segment_t oldfs;
 	long ret;
 
 	if (get_compat_timespec(&tu, rqtp))
@@ -52,11 +77,21 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
 	if (!timespec_valid(&tu))
 		return -EINVAL;
 
-	ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
-				CLOCK_MONOTONIC);
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = hrtimer_nanosleep(&tu,
+				rmtp ? (struct timespec __user *)&rmt : NULL,
+				HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+	set_fs(oldfs);
+
+	if (ret) {
+		struct restart_block *restart
+			= &current_thread_info()->restart_block;
+
+		restart->fn = compat_nanosleep_restart;
+		restart->arg1 = (unsigned long)rmtp;
 
-	if (ret && rmtp) {
-		if (put_compat_timespec(&rmt, rmtp))
+		if (rmtp && put_compat_timespec(&rmt, rmtp))
 			return -EFAULT;
 	}
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e0d3a4f56ec..2eff3f63abe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -389,7 +389,7 @@ int disable_nonboot_cpus(void)
 	return error;
 }
 
-void enable_nonboot_cpus(void)
+void __ref enable_nonboot_cpus(void)
 {
 	int cpu, error;
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cfaf6419d81..3e296ed81d4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -38,7 +38,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
-#include <linux/prio_heap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
@@ -56,6 +55,8 @@
 #include <asm/atomic.h>
 #include <linux/mutex.h>
 #include <linux/kfifo.h>
+#include <linux/workqueue.h>
+#include <linux/cgroup.h>
 
 /*
  * Tracks how many cpusets are currently defined in system.
@@ -64,7 +65,7 @@
  */
 int number_of_cpusets __read_mostly;
 
-/* Retrieve the cpuset from a cgroup */
+/* Forward declare cgroup structures */
 struct cgroup_subsys cpuset_subsys;
 struct cpuset;
 
@@ -96,6 +97,9 @@ struct cpuset {
 
 	/* partition number for rebuild_sched_domains() */
 	int pn;
+
+	/* used for walking a cpuset heirarchy */
+	struct list_head stack_list;
 };
 
 /* Retrieve the cpuset for a cgroup */
@@ -111,7 +115,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
 	return container_of(task_subsys_state(task, cpuset_subsys_id),
 			    struct cpuset, css);
 }
-
+struct cpuset_hotplug_scanner {
+	struct cgroup_scanner scan;
+	struct cgroup *to;
+};
 
 /* bits in struct cpuset flags field */
 typedef enum {
@@ -160,17 +167,17 @@ static inline int is_spread_slab(const struct cpuset *cs)
  * number, and avoid having to lock and reload mems_allowed unless
  * the cpuset they're using changes generation.
  *
- * A single, global generation is needed because attach_task() could
+ * A single, global generation is needed because cpuset_attach_task() could
  * reattach a task to a different cpuset, which must not have its
  * generation numbers aliased with those of that tasks previous cpuset.
  *
  * Generations are needed for mems_allowed because one task cannot
- * modify anothers memory placement.  So we must enable every task,
+ * modify another's memory placement.  So we must enable every task,
  * on every visit to __alloc_pages(), to efficiently check whether
  * its current->cpuset->mems_allowed has changed, requiring an update
  * of its current->mems_allowed.
  *
- * Since cpuset_mems_generation is guarded by manage_mutex,
+ * Since writes to cpuset_mems_generation are guarded by the cgroup lock
  * there is no need to mark it atomic.
  */
 static int cpuset_mems_generation;
@@ -182,17 +189,20 @@ static struct cpuset top_cpuset = {
 };
 
 /*
- * We have two global cpuset mutexes below.  They can nest.
- * It is ok to first take manage_mutex, then nest callback_mutex.  We also
- * require taking task_lock() when dereferencing a tasks cpuset pointer.
- * See "The task_lock() exception", at the end of this comment.
+ * There are two global mutexes guarding cpuset structures.  The first
+ * is the main control groups cgroup_mutex, accessed via
+ * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific
+ * callback_mutex, below. They can nest.  It is ok to first take
+ * cgroup_mutex, then nest callback_mutex.  We also require taking
+ * task_lock() when dereferencing a task's cpuset pointer.  See "The
+ * task_lock() exception", at the end of this comment.
  *
  * A task must hold both mutexes to modify cpusets.  If a task
- * holds manage_mutex, then it blocks others wanting that mutex,
+ * holds cgroup_mutex, then it blocks others wanting that mutex,
  * ensuring that it is the only task able to also acquire callback_mutex
  * and be able to modify cpusets.  It can perform various checks on
  * the cpuset structure first, knowing nothing will change.  It can
- * also allocate memory while just holding manage_mutex.  While it is
+ * also allocate memory while just holding cgroup_mutex.  While it is
  * performing these checks, various callback routines can briefly
  * acquire callback_mutex to query cpusets.  Once it is ready to make
  * the changes, it takes callback_mutex, blocking everyone else.
@@ -208,60 +218,16 @@ static struct cpuset top_cpuset = {
  * The task_struct fields mems_allowed and mems_generation may only
  * be accessed in the context of that task, so require no locks.
  *
- * Any task can increment and decrement the count field without lock.
- * So in general, code holding manage_mutex or callback_mutex can't rely
- * on the count field not changing.  However, if the count goes to
- * zero, then only attach_task(), which holds both mutexes, can
- * increment it again.  Because a count of zero means that no tasks
- * are currently attached, therefore there is no way a task attached
- * to that cpuset can fork (the other way to increment the count).
- * So code holding manage_mutex or callback_mutex can safely assume that
- * if the count is zero, it will stay zero.  Similarly, if a task
- * holds manage_mutex or callback_mutex on a cpuset with zero count, it
- * knows that the cpuset won't be removed, as cpuset_rmdir() needs
- * both of those mutexes.
- *
  * The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds manage_mutex across the entire operation,
+ * the cpuset hierarchy holds cgroup_mutex across the entire operation,
  * single threading all such cpuset modifications across the system.
  *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
  * cpumasks and nodemasks.
  *
- * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
- * (usually) take either mutex.  These are the two most performance
- * critical pieces of code here.  The exception occurs on cpuset_exit(),
- * when a task in a notify_on_release cpuset exits.  Then manage_mutex
- * is taken, and if the cpuset count is zero, a usermode call made
- * to /sbin/cpuset_release_agent with the name of the cpuset (path
- * relative to the root of cpuset file system) as the argument.
- *
- * A cpuset can only be deleted if both its 'count' of using tasks
- * is zero, and its list of 'children' cpusets is empty.  Since all
- * tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init), therefore, top_cpuset
- * always has either children cpusets and/or using tasks.  So we don't
- * need a special hack to ensure that top_cpuset cannot be deleted.
- *
- * The above "Tale of Two Semaphores" would be complete, but for:
- *
- *	The task_lock() exception
- *
- * The need for this exception arises from the action of attach_task(),
- * which overwrites one tasks cpuset pointer with another.  It does
- * so using both mutexes, however there are several performance
- * critical places that need to reference task->cpuset without the
- * expense of grabbing a system global mutex.  Therefore except as
- * noted below, when dereferencing or, as in attach_task(), modifying
- * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
- * (task->alloc_lock) already in the task_struct routinely used for
- * such matters.
- *
- * P.S.  One more locking exception.  RCU is used to guard the
- * update of a tasks cpuset pointer by attach_task() and the
- * access of task->cpuset->mems_generation via that pointer in
- * the routine cpuset_update_task_memory_state().
+ * Accessing a task's cpuset should be done in accordance with the
+ * guidelines for accessing subsystem state in kernel/cgroup.c
  */
 
 static DEFINE_MUTEX(callback_mutex);
@@ -354,15 +320,14 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * Do not call this routine if in_interrupt().
  *
  * Call without callback_mutex or task_lock() held.  May be
- * called with or without manage_mutex held.  Thanks in part to
- * 'the_top_cpuset_hack', the tasks cpuset pointer will never
+ * called with or without cgroup_mutex held.  Thanks in part to
+ * 'the_top_cpuset_hack', the task's cpuset pointer will never
  * be NULL.  This routine also might acquire callback_mutex and
  * current->mm->mmap_sem during call.
  *
  * Reading current->cpuset->mems_generation doesn't need task_lock
  * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset by attach_task(),
- * using RCU.
+ * from concurrent freeing of current->cpuset using RCU.
  *
  * The rcu_dereference() is technically probably not needed,
  * as I don't actually mind if I see a new cpuset pointer but
@@ -424,7 +389,7 @@ void cpuset_update_task_memory_state(void)
  *
  * One cpuset is a subset of another if all its allowed CPUs and
  * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding manage_mutex.
+ * are only set if the other's are set.  Call holding cgroup_mutex.
  */
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -442,7 +407,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  * If we replaced the flag and mask values of the current cpuset
  * (cur) with those values in the trial cpuset (trial), would
  * our various subset and exclusive rules still be valid?  Presumes
- * manage_mutex held.
+ * cgroup_mutex held.
  *
  * 'cur' is the address of an actual, in-use cpuset.  Operations
  * such as list traversal that depend on the actual address of the
@@ -476,7 +441,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	if (!is_cpuset_subset(trial, par))
 		return -EACCES;
 
-	/* If either I or some sibling (!= me) is exclusive, we can't overlap */
+	/*
+	 * If either I or some sibling (!= me) is exclusive, we can't
+	 * overlap
+	 */
 	list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
 		c = cgroup_cs(cont);
 		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
@@ -732,22 +700,50 @@ static inline int started_after(void *p1, void *p2)
 	return started_after_time(t1, &t2->start_time, t2);
 }
 
-/*
- * Call with manage_mutex held.  May take callback_mutex during call.
+/**
+ * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
+ * @tsk: task to test
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Called for each task in a cgroup by cgroup_scan_tasks().
+ * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
+ * words, if its mask is not equal to its cpuset's mask).
+ */
+int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+	return !cpus_equal(tsk->cpus_allowed,
+			(cgroup_cs(scan->cg))->cpus_allowed);
+}
+
+/**
+ * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
+ * @tsk: task to test
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup whose
+ * cpus_allowed mask needs to be changed.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
  */
+void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+	set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed);
+}
 
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
-	int retval, i;
-	int is_load_balanced;
-	struct cgroup_iter it;
-	struct cgroup *cgrp = cs->css.cgroup;
-	struct task_struct *p, *dropped;
-	/* Never dereference latest_task, since it's not refcounted */
-	struct task_struct *latest_task = NULL;
+	struct cgroup_scanner scan;
 	struct ptr_heap heap;
-	struct timespec latest_time = { 0, 0 };
+	int retval;
+	int is_load_balanced;
 
 	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
 	if (cs == &top_cpuset)
@@ -756,7 +752,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	trialcs = *cs;
 
 	/*
-	 * An empty cpus_allowed is ok iff there are no tasks in the cpuset.
+	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
 	 * Since cpulist_parse() fails on an empty mask, we special case
 	 * that parsing.  The validate_change() call ensures that cpusets
 	 * with tasks have cpus.
@@ -777,6 +773,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	/* Nothing to do if the cpus didn't change */
 	if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
 		return 0;
+
 	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
 	if (retval)
 		return retval;
@@ -787,62 +784,19 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	cs->cpus_allowed = trialcs.cpus_allowed;
 	mutex_unlock(&callback_mutex);
 
- again:
 	/*
 	 * Scan tasks in the cpuset, and update the cpumasks of any
-	 * that need an update. Since we can't call set_cpus_allowed()
-	 * while holding tasklist_lock, gather tasks to be processed
-	 * in a heap structure. If the statically-sized heap fills up,
-	 * overflow tasks that started later, and in future iterations
-	 * only consider tasks that started after the latest task in
-	 * the previous pass. This guarantees forward progress and
-	 * that we don't miss any tasks
+	 * that need an update.
 	 */
-	heap.size = 0;
-	cgroup_iter_start(cgrp, &it);
-	while ((p = cgroup_iter_next(cgrp, &it))) {
-		/* Only affect tasks that don't have the right cpus_allowed */
-		if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
-			continue;
-		/*
-		 * Only process tasks that started after the last task
-		 * we processed
-		 */
-		if (!started_after_time(p, &latest_time, latest_task))
-			continue;
-		dropped = heap_insert(&heap, p);
-		if (dropped == NULL) {
-			get_task_struct(p);
-		} else if (dropped != p) {
-			get_task_struct(p);
-			put_task_struct(dropped);
-		}
-	}
-	cgroup_iter_end(cgrp, &it);
-	if (heap.size) {
-		for (i = 0; i < heap.size; i++) {
-			struct task_struct *p = heap.ptrs[i];
-			if (i == 0) {
-				latest_time = p->start_time;
-				latest_task = p;
-			}
-			set_cpus_allowed(p, cs->cpus_allowed);
-			put_task_struct(p);
-		}
-		/*
-		 * If we had to process any tasks at all, scan again
-		 * in case some of them were in the middle of forking
-		 * children that didn't notice the new cpumask
-		 * restriction.  Not the most efficient way to do it,
-		 * but it avoids having to take callback_mutex in the
-		 * fork path
-		 */
-		goto again;
-	}
+	scan.cg = cs->css.cgroup;
+	scan.test_task = cpuset_test_cpumask;
+	scan.process_task = cpuset_change_cpumask;
+	scan.heap = &heap;
+	cgroup_scan_tasks(&scan);
 	heap_free(&heap);
+
 	if (is_load_balanced)
 		rebuild_sched_domains();
-
 	return 0;
 }
 
@@ -854,11 +808,11 @@ static int update_cpumask(struct cpuset *cs, char *buf)
  *    Temporarilly set tasks mems_allowed to target nodes of migration,
  *    so that the migration code can allocate pages on these nodes.
  *
- *    Call holding manage_mutex, so our current->cpuset won't change
- *    during this call, as manage_mutex holds off any attach_task()
+ *    Call holding cgroup_mutex, so current's cpuset won't change
+ *    during this call, as manage_mutex holds off any cpuset_attach()
  *    calls.  Therefore we don't need to take task_lock around the
  *    call to guarantee_online_mems(), as we know no one is changing
- *    our tasks cpuset.
+ *    our task's cpuset.
  *
  *    Hold callback_mutex around the two modifications of our tasks
  *    mems_allowed to synchronize with cpuset_mems_allowed().
@@ -903,7 +857,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  * the cpuset is marked 'memory_migrate', migrate the tasks
  * pages to the new memory.
  *
- * Call with manage_mutex held.  May take callback_mutex during call.
+ * Call with cgroup_mutex held.  May take callback_mutex during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
  * their mempolicies to the cpusets new mems_allowed.
@@ -1016,7 +970,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	 * tasklist_lock.  Forks can happen again now - the mpol_copy()
 	 * cpuset_being_rebound check will catch such forks, and rebind
 	 * their vma mempolicies too.  Because we still hold the global
-	 * cpuset manage_mutex, we know that no other rebind effort will
+	 * cgroup_mutex, we know that no other rebind effort will
 	 * be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -1031,7 +985,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 		mmput(mm);
 	}
 
-	/* We're done rebinding vma's to this cpusets new mems_allowed. */
+	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
 	kfree(mmarray);
 	cpuset_being_rebound = NULL;
 	retval = 0;
@@ -1045,7 +999,7 @@ int current_cpuset_is_being_rebound(void)
 }
 
 /*
- * Call with manage_mutex held.
+ * Call with cgroup_mutex held.
  */
 
 static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -1066,7 +1020,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
  *
- * Call with manage_mutex held.
+ * Call with cgroup_mutex held.
  */
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -1200,6 +1154,7 @@ static int fmeter_getrate(struct fmeter *fmp)
 	return val;
 }
 
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys *ss,
 			     struct cgroup *cont, struct task_struct *tsk)
 {
@@ -1547,7 +1502,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
  * If this becomes a problem for some users who wish to
  * allow that scenario, then cpuset_post_clone() could be
  * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
+ * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
+ * held.
  */
 static void cpuset_post_clone(struct cgroup_subsys *ss,
 			      struct cgroup *cgroup)
@@ -1571,11 +1527,8 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
 
 /*
  *	cpuset_create - create a cpuset
- *	parent:	cpuset that will be parent of the new cpuset.
- *	name:		name of the new cpuset. Will be strcpy'ed.
- *	mode:		mode to set on new inode
- *
- *	Must be called with the mutex on the parent inode held
+ *	ss:	cpuset cgroup subsystem
+ *	cont:	control group that the new cpuset will be part of
  */
 
 static struct cgroup_subsys_state *cpuset_create(
@@ -1687,53 +1640,140 @@ int __init cpuset_init(void)
 	return 0;
 }
 
+/**
+ * cpuset_do_move_task - move a given task to another cpuset
+ * @tsk: pointer to task_struct the task to move
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ * Return nonzero to stop the walk through the tasks.
+ */
+void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+	struct cpuset_hotplug_scanner *chsp;
+
+	chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
+	cgroup_attach_task(chsp->to, tsk);
+}
+
+/**
+ * move_member_tasks_to_cpuset - move tasks from one cpuset to another
+ * @from: cpuset in which the tasks currently reside
+ * @to: cpuset to which the tasks will be moved
+ *
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+	struct cpuset_hotplug_scanner scan;
+
+	scan.scan.cg = from->css.cgroup;
+	scan.scan.test_task = NULL; /* select all tasks in cgroup */
+	scan.scan.process_task = cpuset_do_move_task;
+	scan.scan.heap = NULL;
+	scan.to = to->css.cgroup;
+
+	if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+		printk(KERN_ERR "move_member_tasks_to_cpuset: "
+				"cgroup_scan_tasks failed\n");
+}
+
 /*
  * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
  * removing that CPU or node from all cpusets.  If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
- * or guarantee_online_mems() code will use that emptied cpusets
- * parent online CPUs or nodes.  Cpusets that were already empty of
- * CPUs or nodes are left empty.
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
+ *
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+	struct cpuset *parent;
+
+	/*
+	 * The cgroup's css_sets list is in use if there are tasks
+	 * in the cpuset; the list is empty if there are none;
+	 * the cs->css.refcnt seems always 0.
+	 */
+	if (list_empty(&cs->css.cgroup->css_sets))
+		return;
+
+	/*
+	 * Find its next-highest non-empty parent, (top cpuset
+	 * has online cpus, so can't be empty).
+	 */
+	parent = cs->parent;
+	while (cpus_empty(parent->cpus_allowed) ||
+			nodes_empty(parent->mems_allowed))
+		parent = parent->parent;
+
+	move_member_tasks_to_cpuset(cs, parent);
+}
+
+/*
+ * Walk the specified cpuset subtree and look for empty cpusets.
+ * The tasks of such cpuset must be moved to a parent cpuset.
  *
- * This routine is intentionally inefficient in a couple of regards.
- * It will check all cpusets in a subtree even if the top cpuset of
- * the subtree has no offline CPUs or nodes.  It checks both CPUs and
- * nodes, even though the caller could have been coded to know that
- * only one of CPUs or nodes needed to be checked on a given call.
- * This was done to minimize text size rather than cpu cycles.
+ * Called with cgroup_mutex held.  We take callback_mutex to modify
+ * cpus_allowed and mems_allowed.
  *
- * Call with both manage_mutex and callback_mutex held.
+ * This walk processes the tree from top to bottom, completing one layer
+ * before dropping down to the next.  It always processes a node before
+ * any of its children.
  *
- * Recursive, on depth of cpuset subtree.
+ * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * that has tasks along with an empty 'mems'.  But if we did see such
+ * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
  */
-
-static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+static void scan_for_empty_cpusets(const struct cpuset *root)
 {
+	struct cpuset *cp;	/* scans cpusets being updated */
+	struct cpuset *child;	/* scans child cpusets of cp */
+	struct list_head queue;
 	struct cgroup *cont;
-	struct cpuset *c;
 
-	/* Each of our child cpusets mems must be online */
-	list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
-		c = cgroup_cs(cont);
-		guarantee_online_cpus_mems_in_subtree(c);
-		if (!cpus_empty(c->cpus_allowed))
-			guarantee_online_cpus(c, &c->cpus_allowed);
-		if (!nodes_empty(c->mems_allowed))
-			guarantee_online_mems(c, &c->mems_allowed);
+	INIT_LIST_HEAD(&queue);
+
+	list_add_tail((struct list_head *)&root->stack_list, &queue);
+
+	while (!list_empty(&queue)) {
+		cp = container_of(queue.next, struct cpuset, stack_list);
+		list_del(queue.next);
+		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+			child = cgroup_cs(cont);
+			list_add_tail(&child->stack_list, &queue);
+		}
+		cont = cp->css.cgroup;
+
+		/* Continue past cpusets with all cpus, mems online */
+		if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
+			continue;
+
+		/* Remove offline cpus and mems from this cpuset. */
+		mutex_lock(&callback_mutex);
+		cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
+		nodes_and(cp->mems_allowed, cp->mems_allowed,
+						node_states[N_HIGH_MEMORY]);
+		mutex_unlock(&callback_mutex);
+
+		/* Move tasks from the empty cpuset to a parent */
+		if (cpus_empty(cp->cpus_allowed) ||
+		     nodes_empty(cp->mems_allowed))
+			remove_tasks_in_empty_cpuset(cp);
 	}
 }
 
 /*
  * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
  * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug
- * event.
- *
- * To ensure that we don't remove a CPU or node from the top cpuset
- * that is currently in use by a child cpuset (which would violate
- * the rule that cpusets must be subsets of their parent), we first
- * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ * track what's online after any CPU or memory node hotplug or unplug event.
  *
  * Since there are two callers of this routine, one for CPU hotplug
  * events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1784,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
 static void common_cpu_mem_hotplug_unplug(void)
 {
 	cgroup_lock();
-	mutex_lock(&callback_mutex);
 
-	guarantee_online_cpus_mems_in_subtree(&top_cpuset);
 	top_cpuset.cpus_allowed = cpu_online_map;
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+	scan_for_empty_cpusets(&top_cpuset);
 
-	mutex_unlock(&callback_mutex);
 	cgroup_unlock();
 }
 
@@ -1826,7 +1864,7 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
 
 /**
  * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be  called with callback_mutex held.
+ * Must be called with callback_mutex held.
  **/
 cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
 {
@@ -2163,10 +2201,8 @@ void __cpuset_memory_pressure_bump(void)
  *  - Used for /proc/<pid>/cpuset.
  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
  *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take manage_mutex, keeping attach_task() from changing it
- *    anyway.  No need to check that tsk->cpuset != NULL, thanks to
- *    the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
- *    cpuset to top_cpuset.
+ *    and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ *    anyway.
  */
 static int proc_cpuset_show(struct seq_file *m, void *unused_v)
 {
@@ -2219,13 +2255,14 @@ const struct file_operations proc_cpuset_operations = {
 #endif /* CONFIG_PROC_PID_CPUSET */
 
 /* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
-char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
-{
-	buffer += sprintf(buffer, "Cpus_allowed:\t");
-	buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
-	buffer += sprintf(buffer, "\n");
-	buffer += sprintf(buffer, "Mems_allowed:\t");
-	buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
-	buffer += sprintf(buffer, "\n");
-	return buffer;
+void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
+{
+	seq_printf(m, "Cpus_allowed:\t");
+	m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count,
+					task->cpus_allowed);
+	seq_printf(m, "\n");
+	seq_printf(m, "Mems_allowed:\t");
+	m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count,
+					task->mems_allowed);
+	seq_printf(m, "\n");
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d3d0f0b27d..506a957b665 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -293,26 +293,27 @@ static void reparent_to_kthreadd(void)
 	switch_uid(INIT_USER);
 }
 
-void __set_special_pids(pid_t session, pid_t pgrp)
+void __set_special_pids(struct pid *pid)
 {
 	struct task_struct *curr = current->group_leader;
+	pid_t nr = pid_nr(pid);
 
-	if (task_session_nr(curr) != session) {
+	if (task_session(curr) != pid) {
 		detach_pid(curr, PIDTYPE_SID);
-		set_task_session(curr, session);
-		attach_pid(curr, PIDTYPE_SID, find_pid(session));
+		attach_pid(curr, PIDTYPE_SID, pid);
+		set_task_session(curr, nr);
 	}
-	if (task_pgrp_nr(curr) != pgrp) {
+	if (task_pgrp(curr) != pid) {
 		detach_pid(curr, PIDTYPE_PGID);
-		set_task_pgrp(curr, pgrp);
-		attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp));
+		attach_pid(curr, PIDTYPE_PGID, pid);
+		set_task_pgrp(curr, nr);
 	}
 }
 
-static void set_special_pids(pid_t session, pid_t pgrp)
+static void set_special_pids(struct pid *pid)
 {
 	write_lock_irq(&tasklist_lock);
-	__set_special_pids(session, pgrp);
+	__set_special_pids(pid);
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -383,7 +384,11 @@ void daemonize(const char *name, ...)
 	 */
 	current->flags |= PF_NOFREEZE;
 
-	set_special_pids(1, 1);
+	if (current->nsproxy != &init_nsproxy) {
+		get_nsproxy(&init_nsproxy);
+		switch_task_namespaces(current, &init_nsproxy);
+	}
+	set_special_pids(&init_struct_pid);
 	proc_clear_tty(current);
 
 	/* Block and flush all signals */
@@ -398,11 +403,6 @@ void daemonize(const char *name, ...)
 	current->fs = fs;
 	atomic_inc(&fs->count);
 
-	if (current->nsproxy != init_task.nsproxy) {
-		get_nsproxy(init_task.nsproxy);
-		switch_task_namespaces(current, init_task.nsproxy);
-	}
-
 	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
@@ -458,7 +458,7 @@ struct files_struct *get_files_struct(struct task_struct *task)
 	return files;
 }
 
-void fastcall put_files_struct(struct files_struct *files)
+void put_files_struct(struct files_struct *files)
 {
 	struct fdtable *fdt;
 
@@ -512,14 +512,10 @@ static void __put_fs_struct(struct fs_struct *fs)
 {
 	/* No need to hold fs->lock if we are killing it */
 	if (atomic_dec_and_test(&fs->count)) {
-		dput(fs->root);
-		mntput(fs->rootmnt);
-		dput(fs->pwd);
-		mntput(fs->pwdmnt);
-		if (fs->altroot) {
-			dput(fs->altroot);
-			mntput(fs->altrootmnt);
-		}
+		path_put(&fs->root);
+		path_put(&fs->pwd);
+		if (fs->altroot.dentry)
+			path_put(&fs->altroot);
 		kmem_cache_free(fs_cachep, fs);
 	}
 }
@@ -745,24 +741,6 @@ static void exit_notify(struct task_struct *tsk)
 	struct task_struct *t;
 	struct pid *pgrp;
 
-	if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
-	    && !thread_group_empty(tsk)) {
-		/*
-		 * This occurs when there was a race between our exit
-		 * syscall and a group signal choosing us as the one to
-		 * wake up.  It could be that we are the only thread
-		 * alerted to check for pending signals, but another thread
-		 * should be woken now to take the signal since we will not.
-		 * Now we'll wake all the threads in the group just to make
-		 * sure someone gets all the pending signals.
-		 */
-		spin_lock_irq(&tsk->sighand->siglock);
-		for (t = next_thread(tsk); t != tsk; t = next_thread(t))
-			if (!signal_pending(t) && !(t->flags & PF_EXITING))
-				recalc_sigpending_and_wake(t);
-		spin_unlock_irq(&tsk->sighand->siglock);
-	}
-
 	/*
 	 * This does two things:
 	 *
@@ -905,7 +883,7 @@ static inline void exit_child_reaper(struct task_struct *tsk)
 	zap_pid_ns_processes(tsk->nsproxy->pid_ns);
 }
 
-fastcall NORET_TYPE void do_exit(long code)
+NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
@@ -947,7 +925,7 @@ fastcall NORET_TYPE void do_exit(long code)
 		schedule();
 	}
 
-	tsk->flags |= PF_EXITING;
+	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
 	 * an exiting task cleaning up the robust pi futexes.
@@ -1108,20 +1086,23 @@ asmlinkage void sys_exit_group(int error_code)
 	do_group_exit((error_code & 0xff) << 8);
 }
 
-static int eligible_child(pid_t pid, int options, struct task_struct *p)
+static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
+{
+	struct pid *pid = NULL;
+	if (type == PIDTYPE_PID)
+		pid = task->pids[type].pid;
+	else if (type < PIDTYPE_MAX)
+		pid = task->group_leader->pids[type].pid;
+	return pid;
+}
+
+static int eligible_child(enum pid_type type, struct pid *pid, int options,
+			  struct task_struct *p)
 {
 	int err;
-	struct pid_namespace *ns;
 
-	ns = current->nsproxy->pid_ns;
-	if (pid > 0) {
-		if (task_pid_nr_ns(p, ns) != pid)
-			return 0;
-	} else if (!pid) {
-		if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current))
-			return 0;
-	} else if (pid != -1) {
-		if (task_pgrp_nr_ns(p, ns) != -pid)
+	if (type < PIDTYPE_MAX) {
+		if (task_pid_type(p, type) != pid)
 			return 0;
 	}
 
@@ -1140,18 +1121,16 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
 	if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
 	    && !(options & __WALL))
 		return 0;
-	/*
-	 * Do not consider thread group leaders that are
-	 * in a non-empty thread group:
-	 */
-	if (delay_group_leader(p))
-		return 2;
 
 	err = security_task_wait(p);
-	if (err)
-		return err;
+	if (likely(!err))
+		return 1;
 
-	return 1;
+	if (type != PIDTYPE_PID)
+		return 0;
+	/* This child was explicitly requested, abort */
+	read_unlock(&tasklist_lock);
+	return err;
 }
 
 static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1191,20 +1170,13 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 {
 	unsigned long state;
 	int retval, status, traced;
-	struct pid_namespace *ns;
-
-	ns = current->nsproxy->pid_ns;
+	pid_t pid = task_pid_vnr(p);
 
 	if (unlikely(noreap)) {
-		pid_t pid = task_pid_nr_ns(p, ns);
 		uid_t uid = p->uid;
 		int exit_code = p->exit_code;
 		int why, status;
 
-		if (unlikely(p->exit_state != EXIT_ZOMBIE))
-			return 0;
-		if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
-			return 0;
 		get_task_struct(p);
 		read_unlock(&tasklist_lock);
 		if ((exit_code & 0x7f) == 0) {
@@ -1315,11 +1287,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 			retval = put_user(status, &infop->si_status);
 	}
 	if (!retval && infop)
-		retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
+		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
 		retval = put_user(p->uid, &infop->si_uid);
 	if (!retval)
-		retval = task_pid_nr_ns(p, ns);
+		retval = pid;
 
 	if (traced) {
 		write_lock_irq(&tasklist_lock);
@@ -1351,21 +1323,38 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
+static int wait_task_stopped(struct task_struct *p,
 			     int noreap, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
-	int retval, exit_code;
+	int retval, exit_code, why;
+	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
-	if (!p->exit_code)
-		return 0;
-	if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
-	    p->signal->group_stop_count > 0)
+	exit_code = 0;
+	spin_lock_irq(&p->sighand->siglock);
+
+	if (unlikely(!task_is_stopped_or_traced(p)))
+		goto unlock_sig;
+
+	if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0)
 		/*
 		 * A group stop is in progress and this is the group leader.
 		 * We won't report until all threads have stopped.
 		 */
+		goto unlock_sig;
+
+	exit_code = p->exit_code;
+	if (!exit_code)
+		goto unlock_sig;
+
+	if (!noreap)
+		p->exit_code = 0;
+
+	uid = p->uid;
+unlock_sig:
+	spin_unlock_irq(&p->sighand->siglock);
+	if (!exit_code)
 		return 0;
 
 	/*
@@ -1375,65 +1364,15 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
 	 * keep holding onto the tasklist_lock while we call getrusage and
 	 * possibly take page faults for user memory.
 	 */
-	pid = task_pid_nr_ns(p, current->nsproxy->pid_ns);
 	get_task_struct(p);
+	pid = task_pid_vnr(p);
+	why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 
-	if (unlikely(noreap)) {
-		uid_t uid = p->uid;
-		int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
-
-		exit_code = p->exit_code;
-		if (unlikely(!exit_code) || unlikely(p->exit_state))
-			goto bail_ref;
+	if (unlikely(noreap))
 		return wait_noreap_copyout(p, pid, uid,
 					   why, exit_code,
 					   infop, ru);
-	}
-
-	write_lock_irq(&tasklist_lock);
-
-	/*
-	 * This uses xchg to be atomic with the thread resuming and setting
-	 * it.  It must also be done with the write lock held to prevent a
-	 * race with the EXIT_ZOMBIE case.
-	 */
-	exit_code = xchg(&p->exit_code, 0);
-	if (unlikely(p->exit_state)) {
-		/*
-		 * The task resumed and then died.  Let the next iteration
-		 * catch it in EXIT_ZOMBIE.  Note that exit_code might
-		 * already be zero here if it resumed and did _exit(0).
-		 * The task itself is dead and won't touch exit_code again;
-		 * other processors in this function are locked out.
-		 */
-		p->exit_code = exit_code;
-		exit_code = 0;
-	}
-	if (unlikely(exit_code == 0)) {
-		/*
-		 * Another thread in this function got to it first, or it
-		 * resumed, or it resumed and then died.
-		 */
-		write_unlock_irq(&tasklist_lock);
-bail_ref:
-		put_task_struct(p);
-		/*
-		 * We are returning to the wait loop without having successfully
-		 * removed the process and having released the lock. We cannot
-		 * continue, since the "p" task pointer is potentially stale.
-		 *
-		 * Return -EAGAIN, and do_wait() will restart the loop from the
-		 * beginning. Do _not_ re-acquire the lock.
-		 */
-		return -EAGAIN;
-	}
-
-	/* move to end of parent's list to avoid starvation */
-	remove_parent(p);
-	add_parent(p);
-
-	write_unlock_irq(&tasklist_lock);
 
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 	if (!retval && stat_addr)
@@ -1443,15 +1382,13 @@ bail_ref:
 	if (!retval && infop)
 		retval = put_user(0, &infop->si_errno);
 	if (!retval && infop)
-		retval = put_user((short)((p->ptrace & PT_PTRACED)
-					  ? CLD_TRAPPED : CLD_STOPPED),
-				  &infop->si_code);
+		retval = put_user(why, &infop->si_code);
 	if (!retval && infop)
 		retval = put_user(exit_code, &infop->si_status);
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
-		retval = put_user(p->uid, &infop->si_uid);
+		retval = put_user(uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 	put_task_struct(p);
@@ -1473,7 +1410,6 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	int retval;
 	pid_t pid;
 	uid_t uid;
-	struct pid_namespace *ns;
 
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
 		return 0;
@@ -1488,8 +1424,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 	spin_unlock_irq(&p->sighand->siglock);
 
-	ns = current->nsproxy->pid_ns;
-	pid = task_pid_nr_ns(p, ns);
+	pid = task_pid_vnr(p);
 	uid = p->uid;
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
@@ -1500,7 +1435,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 		if (!retval && stat_addr)
 			retval = put_user(0xffff, stat_addr);
 		if (!retval)
-			retval = task_pid_nr_ns(p, ns);
+			retval = pid;
 	} else {
 		retval = wait_noreap_copyout(p, pid, uid,
 					     CLD_CONTINUED, SIGCONT,
@@ -1511,103 +1446,63 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	return retval;
 }
 
-
-static inline int my_ptrace_child(struct task_struct *p)
-{
-	if (!(p->ptrace & PT_PTRACED))
-		return 0;
-	if (!(p->ptrace & PT_ATTACHED))
-		return 1;
-	/*
-	 * This child was PTRACE_ATTACH'd.  We should be seeing it only if
-	 * we are the attacher.  If we are the real parent, this is a race
-	 * inside ptrace_attach.  It is waiting for the tasklist_lock,
-	 * which we have to switch the parent links, but has already set
-	 * the flags in p->ptrace.
-	 */
-	return (p->parent != p->real_parent);
-}
-
-static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
-		    int __user *stat_addr, struct rusage __user *ru)
+static long do_wait(enum pid_type type, struct pid *pid, int options,
+		    struct siginfo __user *infop, int __user *stat_addr,
+		    struct rusage __user *ru)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
 	int flag, retval;
-	int allowed, denied;
 
 	add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
+	/* If there is nothing that can match our critier just get out */
+	retval = -ECHILD;
+	if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
+		goto end;
+
 	/*
 	 * We will set this flag if we see any child that might later
 	 * match our criteria, even if we are not able to reap it yet.
 	 */
-	flag = 0;
-	allowed = denied = 0;
+	flag = retval = 0;
 	current->state = TASK_INTERRUPTIBLE;
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
 		struct task_struct *p;
-		int ret;
 
 		list_for_each_entry(p, &tsk->children, sibling) {
-			ret = eligible_child(pid, options, p);
+			int ret = eligible_child(type, pid, options, p);
 			if (!ret)
 				continue;
 
 			if (unlikely(ret < 0)) {
-				denied = ret;
-				continue;
-			}
-			allowed = 1;
-
-			if (task_is_stopped_or_traced(p)) {
+				retval = ret;
+			} else if (task_is_stopped_or_traced(p)) {
 				/*
 				 * It's stopped now, so it might later
 				 * continue, exit, or stop again.
-				 *
-				 * When we hit the race with PTRACE_ATTACH, we
-				 * will not report this child.  But the race
-				 * means it has not yet been moved to our
-				 * ptrace_children list, so we need to set the
-				 * flag here to avoid a spurious ECHILD when
-				 * the race happens with the only child.
 				 */
 				flag = 1;
+				if (!(p->ptrace & PT_PTRACED) &&
+				    !(options & WUNTRACED))
+					continue;
 
-				if (!my_ptrace_child(p)) {
-					if (task_is_traced(p))
-						continue;
-					if (!(options & WUNTRACED))
-						continue;
-				}
-
-				retval = wait_task_stopped(p, ret == 2,
+				retval = wait_task_stopped(p,
 						(options & WNOWAIT), infop,
 						stat_addr, ru);
-				if (retval == -EAGAIN)
-					goto repeat;
-				if (retval != 0) /* He released the lock.  */
-					goto end;
-			} else if (p->exit_state == EXIT_DEAD) {
-				continue;
-			} else if (p->exit_state == EXIT_ZOMBIE) {
+			} else if (p->exit_state == EXIT_ZOMBIE &&
+					!delay_group_leader(p)) {
 				/*
-				 * Eligible but we cannot release it yet:
+				 * We don't reap group leaders with subthreads.
 				 */
-				if (ret == 2)
-					goto check_continued;
 				if (!likely(options & WEXITED))
 					continue;
 				retval = wait_task_zombie(p,
 						(options & WNOWAIT), infop,
 						stat_addr, ru);
-				/* He released the lock.  */
-				if (retval != 0)
-					goto end;
-			} else {
-check_continued:
+			} else if (p->exit_state != EXIT_DEAD) {
 				/*
 				 * It's running now, so it might later
 				 * exit, stop, or stop and then continue.
@@ -1618,17 +1513,20 @@ check_continued:
 				retval = wait_task_continued(p,
 						(options & WNOWAIT), infop,
 						stat_addr, ru);
-				if (retval != 0) /* He released the lock.  */
-					goto end;
 			}
+			if (retval != 0) /* tasklist_lock released */
+				goto end;
 		}
 		if (!flag) {
 			list_for_each_entry(p, &tsk->ptrace_children,
-					    ptrace_list) {
-				if (!eligible_child(pid, options, p))
+								ptrace_list) {
+				flag = eligible_child(type, pid, options, p);
+				if (!flag)
 					continue;
-				flag = 1;
-				break;
+				if (likely(flag > 0))
+					break;
+				retval = flag;
+				goto end;
 			}
 		}
 		if (options & __WNOTHREAD)
@@ -1636,10 +1534,9 @@ check_continued:
 		tsk = next_thread(tsk);
 		BUG_ON(tsk->signal != current->signal);
 	} while (tsk != current);
-
 	read_unlock(&tasklist_lock);
+
 	if (flag) {
-		retval = 0;
 		if (options & WNOHANG)
 			goto end;
 		retval = -ERESTARTSYS;
@@ -1649,14 +1546,12 @@ check_continued:
 		goto repeat;
 	}
 	retval = -ECHILD;
-	if (unlikely(denied) && !allowed)
-		retval = denied;
 end:
 	current->state = TASK_RUNNING;
 	remove_wait_queue(&current->signal->wait_chldexit,&wait);
 	if (infop) {
 		if (retval > 0)
-		retval = 0;
+			retval = 0;
 		else {
 			/*
 			 * For a WNOHANG return, clear out all the fields
@@ -1680,10 +1575,12 @@ end:
 	return retval;
 }
 
-asmlinkage long sys_waitid(int which, pid_t pid,
+asmlinkage long sys_waitid(int which, pid_t upid,
 			   struct siginfo __user *infop, int options,
 			   struct rusage __user *ru)
 {
+	struct pid *pid = NULL;
+	enum pid_type type;
 	long ret;
 
 	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
@@ -1693,37 +1590,58 @@ asmlinkage long sys_waitid(int which, pid_t pid,
 
 	switch (which) {
 	case P_ALL:
-		pid = -1;
+		type = PIDTYPE_MAX;
 		break;
 	case P_PID:
-		if (pid <= 0)
+		type = PIDTYPE_PID;
+		if (upid <= 0)
 			return -EINVAL;
 		break;
 	case P_PGID:
-		if (pid <= 0)
+		type = PIDTYPE_PGID;
+		if (upid <= 0)
 			return -EINVAL;
-		pid = -pid;
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	ret = do_wait(pid, options, infop, NULL, ru);
+	if (type < PIDTYPE_MAX)
+		pid = find_get_pid(upid);
+	ret = do_wait(type, pid, options, infop, NULL, ru);
+	put_pid(pid);
 
 	/* avoid REGPARM breakage on x86: */
 	prevent_tail_call(ret);
 	return ret;
 }
 
-asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
+asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
 			  int options, struct rusage __user *ru)
 {
+	struct pid *pid = NULL;
+	enum pid_type type;
 	long ret;
 
 	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
 			__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
-	ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+
+	if (upid == -1)
+		type = PIDTYPE_MAX;
+	else if (upid < 0) {
+		type = PIDTYPE_PGID;
+		pid = find_get_pid(-upid);
+	} else if (upid == 0) {
+		type = PIDTYPE_PGID;
+		pid = get_pid(task_pgrp(current));
+	} else /* upid > 0 */ {
+		type = PIDTYPE_PID;
+		pid = find_get_pid(upid);
+	}
+
+	ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
+	put_pid(pid);
 
 	/* avoid REGPARM breakage on x86: */
 	prevent_tail_call(ret);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2b55b74cd99..dd249c37b3a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
 #include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
+#include <linux/memcontrol.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
@@ -340,7 +341,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 
 #include <linux/init_task.h>
 
-static struct mm_struct * mm_init(struct mm_struct * mm)
+static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
@@ -357,11 +358,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	mm->ioctx_list = NULL;
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
+	mm_init_cgroup(mm, p);
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
 		return mm;
 	}
+
+	mm_free_cgroup(mm);
 	free_mm(mm);
 	return NULL;
 }
@@ -376,7 +380,7 @@ struct mm_struct * mm_alloc(void)
 	mm = allocate_mm();
 	if (mm) {
 		memset(mm, 0, sizeof(*mm));
-		mm = mm_init(mm);
+		mm = mm_init(mm, current);
 	}
 	return mm;
 }
@@ -386,10 +390,11 @@ struct mm_struct * mm_alloc(void)
  * is dropped: either by a lazy thread or by
  * mmput. Free the page directory and the mm.
  */
-void fastcall __mmdrop(struct mm_struct *mm)
+void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	mm_free_pgd(mm);
+	mm_free_cgroup(mm);
 	destroy_context(mm);
 	free_mm(mm);
 }
@@ -511,7 +516,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
 	mm->token_priority = 0;
 	mm->last_interval = 0;
 
-	if (!mm_init(mm))
+	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
 	if (init_new_context(tsk, mm))
@@ -595,16 +600,16 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 		rwlock_init(&fs->lock);
 		fs->umask = old->umask;
 		read_lock(&old->lock);
-		fs->rootmnt = mntget(old->rootmnt);
-		fs->root = dget(old->root);
-		fs->pwdmnt = mntget(old->pwdmnt);
-		fs->pwd = dget(old->pwd);
-		if (old->altroot) {
-			fs->altrootmnt = mntget(old->altrootmnt);
-			fs->altroot = dget(old->altroot);
+		fs->root = old->root;
+		path_get(&old->root);
+		fs->pwd = old->pwd;
+		path_get(&old->pwd);
+		if (old->altroot.dentry) {
+			fs->altroot = old->altroot;
+			path_get(&old->altroot);
 		} else {
-			fs->altrootmnt = NULL;
-			fs->altroot = NULL;
+			fs->altroot.mnt = NULL;
+			fs->altroot.dentry = NULL;
 		}
 		read_unlock(&old->lock);
 	}
@@ -904,7 +909,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
-	sig->tsk = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
@@ -1333,6 +1337,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			if (clone_flags & CLONE_NEWPID)
 				p->nsproxy->pid_ns->child_reaper = p;
 
+			p->signal->leader_pid = pid;
 			p->signal->tty = current->signal->tty;
 			set_task_pgrp(p, task_pgrp_nr(current));
 			set_task_session(p, task_session_nr(current));
@@ -1399,7 +1404,7 @@ fork_out:
 	return ERR_PTR(retval);
 }
 
-noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
 	return regs;
@@ -1483,13 +1488,7 @@ long do_fork(unsigned long clone_flags,
 	if (!IS_ERR(p)) {
 		struct completion vfork;
 
-		/*
-		 * this is enough to call pid_nr_ns here, but this if
-		 * improves optimisation of regular fork()
-		 */
-		nr = (clone_flags & CLONE_NEWPID) ?
-			task_pid_nr_ns(p, current->nsproxy->pid_ns) :
-				task_pid_vnr(p);
+		nr = task_pid_vnr(p);
 
 		if (clone_flags & CLONE_PARENT_SETTID)
 			put_user(nr, parent_tidptr);
@@ -1510,7 +1509,7 @@ long do_fork(unsigned long clone_flags,
 		if (!(clone_flags & CLONE_STOPPED))
 			wake_up_new_task(p, clone_flags);
 		else
-			p->state = TASK_STOPPED;
+			__set_task_state(p, TASK_STOPPED);
 
 		if (unlikely (trace)) {
 			current->ptrace_message = nr;
diff --git a/kernel/futex.c b/kernel/futex.c
index a6baaec44b8..221f2128a43 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2116,7 +2116,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
 
 		t = timespec_to_ktime(ts);
 		if (cmd == FUTEX_WAIT)
-			t = ktime_add(ktime_get(), t);
+			t = ktime_add_safe(ktime_get(), t);
 		tp = &t;
 	}
 	/*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 133d558db45..7d5e4b016f3 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -176,7 +176,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 
 		t = timespec_to_ktime(ts);
 		if (cmd == FUTEX_WAIT)
-			t = ktime_add(ktime_get(), t);
+			t = ktime_add_safe(ktime_get(), t);
 		tp = &t;
 	}
 	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 668f3967eb3..98bee013f71 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -326,6 +326,23 @@ u64 ktime_divns(const ktime_t kt, s64 div)
 #endif /* BITS_PER_LONG >= 64 */
 
 /*
+ * Add two ktime values and do a safety check for overflow:
+ */
+ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
+{
+	ktime_t res = ktime_add(lhs, rhs);
+
+	/*
+	 * We use KTIME_SEC_MAX here, the maximum timeout which we can
+	 * return to user space in a timespec:
+	 */
+	if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
+		res = ktime_set(KTIME_SEC_MAX, 0);
+
+	return res;
+}
+
+/*
  * Check, whether the timer is on the callback pending list
  */
 static inline int hrtimer_cb_pending(const struct hrtimer *timer)
@@ -425,6 +442,8 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	ktime_t expires = ktime_sub(timer->expires, base->offset);
 	int res;
 
+	WARN_ON_ONCE(timer->expires.tv64 < 0);
+
 	/*
 	 * When the callback is running, we do not reprogram the clock event
 	 * device. The timer callback is either running on a different CPU or
@@ -435,6 +454,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	if (hrtimer_callback_running(timer))
 		return 0;
 
+	/*
+	 * CLOCK_REALTIME timer might be requested with an absolute
+	 * expiry time which is less than base->offset. Nothing wrong
+	 * about that, just avoid to call into the tick code, which
+	 * has now objections against negative expiry values.
+	 */
+	if (expires.tv64 < 0)
+		return -ETIME;
+
 	if (expires.tv64 >= expires_next->tv64)
 		return 0;
 
@@ -682,13 +710,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 		 */
 		orun++;
 	}
-	timer->expires = ktime_add(timer->expires, interval);
-	/*
-	 * Make sure, that the result did not wrap with a very large
-	 * interval.
-	 */
-	if (timer->expires.tv64 < 0)
-		timer->expires = ktime_set(KTIME_SEC_MAX, 0);
+	timer->expires = ktime_add_safe(timer->expires, interval);
 
 	return orun;
 }
@@ -839,7 +861,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 	new_base = switch_hrtimer_base(timer, base);
 
 	if (mode == HRTIMER_MODE_REL) {
-		tim = ktime_add(tim, new_base->get_time());
+		tim = ktime_add_safe(tim, new_base->get_time());
 		/*
 		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
 		 * to signal that they simply return xtime in
@@ -848,16 +870,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 		 * timeouts. This will go away with the GTOD framework.
 		 */
 #ifdef CONFIG_TIME_LOW_RES
-		tim = ktime_add(tim, base->resolution);
+		tim = ktime_add_safe(tim, base->resolution);
 #endif
-		/*
-		 * Careful here: User space might have asked for a
-		 * very long sleep, so the add above might result in a
-		 * negative number, which enqueues the timer in front
-		 * of the queue.
-		 */
-		if (tim.tv64 < 0)
-			tim.tv64 = KTIME_MAX;
 	}
 	timer->expires = tim;
 
@@ -1319,13 +1333,26 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	return t->task == NULL;
 }
 
+static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
+{
+	struct timespec rmt;
+	ktime_t rem;
+
+	rem = ktime_sub(timer->expires, timer->base->get_time());
+	if (rem.tv64 <= 0)
+		return 0;
+	rmt = ktime_to_timespec(rem);
+
+	if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+		return -EFAULT;
+
+	return 1;
+}
+
 long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 {
 	struct hrtimer_sleeper t;
-	struct timespec *rmtp;
-	ktime_t time;
-
-	restart->fn = do_no_restart_syscall;
+	struct timespec __user  *rmtp;
 
 	hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
 	t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
@@ -1333,26 +1360,22 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 	if (do_nanosleep(&t, HRTIMER_MODE_ABS))
 		return 0;
 
-	rmtp = (struct timespec *)restart->arg1;
+	rmtp = (struct timespec __user *)restart->arg1;
 	if (rmtp) {
-		time = ktime_sub(t.timer.expires, t.timer.base->get_time());
-		if (time.tv64 <= 0)
-			return 0;
-		*rmtp = ktime_to_timespec(time);
+		int ret = update_rmtp(&t.timer, rmtp);
+		if (ret <= 0)
+			return ret;
 	}
 
-	restart->fn = hrtimer_nanosleep_restart;
-
 	/* The other values in restart are already filled in */
 	return -ERESTART_RESTARTBLOCK;
 }
 
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
 	struct hrtimer_sleeper t;
-	ktime_t rem;
 
 	hrtimer_init(&t.timer, clockid, mode);
 	t.timer.expires = timespec_to_ktime(*rqtp);
@@ -1364,10 +1387,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
 		return -ERESTARTNOHAND;
 
 	if (rmtp) {
-		rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
-		if (rem.tv64 <= 0)
-			return 0;
-		*rmtp = ktime_to_timespec(rem);
+		int ret = update_rmtp(&t.timer, rmtp);
+		if (ret <= 0)
+			return ret;
 	}
 
 	restart = &current_thread_info()->restart_block;
@@ -1383,8 +1405,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
 asmlinkage long
 sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
 {
-	struct timespec tu, rmt;
-	int ret;
+	struct timespec tu;
 
 	if (copy_from_user(&tu, rqtp, sizeof(tu)))
 		return -EFAULT;
@@ -1392,15 +1413,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
 	if (!timespec_valid(&tu))
 		return -EINVAL;
 
-	ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
-				CLOCK_MONOTONIC);
-
-	if (ret && rmtp) {
-		if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
-			return -EFAULT;
-	}
-
-	return ret;
+	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
 /*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 44019ce30a1..cc54c627635 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -286,7 +286,7 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
  *	Note: The caller is expected to handle the ack, clear, mask and
  *	unmask issues if necessary.
  */
-void fastcall
+void
 handle_simple_irq(unsigned int irq, struct irq_desc *desc)
 {
 	struct irqaction *action;
@@ -327,7 +327,7 @@ out_unlock:
  *	it after the associated handler has acknowledged the device, so the
  *	interrupt line is back to inactive.
  */
-void fastcall
+void
 handle_level_irq(unsigned int irq, struct irq_desc *desc)
 {
 	unsigned int cpu = smp_processor_id();
@@ -375,7 +375,7 @@ out_unlock:
  *	for modern forms of interrupt handlers, which handle the flow
  *	details in hardware, transparently.
  */
-void fastcall
+void
 handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
 	unsigned int cpu = smp_processor_id();
@@ -434,7 +434,7 @@ out:
  *	the handler was running. If all pending interrupts are handled, the
  *	loop is left.
  */
-void fastcall
+void
 handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 {
 	const unsigned int cpu = smp_processor_id();
@@ -505,7 +505,7 @@ out_unlock:
  *
  *	Per CPU interrupts on SMP machines without locking requirements
  */
-void fastcall
+void
 handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 {
 	irqreturn_t action_ret;
@@ -589,3 +589,39 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 	set_irq_chip(irq, chip);
 	__set_irq_handler(irq, handle, 0, name);
 }
+
+void __init set_irq_noprobe(unsigned int irq)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
+
+		return;
+	}
+
+	desc = irq_desc + irq;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->status |= IRQ_NOPROBE;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void __init set_irq_probe(unsigned int irq)
+{
+	struct irq_desc *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
+
+		return;
+	}
+
+	desc = irq_desc + irq;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->status &= ~IRQ_NOPROBE;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index dc335ad2752..5fa6198e913 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -25,7 +25,7 @@
  *
  * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
  */
-void fastcall
+void
 handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 {
 	print_irq_desc(irq, desc);
@@ -163,7 +163,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
  * This is the original x86 implementation which is used for every
  * interrupt type.
  */
-fastcall unsigned int __do_IRQ(unsigned int irq)
+unsigned int __do_IRQ(unsigned int irq)
 {
 	struct irq_desc *desc = irq_desc + irq;
 	struct irqaction *action;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 2fab344dbf5..ab982747d9b 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -132,7 +132,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 	struct signal_struct *sig =
 		container_of(timer, struct signal_struct, real_timer);
 
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
+	kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
 
 	return HRTIMER_NORESTART;
 }
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7dadc71ce51..f091d13def0 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -53,14 +53,6 @@ static inline int is_kernel_inittext(unsigned long addr)
 	return 0;
 }
 
-static inline int is_kernel_extratext(unsigned long addr)
-{
-	if (addr >= (unsigned long)_sextratext
-	    && addr <= (unsigned long)_eextratext)
-		return 1;
-	return 0;
-}
-
 static inline int is_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
@@ -80,8 +72,7 @@ static int is_ksym_addr(unsigned long addr)
 	if (all_var)
 		return is_kernel(addr);
 
-	return is_kernel_text(addr) || is_kernel_inittext(addr) ||
-		is_kernel_extratext(addr);
+	return is_kernel_text(addr) || is_kernel_inittext(addr);
 }
 
 /* expand a compressed symbol data into the resulting uncompressed string,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a26eec9eb0..06a0e277565 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1361,8 +1361,8 @@ unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
 
 static int __init crash_save_vmcoreinfo_init(void)
 {
-	vmcoreinfo_append_str("OSRELEASE=%s\n", init_uts_ns.name.release);
-	vmcoreinfo_append_str("PAGESIZE=%ld\n", PAGE_SIZE);
+	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+	VMCOREINFO_PAGESIZE(PAGE_SIZE);
 
 	VMCOREINFO_SYMBOL(init_uts_ns);
 	VMCOREINFO_SYMBOL(node_online_map);
@@ -1376,15 +1376,15 @@ static int __init crash_save_vmcoreinfo_init(void)
 #ifdef CONFIG_SPARSEMEM
 	VMCOREINFO_SYMBOL(mem_section);
 	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-	VMCOREINFO_SIZE(mem_section);
+	VMCOREINFO_STRUCT_SIZE(mem_section);
 	VMCOREINFO_OFFSET(mem_section, section_mem_map);
 #endif
-	VMCOREINFO_SIZE(page);
-	VMCOREINFO_SIZE(pglist_data);
-	VMCOREINFO_SIZE(zone);
-	VMCOREINFO_SIZE(free_area);
-	VMCOREINFO_SIZE(list_head);
-	VMCOREINFO_TYPEDEF_SIZE(nodemask_t);
+	VMCOREINFO_STRUCT_SIZE(page);
+	VMCOREINFO_STRUCT_SIZE(pglist_data);
+	VMCOREINFO_STRUCT_SIZE(zone);
+	VMCOREINFO_STRUCT_SIZE(free_area);
+	VMCOREINFO_STRUCT_SIZE(list_head);
+	VMCOREINFO_SIZE(nodemask_t);
 	VMCOREINFO_OFFSET(page, flags);
 	VMCOREINFO_OFFSET(page, _count);
 	VMCOREINFO_OFFSET(page, mapping);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb7df2a28bd..22be3ff3f36 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -173,10 +173,7 @@ static int ____call_usermodehelper(void *data)
 	 */
 	set_user_nice(current, 0);
 
-	retval = -EPERM;
-	if (current->fs->root)
-		retval = kernel_execve(sub_info->path,
-				sub_info->argv, sub_info->envp);
+	retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
 
 	/* Exec failed? */
 	sub_info->retval = retval;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d0493eafea3..7a86e643233 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -699,6 +699,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 				 struct kretprobe_instance, uflist);
 		ri->rp = rp;
 		ri->task = current;
+
+		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+			spin_unlock_irqrestore(&kretprobe_lock, flags);
+			return 0;
+		}
+
 		arch_prepare_kretprobe(ri, regs);
 
 		/* XXX(hch): why is there no hlist_move_head? */
@@ -745,7 +751,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 	INIT_HLIST_HEAD(&rp->used_instances);
 	INIT_HLIST_HEAD(&rp->free_instances);
 	for (i = 0; i < rp->maxactive; i++) {
-		inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
+		inst = kmalloc(sizeof(struct kretprobe_instance) +
+			       rp->data_size, GFP_KERNEL);
 		if (inst == NULL) {
 			free_rp_inst(rp);
 			return -ENOMEM;
diff --git a/kernel/marker.c b/kernel/marker.c
index 5323cfaedbc..c4c2cd8b61f 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -27,35 +27,42 @@
 extern struct marker __start___markers[];
 extern struct marker __stop___markers[];
 
+/* Set to 1 to enable marker debug output */
+const int marker_debug;
+
 /*
  * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers, the hash table and deferred_sync.
+ * and module markers and the hash table.
  */
 static DEFINE_MUTEX(markers_mutex);
 
 /*
- * Marker deferred synchronization.
- * Upon marker probe_unregister, we delay call to synchronize_sched() to
- * accelerate mass unregistration (only when there is no more reference to a
- * given module do we call synchronize_sched()). However, we need to make sure
- * every critical region has ended before we re-arm a marker that has been
- * unregistered and then registered back with a different probe data.
- */
-static int deferred_sync;
-
-/*
  * Marker hash table, containing the active markers.
  * Protected by module_mutex.
  */
 #define MARKER_HASH_BITS 6
 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
 
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker.  It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
 struct marker_entry {
 	struct hlist_node hlist;
 	char *format;
-	marker_probe_func *probe;
-	void *private;
+	void (*call)(const struct marker *mdata,	/* Probe wrapper */
+		void *call_private, const char *fmt, ...);
+	struct marker_probe_closure single;
+	struct marker_probe_closure *multi;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
+	struct rcu_head rcu;
+	void *oldptr;
+	char rcu_pending:1;
+	char ptype:1;
 	char name[0];	/* Contains name'\0'format'\0' */
 };
 
@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 
 /**
  * __mark_empty_function - Empty probe callback
- * @mdata: pointer of type const struct marker
+ * @probe_private: probe private data
+ * @call_private: call site private data
  * @fmt: format string
  * @...: variable argument list
  *
@@ -72,13 +80,267 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
  * though the function pointer change and the marker enabling are two distinct
  * operations that modifies the execution flow of preemptible code.
  */
-void __mark_empty_function(const struct marker *mdata, void *private,
-	const char *fmt, ...)
+void __mark_empty_function(void *probe_private, void *call_private,
+	const char *fmt, va_list *args)
 {
 }
 EXPORT_SYMBOL_GPL(__mark_empty_function);
 
 /*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...:  Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+void marker_probe_cb(const struct marker *mdata, void *call_private,
+	const char *fmt, ...)
+{
+	va_list args;
+	char ptype;
+
+	/*
+	 * disabling preemption to make sure the teardown of the callbacks can
+	 * be done correctly when they are in modules and they insure RCU read
+	 * coherency.
+	 */
+	preempt_disable();
+	ptype = ACCESS_ONCE(mdata->ptype);
+	if (likely(!ptype)) {
+		marker_probe_func *func;
+		/* Must read the ptype before ptr. They are not data dependant,
+		 * so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func = ACCESS_ONCE(mdata->single.func);
+		/* Must read the ptr before private data. They are not data
+		 * dependant, so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		va_start(args, fmt);
+		func(mdata->single.probe_private, call_private, fmt, &args);
+		va_end(args);
+	} else {
+		struct marker_probe_closure *multi;
+		int i;
+		/*
+		 * multi points to an array, therefore accessing the array
+		 * depends on reading multi. However, even in this case,
+		 * we must insure that the pointer is read _before_ the array
+		 * data. Same as rcu_dereference, but we need a full smp_rmb()
+		 * in the fast path, so put the explicit barrier here.
+		 */
+		smp_read_barrier_depends();
+		multi = ACCESS_ONCE(mdata->multi);
+		for (i = 0; multi[i].func; i++) {
+			va_start(args, fmt);
+			multi[i].func(multi[i].probe_private, call_private, fmt,
+				&args);
+			va_end(args);
+		}
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...:  Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+void marker_probe_cb_noarg(const struct marker *mdata,
+	void *call_private, const char *fmt, ...)
+{
+	va_list args;	/* not initialized */
+	char ptype;
+
+	preempt_disable();
+	ptype = ACCESS_ONCE(mdata->ptype);
+	if (likely(!ptype)) {
+		marker_probe_func *func;
+		/* Must read the ptype before ptr. They are not data dependant,
+		 * so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func = ACCESS_ONCE(mdata->single.func);
+		/* Must read the ptr before private data. They are not data
+		 * dependant, so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func(mdata->single.probe_private, call_private, fmt, &args);
+	} else {
+		struct marker_probe_closure *multi;
+		int i;
+		/*
+		 * multi points to an array, therefore accessing the array
+		 * depends on reading multi. However, even in this case,
+		 * we must insure that the pointer is read _before_ the array
+		 * data. Same as rcu_dereference, but we need a full smp_rmb()
+		 * in the fast path, so put the explicit barrier here.
+		 */
+		smp_read_barrier_depends();
+		multi = ACCESS_ONCE(mdata->multi);
+		for (i = 0; multi[i].func; i++)
+			multi[i].func(multi[i].probe_private, call_private, fmt,
+				&args);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
+
+static void free_old_closure(struct rcu_head *head)
+{
+	struct marker_entry *entry = container_of(head,
+		struct marker_entry, rcu);
+	kfree(entry->oldptr);
+	/* Make sure we free the data before setting the pending flag to 0 */
+	smp_wmb();
+	entry->rcu_pending = 0;
+}
+
+static void debug_print_probes(struct marker_entry *entry)
+{
+	int i;
+
+	if (!marker_debug)
+		return;
+
+	if (!entry->ptype) {
+		printk(KERN_DEBUG "Single probe : %p %p\n",
+			entry->single.func,
+			entry->single.probe_private);
+	} else {
+		for (i = 0; entry->multi[i].func; i++)
+			printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+				entry->multi[i].func,
+				entry->multi[i].probe_private);
+	}
+}
+
+static struct marker_probe_closure *
+marker_entry_add_probe(struct marker_entry *entry,
+		marker_probe_func *probe, void *probe_private)
+{
+	int nr_probes = 0;
+	struct marker_probe_closure *old, *new;
+
+	WARN_ON(!probe);
+
+	debug_print_probes(entry);
+	old = entry->multi;
+	if (!entry->ptype) {
+		if (entry->single.func == probe &&
+				entry->single.probe_private == probe_private)
+			return ERR_PTR(-EBUSY);
+		if (entry->single.func == __mark_empty_function) {
+			/* 0 -> 1 probes */
+			entry->single.func = probe;
+			entry->single.probe_private = probe_private;
+			entry->refcount = 1;
+			entry->ptype = 0;
+			debug_print_probes(entry);
+			return NULL;
+		} else {
+			/* 1 -> 2 probes */
+			nr_probes = 1;
+			old = NULL;
+		}
+	} else {
+		/* (N -> N+1), (N != 0, 1) probes */
+		for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+			if (old[nr_probes].func == probe
+					&& old[nr_probes].probe_private
+						== probe_private)
+				return ERR_PTR(-EBUSY);
+	}
+	/* + 2 : one for new probe, one for NULL func */
+	new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
+			GFP_KERNEL);
+	if (new == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (!old)
+		new[0] = entry->single;
+	else
+		memcpy(new, old,
+			nr_probes * sizeof(struct marker_probe_closure));
+	new[nr_probes].func = probe;
+	new[nr_probes].probe_private = probe_private;
+	entry->refcount = nr_probes + 1;
+	entry->multi = new;
+	entry->ptype = 1;
+	debug_print_probes(entry);
+	return old;
+}
+
+static struct marker_probe_closure *
+marker_entry_remove_probe(struct marker_entry *entry,
+		marker_probe_func *probe, void *probe_private)
+{
+	int nr_probes = 0, nr_del = 0, i;
+	struct marker_probe_closure *old, *new;
+
+	old = entry->multi;
+
+	debug_print_probes(entry);
+	if (!entry->ptype) {
+		/* 0 -> N is an error */
+		WARN_ON(entry->single.func == __mark_empty_function);
+		/* 1 -> 0 probes */
+		WARN_ON(probe && entry->single.func != probe);
+		WARN_ON(entry->single.probe_private != probe_private);
+		entry->single.func = __mark_empty_function;
+		entry->refcount = 0;
+		entry->ptype = 0;
+		debug_print_probes(entry);
+		return NULL;
+	} else {
+		/* (N -> M), (N > 1, M >= 0) probes */
+		for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+			if ((!probe || old[nr_probes].func == probe)
+					&& old[nr_probes].probe_private
+						== probe_private)
+				nr_del++;
+		}
+	}
+
+	if (nr_probes - nr_del == 0) {
+		/* N -> 0, (N > 1) */
+		entry->single.func = __mark_empty_function;
+		entry->refcount = 0;
+		entry->ptype = 0;
+	} else if (nr_probes - nr_del == 1) {
+		/* N -> 1, (N > 1) */
+		for (i = 0; old[i].func; i++)
+			if ((probe && old[i].func != probe) ||
+					old[i].probe_private != probe_private)
+				entry->single = old[i];
+		entry->refcount = 1;
+		entry->ptype = 0;
+	} else {
+		int j = 0;
+		/* N -> M, (N > 1, M > 1) */
+		/* + 1 for NULL */
+		new = kzalloc((nr_probes - nr_del + 1)
+			* sizeof(struct marker_probe_closure), GFP_KERNEL);
+		if (new == NULL)
+			return ERR_PTR(-ENOMEM);
+		for (i = 0; old[i].func; i++)
+			if ((probe && old[i].func != probe) ||
+					old[i].probe_private != probe_private)
+				new[j++] = old[i];
+		entry->refcount = nr_probes - nr_del;
+		entry->ptype = 1;
+		entry->multi = new;
+	}
+	debug_print_probes(entry);
+	return old;
+}
+
+/*
  * Get marker if the marker is present in the marker hash table.
  * Must be called with markers_mutex held.
  * Returns NULL if not present.
@@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name)
  * Add the marker to the marker hash table. Must be called with markers_mutex
  * held.
  */
-static int add_marker(const char *name, const char *format,
-	marker_probe_func *probe, void *private)
+static struct marker_entry *add_marker(const char *name, const char *format)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format,
 	hlist_for_each_entry(e, node, head, hlist) {
 		if (!strcmp(name, e->name)) {
 			printk(KERN_NOTICE
-				"Marker %s busy, probe %p already installed\n",
-				name, e->probe);
-			return -EBUSY;	/* Already there */
+				"Marker %s busy\n", name);
+			return ERR_PTR(-EBUSY);	/* Already there */
 		}
 	}
 	/*
@@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format,
 	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
 			GFP_KERNEL);
 	if (!e)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	memcpy(&e->name[0], name, name_len);
 	if (format) {
 		e->format = &e->name[name_len];
 		memcpy(e->format, format, format_len);
+		if (strcmp(e->format, MARK_NOARGS) == 0)
+			e->call = marker_probe_cb_noarg;
+		else
+			e->call = marker_probe_cb;
 		trace_mark(core_marker_format, "name %s format %s",
 				e->name, e->format);
-	} else
+	} else {
 		e->format = NULL;
-	e->probe = probe;
-	e->private = private;
+		e->call = marker_probe_cb;
+	}
+	e->single.func = __mark_empty_function;
+	e->single.probe_private = NULL;
+	e->multi = NULL;
+	e->ptype = 0;
 	e->refcount = 0;
+	e->rcu_pending = 0;
 	hlist_add_head(&e->hlist, head);
-	return 0;
+	return e;
 }
 
 /*
  * Remove the marker from the marker hash table. Must be called with mutex_lock
  * held.
  */
-static void *remove_marker(const char *name)
+static int remove_marker(const char *name)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
 	struct marker_entry *e;
 	int found = 0;
 	size_t len = strlen(name) + 1;
-	void *private = NULL;
 	u32 hash = jhash(name, len-1, 0);
 
 	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
@@ -167,12 +435,16 @@ static void *remove_marker(const char *name)
 			break;
 		}
 	}
-	if (found) {
-		private = e->private;
-		hlist_del(&e->hlist);
-		kfree(e);
-	}
-	return private;
+	if (!found)
+		return -ENOENT;
+	if (e->single.func != __mark_empty_function)
+		return -EBUSY;
+	hlist_del(&e->hlist);
+	/* Make sure the call_rcu has been executed */
+	if (e->rcu_pending)
+		rcu_barrier();
+	kfree(e);
+	return 0;
 }
 
 /*
@@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 	size_t name_len = strlen((*entry)->name) + 1;
 	size_t format_len = strlen(format) + 1;
 
+
 	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
 			GFP_KERNEL);
 	if (!e)
@@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 	memcpy(&e->name[0], (*entry)->name, name_len);
 	e->format = &e->name[name_len];
 	memcpy(e->format, format, format_len);
-	e->probe = (*entry)->probe;
-	e->private = (*entry)->private;
+	if (strcmp(e->format, MARK_NOARGS) == 0)
+		e->call = marker_probe_cb_noarg;
+	else
+		e->call = marker_probe_cb;
+	e->single = (*entry)->single;
+	e->multi = (*entry)->multi;
+	e->ptype = (*entry)->ptype;
 	e->refcount = (*entry)->refcount;
+	e->rcu_pending = 0;
 	hlist_add_before(&e->hlist, &(*entry)->hlist);
 	hlist_del(&(*entry)->hlist);
+	/* Make sure the call_rcu has been executed */
+	if ((*entry)->rcu_pending)
+		rcu_barrier();
 	kfree(*entry);
 	*entry = e;
 	trace_mark(core_marker_format, "name %s format %s",
@@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
 /*
  * Sets the probe callback corresponding to one marker.
  */
-static int set_marker(struct marker_entry **entry, struct marker *elem)
+static int set_marker(struct marker_entry **entry, struct marker *elem,
+		int active)
 {
 	int ret;
 	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
@@ -226,9 +509,43 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
 		if (ret)
 			return ret;
 	}
-	elem->call = (*entry)->probe;
-	elem->private = (*entry)->private;
-	elem->state = 1;
+
+	/*
+	 * probe_cb setup (statically known) is done here. It is
+	 * asynchronous with the rest of execution, therefore we only
+	 * pass from a "safe" callback (with argument) to an "unsafe"
+	 * callback (does not set arguments).
+	 */
+	elem->call = (*entry)->call;
+	/*
+	 * Sanity check :
+	 * We only update the single probe private data when the ptr is
+	 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+	 */
+	WARN_ON(elem->single.func != __mark_empty_function
+		&& elem->single.probe_private
+		!= (*entry)->single.probe_private &&
+		!elem->ptype);
+	elem->single.probe_private = (*entry)->single.probe_private;
+	/*
+	 * Make sure the private data is valid when we update the
+	 * single probe ptr.
+	 */
+	smp_wmb();
+	elem->single.func = (*entry)->single.func;
+	/*
+	 * We also make sure that the new probe callbacks array is consistent
+	 * before setting a pointer to it.
+	 */
+	rcu_assign_pointer(elem->multi, (*entry)->multi);
+	/*
+	 * Update the function or multi probe array pointer before setting the
+	 * ptype.
+	 */
+	smp_wmb();
+	elem->ptype = (*entry)->ptype;
+	elem->state = active;
+
 	return 0;
 }
 
@@ -240,8 +557,12 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
  */
 static void disable_marker(struct marker *elem)
 {
+	/* leave "call" as is. It is known statically. */
 	elem->state = 0;
-	elem->call = __mark_empty_function;
+	elem->single.func = __mark_empty_function;
+	/* Update the function before setting the ptype */
+	smp_wmb();
+	elem->ptype = 0;	/* single probe */
 	/*
 	 * Leave the private data and id there, because removal is racy and
 	 * should be done only after a synchronize_sched(). These are never used
@@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem)
  * marker_update_probe_range - Update a probe range
  * @begin: beginning of the range
  * @end: end of the range
- * @probe_module: module address of the probe being updated
- * @refcount: number of references left to the given probe_module (out)
  *
  * Updates the probe callback corresponding to a range of markers.
  */
 void marker_update_probe_range(struct marker *begin,
-	struct marker *end, struct module *probe_module,
-	int *refcount)
+	struct marker *end)
 {
 	struct marker *iter;
 	struct marker_entry *mark_entry;
@@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin,
 	mutex_lock(&markers_mutex);
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_marker(iter->name);
-		if (mark_entry && mark_entry->refcount) {
-			set_marker(&mark_entry, iter);
+		if (mark_entry) {
+			set_marker(&mark_entry, iter,
+					!!mark_entry->refcount);
 			/*
 			 * ignore error, continue
 			 */
-			if (probe_module)
-				if (probe_module ==
-			__module_text_address((unsigned long)mark_entry->probe))
-					(*refcount)++;
 		} else {
 			disable_marker(iter);
 		}
@@ -289,20 +604,27 @@ void marker_update_probe_range(struct marker *begin,
  * Issues a synchronize_sched() when no reference to the module passed
  * as parameter is found in the probes so the probe module can be
  * safely unloaded from now on.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions.  All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
  */
-static void marker_update_probes(struct module *probe_module)
+static void marker_update_probes(void)
 {
-	int refcount = 0;
-
 	/* Core kernel markers */
-	marker_update_probe_range(__start___markers,
-			__stop___markers, probe_module, &refcount);
+	marker_update_probe_range(__start___markers, __stop___markers);
 	/* Markers in modules. */
-	module_update_markers(probe_module, &refcount);
-	if (probe_module && refcount == 0) {
-		synchronize_sched();
-		deferred_sync = 0;
-	}
+	module_update_markers();
 }
 
 /**
@@ -310,33 +632,49 @@ static void marker_update_probes(struct module *probe_module)
  * @name: marker name
  * @format: format string
  * @probe: probe handler
- * @private: probe private data
+ * @probe_private: probe private data
  *
  * private data must be a valid allocated memory address, or NULL.
  * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
  */
 int marker_probe_register(const char *name, const char *format,
-			marker_probe_func *probe, void *private)
+			marker_probe_func *probe, void *probe_private)
 {
 	struct marker_entry *entry;
 	int ret = 0;
+	struct marker_probe_closure *old;
 
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
-	if (entry && entry->refcount) {
-		ret = -EBUSY;
-		goto end;
-	}
-	if (deferred_sync) {
-		synchronize_sched();
-		deferred_sync = 0;
+	if (!entry) {
+		entry = add_marker(name, format);
+		if (IS_ERR(entry)) {
+			ret = PTR_ERR(entry);
+			goto end;
+		}
 	}
-	ret = add_marker(name, format, probe, private);
-	if (ret)
+	/*
+	 * If we detect that a call_rcu is pending for this marker,
+	 * make sure it's executed now.
+	 */
+	if (entry->rcu_pending)
+		rcu_barrier();
+	old = marker_entry_add_probe(entry, probe, probe_private);
+	if (IS_ERR(old)) {
+		ret = PTR_ERR(old);
 		goto end;
+	}
 	mutex_unlock(&markers_mutex);
-	marker_update_probes(NULL);
-	return ret;
+	marker_update_probes();		/* may update entry */
+	mutex_lock(&markers_mutex);
+	entry = get_marker(name);
+	WARN_ON(!entry);
+	entry->oldptr = old;
+	entry->rcu_pending = 1;
+	/* write rcu_pending before calling the RCU callback */
+	smp_wmb();
+	call_rcu(&entry->rcu, free_old_closure);
 end:
 	mutex_unlock(&markers_mutex);
 	return ret;
@@ -346,171 +684,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register);
 /**
  * marker_probe_unregister -  Disconnect a probe from a marker
  * @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
  *
  * Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
  */
-void *marker_probe_unregister(const char *name)
+int marker_probe_unregister(const char *name,
+	marker_probe_func *probe, void *probe_private)
 {
-	struct module *probe_module;
 	struct marker_entry *entry;
-	void *private;
+	struct marker_probe_closure *old;
+	int ret = 0;
 
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
 	if (!entry) {
-		private = ERR_PTR(-ENOENT);
+		ret = -ENOENT;
 		goto end;
 	}
-	entry->refcount = 0;
-	/* In what module is the probe handler ? */
-	probe_module = __module_text_address((unsigned long)entry->probe);
-	private = remove_marker(name);
-	deferred_sync = 1;
+	if (entry->rcu_pending)
+		rcu_barrier();
+	old = marker_entry_remove_probe(entry, probe, probe_private);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes(probe_module);
-	return private;
+	marker_update_probes();		/* may update entry */
+	mutex_lock(&markers_mutex);
+	entry = get_marker(name);
+	entry->oldptr = old;
+	entry->rcu_pending = 1;
+	/* write rcu_pending before calling the RCU callback */
+	smp_wmb();
+	call_rcu(&entry->rcu, free_old_closure);
+	remove_marker(name);	/* Ignore busy error message */
 end:
 	mutex_unlock(&markers_mutex);
-	return private;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(marker_probe_unregister);
 
-/**
- * marker_probe_unregister_private_data -  Disconnect a probe from a marker
- * @private: probe private data
- *
- * Unregister a marker by providing the registered private data.
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- */
-void *marker_probe_unregister_private_data(void *private)
+static struct marker_entry *
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
 {
-	struct module *probe_module;
-	struct hlist_head *head;
-	struct hlist_node *node;
 	struct marker_entry *entry;
-	int found = 0;
 	unsigned int i;
+	struct hlist_head *head;
+	struct hlist_node *node;
 
-	mutex_lock(&markers_mutex);
 	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
 		head = &marker_table[i];
 		hlist_for_each_entry(entry, node, head, hlist) {
-			if (entry->private == private) {
-				found = 1;
-				goto iter_end;
+			if (!entry->ptype) {
+				if (entry->single.func == probe
+						&& entry->single.probe_private
+						== probe_private)
+					return entry;
+			} else {
+				struct marker_probe_closure *closure;
+				closure = entry->multi;
+				for (i = 0; closure[i].func; i++) {
+					if (closure[i].func == probe &&
+							closure[i].probe_private
+							== probe_private)
+						return entry;
+				}
 			}
 		}
 	}
-iter_end:
-	if (!found) {
-		private = ERR_PTR(-ENOENT);
-		goto end;
-	}
-	entry->refcount = 0;
-	/* In what module is the probe handler ? */
-	probe_module = __module_text_address((unsigned long)entry->probe);
-	private = remove_marker(entry->name);
-	deferred_sync = 1;
-	mutex_unlock(&markers_mutex);
-	marker_update_probes(probe_module);
-	return private;
-end:
-	mutex_unlock(&markers_mutex);
-	return private;
+	return NULL;
 }
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
 
 /**
- * marker_arm - Arm a marker
- * @name: marker name
+ * marker_probe_unregister_private_data -  Disconnect a probe from a marker
+ * @probe: probe function
+ * @probe_private: probe private data
  *
- * Activate a marker. It keeps a reference count of the number of
- * arming/disarming done.
- * Returns 0 if ok, error value on error.
+ * Unregister a probe by providing the registered private data.
+ * Only removes the first marker found in hash table.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
  */
-int marker_arm(const char *name)
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+		void *probe_private)
 {
 	struct marker_entry *entry;
 	int ret = 0;
+	struct marker_probe_closure *old;
 
 	mutex_lock(&markers_mutex);
-	entry = get_marker(name);
+	entry = get_marker_from_private_data(probe, probe_private);
 	if (!entry) {
 		ret = -ENOENT;
 		goto end;
 	}
-	/*
-	 * Only need to update probes when refcount passes from 0 to 1.
-	 */
-	if (entry->refcount++)
-		goto end;
-end:
+	if (entry->rcu_pending)
+		rcu_barrier();
+	old = marker_entry_remove_probe(entry, NULL, probe_private);
 	mutex_unlock(&markers_mutex);
-	marker_update_probes(NULL);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(marker_arm);
-
-/**
- * marker_disarm - Disarm a marker
- * @name: marker name
- *
- * Disarm a marker. It keeps a reference count of the number of arming/disarming
- * done.
- * Returns 0 if ok, error value on error.
- */
-int marker_disarm(const char *name)
-{
-	struct marker_entry *entry;
-	int ret = 0;
-
+	marker_update_probes();		/* may update entry */
 	mutex_lock(&markers_mutex);
-	entry = get_marker(name);
-	if (!entry) {
-		ret = -ENOENT;
-		goto end;
-	}
-	/*
-	 * Only permit decrement refcount if higher than 0.
-	 * Do probe update only on 1 -> 0 transition.
-	 */
-	if (entry->refcount) {
-		if (--entry->refcount)
-			goto end;
-	} else {
-		ret = -EPERM;
-		goto end;
-	}
+	entry = get_marker_from_private_data(probe, probe_private);
+	WARN_ON(!entry);
+	entry->oldptr = old;
+	entry->rcu_pending = 1;
+	/* write rcu_pending before calling the RCU callback */
+	smp_wmb();
+	call_rcu(&entry->rcu, free_old_closure);
+	remove_marker(entry->name);	/* Ignore busy error message */
 end:
 	mutex_unlock(&markers_mutex);
-	marker_update_probes(NULL);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(marker_disarm);
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
 
 /**
  * marker_get_private_data - Get a marker's probe private data
  * @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
  *
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
  * Returns the private data pointer, or an ERR_PTR.
  * The private data pointer should _only_ be dereferenced if the caller is the
  * owner of the data, or its content could vanish. This is mostly used to
  * confirm that a caller is the owner of a registered probe.
  */
-void *marker_get_private_data(const char *name)
+void *marker_get_private_data(const char *name, marker_probe_func *probe,
+		int num)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
 	struct marker_entry *e;
 	size_t name_len = strlen(name) + 1;
 	u32 hash = jhash(name, name_len-1, 0);
-	int found = 0;
+	int i;
 
 	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
 	hlist_for_each_entry(e, node, head, hlist) {
 		if (!strcmp(name, e->name)) {
-			found = 1;
-			return e->private;
+			if (!e->ptype) {
+				if (num == 0 && e->single.func == probe)
+					return e->single.probe_private;
+				else
+					break;
+			} else {
+				struct marker_probe_closure *closure;
+				int match = 0;
+				closure = e->multi;
+				for (i = 0; closure[i].func; i++) {
+					if (closure[i].func != probe)
+						continue;
+					if (match++ == num)
+						return closure[i].probe_private;
+				}
+			}
 		}
 	}
 	return ERR_PTR(-ENOENT);
diff --git a/kernel/module.c b/kernel/module.c
index bd60278ee70..92595bad381 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -46,6 +46,7 @@
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
 #include <linux/license.h>
+#include <asm/sections.h>
 
 #if 0
 #define DEBUGP printk
@@ -290,7 +291,7 @@ static unsigned long __find_symbol(const char *name,
 		}
 	}
 	DEBUGP("Failed to find symbol %s\n", name);
-	return 0;
+	return -ENOENT;
 }
 
 /* Search for module by name: must hold module_mutex. */
@@ -343,9 +344,6 @@ static inline unsigned int block_size(int val)
 	return val;
 }
 
-/* Created by linker magic */
-extern char __per_cpu_start[], __per_cpu_end[];
-
 static void *percpu_modalloc(unsigned long size, unsigned long align,
 			     const char *name)
 {
@@ -783,7 +781,7 @@ void __symbol_put(const char *symbol)
 	const unsigned long *crc;
 
 	preempt_disable();
-	if (!__find_symbol(symbol, &owner, &crc, 1))
+	if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1)))
 		BUG();
 	module_put(owner);
 	preempt_enable();
@@ -929,7 +927,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 	const unsigned long *crc;
 	struct module *owner;
 
-	if (!__find_symbol("struct_module", &owner, &crc, 1))
+	if (IS_ERR_VALUE(__find_symbol("struct_module",
+						&owner, &crc, 1)))
 		BUG();
 	return check_version(sechdrs, versindex, "struct_module", mod,
 			     crc);
@@ -978,12 +977,12 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
 
 	ret = __find_symbol(name, &owner, &crc,
 			!(mod->taints & TAINT_PROPRIETARY_MODULE));
-	if (ret) {
+	if (!IS_ERR_VALUE(ret)) {
 		/* use_module can fail due to OOM,
 		   or module initialization or unloading */
 		if (!check_version(sechdrs, versindex, name, mod, crc) ||
 		    !use_module(mod, owner))
-			ret = 0;
+			ret = -EINVAL;
 	}
 	return ret;
 }
@@ -1371,7 +1370,9 @@ void *__symbol_get(const char *symbol)
 
 	preempt_disable();
 	value = __find_symbol(symbol, &owner, &crc, 1);
-	if (value && strong_try_module_get(owner) != 0)
+	if (IS_ERR_VALUE(value))
+		value = 0;
+	else if (strong_try_module_get(owner))
 		value = 0;
 	preempt_enable();
 
@@ -1391,14 +1392,16 @@ static int verify_export_symbols(struct module *mod)
 	const unsigned long *crc;
 
 	for (i = 0; i < mod->num_syms; i++)
-		if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) {
+		if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name,
+							&owner, &crc, 1))) {
 			name = mod->syms[i].name;
 			ret = -ENOEXEC;
 			goto dup;
 		}
 
 	for (i = 0; i < mod->num_gpl_syms; i++)
-		if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) {
+		if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name,
+							&owner, &crc, 1))) {
 			name = mod->gpl_syms[i].name;
 			ret = -ENOEXEC;
 			goto dup;
@@ -1448,7 +1451,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 					   strtab + sym[i].st_name, mod);
 
 			/* Ok if resolved.  */
-			if (sym[i].st_value != 0)
+			if (!IS_ERR_VALUE(sym[i].st_value))
 				break;
 			/* Ok if weak.  */
 			if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
@@ -2035,7 +2038,7 @@ static struct module *load_module(void __user *umod,
 #ifdef CONFIG_MARKERS
 	if (!mod->taints)
 		marker_update_probe_range(mod->markers,
-			mod->markers + mod->num_markers, NULL, NULL);
+			mod->markers + mod->num_markers);
 #endif
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
@@ -2250,7 +2253,7 @@ static const char *get_ksymbol(struct module *mod,
 
 /* For kallsyms to ask for address resolution.  NULL means not found.  Careful
  * not to lock to avoid deadlock on oopses, simply disable preemption. */
-char *module_address_lookup(unsigned long addr,
+const char *module_address_lookup(unsigned long addr,
 			    unsigned long *size,
 			    unsigned long *offset,
 			    char **modname,
@@ -2275,7 +2278,7 @@ char *module_address_lookup(unsigned long addr,
 		ret = namebuf;
 	}
 	preempt_enable();
-	return (char *)ret;
+	return ret;
 }
 
 int lookup_module_symbol_name(unsigned long addr, char *symname)
@@ -2561,7 +2564,7 @@ EXPORT_SYMBOL(struct_module);
 #endif
 
 #ifdef CONFIG_MARKERS
-void module_update_markers(struct module *probe_module, int *refcount)
+void module_update_markers(void)
 {
 	struct module *mod;
 
@@ -2569,8 +2572,7 @@ void module_update_markers(struct module *probe_module, int *refcount)
 	list_for_each_entry(mod, &modules, list)
 		if (!mod->taints)
 			marker_update_probe_range(mod->markers,
-				mod->markers + mod->num_markers,
-				probe_module, refcount);
+				mod->markers + mod->num_markers);
 	mutex_unlock(&module_mutex);
 }
 #endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index d17436cdea1..3aaa06c561d 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -107,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name,
  * use of the mutex is forbidden. The mutex must not be locked when
  * this function is called.
  */
-void fastcall mutex_destroy(struct mutex *lock)
+void mutex_destroy(struct mutex *lock)
 {
 	DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
 	lock->magic = NULL;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d9ec9b66625..d046a345d36 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init);
  * We also put the fastpath first in the kernel image, to make sure the
  * branch is predicted by the CPU as default-untaken.
  */
-static void fastcall noinline __sched
+static void noinline __sched
 __mutex_lock_slowpath(atomic_t *lock_count);
 
 /***
@@ -82,7 +82,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
  *
  * This function is similar to (but not equivalent to) down().
  */
-void inline fastcall __sched mutex_lock(struct mutex *lock)
+void inline __sched mutex_lock(struct mutex *lock)
 {
 	might_sleep();
 	/*
@@ -95,8 +95,7 @@ void inline fastcall __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
 
-static void fastcall noinline __sched
-__mutex_unlock_slowpath(atomic_t *lock_count);
+static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 
 /***
  * mutex_unlock - release the mutex
@@ -109,7 +108,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count);
  *
  * This function is similar to (but not equivalent to) up().
  */
-void fastcall __sched mutex_unlock(struct mutex *lock)
+void __sched mutex_unlock(struct mutex *lock)
 {
 	/*
 	 * The unlocking fastpath is the 0->1 transition from 'locked'
@@ -234,7 +233,7 @@ EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 /*
  * Release the lock, slowpath:
  */
-static fastcall inline void
+static inline void
 __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -271,7 +270,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 /*
  * Release the lock, slowpath:
  */
-static fastcall noinline void
+static noinline void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
 	__mutex_unlock_common_slowpath(lock_count, 1);
@@ -282,10 +281,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
  * Here come the less common (and hence less performance-critical) APIs:
  * mutex_lock_interruptible() and mutex_trylock().
  */
-static int fastcall noinline __sched
+static noinline int __sched
 __mutex_lock_killable_slowpath(atomic_t *lock_count);
 
-static noinline int fastcall __sched
+static noinline int __sched
 __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
 
 /***
@@ -299,7 +298,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
  *
  * This function is similar to (but not equivalent to) down_interruptible().
  */
-int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
+int __sched mutex_lock_interruptible(struct mutex *lock)
 {
 	might_sleep();
 	return __mutex_fastpath_lock_retval
@@ -308,7 +307,7 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
-int fastcall __sched mutex_lock_killable(struct mutex *lock)
+int __sched mutex_lock_killable(struct mutex *lock)
 {
 	might_sleep();
 	return __mutex_fastpath_lock_retval
@@ -316,7 +315,7 @@ int fastcall __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
-static void fastcall noinline __sched
+static noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -324,7 +323,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
 }
 
-static int fastcall noinline __sched
+static noinline int __sched
 __mutex_lock_killable_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -332,7 +331,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
 	return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
 }
 
-static noinline int fastcall __sched
+static noinline int __sched
 __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -381,7 +380,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
  * This function must not be used in interrupt context. The
  * mutex must be released by the same task that acquired it.
  */
-int fastcall __sched mutex_trylock(struct mutex *lock)
+int __sched mutex_trylock(struct mutex *lock)
 {
 	return __mutex_fastpath_trylock(&lock->count,
 					__mutex_trylock_slowpath);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4253f472f06..643360d1bb1 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -4,6 +4,7 @@
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/vmalloc.h>
+#include <linux/reboot.h>
 
 /*
  *	Notifier list for kernel code which wants to be called
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 79f871bc0ef..f5d332cf8c6 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,7 @@
 #include <linux/utsname.h>
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
+#include <linux/ipc_namespace.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
diff --git a/kernel/panic.c b/kernel/panic.c
index d9e90cfe329..24af9f8bac9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -161,7 +161,7 @@ const char *print_tainted(void)
 {
 	static char buf[20];
 	if (tainted) {
-		snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c",
+		snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c",
 			tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
 			tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
 			tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -169,7 +169,8 @@ const char *print_tainted(void)
 			tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
 			tainted & TAINT_BAD_PAGE ? 'B' : ' ',
 			tainted & TAINT_USER ? 'U' : ' ',
-			tainted & TAINT_DIE ? 'D' : ' ');
+			tainted & TAINT_DIE ? 'D' : ' ',
+			tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ');
 	}
 	else
 		snprintf(buf, sizeof(buf), "Not tainted");
diff --git a/kernel/params.c b/kernel/params.c
index 42fe5e6126c..afc46a23eb6 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -180,12 +180,12 @@ int parse_args(const char *name,
 #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn)      	\
 	int param_set_##name(const char *val, struct kernel_param *kp)	\
 	{								\
-		char *endp;						\
 		tmptype l;						\
+		int ret;						\
 									\
 		if (!val) return -EINVAL;				\
-		l = strtolfn(val, &endp, 0);				\
-		if (endp == val || ((type)l != l))			\
+		ret = strtolfn(val, 0, &l);				\
+		if (ret == -EINVAL || ((type)l != l))			\
 			return -EINVAL;					\
 		*((type *)kp->arg) = l;					\
 		return 0;						\
@@ -195,13 +195,13 @@ int parse_args(const char *name,
 		return sprintf(buffer, format, *((type *)kp->arg));	\
 	}
 
-STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul);
-STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul);
-STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul);
-STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
 
 int param_set_charp(const char *val, struct kernel_param *kp)
 {
@@ -272,7 +272,7 @@ static int param_array(const char *name,
 		       unsigned int min, unsigned int max,
 		       void *elem, int elemsize,
 		       int (*set)(const char *, struct kernel_param *kp),
-		       int *num)
+		       unsigned int *num)
 {
 	int ret;
 	struct kernel_param kp;
diff --git a/kernel/pid.c b/kernel/pid.c
index f815455431b..477691576b3 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,7 +41,6 @@
 static struct hlist_head *pid_hash;
 static int pidhash_shift;
 struct pid init_struct_pid = INIT_STRUCT_PID;
-static struct kmem_cache *pid_ns_cachep;
 
 int pid_max = PID_MAX_DEFAULT;
 
@@ -112,7 +111,7 @@ EXPORT_SYMBOL(is_container_init);
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
+static void free_pidmap(struct pid_namespace *pid_ns, int pid)
 {
 	struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
 	int offset = pid & BITS_PER_PAGE_MASK;
@@ -181,7 +180,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	return -1;
 }
 
-static int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, int last)
 {
 	int offset;
 	struct pidmap *map, *end;
@@ -199,7 +198,7 @@ static int next_pidmap(struct pid_namespace *pid_ns, int last)
 	return -1;
 }
 
-fastcall void put_pid(struct pid *pid)
+void put_pid(struct pid *pid)
 {
 	struct pid_namespace *ns;
 
@@ -221,7 +220,7 @@ static void delayed_put_pid(struct rcu_head *rhp)
 	put_pid(pid);
 }
 
-fastcall void free_pid(struct pid *pid)
+void free_pid(struct pid *pid)
 {
 	/* We can be called with write_lock_irq(&tasklist_lock) held */
 	int i;
@@ -287,7 +286,7 @@ out_free:
 	goto out;
 }
 
-struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns)
+struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
 	struct hlist_node *elem;
 	struct upid *pnr;
@@ -317,7 +316,7 @@ EXPORT_SYMBOL_GPL(find_pid);
 /*
  * attach_pid() must be called with the tasklist_lock write-held.
  */
-int fastcall attach_pid(struct task_struct *task, enum pid_type type,
+int attach_pid(struct task_struct *task, enum pid_type type,
 		struct pid *pid)
 {
 	struct pid_link *link;
@@ -329,7 +328,7 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type,
 	return 0;
 }
 
-void fastcall detach_pid(struct task_struct *task, enum pid_type type)
+void detach_pid(struct task_struct *task, enum pid_type type)
 {
 	struct pid_link *link;
 	struct pid *pid;
@@ -349,7 +348,7 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type)
 }
 
 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
-void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
+void transfer_pid(struct task_struct *old, struct task_struct *new,
 			   enum pid_type type)
 {
 	new->pids[type].pid = old->pids[type].pid;
@@ -357,7 +356,7 @@ void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
 	old->pids[type].pid = NULL;
 }
 
-struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
+struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 {
 	struct task_struct *result = NULL;
 	if (pid) {
@@ -368,6 +367,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 	}
 	return result;
 }
+EXPORT_SYMBOL(pid_task);
 
 /*
  * Must be called under rcu_read_lock() or with tasklist_lock read-held.
@@ -408,7 +408,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 	return pid;
 }
 
-struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
+struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 {
 	struct task_struct *result;
 	rcu_read_lock();
@@ -443,6 +443,12 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 	return nr;
 }
 
+pid_t pid_vnr(struct pid *pid)
+{
+	return pid_nr_ns(pid, current->nsproxy->pid_ns);
+}
+EXPORT_SYMBOL_GPL(pid_vnr);
+
 pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return pid_nr_ns(task_pid(tsk), ns);
@@ -487,180 +493,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
 
-struct pid_cache {
-	int nr_ids;
-	char name[16];
-	struct kmem_cache *cachep;
-	struct list_head list;
-};
-
-static LIST_HEAD(pid_caches_lh);
-static DEFINE_MUTEX(pid_caches_mutex);
-
-/*
- * creates the kmem cache to allocate pids from.
- * @nr_ids: the number of numerical ids this pid will have to carry
- */
-
-static struct kmem_cache *create_pid_cachep(int nr_ids)
-{
-	struct pid_cache *pcache;
-	struct kmem_cache *cachep;
-
-	mutex_lock(&pid_caches_mutex);
-	list_for_each_entry (pcache, &pid_caches_lh, list)
-		if (pcache->nr_ids == nr_ids)
-			goto out;
-
-	pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
-	if (pcache == NULL)
-		goto err_alloc;
-
-	snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
-	cachep = kmem_cache_create(pcache->name,
-			sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
-			0, SLAB_HWCACHE_ALIGN, NULL);
-	if (cachep == NULL)
-		goto err_cachep;
-
-	pcache->nr_ids = nr_ids;
-	pcache->cachep = cachep;
-	list_add(&pcache->list, &pid_caches_lh);
-out:
-	mutex_unlock(&pid_caches_mutex);
-	return pcache->cachep;
-
-err_cachep:
-	kfree(pcache);
-err_alloc:
-	mutex_unlock(&pid_caches_mutex);
-	return NULL;
-}
-
-#ifdef CONFIG_PID_NS
-static struct pid_namespace *create_pid_namespace(int level)
-{
-	struct pid_namespace *ns;
-	int i;
-
-	ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
-	if (ns == NULL)
-		goto out;
-
-	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!ns->pidmap[0].page)
-		goto out_free;
-
-	ns->pid_cachep = create_pid_cachep(level + 1);
-	if (ns->pid_cachep == NULL)
-		goto out_free_map;
-
-	kref_init(&ns->kref);
-	ns->last_pid = 0;
-	ns->child_reaper = NULL;
-	ns->level = level;
-
-	set_bit(0, ns->pidmap[0].page);
-	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
-	for (i = 1; i < PIDMAP_ENTRIES; i++) {
-		ns->pidmap[i].page = 0;
-		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-	}
-
-	return ns;
-
-out_free_map:
-	kfree(ns->pidmap[0].page);
-out_free:
-	kmem_cache_free(pid_ns_cachep, ns);
-out:
-	return ERR_PTR(-ENOMEM);
-}
-
-static void destroy_pid_namespace(struct pid_namespace *ns)
-{
-	int i;
-
-	for (i = 0; i < PIDMAP_ENTRIES; i++)
-		kfree(ns->pidmap[i].page);
-	kmem_cache_free(pid_ns_cachep, ns);
-}
-
-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
-{
-	struct pid_namespace *new_ns;
-
-	BUG_ON(!old_ns);
-	new_ns = get_pid_ns(old_ns);
-	if (!(flags & CLONE_NEWPID))
-		goto out;
-
-	new_ns = ERR_PTR(-EINVAL);
-	if (flags & CLONE_THREAD)
-		goto out_put;
-
-	new_ns = create_pid_namespace(old_ns->level + 1);
-	if (!IS_ERR(new_ns))
-		new_ns->parent = get_pid_ns(old_ns);
-
-out_put:
-	put_pid_ns(old_ns);
-out:
-	return new_ns;
-}
-
-void free_pid_ns(struct kref *kref)
-{
-	struct pid_namespace *ns, *parent;
-
-	ns = container_of(kref, struct pid_namespace, kref);
-
-	parent = ns->parent;
-	destroy_pid_namespace(ns);
-
-	if (parent != NULL)
-		put_pid_ns(parent);
-}
-#endif /* CONFIG_PID_NS */
-
-void zap_pid_ns_processes(struct pid_namespace *pid_ns)
-{
-	int nr;
-	int rc;
-
-	/*
-	 * The last thread in the cgroup-init thread group is terminating.
-	 * Find remaining pid_ts in the namespace, signal and wait for them
-	 * to exit.
-	 *
-	 * Note:  This signals each threads in the namespace - even those that
-	 * 	  belong to the same thread group, To avoid this, we would have
-	 * 	  to walk the entire tasklist looking a processes in this
-	 * 	  namespace, but that could be unnecessarily expensive if the
-	 * 	  pid namespace has just a few processes. Or we need to
-	 * 	  maintain a tasklist for each pid namespace.
-	 *
-	 */
-	read_lock(&tasklist_lock);
-	nr = next_pidmap(pid_ns, 1);
-	while (nr > 0) {
-		kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
-		nr = next_pidmap(pid_ns, nr);
-	}
-	read_unlock(&tasklist_lock);
-
-	do {
-		clear_thread_flag(TIF_SIGPENDING);
-		rc = sys_wait4(-1, NULL, __WALL, NULL);
-	} while (rc != -ECHILD);
-
-
-	/* Child reaper for the pid namespace is going away */
-	pid_ns->child_reaper = NULL;
-	return;
-}
-
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -693,9 +525,6 @@ void __init pidmap_init(void)
 	set_bit(0, init_pid_ns.pidmap[0].page);
 	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
 
-	init_pid_ns.pid_cachep = create_pid_cachep(1);
-	if (init_pid_ns.pid_cachep == NULL)
-		panic("Can't create pid_1 cachep\n");
-
-	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC);
 }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
new file mode 100644
index 00000000000..6d792b66d85
--- /dev/null
+++ b/kernel/pid_namespace.c
@@ -0,0 +1,197 @@
+/*
+ * Pid namespaces
+ *
+ * Authors:
+ *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
+ *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
+ *     Many thanks to Oleg Nesterov for comments and help
+ *
+ */
+
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
+#include <linux/err.h>
+
+#define BITS_PER_PAGE		(PAGE_SIZE*8)
+
+struct pid_cache {
+	int nr_ids;
+	char name[16];
+	struct kmem_cache *cachep;
+	struct list_head list;
+};
+
+static LIST_HEAD(pid_caches_lh);
+static DEFINE_MUTEX(pid_caches_mutex);
+static struct kmem_cache *pid_ns_cachep;
+
+/*
+ * creates the kmem cache to allocate pids from.
+ * @nr_ids: the number of numerical ids this pid will have to carry
+ */
+
+static struct kmem_cache *create_pid_cachep(int nr_ids)
+{
+	struct pid_cache *pcache;
+	struct kmem_cache *cachep;
+
+	mutex_lock(&pid_caches_mutex);
+	list_for_each_entry(pcache, &pid_caches_lh, list)
+		if (pcache->nr_ids == nr_ids)
+			goto out;
+
+	pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
+	if (pcache == NULL)
+		goto err_alloc;
+
+	snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
+	cachep = kmem_cache_create(pcache->name,
+			sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
+			0, SLAB_HWCACHE_ALIGN, NULL);
+	if (cachep == NULL)
+		goto err_cachep;
+
+	pcache->nr_ids = nr_ids;
+	pcache->cachep = cachep;
+	list_add(&pcache->list, &pid_caches_lh);
+out:
+	mutex_unlock(&pid_caches_mutex);
+	return pcache->cachep;
+
+err_cachep:
+	kfree(pcache);
+err_alloc:
+	mutex_unlock(&pid_caches_mutex);
+	return NULL;
+}
+
+static struct pid_namespace *create_pid_namespace(int level)
+{
+	struct pid_namespace *ns;
+	int i;
+
+	ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
+	if (ns == NULL)
+		goto out;
+
+	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!ns->pidmap[0].page)
+		goto out_free;
+
+	ns->pid_cachep = create_pid_cachep(level + 1);
+	if (ns->pid_cachep == NULL)
+		goto out_free_map;
+
+	kref_init(&ns->kref);
+	ns->last_pid = 0;
+	ns->child_reaper = NULL;
+	ns->level = level;
+
+	set_bit(0, ns->pidmap[0].page);
+	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
+
+	for (i = 1; i < PIDMAP_ENTRIES; i++) {
+		ns->pidmap[i].page = 0;
+		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+	}
+
+	return ns;
+
+out_free_map:
+	kfree(ns->pidmap[0].page);
+out_free:
+	kmem_cache_free(pid_ns_cachep, ns);
+out:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void destroy_pid_namespace(struct pid_namespace *ns)
+{
+	int i;
+
+	for (i = 0; i < PIDMAP_ENTRIES; i++)
+		kfree(ns->pidmap[i].page);
+	kmem_cache_free(pid_ns_cachep, ns);
+}
+
+struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+{
+	struct pid_namespace *new_ns;
+
+	BUG_ON(!old_ns);
+	new_ns = get_pid_ns(old_ns);
+	if (!(flags & CLONE_NEWPID))
+		goto out;
+
+	new_ns = ERR_PTR(-EINVAL);
+	if (flags & CLONE_THREAD)
+		goto out_put;
+
+	new_ns = create_pid_namespace(old_ns->level + 1);
+	if (!IS_ERR(new_ns))
+		new_ns->parent = get_pid_ns(old_ns);
+
+out_put:
+	put_pid_ns(old_ns);
+out:
+	return new_ns;
+}
+
+void free_pid_ns(struct kref *kref)
+{
+	struct pid_namespace *ns, *parent;
+
+	ns = container_of(kref, struct pid_namespace, kref);
+
+	parent = ns->parent;
+	destroy_pid_namespace(ns);
+
+	if (parent != NULL)
+		put_pid_ns(parent);
+}
+
+void zap_pid_ns_processes(struct pid_namespace *pid_ns)
+{
+	int nr;
+	int rc;
+
+	/*
+	 * The last thread in the cgroup-init thread group is terminating.
+	 * Find remaining pid_ts in the namespace, signal and wait for them
+	 * to exit.
+	 *
+	 * Note:  This signals each threads in the namespace - even those that
+	 * 	  belong to the same thread group, To avoid this, we would have
+	 * 	  to walk the entire tasklist looking a processes in this
+	 * 	  namespace, but that could be unnecessarily expensive if the
+	 * 	  pid namespace has just a few processes. Or we need to
+	 * 	  maintain a tasklist for each pid namespace.
+	 *
+	 */
+	read_lock(&tasklist_lock);
+	nr = next_pidmap(pid_ns, 1);
+	while (nr > 0) {
+		kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
+		nr = next_pidmap(pid_ns, nr);
+	}
+	read_unlock(&tasklist_lock);
+
+	do {
+		clear_thread_flag(TIF_SIGPENDING);
+		rc = sys_wait4(-1, NULL, __WALL, NULL);
+	} while (rc != -ECHILD);
+
+
+	/* Child reaper for the pid namespace is going away */
+	pid_ns->child_reaper = NULL;
+	return;
+}
+
+static __init int pid_namespaces_init(void)
+{
+	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+	return 0;
+}
+
+__initcall(pid_namespaces_init);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0b7c82ac467..2eae91f954c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -20,7 +20,7 @@ static int check_clock(const clockid_t which_clock)
 		return 0;
 
 	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
+	p = find_task_by_vpid(pid);
 	if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
 		   same_thread_group(p, current) : thread_group_leader(p))) {
 		error = -EINVAL;
@@ -305,7 +305,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 		 */
 		struct task_struct *p;
 		rcu_read_lock();
-		p = find_task_by_pid(pid);
+		p = find_task_by_vpid(pid);
 		if (p) {
 			if (CPUCLOCK_PERTHREAD(which_clock)) {
 				if (same_thread_group(p, current)) {
@@ -354,7 +354,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 		if (pid == 0) {
 			p = current;
 		} else {
-			p = find_task_by_pid(pid);
+			p = find_task_by_vpid(pid);
 			if (p && !same_thread_group(p, current))
 				p = NULL;
 		}
@@ -362,7 +362,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 		if (pid == 0) {
 			p = current->group_leader;
 		} else {
-			p = find_task_by_pid(pid);
+			p = find_task_by_vpid(pid);
 			if (p && !thread_group_leader(p))
 				p = NULL;
 		}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 122d5c787fe..a9b04203a66 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -404,7 +404,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
 	struct task_struct *rtn = current->group_leader;
 
 	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
-		(!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
+		(!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
 		 !same_thread_group(rtn, current) ||
 		 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
 		return NULL;
@@ -767,9 +767,11 @@ common_timer_set(struct k_itimer *timr, int flags,
 	/* SIGEV_NONE timers are not queued ! See common_timer_get */
 	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
 		/* Setup correct expiry time for relative timers */
-		if (mode == HRTIMER_MODE_REL)
-			timer->expires = ktime_add(timer->expires,
-						   timer->base->get_time());
+		if (mode == HRTIMER_MODE_REL) {
+			timer->expires =
+				ktime_add_safe(timer->expires,
+					       timer->base->get_time());
+		}
 		return 0;
 	}
 
@@ -982,20 +984,9 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
 static int common_nsleep(const clockid_t which_clock, int flags,
 			 struct timespec *tsave, struct timespec __user *rmtp)
 {
-	struct timespec rmt;
-	int ret;
-
-	ret = hrtimer_nanosleep(tsave, rmtp ? &rmt : NULL,
-				flags & TIMER_ABSTIME ?
-				HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
-				which_clock);
-
-	if (ret && rmtp) {
-		if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
-			return -EFAULT;
-	}
-
-	return ret;
+	return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
+				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+				 which_clock);
 }
 
 asmlinkage long
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ef9b802738a..79833170bb9 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -74,8 +74,8 @@ config PM_TRACE_RTC
 	RTC across reboots, so that you can debug a machine that just hangs
 	during suspend (or more commonly, during resume).
 
-	To use this debugging feature you should attempt to suspend the machine,
-	then reboot it, then run
+	To use this debugging feature you should attempt to suspend the
+	machine, reboot it and then run
 
 		dmesg -s 1000000 | grep 'hash matches'
 
@@ -123,7 +123,10 @@ config HIBERNATION
 	  called "hibernation" in user interfaces.  STD checkpoints the
 	  system and powers it off; and restores that checkpoint on reboot.
 
-	  You can suspend your machine with 'echo disk > /sys/power/state'.
+	  You can suspend your machine with 'echo disk > /sys/power/state'
+	  after placing resume=/dev/swappartition on the kernel command line
+	  in your bootloader's configuration file.
+
 	  Alternatively, you can use the additional userland tools available
 	  from <http://suspend.sf.net>.
 
diff --git a/kernel/printk.c b/kernel/printk.c
index 29ae1e99cde..bee36100f11 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,7 +32,6 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
-#include <linux/jiffies.h>
 
 #include <asm/uaccess.h>
 
@@ -93,16 +92,16 @@ static int console_locked, console_suspended;
  */
 static DEFINE_SPINLOCK(logbuf_lock);
 
-#define LOG_BUF_MASK	(log_buf_len-1)
+#define LOG_BUF_MASK (log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
 
 /*
  * The indices into log_buf are not constrained to log_buf_len - they
  * must be masked before subscripting
  */
-static unsigned long log_start;	/* Index into log_buf: next char to be read by syslog() */
-static unsigned long con_start;	/* Index into log_buf: next char to be sent to consoles */
-static unsigned long log_end;	/* Index into log_buf: most-recently-written-char + 1 */
+static unsigned log_start;	/* Index into log_buf: next char to be read by syslog() */
+static unsigned con_start;	/* Index into log_buf: next char to be sent to consoles */
+static unsigned log_end;	/* Index into log_buf: most-recently-written-char + 1 */
 
 /*
  *	Array of consoles built from command line options (console=)
@@ -128,17 +127,17 @@ static int console_may_schedule;
 static char __log_buf[__LOG_BUF_LEN];
 static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
-static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
+static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
 
 static int __init log_buf_len_setup(char *str)
 {
-	unsigned long size = memparse(str, &str);
+	unsigned size = memparse(str, &str);
 	unsigned long flags;
 
 	if (size)
 		size = roundup_pow_of_two(size);
 	if (size > log_buf_len) {
-		unsigned long start, dest_idx, offset;
+		unsigned start, dest_idx, offset;
 		char *new_log_buf;
 
 		new_log_buf = alloc_bootmem(size);
@@ -295,7 +294,7 @@ int log_buf_read(int idx)
  */
 int do_syslog(int type, char __user *buf, int len)
 {
-	unsigned long i, j, limit, count;
+	unsigned i, j, limit, count;
 	int do_clear = 0;
 	char c;
 	int error = 0;
@@ -436,7 +435,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len)
 /*
  * Call the console drivers on a range of log_buf
  */
-static void __call_console_drivers(unsigned long start, unsigned long end)
+static void __call_console_drivers(unsigned start, unsigned end)
 {
 	struct console *con;
 
@@ -463,8 +462,8 @@ early_param("ignore_loglevel", ignore_loglevel_setup);
 /*
  * Write out chars from start to end - 1 inclusive
  */
-static void _call_console_drivers(unsigned long start,
-				unsigned long end, int msg_log_level)
+static void _call_console_drivers(unsigned start,
+				unsigned end, int msg_log_level)
 {
 	if ((msg_log_level < console_loglevel || ignore_loglevel) &&
 			console_drivers && start != end) {
@@ -484,12 +483,12 @@ static void _call_console_drivers(unsigned long start,
  * log_buf[start] to log_buf[end - 1].
  * The console_sem must be held.
  */
-static void call_console_drivers(unsigned long start, unsigned long end)
+static void call_console_drivers(unsigned start, unsigned end)
 {
-	unsigned long cur_index, start_print;
+	unsigned cur_index, start_print;
 	static int msg_level = -1;
 
-	BUG_ON(((long)(start - end)) > 0);
+	BUG_ON(((int)(start - end)) > 0);
 
 	cur_index = start;
 	start_print = start;
@@ -567,19 +566,6 @@ static int printk_time = 0;
 #endif
 module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
 
-static int __init printk_time_setup(char *str)
-{
-	if (*str)
-		return 0;
-	printk_time = 1;
-	printk(KERN_NOTICE "The 'time' option is deprecated and "
-		"is scheduled for removal in early 2008\n");
-	printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n");
-	return 1;
-}
-
-__setup("time", printk_time_setup);
-
 /* Check if we have any console registered that can be called early in boot. */
 static int have_callable_console(void)
 {
@@ -790,7 +776,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len)
 	return -ENOSYS;
 }
 
-static void call_console_drivers(unsigned long start, unsigned long end)
+static void call_console_drivers(unsigned start, unsigned end)
 {
 }
 
@@ -983,8 +969,8 @@ void wake_up_klogd(void)
 void release_console_sem(void)
 {
 	unsigned long flags;
-	unsigned long _con_start, _log_end;
-	unsigned long wake_klogd = 0;
+	unsigned _con_start, _log_end;
+	unsigned wake_klogd = 0;
 
 	if (console_suspended) {
 		up(&secondary_console_sem);
@@ -1265,6 +1251,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 	return;
 }
 
+#if defined CONFIG_PRINTK
 /*
  * printk rate limiting, lifted from the networking subsystem.
  *
@@ -1275,7 +1262,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
 	static DEFINE_SPINLOCK(ratelimit_lock);
-	static unsigned long toks = 10 * 5 * HZ;
+	static unsigned toks = 10 * 5 * HZ;
 	static unsigned long last_msg;
 	static int missed;
 	unsigned long flags;
@@ -1334,3 +1321,4 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 	return false;
 }
 EXPORT_SYMBOL(printk_timed_ratelimit);
+#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index e64c2da11c0..3b7a1b05512 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -20,7 +20,6 @@
 #include <linux/mm.h>
 #include <linux/cpumask.h>
 #include <linux/cpu.h>
-#include <linux/profile.h>
 #include <linux/highmem.h>
 #include <linux/mutex.h>
 #include <asm/sections.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b0d4ab4dfd3..fdb34e86f92 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -20,6 +20,7 @@
 #include <linux/signal.h>
 #include <linux/audit.h>
 #include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -53,7 +54,7 @@ void ptrace_untrace(struct task_struct *child)
 	spin_lock(&child->sighand->siglock);
 	if (task_is_traced(child)) {
 		if (child->signal->flags & SIGNAL_STOP_STOPPED) {
-			child->state = TASK_STOPPED;
+			__set_task_state(child, TASK_STOPPED);
 		} else {
 			signal_wake_up(child, 1);
 		}
@@ -98,23 +99,23 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	 * be changed by us so it's not changing right after this.
 	 */
 	read_lock(&tasklist_lock);
-	if ((child->ptrace & PT_PTRACED) && child->parent == current &&
-	    (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
-	    && child->signal != NULL) {
+	if ((child->ptrace & PT_PTRACED) && child->parent == current) {
 		ret = 0;
+		/*
+		 * child->sighand can't be NULL, release_task()
+		 * does ptrace_unlink() before __exit_signal().
+		 */
 		spin_lock_irq(&child->sighand->siglock);
-		if (task_is_stopped(child)) {
+		if (task_is_stopped(child))
 			child->state = TASK_TRACED;
-		} else if (!task_is_traced(child) && !kill) {
+		else if (!task_is_traced(child) && !kill)
 			ret = -ESRCH;
-		}
 		spin_unlock_irq(&child->sighand->siglock);
 	}
 	read_unlock(&tasklist_lock);
 
-	if (!ret && !kill) {
+	if (!ret && !kill)
 		wait_task_inactive(child);
-	}
 
 	/* All systems go.. */
 	return ret;
@@ -201,8 +202,7 @@ repeat:
 		goto bad;
 
 	/* Go */
-	task->ptrace |= PT_PTRACED | ((task->real_parent != current)
-				      ? PT_ATTACHED : 0);
+	task->ptrace |= PT_PTRACED;
 	if (capable(CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 760dfc233a0..c09605f8d16 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 
-/* Because of FASTCALL declaration of complete, we use this wrapper */
+/*
+ * Awaken the corresponding synchronize_rcu() instance now that a
+ * grace period has elapsed.
+ */
 static void wakeme_after_rcu(struct rcu_head  *head)
 {
 	struct rcu_synchronize *rcu;
diff --git a/kernel/relay.c b/kernel/relay.c
index 7c0373322f1..d080b9d161a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -37,37 +37,31 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
 }
 
 /*
- * nopage() vm_op implementation for relay file mapping.
+ * fault() vm_op implementation for relay file mapping.
  */
-static struct page *relay_buf_nopage(struct vm_area_struct *vma,
-				     unsigned long address,
-				     int *type)
+static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct page *page;
 	struct rchan_buf *buf = vma->vm_private_data;
-	unsigned long offset = address - vma->vm_start;
+	pgoff_t pgoff = vmf->pgoff;
 
-	if (address > vma->vm_end)
-		return NOPAGE_SIGBUS; /* Disallow mremap */
 	if (!buf)
-		return NOPAGE_OOM;
+		return VM_FAULT_OOM;
 
-	page = vmalloc_to_page(buf->start + offset);
+	page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT));
 	if (!page)
-		return NOPAGE_OOM;
+		return VM_FAULT_SIGBUS;
 	get_page(page);
+	vmf->page = page;
 
-	if (type)
-		*type = VM_FAULT_MINOR;
-
-	return page;
+	return 0;
 }
 
 /*
  * vm_ops for relay file mappings.
  */
 static struct vm_operations_struct relay_file_mmap_ops = {
-	.nopage = relay_buf_nopage,
+	.fault = relay_buf_fault,
 	.close = relay_file_mmap_close,
 };
 
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
new file mode 100644
index 00000000000..16cbec2d5d6
--- /dev/null
+++ b/kernel/res_counter.c
@@ -0,0 +1,134 @@
+/*
+ * resource cgroups
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/parser.h>
+#include <linux/fs.h>
+#include <linux/res_counter.h>
+#include <linux/uaccess.h>
+
+void res_counter_init(struct res_counter *counter)
+{
+	spin_lock_init(&counter->lock);
+	counter->limit = (unsigned long long)LLONG_MAX;
+}
+
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
+{
+	if (counter->usage + val > counter->limit) {
+		counter->failcnt++;
+		return -ENOMEM;
+	}
+
+	counter->usage += val;
+	return 0;
+}
+
+int res_counter_charge(struct res_counter *counter, unsigned long val)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&counter->lock, flags);
+	ret = res_counter_charge_locked(counter, val);
+	spin_unlock_irqrestore(&counter->lock, flags);
+	return ret;
+}
+
+void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
+{
+	if (WARN_ON(counter->usage < val))
+		val = counter->usage;
+
+	counter->usage -= val;
+}
+
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&counter->lock, flags);
+	res_counter_uncharge_locked(counter, val);
+	spin_unlock_irqrestore(&counter->lock, flags);
+}
+
+
+static inline unsigned long long *
+res_counter_member(struct res_counter *counter, int member)
+{
+	switch (member) {
+	case RES_USAGE:
+		return &counter->usage;
+	case RES_LIMIT:
+		return &counter->limit;
+	case RES_FAILCNT:
+		return &counter->failcnt;
+	};
+
+	BUG();
+	return NULL;
+}
+
+ssize_t res_counter_read(struct res_counter *counter, int member,
+		const char __user *userbuf, size_t nbytes, loff_t *pos,
+		int (*read_strategy)(unsigned long long val, char *st_buf))
+{
+	unsigned long long *val;
+	char buf[64], *s;
+
+	s = buf;
+	val = res_counter_member(counter, member);
+	if (read_strategy)
+		s += read_strategy(*val, s);
+	else
+		s += sprintf(s, "%llu\n", *val);
+	return simple_read_from_buffer((void __user *)userbuf, nbytes,
+			pos, buf, s - buf);
+}
+
+ssize_t res_counter_write(struct res_counter *counter, int member,
+		const char __user *userbuf, size_t nbytes, loff_t *pos,
+		int (*write_strategy)(char *st_buf, unsigned long long *val))
+{
+	int ret;
+	char *buf, *end;
+	unsigned long flags;
+	unsigned long long tmp, *val;
+
+	buf = kmalloc(nbytes + 1, GFP_KERNEL);
+	ret = -ENOMEM;
+	if (buf == NULL)
+		goto out;
+
+	buf[nbytes] = '\0';
+	ret = -EFAULT;
+	if (copy_from_user(buf, userbuf, nbytes))
+		goto out_free;
+
+	ret = -EINVAL;
+
+	if (write_strategy) {
+		if (write_strategy(buf, &tmp)) {
+			goto out_free;
+		}
+	} else {
+		tmp = simple_strtoull(buf, &end, 10);
+		if (*end != '\0')
+			goto out_free;
+	}
+	spin_lock_irqsave(&counter->lock, flags);
+	val = res_counter_member(counter, member);
+	*val = tmp;
+	spin_unlock_irqrestore(&counter->lock, flags);
+	ret = nbytes;
+out_free:
+	kfree(buf);
+out:
+	return ret;
+}
diff --git a/kernel/resource.c b/kernel/resource.c
index 2eb553d9b51..82aea814d40 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -228,7 +228,7 @@ int release_resource(struct resource *old)
 
 EXPORT_SYMBOL(release_resource);
 
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 /*
  * Finds the lowest memory reosurce exists within [res->start.res->end)
  * the caller must specify res->start, res->end, res->flags.
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 56d73cb8826..5fcb4fe645e 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -130,7 +130,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
 
 	task = rt_mutex_owner(act_waiter->lock);
 	if (task && task != current) {
-		act_waiter->deadlock_task_pid = task->pid;
+		act_waiter->deadlock_task_pid = get_pid(task_pid(task));
 		act_waiter->deadlock_lock = lock;
 	}
 }
@@ -142,9 +142,12 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 	if (!waiter->deadlock_lock || !rt_trace_on)
 		return;
 
-	task = find_task_by_pid(waiter->deadlock_task_pid);
-	if (!task)
+	rcu_read_lock();
+	task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
+	if (!task) {
+		rcu_read_unlock();
 		return;
+	}
 
 	TRACE_OFF_NOLOCK();
 
@@ -173,6 +176,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 		current->comm, task_pid_nr(current));
 	dump_stack();
 	debug_show_all_locks();
+	rcu_read_unlock();
 
 	printk("[ turning off deadlock detection."
 	       "Please report this trace. ]\n\n");
@@ -203,10 +207,12 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 	memset(waiter, 0x11, sizeof(*waiter));
 	plist_node_init(&waiter->list_entry, MAX_PRIO);
 	plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+	waiter->deadlock_task_pid = NULL;
 }
 
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
+	put_pid(waiter->deadlock_task_pid);
 	TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
 	TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
 	TRACE_WARN_ON(waiter->task);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0deef71ff8d..6522ae5b14a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	set_current_state(state);
 
 	/* Setup the timer, when timeout != NULL */
-	if (unlikely(timeout))
+	if (unlikely(timeout)) {
 		hrtimer_start(&timeout->timer, timeout->timer.expires,
 			      HRTIMER_MODE_ABS);
+		if (!hrtimer_active(&timeout->timer))
+			timeout->task = NULL;
+	}
 
 	for (;;) {
 		/* Try to acquire the lock: */
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 2d3b83593ca..e124bf5800e 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -51,7 +51,7 @@ struct rt_mutex_waiter {
 	struct rt_mutex		*lock;
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	unsigned long		ip;
-	pid_t			deadlock_task_pid;
+	struct pid		*deadlock_task_pid;
 	struct rt_mutex		*deadlock_lock;
 #endif
 };
diff --git a/kernel/sched.c b/kernel/sched.c
index 9474b23c28b..f28f19e65b5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -155,7 +155,7 @@ struct rt_prio_array {
 	struct list_head queue[MAX_RT_PRIO];
 };
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
 
 #include <linux/cgroup.h>
 
@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups);
 
 /* task group related information */
 struct task_group {
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 	struct cgroup_subsys_state css;
 #endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 
-	struct sched_rt_entity **rt_se;
-	struct rt_rq **rt_rq;
-
-	unsigned int rt_ratio;
-
 	/*
 	 * shares assigned to a task group governs how much of cpu bandwidth
 	 * is allocated to the group. The more shares a group has, the more is
@@ -213,33 +210,46 @@ struct task_group {
 	 *
 	 */
 	unsigned long shares;
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+	struct sched_rt_entity **rt_se;
+	struct rt_rq **rt_rq;
+
+	u64 rt_runtime;
+#endif
 
 	struct rcu_head rcu;
 	struct list_head list;
 };
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 
 static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
 static struct rt_rq *init_rt_rq_p[NR_CPUS];
+#endif
 
-/* task_group_mutex serializes add/remove of task groups and also changes to
+/* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
  */
-static DEFINE_MUTEX(task_group_mutex);
+static DEFINE_SPINLOCK(task_group_lock);
 
 /* doms_cur_mutex serializes access to doms_cur[] array */
 static DEFINE_MUTEX(doms_cur_mutex);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
 /* kernel thread that runs rebalance_shares() periodically */
 static struct task_struct *lb_monitor_task;
@@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused);
 
 static void set_se_shares(struct sched_entity *se, unsigned long shares);
 
+#ifdef CONFIG_USER_SCHED
+# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
+#else
+# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
+#endif
+
+#define MIN_GROUP_SHARES	2
+
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+#endif
+
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  */
 struct task_group init_task_group = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	.se	= init_sched_entity_p,
 	.cfs_rq = init_cfs_rq_p,
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 	.rt_se	= init_sched_rt_entity_p,
 	.rt_rq	= init_rt_rq_p,
-};
-
-#ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
-#else
-# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
-
-#define MIN_GROUP_SHARES	2
-
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+};
 
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
 {
 	struct task_group *tg;
 
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
 	tg = p->user->tg;
-#elif defined(CONFIG_FAIR_CGROUP_SCHED)
+#elif defined(CONFIG_CGROUP_SCHED)
 	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
 				struct task_group, css);
 #else
@@ -288,21 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p)
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 	p->rt.parent = task_group(p)->rt_se[cpu];
-}
-
-static inline void lock_task_group_list(void)
-{
-	mutex_lock(&task_group_mutex);
-}
-
-static inline void unlock_task_group_list(void)
-{
-	mutex_unlock(&task_group_mutex);
+#endif
 }
 
 static inline void lock_doms_cur(void)
@@ -318,12 +327,10 @@ static inline void unlock_doms_cur(void)
 #else
 
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_task_group_list(void) { }
-static inline void unlock_task_group_list(void) { }
 static inline void lock_doms_cur(void) { }
 static inline void unlock_doms_cur(void) { }
 
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
+#endif	/* CONFIG_GROUP_SCHED */
 
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
@@ -363,7 +370,7 @@ struct cfs_rq {
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	int highest_prio; /* highest queued rt task prio */
 #endif
 #ifdef CONFIG_SMP
@@ -373,7 +380,9 @@ struct rt_rq {
 	int rt_throttled;
 	u64 rt_time;
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+	unsigned long rt_nr_boosted;
+
 	struct rq *rq;
 	struct list_head leaf_rt_rq_list;
 	struct task_group *tg;
@@ -447,6 +456,8 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 	struct list_head leaf_rt_rq_list;
 #endif
 
@@ -652,19 +663,21 @@ const_debug unsigned int sysctl_sched_features =
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+unsigned int sysctl_sched_rt_period = 1000000;
 
-#define SCHED_RT_FRAC_SHIFT	16
-#define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
 
 /*
- * ratio of time -rt tasks may consume.
- * default: 95%
+ * single value that denotes runtime == period, ie unlimited time.
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+#define RUNTIME_INF	((u64)~0ULL)
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1893,13 +1906,13 @@ out:
 	return success;
 }
 
-int fastcall wake_up_process(struct task_struct *p)
+int wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_ALL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
 
-int fastcall wake_up_state(struct task_struct *p, unsigned int state)
+int wake_up_state(struct task_struct *p, unsigned int state)
 {
 	return try_to_wake_up(p, state, 0);
 }
@@ -1986,7 +1999,7 @@ void sched_fork(struct task_struct *p, int clone_flags)
  * that must be done for every newly created context, then puts the task
  * on the runqueue and wakes it.
  */
-void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
 	unsigned long flags;
 	struct rq *rq;
@@ -3753,7 +3766,7 @@ void scheduler_tick(void)
 
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
 
-void fastcall add_preempt_count(int val)
+void add_preempt_count(int val)
 {
 	/*
 	 * Underflow?
@@ -3769,7 +3782,7 @@ void fastcall add_preempt_count(int val)
 }
 EXPORT_SYMBOL(add_preempt_count);
 
-void fastcall sub_preempt_count(int val)
+void sub_preempt_count(int val)
 {
 	/*
 	 * Underflow?
@@ -4067,7 +4080,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
  */
-void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
@@ -4081,7 +4094,7 @@ EXPORT_SYMBOL(__wake_up);
 /*
  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
  */
-void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 {
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
@@ -4099,7 +4112,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
  *
  * On UP it can prevent extra preemption.
  */
-void fastcall
+void
 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
 	unsigned long flags;
@@ -4571,6 +4584,15 @@ recheck:
 			return -EPERM;
 	}
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	/*
+	 * Do not allow realtime tasks into groups that have no runtime
+	 * assigned.
+	 */
+	if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
+		return -EPERM;
+#endif
+
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
 		return retval;
@@ -7112,7 +7134,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
@@ -7123,7 +7145,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	rt_rq->rt_time = 0;
 	rt_rq->rt_throttled = 0;
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+	rt_rq->rt_nr_boosted = 0;
 	rt_rq->rq = rq;
 #endif
 }
@@ -7146,7 +7169,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
 	se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
 	se->parent = NULL;
 }
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
 		struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
 		int cpu, int add)
@@ -7175,7 +7200,7 @@ void __init sched_init(void)
 	init_defrootdomain();
 #endif
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 #endif
 
@@ -7196,7 +7221,10 @@ void __init sched_init(void)
 				&per_cpu(init_cfs_rq, i),
 				&per_cpu(init_sched_entity, i), i, 1);
 
-		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		init_task_group.rt_runtime =
+			sysctl_sched_rt_runtime * NSEC_PER_USEC;
 		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(rq, &init_task_group,
 				&per_cpu(init_rt_rq, i),
@@ -7303,7 +7331,7 @@ void normalize_rt_tasks(void)
 	unsigned long flags;
 	struct rq *rq;
 
-	read_lock_irq(&tasklist_lock);
+	read_lock_irqsave(&tasklist_lock, flags);
 	do_each_thread(g, p) {
 		/*
 		 * Only normalize user tasks:
@@ -7329,16 +7357,16 @@ void normalize_rt_tasks(void)
 			continue;
 		}
 
-		spin_lock_irqsave(&p->pi_lock, flags);
+		spin_lock(&p->pi_lock);
 		rq = __task_rq_lock(p);
 
 		normalize_task(rq, p);
 
 		__task_rq_unlock(rq);
-		spin_unlock_irqrestore(&p->pi_lock, flags);
+		spin_unlock(&p->pi_lock);
 	} while_each_thread(g, p);
 
-	read_unlock_irq(&tasklist_lock);
+	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 
 #endif /* CONFIG_MAGIC_SYSRQ */
@@ -7387,9 +7415,9 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #endif
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
 
-#ifdef CONFIG_SMP
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
 /*
  * distribute shares of all task groups among their schedulable entities,
  * to reflect load distribution across cpus.
@@ -7540,7 +7568,8 @@ static int load_balance_monitor(void *unused)
 }
 #endif	/* CONFIG_SMP */
 
-static void free_sched_group(struct task_group *tg)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void free_fair_sched_group(struct task_group *tg)
 {
 	int i;
 
@@ -7549,49 +7578,27 @@ static void free_sched_group(struct task_group *tg)
 			kfree(tg->cfs_rq[i]);
 		if (tg->se)
 			kfree(tg->se[i]);
-		if (tg->rt_rq)
-			kfree(tg->rt_rq[i]);
-		if (tg->rt_se)
-			kfree(tg->rt_se[i]);
 	}
 
 	kfree(tg->cfs_rq);
 	kfree(tg->se);
-	kfree(tg->rt_rq);
-	kfree(tg->rt_se);
-	kfree(tg);
 }
 
-/* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(void)
+static int alloc_fair_sched_group(struct task_group *tg)
 {
-	struct task_group *tg;
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
-	struct rt_rq *rt_rq;
-	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
 
-	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
-	if (!tg)
-		return ERR_PTR(-ENOMEM);
-
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
 	if (!tg->cfs_rq)
 		goto err;
 	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
 	if (!tg->se)
 		goto err;
-	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
-	if (!tg->rt_rq)
-		goto err;
-	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
-	if (!tg->rt_se)
-		goto err;
 
 	tg->shares = NICE_0_LOAD;
-	tg->rt_ratio = 0; /* XXX */
 
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
@@ -7606,6 +7613,79 @@ struct task_group *sched_create_group(void)
 		if (!se)
 			goto err;
 
+		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+	}
+
+	return 1;
+
+ err:
+	return 0;
+}
+
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+	list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
+			&cpu_rq(cpu)->leaf_cfs_rq_list);
+}
+
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+	list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+}
+#else
+static inline void free_fair_sched_group(struct task_group *tg)
+{
+}
+
+static inline int alloc_fair_sched_group(struct task_group *tg)
+{
+	return 1;
+}
+
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (tg->rt_rq)
+			kfree(tg->rt_rq[i]);
+		if (tg->rt_se)
+			kfree(tg->rt_se[i]);
+	}
+
+	kfree(tg->rt_rq);
+	kfree(tg->rt_se);
+}
+
+static int alloc_rt_sched_group(struct task_group *tg)
+{
+	struct rt_rq *rt_rq;
+	struct sched_rt_entity *rt_se;
+	struct rq *rq;
+	int i;
+
+	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_rq)
+		goto err;
+	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_se)
+		goto err;
+
+	tg->rt_runtime = 0;
+
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+
 		rt_rq = kmalloc_node(sizeof(struct rt_rq),
 				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!rt_rq)
@@ -7616,20 +7696,75 @@ struct task_group *sched_create_group(void)
 		if (!rt_se)
 			goto err;
 
-		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
 		init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
 	}
 
-	lock_task_group_list();
+	return 1;
+
+ err:
+	return 0;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+	list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
+			&cpu_rq(cpu)->leaf_rt_rq_list);
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+	list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
+}
+#else
+static inline void free_rt_sched_group(struct task_group *tg)
+{
+}
+
+static inline int alloc_rt_sched_group(struct task_group *tg)
+{
+	return 1;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+
+static void free_sched_group(struct task_group *tg)
+{
+	free_fair_sched_group(tg);
+	free_rt_sched_group(tg);
+	kfree(tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(void)
+{
+	struct task_group *tg;
+	unsigned long flags;
+	int i;
+
+	tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+	if (!tg)
+		return ERR_PTR(-ENOMEM);
+
+	if (!alloc_fair_sched_group(tg))
+		goto err;
+
+	if (!alloc_rt_sched_group(tg))
+		goto err;
+
+	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
-		rq = cpu_rq(i);
-		cfs_rq = tg->cfs_rq[i];
-		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
-		rt_rq = tg->rt_rq[i];
-		list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+		register_fair_sched_group(tg, i);
+		register_rt_sched_group(tg, i);
 	}
 	list_add_rcu(&tg->list, &task_groups);
-	unlock_task_group_list();
+	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	return tg;
 
@@ -7648,21 +7783,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
-	struct cfs_rq *cfs_rq = NULL;
-	struct rt_rq *rt_rq = NULL;
+	unsigned long flags;
 	int i;
 
-	lock_task_group_list();
+	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i) {
-		cfs_rq = tg->cfs_rq[i];
-		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
-		rt_rq = tg->rt_rq[i];
-		list_del_rcu(&rt_rq->leaf_rt_rq_list);
+		unregister_fair_sched_group(tg, i);
+		unregister_rt_sched_group(tg, i);
 	}
 	list_del_rcu(&tg->list);
-	unlock_task_group_list();
-
-	BUG_ON(!cfs_rq);
+	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	/* wait for possible concurrent references to cfs_rqs complete */
 	call_rcu(&tg->rcu, free_sched_group_rcu);
@@ -7703,6 +7833,7 @@ void sched_move_task(struct task_struct *tsk)
 	task_rq_unlock(rq, &flags);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 /* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
@@ -7728,13 +7859,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 	}
 }
 
+static DEFINE_MUTEX(shares_mutex);
+
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 	int i;
-	struct cfs_rq *cfs_rq;
-	struct rq *rq;
+	unsigned long flags;
 
-	lock_task_group_list();
+	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
 		goto done;
 
@@ -7746,10 +7878,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * load_balance_fair) from referring to this group first,
 	 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
 	 */
-	for_each_possible_cpu(i) {
-		cfs_rq = tg->cfs_rq[i];
-		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
-	}
+	spin_lock_irqsave(&task_group_lock, flags);
+	for_each_possible_cpu(i)
+		unregister_fair_sched_group(tg, i);
+	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	/* wait for any ongoing reference to this group to finish */
 	synchronize_sched();
@@ -7769,13 +7901,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * Enable load balance activity on this group, by inserting it back on
 	 * each cpu's rq->leaf_cfs_rq_list.
 	 */
-	for_each_possible_cpu(i) {
-		rq = cpu_rq(i);
-		cfs_rq = tg->cfs_rq[i];
-		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
-	}
+	spin_lock_irqsave(&task_group_lock, flags);
+	for_each_possible_cpu(i)
+		register_fair_sched_group(tg, i);
+	spin_unlock_irqrestore(&task_group_lock, flags);
 done:
-	unlock_task_group_list();
+	mutex_unlock(&shares_mutex);
 	return 0;
 }
 
@@ -7783,35 +7914,84 @@ unsigned long sched_group_shares(struct task_group *tg)
 {
 	return tg->shares;
 }
+#endif
 
+#ifdef CONFIG_RT_GROUP_SCHED
 /*
- * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
  */
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+	if (runtime == RUNTIME_INF)
+		return 1ULL << 16;
+
+	runtime *= (1ULL << 16);
+	div64_64(runtime, period);
+	return runtime;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
 	struct task_group *tgi;
 	unsigned long total = 0;
+	unsigned long global_ratio =
+		to_ratio(sysctl_sched_rt_period,
+			 sysctl_sched_rt_runtime < 0 ?
+				RUNTIME_INF : sysctl_sched_rt_runtime);
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(tgi, &task_groups, list)
-		total += tgi->rt_ratio;
-	rcu_read_unlock();
+	list_for_each_entry_rcu(tgi, &task_groups, list) {
+		if (tgi == tg)
+			continue;
 
-	if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
-		return -EINVAL;
+		total += to_ratio(period, tgi->rt_runtime);
+	}
+	rcu_read_unlock();
 
-	tg->rt_ratio = rt_ratio;
-	return 0;
+	return total + to_ratio(period, runtime) < global_ratio;
 }
 
-unsigned long sched_group_rt_ratio(struct task_group *tg)
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
-	return tg->rt_ratio;
+	u64 rt_runtime, rt_period;
+	int err = 0;
+
+	rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+	if (rt_runtime_us == -1)
+		rt_runtime = rt_period;
+
+	mutex_lock(&rt_constraints_mutex);
+	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
+		err = -EINVAL;
+		goto unlock;
+	}
+	if (rt_runtime_us == -1)
+		rt_runtime = RUNTIME_INF;
+	tg->rt_runtime = rt_runtime;
+ unlock:
+	mutex_unlock(&rt_constraints_mutex);
+
+	return err;
 }
 
-#endif	/* CONFIG_FAIR_GROUP_SCHED */
+long sched_group_rt_runtime(struct task_group *tg)
+{
+	u64 rt_runtime_us;
+
+	if (tg->rt_runtime == RUNTIME_INF)
+		return -1;
+
+	rt_runtime_us = tg->rt_runtime;
+	do_div(rt_runtime_us, NSEC_PER_USEC);
+	return rt_runtime_us;
+}
+#endif
+#endif	/* CONFIG_GROUP_SCHED */
 
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
 
 /* return corresponding task_group object of a cgroup */
 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
@@ -7857,9 +8037,15 @@ static int
 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		      struct task_struct *tsk)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+	/* Don't accept realtime tasks when there is no way for them to run */
+	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
+		return -EINVAL;
+#else
 	/* We don't support RT-tasks being in separate groups */
 	if (tsk->sched_class != &fair_sched_class)
 		return -EINVAL;
+#endif
 
 	return 0;
 }
@@ -7871,6 +8057,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	sched_move_task(tsk);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
 {
@@ -7883,31 +8070,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 
 	return (u64) tg->shares;
 }
+#endif
 
-static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
-		u64 rt_ratio_val)
+#ifdef CONFIG_RT_GROUP_SCHED
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+				struct file *file,
+				const char __user *userbuf,
+				size_t nbytes, loff_t *unused_ppos)
 {
-	return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+	char buffer[64];
+	int retval = 0;
+	s64 val;
+	char *end;
+
+	if (!nbytes)
+		return -EINVAL;
+	if (nbytes >= sizeof(buffer))
+		return -E2BIG;
+	if (copy_from_user(buffer, userbuf, nbytes))
+		return -EFAULT;
+
+	buffer[nbytes] = 0;     /* nul-terminate */
+
+	/* strip newline if necessary */
+	if (nbytes && (buffer[nbytes-1] == '\n'))
+		buffer[nbytes-1] = 0;
+	val = simple_strtoll(buffer, &end, 0);
+	if (*end)
+		return -EINVAL;
+
+	/* Pass to subsystem */
+	retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+	if (!retval)
+		retval = nbytes;
+	return retval;
 }
 
-static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+				   struct file *file,
+				   char __user *buf, size_t nbytes,
+				   loff_t *ppos)
 {
-	struct task_group *tg = cgroup_tg(cgrp);
+	char tmp[64];
+	long val = sched_group_rt_runtime(cgroup_tg(cgrp));
+	int len = sprintf(tmp, "%ld\n", val);
 
-	return (u64) tg->rt_ratio;
+	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
+#endif
 
 static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
 		.read_uint = cpu_shares_read_uint,
 		.write_uint = cpu_shares_write_uint,
 	},
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
 	{
-		.name = "rt_ratio",
-		.read_uint = cpu_rt_ratio_read_uint,
-		.write_uint = cpu_rt_ratio_write_uint,
+		.name = "rt_runtime_us",
+		.read = cpu_rt_runtime_read,
+		.write = cpu_rt_runtime_write,
 	},
+#endif
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -7926,7 +8152,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.early_init	= 1,
 };
 
-#endif	/* CONFIG_FAIR_CGROUP_SCHED */
+#endif	/* CONFIG_CGROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_CPUACCT
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 274b40d7bef..f54792b175b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 	return !list_empty(&rt_se->run_list);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
 
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
 	if (!rt_rq->tg)
-		return SCHED_RT_FRAC;
+		return RUNTIME_INF;
 
-	return rt_rq->tg->rt_ratio;
+	return rt_rq->tg->rt_runtime;
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 
-static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
 	}
 }
 
-static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
@@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
 		dequeue_rt_entity(rt_se);
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+	struct task_struct *p;
+
+	if (rt_rq)
+		return !!rt_rq->rt_nr_boosted;
+
+	p = rt_task_of(rt_se);
+	return p->prio != p->normal_prio;
+}
+
 #else
 
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 {
-	return sysctl_sched_rt_ratio;
+	if (sysctl_sched_rt_runtime == -1)
+		return RUNTIME_INF;
+
+	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
 #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 	return NULL;
 }
 
-static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
 }
 
-static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled;
+}
 #endif
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
 	struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 	if (rt_rq)
@@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 	return rt_task_of(rt_se)->prio;
 }
 
-static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
-	unsigned int rt_ratio = sched_rt_ratio(rt_rq);
-	u64 period, ratio;
+	u64 runtime = sched_rt_runtime(rt_rq);
 
-	if (rt_ratio == SCHED_RT_FRAC)
+	if (runtime == RUNTIME_INF)
 		return 0;
 
 	if (rt_rq->rt_throttled)
-		return 1;
-
-	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+		return rt_rq_throttled(rt_rq);
 
-	if (rt_rq->rt_time > ratio) {
+	if (rt_rq->rt_time > runtime) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
 
 		rq->rt_throttled = 1;
 		rt_rq->rt_throttled = 1;
 
-		sched_rt_ratio_dequeue(rt_rq);
-		return 1;
+		if (rt_rq_throttled(rt_rq)) {
+			sched_rt_rq_dequeue(rt_rq);
+			return 1;
+		}
 	}
 
 	return 0;
@@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
 	u64 period;
 
 	while (rq->clock > rq->rt_period_expire) {
-		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+		period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 		rq->rt_period_expire += period;
 
 		for_each_leaf_rt_rq(rt_rq, rq) {
-			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+			u64 runtime = sched_rt_runtime(rt_rq);
 
-			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-			if (rt_rq->rt_throttled) {
+			rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
+			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 				rt_rq->rt_throttled = 0;
-				sched_rt_ratio_enqueue(rt_rq);
+				sched_rt_rq_enqueue(rt_rq);
 			}
 		}
 
@@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
 	cpuacct_charge(curr, delta_exec);
 
 	rt_rq->rt_time += delta_exec;
-	/*
-	 * might make it a tad more accurate:
-	 *
-	 * update_sched_rt_period(rq);
-	 */
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (sched_rt_runtime_exceeded(rt_rq))
 		resched_task(curr);
 }
 
@@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio)
 		rt_rq->highest_prio = rt_se_prio(rt_se);
 #endif
@@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted++;
+#endif
 }
 
 static inline
@@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running--;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_rq->rt_nr_running) {
 		struct rt_prio_array *array;
 
@@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted--;
+
+	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
 }
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
 	struct rt_prio_array *array = &rt_rq->active;
 	struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-	if (group_rq && group_rq->rt_throttled)
+	if (group_rq && rt_rq_throttled(group_rq))
 		return;
 
 	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -496,7 +522,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 	if (unlikely(!rt_rq->rt_nr_running))
 		return NULL;
 
-	if (sched_rt_ratio_exceeded(rt_rq))
+	if (rt_rq_throttled(rt_rq))
 		return NULL;
 
 	do {
diff --git a/kernel/signal.c b/kernel/signal.c
index 6a5f97cd337..84917fe507f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -972,7 +972,7 @@ void zap_other_threads(struct task_struct *p)
 	}
 }
 
-int fastcall __fatal_signal_pending(struct task_struct *tsk)
+int __fatal_signal_pending(struct task_struct *tsk)
 {
 	return sigismember(&tsk->pending.signal, SIGKILL);
 }
@@ -1018,7 +1018,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 }
 
 /*
- * kill_pgrp_info() sends a signal to a process group: this is what the tty
+ * __kill_pgrp_info() sends a signal to a process group: this is what the tty
  * control characters do (^C, ^Z etc)
  */
 
@@ -1037,30 +1037,28 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
 	return success ? 0 : retval;
 }
 
-int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
-{
-	int retval;
-
-	read_lock(&tasklist_lock);
-	retval = __kill_pgrp_info(sig, info, pgrp);
-	read_unlock(&tasklist_lock);
-
-	return retval;
-}
-
 int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
 {
-	int error;
+	int error = -ESRCH;
 	struct task_struct *p;
 
 	rcu_read_lock();
 	if (unlikely(sig_needs_tasklist(sig)))
 		read_lock(&tasklist_lock);
 
+retry:
 	p = pid_task(pid, PIDTYPE_PID);
-	error = -ESRCH;
-	if (p)
+	if (p) {
 		error = group_send_sig_info(sig, info, p);
+		if (unlikely(error == -ESRCH))
+			/*
+			 * The task was unhashed in between, try again.
+			 * If it is dead, pid_task() will return NULL,
+			 * if we race with de_thread() it will find the
+			 * new leader.
+			 */
+			goto retry;
+	}
 
 	if (unlikely(sig_needs_tasklist(sig)))
 		read_unlock(&tasklist_lock);
@@ -1125,14 +1123,22 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
 static int kill_something_info(int sig, struct siginfo *info, int pid)
 {
 	int ret;
-	rcu_read_lock();
-	if (!pid) {
-		ret = kill_pgrp_info(sig, info, task_pgrp(current));
-	} else if (pid == -1) {
+
+	if (pid > 0) {
+		rcu_read_lock();
+		ret = kill_pid_info(sig, info, find_vpid(pid));
+		rcu_read_unlock();
+		return ret;
+	}
+
+	read_lock(&tasklist_lock);
+	if (pid != -1) {
+		ret = __kill_pgrp_info(sig, info,
+				pid ? find_vpid(-pid) : task_pgrp(current));
+	} else {
 		int retval = 0, count = 0;
 		struct task_struct * p;
 
-		read_lock(&tasklist_lock);
 		for_each_process(p) {
 			if (p->pid > 1 && !same_thread_group(p, current)) {
 				int err = group_send_sig_info(sig, info, p);
@@ -1141,14 +1147,10 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
 					retval = err;
 			}
 		}
-		read_unlock(&tasklist_lock);
 		ret = count ? retval : -ESRCH;
-	} else if (pid < 0) {
-		ret = kill_pgrp_info(sig, info, find_vpid(-pid));
-	} else {
-		ret = kill_pid_info(sig, info, find_vpid(pid));
 	}
-	rcu_read_unlock();
+	read_unlock(&tasklist_lock);
+
 	return ret;
 }
 
@@ -1196,20 +1198,6 @@ send_sig(int sig, struct task_struct *p, int priv)
 	return send_sig_info(sig, __si_special(priv), p);
 }
 
-/*
- * This is the entry point for "process-wide" signals.
- * They will go to an appropriate thread in the thread group.
- */
-int
-send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
-{
-	int ret;
-	read_lock(&tasklist_lock);
-	ret = group_send_sig_info(sig, info, p);
-	read_unlock(&tasklist_lock);
-	return ret;
-}
-
 void
 force_sig(int sig, struct task_struct *p)
 {
@@ -1237,7 +1225,13 @@ force_sigsegv(int sig, struct task_struct *p)
 
 int kill_pgrp(struct pid *pid, int sig, int priv)
 {
-	return kill_pgrp_info(sig, __si_special(priv), pid);
+	int ret;
+
+	read_lock(&tasklist_lock);
+	ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+	read_unlock(&tasklist_lock);
+
+	return ret;
 }
 EXPORT_SYMBOL(kill_pgrp);
 
@@ -1556,11 +1550,6 @@ static inline int may_ptrace_stop(void)
 {
 	if (!likely(current->ptrace & PT_PTRACED))
 		return 0;
-
-	if (unlikely(current->parent == current->real_parent &&
-		    (current->ptrace & PT_ATTACHED)))
-		return 0;
-
 	/*
 	 * Are we in the middle of do_coredump?
 	 * If so and our tracer is also part of the coredump stopping
@@ -1578,6 +1567,17 @@ static inline int may_ptrace_stop(void)
 }
 
 /*
+ * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Called with the siglock held.
+ */
+static int sigkill_pending(struct task_struct *tsk)
+{
+	return ((sigismember(&tsk->pending.signal, SIGKILL) ||
+		 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
+		!unlikely(sigismember(&tsk->blocked, SIGKILL)));
+}
+
+/*
  * This must be called with current->sighand->siglock held.
  *
  * This should be the path for all ptrace stops.
@@ -1585,11 +1585,31 @@ static inline int may_ptrace_stop(void)
  * That makes it a way to test a stopped process for
  * being ptrace-stopped vs being job-control-stopped.
  *
- * If we actually decide not to stop at all because the tracer is gone,
- * we leave nostop_code in current->exit_code.
+ * If we actually decide not to stop at all because the tracer
+ * is gone, we keep current->exit_code unless clear_code.
  */
-static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
+static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 {
+	int killed = 0;
+
+	if (arch_ptrace_stop_needed(exit_code, info)) {
+		/*
+		 * The arch code has something special to do before a
+		 * ptrace stop.  This is allowed to block, e.g. for faults
+		 * on user stack pages.  We can't keep the siglock while
+		 * calling arch_ptrace_stop, so we must release it now.
+		 * To preserve proper semantics, we must do this before
+		 * any signal bookkeeping like checking group_stop_count.
+		 * Meanwhile, a SIGKILL could come in before we retake the
+		 * siglock.  That must prevent us from sleeping in TASK_TRACED.
+		 * So after regaining the lock, we must check for SIGKILL.
+		 */
+		spin_unlock_irq(&current->sighand->siglock);
+		arch_ptrace_stop(exit_code, info);
+		spin_lock_irq(&current->sighand->siglock);
+		killed = sigkill_pending(current);
+	}
+
 	/*
 	 * If there is a group stop in progress,
 	 * we must participate in the bookkeeping.
@@ -1601,22 +1621,23 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
 	current->exit_code = exit_code;
 
 	/* Let the debugger run.  */
-	set_current_state(TASK_TRACED);
+	__set_current_state(TASK_TRACED);
 	spin_unlock_irq(&current->sighand->siglock);
 	try_to_freeze();
 	read_lock(&tasklist_lock);
-	if (may_ptrace_stop()) {
+	if (!unlikely(killed) && may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
 		schedule();
 	} else {
 		/*
 		 * By the time we got the lock, our tracer went away.
-		 * Don't stop here.
+		 * Don't drop the lock yet, another tracer may come.
 		 */
+		__set_current_state(TASK_RUNNING);
+		if (clear_code)
+			current->exit_code = 0;
 		read_unlock(&tasklist_lock);
-		set_current_state(TASK_RUNNING);
-		current->exit_code = nostop_code;
 	}
 
 	/*
@@ -1649,7 +1670,7 @@ void ptrace_notify(int exit_code)
 
 	/* Let the debugger run.  */
 	spin_lock_irq(&current->sighand->siglock);
-	ptrace_stop(exit_code, 0, &info);
+	ptrace_stop(exit_code, 1, &info);
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
@@ -1712,7 +1733,7 @@ static int do_signal_stop(int signr)
 			 * stop is always done with the siglock held,
 			 * so this check has no races.
 			 */
-			if (!t->exit_state &&
+			if (!(t->flags & PF_EXITING) &&
 			    !task_is_stopped_or_traced(t)) {
 				stop_count++;
 				signal_wake_up(t, 0);
@@ -1756,7 +1777,7 @@ relock:
 			ptrace_signal_deliver(regs, cookie);
 
 			/* Let the debugger run.  */
-			ptrace_stop(signr, signr, info);
+			ptrace_stop(signr, 0, info);
 
 			/* We're back.  Did the debugger cancel the sig?  */
 			signr = current->exit_code;
@@ -1873,6 +1894,48 @@ relock:
 	return signr;
 }
 
+void exit_signals(struct task_struct *tsk)
+{
+	int group_stop = 0;
+	struct task_struct *t;
+
+	if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
+		tsk->flags |= PF_EXITING;
+		return;
+	}
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	/*
+	 * From now this task is not visible for group-wide signals,
+	 * see wants_signal(), do_signal_stop().
+	 */
+	tsk->flags |= PF_EXITING;
+	if (!signal_pending(tsk))
+		goto out;
+
+	/* It could be that __group_complete_signal() choose us to
+	 * notify about group-wide signal. Another thread should be
+	 * woken now to take the signal since we will not.
+	 */
+	for (t = tsk; (t = next_thread(t)) != tsk; )
+		if (!signal_pending(t) && !(t->flags & PF_EXITING))
+			recalc_sigpending_and_wake(t);
+
+	if (unlikely(tsk->signal->group_stop_count) &&
+			!--tsk->signal->group_stop_count) {
+		tsk->signal->flags = SIGNAL_STOP_STOPPED;
+		group_stop = 1;
+	}
+out:
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	if (unlikely(group_stop)) {
+		read_lock(&tasklist_lock);
+		do_notify_parent_cldstop(tsk, CLD_STOPPED);
+		read_unlock(&tasklist_lock);
+	}
+}
+
 EXPORT_SYMBOL(recalc_sigpending);
 EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7837d45419..5b3aea5f471 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -320,7 +320,7 @@ void irq_exit(void)
 /*
  * This function must run with irqs disabled!
  */
-inline fastcall void raise_softirq_irqoff(unsigned int nr)
+inline void raise_softirq_irqoff(unsigned int nr)
 {
 	__raise_softirq_irqoff(nr);
 
@@ -337,7 +337,7 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr)
 		wakeup_softirqd();
 }
 
-void fastcall raise_softirq(unsigned int nr)
+void raise_softirq(unsigned int nr)
 {
 	unsigned long flags;
 
@@ -363,7 +363,7 @@ struct tasklet_head
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
 
-void fastcall __tasklet_schedule(struct tasklet_struct *t)
+void __tasklet_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
@@ -376,7 +376,7 @@ void fastcall __tasklet_schedule(struct tasklet_struct *t)
 
 EXPORT_SYMBOL(__tasklet_schedule);
 
-void fastcall __tasklet_hi_schedule(struct tasklet_struct *t)
+void __tasklet_hi_schedule(struct tasklet_struct *t)
 {
 	unsigned long flags;
 
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 3507cabe963..b0aeeaf22ce 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -74,7 +74,7 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
  * severe errors when invoked on an active srcu_struct.  That said, it
  * can be useful as an error check at cleanup time.
  */
-int srcu_readers_active(struct srcu_struct *sp)
+static int srcu_readers_active(struct srcu_struct *sp)
 {
 	return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
 }
@@ -255,4 +255,3 @@ EXPORT_SYMBOL_GPL(srcu_read_lock);
 EXPORT_SYMBOL_GPL(srcu_read_unlock);
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
-EXPORT_SYMBOL_GPL(srcu_readers_active);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51b5ee53571..6f4e0e13f70 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -29,7 +29,6 @@ enum stopmachine_state {
 static enum stopmachine_state stopmachine_state;
 static unsigned int stopmachine_num_threads;
 static atomic_t stopmachine_thread_ack;
-static DECLARE_MUTEX(stopmachine_mutex);
 
 static int stopmachine(void *cpu)
 {
@@ -170,6 +169,7 @@ static int do_stop(void *_smdata)
 struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 				       unsigned int cpu)
 {
+	static DEFINE_MUTEX(stopmachine_mutex);
 	struct stop_machine_data smdata;
 	struct task_struct *p;
 
@@ -177,7 +177,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 	smdata.data = data;
 	init_completion(&smdata.done);
 
-	down(&stopmachine_mutex);
+	mutex_lock(&stopmachine_mutex);
 
 	/* If they don't care which CPU fn runs on, bind to any online one. */
 	if (cpu == NR_CPUS)
@@ -193,7 +193,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 		wake_up_process(p);
 		wait_for_completion(&smdata.done);
 	}
-	up(&stopmachine_mutex);
+	mutex_unlock(&stopmachine_mutex);
 	return p;
 }
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 53de35fc824..a626116af5d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -916,8 +916,8 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 {
 	struct task_struct *p;
 	struct task_struct *group_leader = current->group_leader;
-	int err = -EINVAL;
-	struct pid_namespace *ns;
+	struct pid *pgrp;
+	int err;
 
 	if (!pid)
 		pid = task_pid_vnr(group_leader);
@@ -929,12 +929,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	/* From this point forward we keep holding onto the tasklist lock
 	 * so that our parent does not change from under us. -DaveM
 	 */
-	ns = current->nsproxy->pid_ns;
-
 	write_lock_irq(&tasklist_lock);
 
 	err = -ESRCH;
-	p = find_task_by_pid_ns(pid, ns);
+	p = find_task_by_vpid(pid);
 	if (!p)
 		goto out;
 
@@ -942,7 +940,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	if (!thread_group_leader(p))
 		goto out;
 
-	if (p->real_parent->tgid == group_leader->tgid) {
+	if (same_thread_group(p->real_parent, group_leader)) {
 		err = -EPERM;
 		if (task_session(p) != task_session(group_leader))
 			goto out;
@@ -959,10 +957,12 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	if (p->signal->leader)
 		goto out;
 
+	pgrp = task_pid(p);
 	if (pgid != pid) {
 		struct task_struct *g;
 
-		g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns);
+		pgrp = find_vpid(pgid);
+		g = pid_task(pgrp, PIDTYPE_PGID);
 		if (!g || task_session(g) != task_session(group_leader))
 			goto out;
 	}
@@ -971,13 +971,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	if (err)
 		goto out;
 
-	if (task_pgrp_nr_ns(p, ns) != pgid) {
-		struct pid *pid;
-
+	if (task_pgrp(p) != pgrp) {
 		detach_pid(p, PIDTYPE_PGID);
-		pid = find_vpid(pgid);
-		attach_pid(p, PIDTYPE_PGID, pid);
-		set_task_pgrp(p, pid_nr(pid));
+		attach_pid(p, PIDTYPE_PGID, pgrp);
+		set_task_pgrp(p, pid_nr(pgrp));
 	}
 
 	err = 0;
@@ -994,17 +991,14 @@ asmlinkage long sys_getpgid(pid_t pid)
 	else {
 		int retval;
 		struct task_struct *p;
-		struct pid_namespace *ns;
-
-		ns = current->nsproxy->pid_ns;
 
 		read_lock(&tasklist_lock);
-		p = find_task_by_pid_ns(pid, ns);
+		p = find_task_by_vpid(pid);
 		retval = -ESRCH;
 		if (p) {
 			retval = security_task_getpgid(p);
 			if (!retval)
-				retval = task_pgrp_nr_ns(p, ns);
+				retval = task_pgrp_vnr(p);
 		}
 		read_unlock(&tasklist_lock);
 		return retval;
@@ -1028,19 +1022,16 @@ asmlinkage long sys_getsid(pid_t pid)
 	else {
 		int retval;
 		struct task_struct *p;
-		struct pid_namespace *ns;
-
-		ns = current->nsproxy->pid_ns;
 
-		read_lock(&tasklist_lock);
-		p = find_task_by_pid_ns(pid, ns);
+		rcu_read_lock();
+		p = find_task_by_vpid(pid);
 		retval = -ESRCH;
 		if (p) {
 			retval = security_task_getsid(p);
 			if (!retval)
-				retval = task_session_nr_ns(p, ns);
+				retval = task_session_vnr(p);
 		}
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 		return retval;
 	}
 }
@@ -1048,35 +1039,29 @@ asmlinkage long sys_getsid(pid_t pid)
 asmlinkage long sys_setsid(void)
 {
 	struct task_struct *group_leader = current->group_leader;
-	pid_t session;
+	struct pid *sid = task_pid(group_leader);
+	pid_t session = pid_vnr(sid);
 	int err = -EPERM;
 
 	write_lock_irq(&tasklist_lock);
-
 	/* Fail if I am already a session leader */
 	if (group_leader->signal->leader)
 		goto out;
 
-	session = group_leader->pid;
 	/* Fail if a process group id already exists that equals the
 	 * proposed session id.
-	 *
-	 * Don't check if session id == 1 because kernel threads use this
-	 * session id and so the check will always fail and make it so
-	 * init cannot successfully call setsid.
 	 */
-	if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID,
-				session, &init_pid_ns))
+	if (pid_task(sid, PIDTYPE_PGID))
 		goto out;
 
 	group_leader->signal->leader = 1;
-	__set_special_pids(session, session);
+	__set_special_pids(sid);
 
 	spin_lock(&group_leader->sighand->siglock);
 	group_leader->signal->tty = NULL;
 	spin_unlock(&group_leader->sighand->siglock);
 
-	err = task_pgrp_vnr(group_leader);
+	err = session;
 out:
 	write_unlock_irq(&tasklist_lock);
 	return err;
@@ -1145,16 +1130,16 @@ static int groups_to_user(gid_t __user *grouplist,
     struct group_info *group_info)
 {
 	int i;
-	int count = group_info->ngroups;
+	unsigned int count = group_info->ngroups;
 
 	for (i = 0; i < group_info->nblocks; i++) {
-		int cp_count = min(NGROUPS_PER_BLOCK, count);
-		int off = i * NGROUPS_PER_BLOCK;
-		int len = cp_count * sizeof(*grouplist);
+		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+		unsigned int len = cp_count * sizeof(*grouplist);
 
-		if (copy_to_user(grouplist+off, group_info->blocks[i], len))
+		if (copy_to_user(grouplist, group_info->blocks[i], len))
 			return -EFAULT;
 
+		grouplist += NGROUPS_PER_BLOCK;
 		count -= cp_count;
 	}
 	return 0;
@@ -1165,16 +1150,16 @@ static int groups_from_user(struct group_info *group_info,
     gid_t __user *grouplist)
 {
 	int i;
-	int count = group_info->ngroups;
+	unsigned int count = group_info->ngroups;
 
 	for (i = 0; i < group_info->nblocks; i++) {
-		int cp_count = min(NGROUPS_PER_BLOCK, count);
-		int off = i * NGROUPS_PER_BLOCK;
-		int len = cp_count * sizeof(*grouplist);
+		unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+		unsigned int len = cp_count * sizeof(*grouplist);
 
-		if (copy_from_user(group_info->blocks[i], grouplist+off, len))
+		if (copy_from_user(group_info->blocks[i], grouplist, len))
 			return -EFAULT;
 
+		grouplist += NGROUPS_PER_BLOCK;
 		count -= cp_count;
 	}
 	return 0;
@@ -1472,7 +1457,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
 	    !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
-	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
+	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
 		return -EPERM;
 
 	retval = security_task_setrlimit(resource, &new_rlim);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5e2ad5bf88e..8b7e9541179 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,7 +37,6 @@
 #include <linux/highuid.h>
 #include <linux/writeback.h>
 #include <linux/hugetlb.h>
-#include <linux/security.h>
 #include <linux/initrd.h>
 #include <linux/times.h>
 #include <linux/limits.h>
@@ -67,14 +66,13 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
 extern int sysctl_oom_kill_allocating_task;
+extern int sysctl_oom_dump_tasks;
 extern int max_threads;
 extern int core_uses_pid;
 extern int suid_dumpable;
 extern char core_pattern[];
 extern int pid_max;
 extern int min_free_kbytes;
-extern int printk_ratelimit_jiffies;
-extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -313,22 +311,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_period_ms",
-		.data		= &sysctl_sched_rt_period,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_rt_ratio",
-		.data		= &sysctl_sched_rt_ratio,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
 	{
 		.ctl_name       = CTL_UNNUMBERED,
@@ -350,6 +332,22 @@ static struct ctl_table kern_table[] = {
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_period_us",
+		.data		= &sysctl_sched_rt_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_runtime_us",
+		.data		= &sysctl_sched_rt_runtime,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_compat_yield",
 		.data		= &sysctl_sched_compat_yield,
 		.maxlen		= sizeof(unsigned int),
@@ -490,14 +488,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= KERN_PRINTK,
-		.procname	= "printk",
-		.data		= &console_loglevel,
-		.maxlen		= 4*sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 #ifdef CONFIG_KMOD
 	{
 		.ctl_name	= KERN_MODPROBE,
@@ -644,6 +634,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#if defined CONFIG_PRINTK
+	{
+		.ctl_name	= KERN_PRINTK,
+		.procname	= "printk",
+		.data		= &console_loglevel,
+		.maxlen		= 4*sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= KERN_PRINTK_RATELIMIT,
 		.procname	= "printk_ratelimit",
@@ -661,6 +660,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
 	{
 		.ctl_name	= KERN_NGROUPS_MAX,
 		.procname	= "ngroups_max",
@@ -871,6 +871,14 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "oom_dump_tasks",
+		.data		= &sysctl_oom_dump_tasks,
+		.maxlen		= sizeof(sysctl_oom_dump_tasks),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= VM_OVERCOMMIT_RATIO,
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
@@ -970,10 +978,10 @@ static struct ctl_table vm_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "nr_overcommit_hugepages",
-		.data		= &nr_overcommit_huge_pages,
-		.maxlen		= sizeof(nr_overcommit_huge_pages),
+		.data		= &sysctl_overcommit_huge_pages,
+		.maxlen		= sizeof(sysctl_overcommit_huge_pages),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &hugetlb_overcommit_handler,
 	},
 #endif
 	{
@@ -1203,6 +1211,14 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_open",
+		.data		= &sysctl_nr_open,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= FS_DENTRY,
 		.procname	= "dentry-state",
 		.data		= &dentry_stat,
@@ -2471,7 +2487,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
 	pid_t tmp;
 	int r;
 
-	tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns);
+	tmp = pid_vnr(cad_pid);
 
 	r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
 			       lenp, ppos, NULL, NULL);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 006365b69ea..c09350d564f 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -8,10 +8,10 @@
 struct trans_ctl_table {
 	int			ctl_name;
 	const char		*procname;
-	struct trans_ctl_table	*child;
+	const struct trans_ctl_table *child;
 };
 
-static struct trans_ctl_table trans_random_table[] = {
+static const struct trans_ctl_table trans_random_table[] = {
 	{ RANDOM_POOLSIZE,	"poolsize" },
 	{ RANDOM_ENTROPY_COUNT,	"entropy_avail" },
 	{ RANDOM_READ_THRESH,	"read_wakeup_threshold" },
@@ -21,13 +21,13 @@ static struct trans_ctl_table trans_random_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_pty_table[] = {
+static const struct trans_ctl_table trans_pty_table[] = {
 	{ PTY_MAX,		"max" },
 	{ PTY_NR,		"nr" },
 	{}
 };
 
-static struct trans_ctl_table trans_kern_table[] = {
+static const struct trans_ctl_table trans_kern_table[] = {
 	{ KERN_OSTYPE,			"ostype" },
 	{ KERN_OSRELEASE,		"osrelease" },
 	/* KERN_OSREV not used */
@@ -107,7 +107,7 @@ static struct trans_ctl_table trans_kern_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_vm_table[] = {
+static const struct trans_ctl_table trans_vm_table[] = {
 	{ VM_OVERCOMMIT_MEMORY,		"overcommit_memory" },
 	{ VM_PAGE_CLUSTER,		"page-cluster" },
 	{ VM_DIRTY_BACKGROUND,		"dirty_background_ratio" },
@@ -139,7 +139,7 @@ static struct trans_ctl_table trans_vm_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_core_table[] = {
+static const struct trans_ctl_table trans_net_core_table[] = {
 	{ NET_CORE_WMEM_MAX,		"wmem_max" },
 	{ NET_CORE_RMEM_MAX,		"rmem_max" },
 	{ NET_CORE_WMEM_DEFAULT,	"wmem_default" },
@@ -165,14 +165,14 @@ static struct trans_ctl_table trans_net_core_table[] = {
 	{},
 };
 
-static struct trans_ctl_table trans_net_unix_table[] = {
+static const struct trans_ctl_table trans_net_unix_table[] = {
 	/* NET_UNIX_DESTROY_DELAY unused */
 	/* NET_UNIX_DELETE_DELAY unused */
 	{ NET_UNIX_MAX_DGRAM_QLEN,	"max_dgram_qlen" },
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv4_route_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
 	{ NET_IPV4_ROUTE_FLUSH,			"flush" },
 	{ NET_IPV4_ROUTE_MIN_DELAY,		"min_delay" },
 	{ NET_IPV4_ROUTE_MAX_DELAY,		"max_delay" },
@@ -195,7 +195,7 @@ static struct trans_ctl_table trans_net_ipv4_route_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
 	{ NET_IPV4_CONF_FORWARDING,		"forwarding" },
 	{ NET_IPV4_CONF_MC_FORWARDING,		"mc_forwarding" },
 
@@ -222,14 +222,14 @@ static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv4_conf_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
 	{ NET_PROTO_CONF_ALL,		"all",		trans_net_ipv4_conf_vars_table },
 	{ NET_PROTO_CONF_DEFAULT,	"default",	trans_net_ipv4_conf_vars_table },
 	{ 0, NULL, trans_net_ipv4_conf_vars_table },
 	{}
 };
 
-static struct trans_ctl_table trans_net_neigh_vars_table[] = {
+static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
 	{ NET_NEIGH_MCAST_SOLICIT,	"mcast_solicit" },
 	{ NET_NEIGH_UCAST_SOLICIT,	"ucast_solicit" },
 	{ NET_NEIGH_APP_SOLICIT,	"app_solicit" },
@@ -251,13 +251,13 @@ static struct trans_ctl_table trans_net_neigh_vars_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_neigh_table[] = {
+static const struct trans_ctl_table trans_net_neigh_table[] = {
 	{ NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
 	{ 0, NULL, trans_net_neigh_vars_table },
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
 	{ NET_IPV4_NF_CONNTRACK_MAX,				"ip_conntrack_max" },
 
 	{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,		"ip_conntrack_tcp_timeout_syn_sent" },
@@ -294,7 +294,7 @@ static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv4_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_table[] = {
 	{ NET_IPV4_FORWARD,			"ip_forward" },
 	{ NET_IPV4_DYNADDR,			"ip_dynaddr" },
 
@@ -393,13 +393,13 @@ static struct trans_ctl_table trans_net_ipv4_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipx_table[] = {
+static const struct trans_ctl_table trans_net_ipx_table[] = {
 	{ NET_IPX_PPROP_BROADCASTING,	"ipx_pprop_broadcasting" },
 	/* NET_IPX_FORWARDING unused */
 	{}
 };
 
-static struct trans_ctl_table trans_net_atalk_table[] = {
+static const struct trans_ctl_table trans_net_atalk_table[] = {
 	{ NET_ATALK_AARP_EXPIRY_TIME,		"aarp-expiry-time" },
 	{ NET_ATALK_AARP_TICK_TIME,		"aarp-tick-time" },
 	{ NET_ATALK_AARP_RETRANSMIT_LIMIT,	"aarp-retransmit-limit" },
@@ -407,7 +407,7 @@ static struct trans_ctl_table trans_net_atalk_table[] = {
 	{},
 };
 
-static struct trans_ctl_table trans_net_netrom_table[] = {
+static const struct trans_ctl_table trans_net_netrom_table[] = {
 	{ NET_NETROM_DEFAULT_PATH_QUALITY,		"default_path_quality" },
 	{ NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER,	"obsolescence_count_initialiser" },
 	{ NET_NETROM_NETWORK_TTL_INITIALISER,		"network_ttl_initialiser" },
@@ -423,7 +423,7 @@ static struct trans_ctl_table trans_net_netrom_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_ax25_param_table[] = {
+static const struct trans_ctl_table trans_net_ax25_param_table[] = {
 	{ NET_AX25_IP_DEFAULT_MODE,	"ip_default_mode" },
 	{ NET_AX25_DEFAULT_MODE,	"ax25_default_mode" },
 	{ NET_AX25_BACKOFF_TYPE,	"backoff_type" },
@@ -441,12 +441,12 @@ static struct trans_ctl_table trans_net_ax25_param_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_ax25_table[] = {
+static const struct trans_ctl_table trans_net_ax25_table[] = {
 	{ 0, NULL, trans_net_ax25_param_table },
 	{}
 };
 
-static struct trans_ctl_table trans_net_bridge_table[] = {
+static const struct trans_ctl_table trans_net_bridge_table[] = {
 	{ NET_BRIDGE_NF_CALL_ARPTABLES,		"bridge-nf-call-arptables" },
 	{ NET_BRIDGE_NF_CALL_IPTABLES,		"bridge-nf-call-iptables" },
 	{ NET_BRIDGE_NF_CALL_IP6TABLES,		"bridge-nf-call-ip6tables" },
@@ -455,7 +455,7 @@ static struct trans_ctl_table trans_net_bridge_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_rose_table[] = {
+static const struct trans_ctl_table trans_net_rose_table[] = {
 	{ NET_ROSE_RESTART_REQUEST_TIMEOUT,	"restart_request_timeout" },
 	{ NET_ROSE_CALL_REQUEST_TIMEOUT,	"call_request_timeout" },
 	{ NET_ROSE_RESET_REQUEST_TIMEOUT,	"reset_request_timeout" },
@@ -469,7 +469,7 @@ static struct trans_ctl_table trans_net_rose_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
 	{ NET_IPV6_FORWARDING,			"forwarding" },
 	{ NET_IPV6_HOP_LIMIT,			"hop_limit" },
 	{ NET_IPV6_MTU,				"mtu" },
@@ -497,14 +497,14 @@ static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv6_conf_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
 	{ NET_PROTO_CONF_ALL,		"all",	trans_net_ipv6_conf_var_table },
 	{ NET_PROTO_CONF_DEFAULT, 	"default", trans_net_ipv6_conf_var_table },
 	{ 0, NULL, trans_net_ipv6_conf_var_table },
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv6_route_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
 	{ NET_IPV6_ROUTE_FLUSH,			"flush" },
 	{ NET_IPV6_ROUTE_GC_THRESH,		"gc_thresh" },
 	{ NET_IPV6_ROUTE_MAX_SIZE,		"max_size" },
@@ -518,12 +518,12 @@ static struct trans_ctl_table trans_net_ipv6_route_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
 	{ NET_IPV6_ICMP_RATELIMIT,	"ratelimit" },
 	{}
 };
 
-static struct trans_ctl_table trans_net_ipv6_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_table[] = {
 	{ NET_IPV6_CONF,		"conf",		trans_net_ipv6_conf_table },
 	{ NET_IPV6_NEIGH,		"neigh",	trans_net_neigh_table },
 	{ NET_IPV6_ROUTE,		"route",	trans_net_ipv6_route_table },
@@ -538,7 +538,7 @@ static struct trans_ctl_table trans_net_ipv6_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_x25_table[] = {
+static const struct trans_ctl_table trans_net_x25_table[] = {
 	{ NET_X25_RESTART_REQUEST_TIMEOUT,	"restart_request_timeout" },
 	{ NET_X25_CALL_REQUEST_TIMEOUT,		"call_request_timeout" },
 	{ NET_X25_RESET_REQUEST_TIMEOUT,	"reset_request_timeout" },
@@ -548,13 +548,13 @@ static struct trans_ctl_table trans_net_x25_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_tr_table[] = {
+static const struct trans_ctl_table trans_net_tr_table[] = {
 	{ NET_TR_RIF_TIMEOUT,	"rif_timeout" },
 	{}
 };
 
 
-static struct trans_ctl_table trans_net_decnet_conf_vars[] = {
+static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
 	{ NET_DECNET_CONF_DEV_FORWARDING,	"forwarding" },
 	{ NET_DECNET_CONF_DEV_PRIORITY,		"priority" },
 	{ NET_DECNET_CONF_DEV_T2,		"t2" },
@@ -562,12 +562,12 @@ static struct trans_ctl_table trans_net_decnet_conf_vars[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_decnet_conf[] = {
+static const struct trans_ctl_table trans_net_decnet_conf[] = {
 	{ 0, NULL, trans_net_decnet_conf_vars },
 	{}
 };
 
-static struct trans_ctl_table trans_net_decnet_table[] = {
+static const struct trans_ctl_table trans_net_decnet_table[] = {
 	{ NET_DECNET_CONF,		"conf",	trans_net_decnet_conf },
 	{ NET_DECNET_NODE_ADDRESS,	"node_address" },
 	{ NET_DECNET_NODE_NAME,		"node_name" },
@@ -585,7 +585,7 @@ static struct trans_ctl_table trans_net_decnet_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_sctp_table[] = {
+static const struct trans_ctl_table trans_net_sctp_table[] = {
 	{ NET_SCTP_RTO_INITIAL,		"rto_initial" },
 	{ NET_SCTP_RTO_MIN,		"rto_min" },
 	{ NET_SCTP_RTO_MAX,		"rto_max" },
@@ -606,7 +606,7 @@ static struct trans_ctl_table trans_net_sctp_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
+static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
 	{ NET_LLC2_ACK_TIMEOUT,		"ack" },
 	{ NET_LLC2_P_TIMEOUT,		"p" },
 	{ NET_LLC2_REJ_TIMEOUT,		"rej" },
@@ -614,23 +614,23 @@ static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_llc_station_table[] = {
+static const struct trans_ctl_table trans_net_llc_station_table[] = {
 	{ NET_LLC_STATION_ACK_TIMEOUT,	"ack_timeout" },
 	{}
 };
 
-static struct trans_ctl_table trans_net_llc_llc2_table[] = {
+static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
 	{ NET_LLC2,		"timeout",	trans_net_llc_llc2_timeout_table },
 	{}
 };
 
-static struct trans_ctl_table trans_net_llc_table[] = {
+static const struct trans_ctl_table trans_net_llc_table[] = {
 	{ NET_LLC2,		"llc2",		trans_net_llc_llc2_table },
 	{ NET_LLC_STATION,	"station",	trans_net_llc_station_table },
 	{}
 };
 
-static struct trans_ctl_table trans_net_netfilter_table[] = {
+static const struct trans_ctl_table trans_net_netfilter_table[] = {
 	{ NET_NF_CONNTRACK_MAX,				"nf_conntrack_max" },
 	{ NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,	"nf_conntrack_tcp_timeout_syn_sent" },
 	{ NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,	"nf_conntrack_tcp_timeout_syn_recv" },
@@ -667,12 +667,12 @@ static struct trans_ctl_table trans_net_netfilter_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_dccp_table[] = {
+static const struct trans_ctl_table trans_net_dccp_table[] = {
 	{ NET_DCCP_DEFAULT,	"default" },
 	{}
 };
 
-static struct trans_ctl_table trans_net_irda_table[] = {
+static const struct trans_ctl_table trans_net_irda_table[] = {
 	{ NET_IRDA_DISCOVERY,		"discovery" },
 	{ NET_IRDA_DEVNAME,		"devname" },
 	{ NET_IRDA_DEBUG,		"debug" },
@@ -690,7 +690,7 @@ static struct trans_ctl_table trans_net_irda_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_net_table[] = {
+static const struct trans_ctl_table trans_net_table[] = {
 	{ NET_CORE,		"core",		trans_net_core_table },
 	/* NET_ETHER not used */
 	/* NET_802 not used */
@@ -716,7 +716,7 @@ static struct trans_ctl_table trans_net_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_fs_quota_table[] = {
+static const struct trans_ctl_table trans_fs_quota_table[] = {
 	{ FS_DQ_LOOKUPS,	"lookups" },
 	{ FS_DQ_DROPS,		"drops" },
 	{ FS_DQ_READS,		"reads" },
@@ -729,7 +729,7 @@ static struct trans_ctl_table trans_fs_quota_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_fs_xfs_table[] = {
+static const struct trans_ctl_table trans_fs_xfs_table[] = {
 	{ XFS_RESTRICT_CHOWN,	"restrict_chown" },
 	{ XFS_SGID_INHERIT,	"irix_sgid_inherit" },
 	{ XFS_SYMLINK_MODE,	"irix_symlink_mode" },
@@ -750,24 +750,24 @@ static struct trans_ctl_table trans_fs_xfs_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
+static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
 	{ 1, "hb_ctl_path" },
 	{}
 };
 
-static struct trans_ctl_table trans_fs_ocfs2_table[] = {
+static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
 	{ 1,	"nm",	trans_fs_ocfs2_nm_table },
 	{}
 };
 
-static struct trans_ctl_table trans_inotify_table[] = {
+static const struct trans_ctl_table trans_inotify_table[] = {
 	{ INOTIFY_MAX_USER_INSTANCES,	"max_user_instances" },
 	{ INOTIFY_MAX_USER_WATCHES,	"max_user_watches" },
 	{ INOTIFY_MAX_QUEUED_EVENTS,	"max_queued_events" },
 	{}
 };
 
-static struct trans_ctl_table trans_fs_table[] = {
+static const struct trans_ctl_table trans_fs_table[] = {
 	{ FS_NRINODE,		"inode-nr" },
 	{ FS_STATINODE,		"inode-state" },
 	/* FS_MAXINODE unused */
@@ -793,11 +793,11 @@ static struct trans_ctl_table trans_fs_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_debug_table[] = {
+static const struct trans_ctl_table trans_debug_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_cdrom_table[] = {
+static const struct trans_ctl_table trans_cdrom_table[] = {
 	{ DEV_CDROM_INFO,		"info" },
 	{ DEV_CDROM_AUTOCLOSE,		"autoclose" },
 	{ DEV_CDROM_AUTOEJECT,		"autoeject" },
@@ -807,12 +807,12 @@ static struct trans_ctl_table trans_cdrom_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_ipmi_table[] = {
+static const struct trans_ctl_table trans_ipmi_table[] = {
 	{ DEV_IPMI_POWEROFF_POWERCYCLE,	"poweroff_powercycle" },
 	{}
 };
 
-static struct trans_ctl_table trans_mac_hid_files[] = {
+static const struct trans_ctl_table trans_mac_hid_files[] = {
 	/* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
 	/* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
 	{ DEV_MAC_HID_MOUSE_BUTTON_EMULATION,	"mouse_button_emulation" },
@@ -822,35 +822,35 @@ static struct trans_ctl_table trans_mac_hid_files[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_raid_table[] = {
+static const struct trans_ctl_table trans_raid_table[] = {
 	{ DEV_RAID_SPEED_LIMIT_MIN,	"speed_limit_min" },
 	{ DEV_RAID_SPEED_LIMIT_MAX,	"speed_limit_max" },
 	{}
 };
 
-static struct trans_ctl_table trans_scsi_table[] = {
+static const struct trans_ctl_table trans_scsi_table[] = {
 	{ DEV_SCSI_LOGGING_LEVEL, "logging_level" },
 	{}
 };
 
-static struct trans_ctl_table trans_parport_default_table[] = {
+static const struct trans_ctl_table trans_parport_default_table[] = {
 	{ DEV_PARPORT_DEFAULT_TIMESLICE,	"timeslice" },
 	{ DEV_PARPORT_DEFAULT_SPINTIME,		"spintime" },
 	{}
 };
 
-static struct trans_ctl_table trans_parport_device_table[] = {
+static const struct trans_ctl_table trans_parport_device_table[] = {
 	{ DEV_PARPORT_DEVICE_TIMESLICE,		"timeslice" },
 	{}
 };
 
-static struct trans_ctl_table trans_parport_devices_table[] = {
+static const struct trans_ctl_table trans_parport_devices_table[] = {
 	{ DEV_PARPORT_DEVICES_ACTIVE,		"active" },
 	{ 0, NULL, trans_parport_device_table },
 	{}
 };
 
-static struct trans_ctl_table trans_parport_parport_table[] = {
+static const struct trans_ctl_table trans_parport_parport_table[] = {
 	{ DEV_PARPORT_SPINTIME,		"spintime" },
 	{ DEV_PARPORT_BASE_ADDR,	"base-addr" },
 	{ DEV_PARPORT_IRQ,		"irq" },
@@ -864,13 +864,13 @@ static struct trans_ctl_table trans_parport_parport_table[] = {
 	{ DEV_PARPORT_AUTOPROBE + 4,	"autoprobe3" },
 	{}
 };
-static struct trans_ctl_table trans_parport_table[] = {
+static const struct trans_ctl_table trans_parport_table[] = {
 	{ DEV_PARPORT_DEFAULT,	"default",	trans_parport_default_table },
 	{ 0, NULL, trans_parport_parport_table },
 	{}
 };
 
-static struct trans_ctl_table trans_dev_table[] = {
+static const struct trans_ctl_table trans_dev_table[] = {
 	{ DEV_CDROM,	"cdrom",	trans_cdrom_table },
 	/* DEV_HWMON unused */
 	{ DEV_PARPORT,	"parport",	trans_parport_table },
@@ -881,19 +881,19 @@ static struct trans_ctl_table trans_dev_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_bus_isa_table[] = {
+static const struct trans_ctl_table trans_bus_isa_table[] = {
 	{ BUS_ISA_MEM_BASE,	"membase" },
 	{ BUS_ISA_PORT_BASE,	"portbase" },
 	{ BUS_ISA_PORT_SHIFT,	"portshift" },
 	{}
 };
 
-static struct trans_ctl_table trans_bus_table[] = {
+static const struct trans_ctl_table trans_bus_table[] = {
 	{ CTL_BUS_ISA,	"isa",	trans_bus_isa_table },
 	{}
 };
 
-static struct trans_ctl_table trans_arlan_conf_table0[] = {
+static const struct trans_ctl_table trans_arlan_conf_table0[] = {
 	{ 1,	"spreadingCode" },
 	{ 2,	"channelNumber" },
 	{ 3,	"scramblingDisable" },
@@ -964,7 +964,7 @@ static struct trans_ctl_table trans_arlan_conf_table0[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_arlan_conf_table1[] = {
+static const struct trans_ctl_table trans_arlan_conf_table1[] = {
 	{ 1,	"spreadingCode" },
 	{ 2,	"channelNumber" },
 	{ 3,	"scramblingDisable" },
@@ -1035,7 +1035,7 @@ static struct trans_ctl_table trans_arlan_conf_table1[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_arlan_conf_table2[] = {
+static const struct trans_ctl_table trans_arlan_conf_table2[] = {
 	{ 1,	"spreadingCode" },
 	{ 2,	"channelNumber" },
 	{ 3,	"scramblingDisable" },
@@ -1106,7 +1106,7 @@ static struct trans_ctl_table trans_arlan_conf_table2[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_arlan_conf_table3[] = {
+static const struct trans_ctl_table trans_arlan_conf_table3[] = {
 	{ 1,	"spreadingCode" },
 	{ 2,	"channelNumber" },
 	{ 3,	"scramblingDisable" },
@@ -1177,7 +1177,7 @@ static struct trans_ctl_table trans_arlan_conf_table3[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_arlan_table[] = {
+static const struct trans_ctl_table trans_arlan_table[] = {
 	{ 1,		"arlan0",	trans_arlan_conf_table0 },
 	{ 2,		"arlan1",	trans_arlan_conf_table1 },
 	{ 3,		"arlan2",	trans_arlan_conf_table2 },
@@ -1185,13 +1185,13 @@ static struct trans_ctl_table trans_arlan_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_s390dbf_table[] = {
+static const struct trans_ctl_table trans_s390dbf_table[] = {
 	{ 5678 /* CTL_S390DBF_STOPPABLE */,	"debug_stoppable" },
 	{ 5679 /* CTL_S390DBF_ACTIVE */,	"debug_active" },
 	{}
 };
 
-static struct trans_ctl_table trans_sunrpc_table[] = {
+static const struct trans_ctl_table trans_sunrpc_table[] = {
 	{ CTL_RPCDEBUG,		"rpc_debug" },
 	{ CTL_NFSDEBUG,		"nfs_debug" },
 	{ CTL_NFSDDEBUG,	"nfsd_debug" },
@@ -1203,7 +1203,7 @@ static struct trans_ctl_table trans_sunrpc_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_pm_table[] = {
+static const struct trans_ctl_table trans_pm_table[] = {
 	{ 1 /* CTL_PM_SUSPEND */,	"suspend" },
 	{ 2 /* CTL_PM_CMODE */,		"cmode" },
 	{ 3 /* CTL_PM_P0 */,		"p0" },
@@ -1211,13 +1211,13 @@ static struct trans_ctl_table trans_pm_table[] = {
 	{}
 };
 
-static struct trans_ctl_table trans_frv_table[] = {
+static const struct trans_ctl_table trans_frv_table[] = {
 	{ 1,	"cache-mode" },
 	{ 2,	"pin-cxnr" },
 	{}
 };
 
-static struct trans_ctl_table trans_root_table[] = {
+static const struct trans_ctl_table trans_root_table[] = {
 	{ CTL_KERN,	"kernel",	trans_kern_table },
 	{ CTL_VM,	"vm",		trans_vm_table },
 	{ CTL_NET,	"net",		trans_net_table },
@@ -1261,15 +1261,14 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
 	return table;
 }
 
-static struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
+static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
 {
 	struct ctl_table *test;
-	struct trans_ctl_table *ref;
-	int depth, cur_depth;
+	const struct trans_ctl_table *ref;
+	int cur_depth;
 
-	depth = sysctl_depth(table);
+	cur_depth = sysctl_depth(table);
 
-	cur_depth = depth;
 	ref = trans_root_table;
 repeat:
 	test = sysctl_parent(table, cur_depth);
@@ -1437,7 +1436,7 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
 
 static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
 {
-	struct trans_ctl_table *ref;
+	const struct trans_ctl_table *ref;
 
 	ref = sysctl_binary_lookup(table);
 	if (table->ctl_name && !ref)
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 88cdb109e13..06b6395b45b 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -135,6 +135,12 @@ static int test_jprobe(void)
 #ifdef CONFIG_KRETPROBES
 static u32 krph_val;
 
+static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+	krph_val = (rand1 / div_factor);
+	return 0;
+}
+
 static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	unsigned long ret = regs_return_value(regs);
@@ -144,13 +150,19 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
 		printk(KERN_ERR "Kprobe smoke test failed: "
 				"incorrect value in kretprobe handler\n");
 	}
+	if (krph_val == 0) {
+		handler_errors++;
+		printk(KERN_ERR "Kprobe smoke test failed: "
+				"call to kretprobe entry handler failed\n");
+	}
 
-	krph_val = (rand1 / div_factor);
+	krph_val = rand1;
 	return 0;
 }
 
 static struct kretprobe rp = {
 	.handler	= return_handler,
+	.entry_handler  = entry_handler,
 	.kp.symbol_name = "kprobe_target"
 };
 
@@ -167,7 +179,7 @@ static int test_kretprobe(void)
 
 	ret = kprobe_target(rand1);
 	unregister_kretprobe(&rp);
-	if (krph_val == 0) {
+	if (krph_val != rand1) {
 		printk(KERN_ERR "Kprobe smoke test failed: "
 				"kretprobe handler not called\n");
 		handler_errors++;
diff --git a/kernel/time.c b/kernel/time.c
index 4064c0566e7..a5ec013b6c8 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -39,6 +39,8 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
+#include "timeconst.h"
+
 /*
  * The timezone where the local system is located.  Used as a default by some
  * programs who obtain this value by using gettimeofday.
@@ -93,7 +95,8 @@ asmlinkage long sys_stime(time_t __user *tptr)
 
 #endif /* __ARCH_WANT_SYS_TIME */
 
-asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
+asmlinkage long sys_gettimeofday(struct timeval __user *tv,
+				 struct timezone __user *tz)
 {
 	if (likely(tv != NULL)) {
 		struct timeval ktv;
@@ -118,7 +121,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
  * hard to make the program warp the clock precisely n hours)  or
  * compile in the timezone information into the kernel.  Bad, bad....
  *
- *              				- TYT, 1992-01-01
+ *						- TYT, 1992-01-01
  *
  * The best thing to do is to keep the CMOS clock in universal time (UTC)
  * as real UNIX machines always do it. This avoids all headaches about
@@ -240,7 +243,11 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
 #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
 	return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
 #else
-	return (j * MSEC_PER_SEC) / HZ;
+# if BITS_PER_LONG == 32
+	return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+# else
+	return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
+# endif
 #endif
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
@@ -252,7 +259,11 @@ unsigned int inline jiffies_to_usecs(const unsigned long j)
 #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
 	return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
 #else
-	return (j * USEC_PER_SEC) / HZ;
+# if BITS_PER_LONG == 32
+	return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
+# else
+	return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
+# endif
 #endif
 }
 EXPORT_SYMBOL(jiffies_to_usecs);
@@ -267,7 +278,7 @@ EXPORT_SYMBOL(jiffies_to_usecs);
  *
  * This function should be only used for timestamps returned by
  * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
- * it doesn't handle the better resolution of the later.
+ * it doesn't handle the better resolution of the latter.
  */
 struct timespec timespec_trunc(struct timespec t, unsigned gran)
 {
@@ -315,7 +326,7 @@ EXPORT_SYMBOL_GPL(getnstimeofday);
  * This algorithm was first published by Gauss (I think).
  *
  * WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines were long is 32-bit! (However, as time_t is signed, we
+ * machines where long is 32-bit! (However, as time_t is signed, we
  * will already get problems at other places on 2038-01-19 03:14:08)
  */
 unsigned long
@@ -352,7 +363,7 @@ EXPORT_SYMBOL(mktime);
  * normalize to the timespec storage format
  *
  * Note: The tv_nsec part is always in the range of
- * 	0 <= tv_nsec < NSEC_PER_SEC
+ *	0 <= tv_nsec < NSEC_PER_SEC
  * For negative values only the tv_sec field is negative !
  */
 void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
@@ -453,12 +464,13 @@ unsigned long msecs_to_jiffies(const unsigned int m)
 	/*
 	 * Generic case - multiply, round and divide. But first
 	 * check that if we are doing a net multiplication, that
-	 * we wouldnt overflow:
+	 * we wouldn't overflow:
 	 */
 	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
 		return MAX_JIFFY_OFFSET;
 
-	return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
+	return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+		>> MSEC_TO_HZ_SHR32;
 #endif
 }
 EXPORT_SYMBOL(msecs_to_jiffies);
@@ -472,7 +484,8 @@ unsigned long usecs_to_jiffies(const unsigned int u)
 #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
 	return u * (HZ / USEC_PER_SEC);
 #else
-	return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
+	return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+		>> USEC_TO_HZ_SHR32;
 #endif
 }
 EXPORT_SYMBOL(usecs_to_jiffies);
@@ -566,7 +579,11 @@ EXPORT_SYMBOL(jiffies_to_timeval);
 clock_t jiffies_to_clock_t(long x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+	return x * (USER_HZ / HZ);
+# else
 	return x / (HZ / USER_HZ);
+# endif
 #else
 	u64 tmp = (u64)x * TICK_NSEC;
 	do_div(tmp, (NSEC_PER_SEC / USER_HZ));
@@ -599,7 +616,14 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
 u64 jiffies_64_to_clock_t(u64 x)
 {
 #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+	x *= USER_HZ;
+	do_div(x, HZ);
+# elif HZ > USER_HZ
 	do_div(x, HZ / USER_HZ);
+# else
+	/* Nothing to do */
+# endif
 #else
 	/*
 	 * There are better ways that don't overflow early,
@@ -611,7 +635,6 @@ u64 jiffies_64_to_clock_t(u64 x)
 #endif
 	return x;
 }
-
 EXPORT_SYMBOL(jiffies_64_to_clock_t);
 
 u64 nsec_to_clock_t(u64 x)
@@ -646,7 +669,6 @@ u64 get_jiffies_64(void)
 	} while (read_seqretry(&xtime_lock, seq));
 	return ret;
 }
-
 EXPORT_SYMBOL(get_jiffies_64);
 #endif
 
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3e59fce6dd4..3d1e3e1a197 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -133,7 +133,7 @@ static void clockevents_do_notify(unsigned long reason, void *dev)
 }
 
 /*
- * Called after a notify add to make devices availble which were
+ * Called after a notify add to make devices available which were
  * released from the notifier call.
  */
 static void clockevents_notify_released(void)
@@ -218,6 +218,8 @@ void clockevents_exchange_device(struct clock_event_device *old,
  */
 void clockevents_notify(unsigned long reason, void *arg)
 {
+	struct list_head *node, *tmp;
+
 	spin_lock(&clockevents_lock);
 	clockevents_do_notify(reason, arg);
 
@@ -227,13 +229,8 @@ void clockevents_notify(unsigned long reason, void *arg)
 		 * Unregister the clock event devices which were
 		 * released from the users in the notify chain.
 		 */
-		while (!list_empty(&clockevents_released)) {
-			struct clock_event_device *dev;
-
-			dev = list_entry(clockevents_released.next,
-					 struct clock_event_device, list);
-			list_del(&dev->list);
-		}
+		list_for_each_safe(node, tmp, &clockevents_released)
+			list_del(node);
 		break;
 	default:
 		break;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6e9259a5d50..548c436a776 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -91,7 +91,6 @@ static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
 	       cs->name, delta);
 	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
 	clocksource_change_rating(cs, 0);
-	cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
 	list_del(&cs->wd_list);
 }
 
@@ -363,15 +362,13 @@ void clocksource_unregister(struct clocksource *cs)
 static ssize_t
 sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
 {
-	char *curr = buf;
+	ssize_t count = 0;
 
 	spin_lock_irq(&clocksource_lock);
-	curr += sprintf(curr, "%s ", curr_clocksource->name);
+	count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
 	spin_unlock_irq(&clocksource_lock);
 
-	curr += sprintf(curr, "\n");
-
-	return curr - buf;
+	return count;
 }
 
 /**
@@ -439,17 +436,20 @@ static ssize_t
 sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
 {
 	struct clocksource *src;
-	char *curr = buf;
+	ssize_t count = 0;
 
 	spin_lock_irq(&clocksource_lock);
 	list_for_each_entry(src, &clocksource_list, list) {
-		curr += sprintf(curr, "%s ", src->name);
+		count += snprintf(buf + count,
+				  max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
+				  "%s ", src->name);
 	}
 	spin_unlock_irq(&clocksource_lock);
 
-	curr += sprintf(curr, "\n");
+	count += snprintf(buf + count,
+			  max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
 
-	return curr - buf;
+	return count;
 }
 
 /*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index e64efaf957e..c88b5910e7a 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -43,10 +43,6 @@ long time_freq;				/* frequency offset (scaled ppm)*/
 static long time_reftime;		/* time at last adjustment (s)	*/
 long time_adjust;
 
-#define CLOCK_TICK_OVERFLOW	(LATCH * HZ - CLOCK_TICK_RATE)
-#define CLOCK_TICK_ADJUST	(((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
-					(s64)CLOCK_TICK_RATE)
-
 static void ntp_update_frequency(void)
 {
 	u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 88267f0a847..fa9bb73dbdb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -681,7 +681,7 @@ int tick_check_oneshot_change(int allow_nohz)
 	if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
 		return 0;
 
-	if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
+	if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
 		return 0;
 
 	if (!allow_nohz)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cd5dbc4579c..1af9fb050fe 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -201,9 +201,9 @@ static inline s64 __get_nsec_offset(void) { return 0; }
 #endif
 
 /**
- * timekeeping_is_continuous - check to see if timekeeping is free running
+ * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
  */
-int timekeeping_is_continuous(void)
+int timekeeping_valid_for_hres(void)
 {
 	unsigned long seq;
 	int ret;
@@ -364,7 +364,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
 	 * with losing too many ticks, otherwise we would overadjust and
 	 * produce an even larger error.  The smaller the adjustment the
 	 * faster we try to adjust for it, as lost ticks can do less harm
-	 * here.  This is tuned so that an error of about 1 msec is adusted
+	 * here.  This is tuned so that an error of about 1 msec is adjusted
 	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
 	 */
 	error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
new file mode 100644
index 00000000000..41468035473
--- /dev/null
+++ b/kernel/timeconst.pl
@@ -0,0 +1,402 @@
+#!/usr/bin/perl
+# -----------------------------------------------------------------------
+#
+#   Copyright 2007 rPath, Inc. - All Rights Reserved
+#
+#   This file is part of the Linux kernel, and is made available under
+#   the terms of the GNU General Public License version 2 or (at your
+#   option) any later version; incorporated herein by reference.
+#
+# -----------------------------------------------------------------------
+#
+
+#
+# Usage: timeconst.pl HZ > timeconst.h
+#
+
+# Precomputed values for systems without Math::BigInt
+# Generated by:
+# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
+%canned_values = (
+	24 => [
+		'0xa6aaaaab','0x2aaaaaa',26,
+		'0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58,
+		125,3,
+		'0xc49ba5e4','0x1fbe76c8b4',37,
+		'0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69,
+		3,125,
+		'0xa2c2aaab','0xaaaa',16,
+		'0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48,
+		125000,3,
+		'0xc9539b89','0x7fffbce4217d',47,
+		'0xc9539b8887229e91','0x7fffbce4217d2849cb25',79,
+		3,125000,
+	], 32 => [
+		'0xfa000000','0x6000000',27,
+		'0xfa00000000000000','0x600000000000000',59,
+		125,4,
+		'0x83126e98','0xfdf3b645a',36,
+		'0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68,
+		4,125,
+		'0xf4240000','0x0',17,
+		'0xf424000000000000','0x0',49,
+		31250,1,
+		'0x8637bd06','0x3fff79c842fa',46,
+		'0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78,
+		1,31250,
+	], 48 => [
+		'0xa6aaaaab','0x6aaaaaa',27,
+		'0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59,
+		125,6,
+		'0xc49ba5e4','0xfdf3b645a',36,
+		'0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68,
+		6,125,
+		'0xa2c2aaab','0x15555',17,
+		'0xa2c2aaaaaaaaaaab','0x1555555555555',49,
+		62500,3,
+		'0xc9539b89','0x3fffbce4217d',46,
+		'0xc9539b8887229e91','0x3fffbce4217d2849cb25',78,
+		3,62500,
+	], 64 => [
+		'0xfa000000','0xe000000',28,
+		'0xfa00000000000000','0xe00000000000000',60,
+		125,8,
+		'0x83126e98','0x7ef9db22d',35,
+		'0x83126e978d4fdf3c','0x7ef9db22d0e560418',67,
+		8,125,
+		'0xf4240000','0x0',18,
+		'0xf424000000000000','0x0',50,
+		15625,1,
+		'0x8637bd06','0x1fff79c842fa',45,
+		'0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77,
+		1,15625,
+	], 100 => [
+		'0xa0000000','0x0',28,
+		'0xa000000000000000','0x0',60,
+		10,1,
+		'0xcccccccd','0x733333333',35,
+		'0xcccccccccccccccd','0x73333333333333333',67,
+		1,10,
+		'0x9c400000','0x0',18,
+		'0x9c40000000000000','0x0',50,
+		10000,1,
+		'0xd1b71759','0x1fff2e48e8a7',45,
+		'0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77,
+		1,10000,
+	], 122 => [
+		'0x8325c53f','0xfbcda3a',28,
+		'0x8325c53ef368eb05','0xfbcda3ac10c9714',60,
+		500,61,
+		'0xf9db22d1','0x7fbe76c8b',35,
+		'0xf9db22d0e560418a','0x7fbe76c8b43958106',67,
+		61,500,
+		'0x8012e2a0','0x3ef36',18,
+		'0x8012e29f79b47583','0x3ef368eb04325',50,
+		500000,61,
+		'0xffda4053','0x1ffffbce4217',45,
+		'0xffda4052d666a983','0x1ffffbce4217d2849cb2',77,
+		61,500000,
+	], 128 => [
+		'0xfa000000','0x1e000000',29,
+		'0xfa00000000000000','0x1e00000000000000',61,
+		125,16,
+		'0x83126e98','0x3f7ced916',34,
+		'0x83126e978d4fdf3c','0x3f7ced916872b020c',66,
+		16,125,
+		'0xf4240000','0x40000',19,
+		'0xf424000000000000','0x4000000000000',51,
+		15625,2,
+		'0x8637bd06','0xfffbce4217d',44,
+		'0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76,
+		2,15625,
+	], 200 => [
+		'0xa0000000','0x0',29,
+		'0xa000000000000000','0x0',61,
+		5,1,
+		'0xcccccccd','0x333333333',34,
+		'0xcccccccccccccccd','0x33333333333333333',66,
+		1,5,
+		'0x9c400000','0x0',19,
+		'0x9c40000000000000','0x0',51,
+		5000,1,
+		'0xd1b71759','0xfff2e48e8a7',44,
+		'0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76,
+		1,5000,
+	], 250 => [
+		'0x80000000','0x0',29,
+		'0x8000000000000000','0x0',61,
+		4,1,
+		'0x80000000','0x180000000',33,
+		'0x8000000000000000','0x18000000000000000',65,
+		1,4,
+		'0xfa000000','0x0',20,
+		'0xfa00000000000000','0x0',52,
+		4000,1,
+		'0x83126e98','0x7ff7ced9168',43,
+		'0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75,
+		1,4000,
+	], 256 => [
+		'0xfa000000','0x3e000000',30,
+		'0xfa00000000000000','0x3e00000000000000',62,
+		125,32,
+		'0x83126e98','0x1fbe76c8b',33,
+		'0x83126e978d4fdf3c','0x1fbe76c8b43958106',65,
+		32,125,
+		'0xf4240000','0xc0000',20,
+		'0xf424000000000000','0xc000000000000',52,
+		15625,4,
+		'0x8637bd06','0x7ffde7210be',43,
+		'0x8637bd05af6c69b6','0x7ffde7210be9424e592',75,
+		4,15625,
+	], 300 => [
+		'0xd5555556','0x2aaaaaaa',30,
+		'0xd555555555555556','0x2aaaaaaaaaaaaaaa',62,
+		10,3,
+		'0x9999999a','0x1cccccccc',33,
+		'0x999999999999999a','0x1cccccccccccccccc',65,
+		3,10,
+		'0xd0555556','0xaaaaa',20,
+		'0xd055555555555556','0xaaaaaaaaaaaaa',52,
+		10000,3,
+		'0x9d495183','0x7ffcb923a29',43,
+		'0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75,
+		3,10000,
+	], 512 => [
+		'0xfa000000','0x7e000000',31,
+		'0xfa00000000000000','0x7e00000000000000',63,
+		125,64,
+		'0x83126e98','0xfdf3b645',32,
+		'0x83126e978d4fdf3c','0xfdf3b645a1cac083',64,
+		64,125,
+		'0xf4240000','0x1c0000',21,
+		'0xf424000000000000','0x1c000000000000',53,
+		15625,8,
+		'0x8637bd06','0x3ffef39085f',42,
+		'0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74,
+		8,15625,
+	], 1000 => [
+		'0x80000000','0x0',31,
+		'0x8000000000000000','0x0',63,
+		1,1,
+		'0x80000000','0x0',31,
+		'0x8000000000000000','0x0',63,
+		1,1,
+		'0xfa000000','0x0',22,
+		'0xfa00000000000000','0x0',54,
+		1000,1,
+		'0x83126e98','0x1ff7ced9168',41,
+		'0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73,
+		1,1000,
+	], 1024 => [
+		'0xfa000000','0xfe000000',32,
+		'0xfa00000000000000','0xfe00000000000000',64,
+		125,128,
+		'0x83126e98','0x7ef9db22',31,
+		'0x83126e978d4fdf3c','0x7ef9db22d0e56041',63,
+		128,125,
+		'0xf4240000','0x3c0000',22,
+		'0xf424000000000000','0x3c000000000000',54,
+		15625,16,
+		'0x8637bd06','0x1fff79c842f',41,
+		'0x8637bd05af6c69b6','0x1fff79c842fa5093964',73,
+		16,15625,
+	], 1200 => [
+		'0xd5555556','0xd5555555',32,
+		'0xd555555555555556','0xd555555555555555',64,
+		5,6,
+		'0x9999999a','0x66666666',31,
+		'0x999999999999999a','0x6666666666666666',63,
+		6,5,
+		'0xd0555556','0x2aaaaa',22,
+		'0xd055555555555556','0x2aaaaaaaaaaaaa',54,
+		2500,3,
+		'0x9d495183','0x1ffcb923a29',41,
+		'0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73,
+		3,2500,
+	]
+);
+
+$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
+
+sub bint($)
+{
+	my($x) = @_;
+	return Math::BigInt->new($x);
+}
+
+#
+# Constants for division by reciprocal multiplication.
+# (bits, numerator, denominator)
+#
+sub fmul($$$)
+{
+	my ($b,$n,$d) = @_;
+
+	$n = bint($n);
+	$d = bint($d);
+
+	return scalar (($n << $b)+$d-bint(1))/$d;
+}
+
+sub fadj($$$)
+{
+	my($b,$n,$d) = @_;
+
+	$n = bint($n);
+	$d = bint($d);
+
+	$d = $d/bgcd($n, $d);
+	return scalar (($d-bint(1)) << $b)/$d;
+}
+
+sub fmuls($$$) {
+	my($b,$n,$d) = @_;
+	my($s,$m);
+	my($thres) = bint(1) << ($b-1);
+
+	$n = bint($n);
+	$d = bint($d);
+
+	for ($s = 0; 1; $s++) {
+		$m = fmul($s,$n,$d);
+		return $s if ($m >= $thres);
+	}
+	return 0;
+}
+
+# Provides mul, adj, and shr factors for a specific
+# (bit, time, hz) combination
+sub muladj($$$) {
+	my($b, $t, $hz) = @_;
+	my $s = fmuls($b, $t, $hz);
+	my $m = fmul($s, $t, $hz);
+	my $a = fadj($s, $t, $hz);
+	return ($m->as_hex(), $a->as_hex(), $s);
+}
+
+# Provides numerator, denominator values
+sub numden($$) {
+	my($n, $d) = @_;
+	my $g = bgcd($n, $d);
+	return ($n/$g, $d/$g);
+}
+
+# All values for a specific (time, hz) combo
+sub conversions($$) {
+	my ($t, $hz) = @_;
+	my @val = ();
+
+	# HZ_TO_xx
+	push(@val, muladj(32, $t, $hz));
+	push(@val, muladj(64, $t, $hz));
+	push(@val, numden($t, $hz));
+
+	# xx_TO_HZ
+	push(@val, muladj(32, $hz, $t));
+	push(@val, muladj(64, $hz, $t));
+	push(@val, numden($hz, $t));
+
+	return @val;
+}
+
+sub compute_values($) {
+	my($hz) = @_;
+	my @val = ();
+	my $s, $m, $a, $g;
+
+	if (!$has_bigint) {
+		die "$0: HZ == $hz not canned and ".
+		    "Math::BigInt not available\n";
+	}
+
+	# MSEC conversions
+	push(@val, conversions(1000, $hz));
+
+	# USEC conversions
+	push(@val, conversions(1000000, $hz));
+
+	return @val;
+}
+
+sub output($@)
+{
+	my($hz, @val) = @_;
+	my $pfx, $bit, $suf, $s, $m, $a;
+
+	print "/* Automatically generated by kernel/timeconst.pl */\n";
+	print "/* Conversion constants for HZ == $hz */\n";
+	print "\n";
+	print "#ifndef KERNEL_TIMECONST_H\n";
+	print "#define KERNEL_TIMECONST_H\n";
+	print "\n";
+
+	print "#include <linux/param.h>\n";
+
+	print "\n";
+	print "#if HZ != $hz\n";
+	print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
+	print "#endif\n";
+	print "\n";
+
+	foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
+		      'HZ_TO_USEC','USEC_TO_HZ') {
+		foreach $bit (32, 64) {
+			foreach $suf ('MUL', 'ADJ', 'SHR') {
+				printf "#define %-23s %s\n",
+					"${pfx}_$suf$bit", shift(@val);
+			}
+		}
+		foreach $suf ('NUM', 'DEN') {
+			printf "#define %-23s %s\n",
+				"${pfx}_$suf", shift(@val);
+		}
+	}
+
+	print "\n";
+	print "#endif /* KERNEL_TIMECONST_H */\n";
+}
+
+($hz) = @ARGV;
+
+# Use this to generate the %canned_values structure
+if ($hz eq '--can') {
+	shift(@ARGV);
+	@hzlist = sort {$a <=> $b} (@ARGV);
+
+	print "# Precomputed values for systems without Math::BigInt\n";
+	print "# Generated by:\n";
+	print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
+	print "\%canned_values = (\n";
+	my $pf = "\t";
+	foreach $hz (@hzlist) {
+		my @values = compute_values($hz);
+		print "$pf$hz => [\n";
+		while (scalar(@values)) {
+			my $bit;
+			foreach $bit (32, 64) {
+				my $m = shift(@values);
+				my $a = shift(@values);
+				my $s = shift(@values);
+				print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n";
+			}
+			my $n = shift(@values);
+			my $d = shift(@values);
+			print "\t\t",$n,',',$d,",\n";
+		}
+		print "\t]";
+		$pf = ', ';
+	}
+	print "\n);\n";
+} else {
+	$hz += 0;			# Force to number
+	if ($hz < 1) {
+		die "Usage: $0 HZ\n";
+	}
+
+	@val = @{$canned_values{$hz}};
+	if (!defined(@val)) {
+		@val = compute_values($hz);
+	}
+	output($hz, @val);
+}
+exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 9fbb472b8cf..99b00a25f88 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -327,7 +327,7 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
  * init_timer() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
-void fastcall init_timer(struct timer_list *timer)
+void init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
 	timer->base = __raw_get_cpu_var(tvec_bases);
@@ -339,7 +339,7 @@ void fastcall init_timer(struct timer_list *timer)
 }
 EXPORT_SYMBOL(init_timer);
 
-void fastcall init_timer_deferrable(struct timer_list *timer)
+void init_timer_deferrable(struct timer_list *timer)
 {
 	init_timer(timer);
 	timer_set_deferrable(timer);
@@ -818,12 +818,14 @@ unsigned long next_timer_interrupt(void)
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 void account_process_tick(struct task_struct *p, int user_tick)
 {
+	cputime_t one_jiffy = jiffies_to_cputime(1);
+
 	if (user_tick) {
-		account_user_time(p, jiffies_to_cputime(1));
-		account_user_time_scaled(p, jiffies_to_cputime(1));
+		account_user_time(p, one_jiffy);
+		account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
 	} else {
-		account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
-		account_system_time_scaled(p, jiffies_to_cputime(1));
+		account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
+		account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
 	}
 }
 #endif
@@ -977,7 +979,7 @@ asmlinkage long sys_getppid(void)
 	int pid;
 
 	rcu_read_lock();
-	pid = task_tgid_nr_ns(current->real_parent, current->nsproxy->pid_ns);
+	pid = task_tgid_vnr(current->real_parent);
 	rcu_read_unlock();
 
 	return pid;
@@ -1040,7 +1042,7 @@ static void process_timeout(unsigned long __data)
  *
  * In all cases the return value is guaranteed to be non-negative.
  */
-fastcall signed long __sched schedule_timeout(signed long timeout)
+signed long __sched schedule_timeout(signed long timeout)
 {
 	struct timer_list timer;
 	unsigned long expire;
diff --git a/kernel/user.c b/kernel/user.c
index bc1c48d35cb..7132022a040 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,6 +17,14 @@
 #include <linux/module.h>
 #include <linux/user_namespace.h>
 
+struct user_namespace init_user_ns = {
+	.kref = {
+		.refcount	= ATOMIC_INIT(2),
+	},
+	.root_user = &root_user,
+};
+EXPORT_SYMBOL_GPL(init_user_ns);
+
 /*
  * UID task count cache, to get fast user lookup in "alloc_uid"
  * when changing user ID's (ie setuid() and friends).
@@ -49,7 +57,7 @@ struct user_struct root_user = {
 	.uid_keyring	= &root_user_keyring,
 	.session_keyring = &root_session_keyring,
 #endif
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
 	.tg		= &init_task_group,
 #endif
 };
@@ -82,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 	return NULL;
 }
 
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
 
 static void sched_destroy_user(struct user_struct *up)
 {
@@ -105,15 +113,15 @@ static void sched_switch_user(struct task_struct *p)
 	sched_move_task(p);
 }
 
-#else	/* CONFIG_FAIR_USER_SCHED */
+#else	/* CONFIG_USER_SCHED */
 
 static void sched_destroy_user(struct user_struct *up) { }
 static int sched_create_user(struct user_struct *up) { return 0; }
 static void sched_switch_user(struct task_struct *p) { }
 
-#endif	/* CONFIG_FAIR_USER_SCHED */
+#endif	/* CONFIG_USER_SCHED */
 
-#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
 
 static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
 static DEFINE_MUTEX(uids_mutex);
@@ -129,6 +137,7 @@ static inline void uids_mutex_unlock(void)
 }
 
 /* uid directory attributes */
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static ssize_t cpu_shares_show(struct kobject *kobj,
 			       struct kobj_attribute *attr,
 			       char *buf)
@@ -155,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
 
 static struct kobj_attribute cpu_share_attr =
 	__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   char *buf)
+{
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+	return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+}
+
+static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    const char *buf, size_t size)
+{
+	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+	unsigned long rt_runtime;
+	int rc;
+
+	sscanf(buf, "%lu", &rt_runtime);
+
+	rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
+
+	return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_rt_runtime_attr =
+	__ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+#endif
 
 /* default attributes per uid directory */
 static struct attribute *uids_attributes[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	&cpu_share_attr.attr,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+	&cpu_rt_runtime_attr.attr,
+#endif
 	NULL
 };
 
@@ -261,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
 	schedule_work(&up->work);
 }
 
-#else	/* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
+#else	/* CONFIG_USER_SCHED && CONFIG_SYSFS */
 
 int uids_sysfs_init(void) { return 0; }
 static inline int uids_user_create(struct user_struct *up) { return 0; }
@@ -365,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			/* This case is not possible when CONFIG_FAIR_USER_SCHED
+			/* This case is not possible when CONFIG_USER_SCHED
 			 * is defined, since we serialize alloc_uid() using
 			 * uids_mutex. Hence no need to call
 			 * sched_destroy_user() or remove_user_sysfs_dir().
@@ -427,6 +471,7 @@ void switch_uid(struct user_struct *new_user)
 	suid_keys(current);
 }
 
+#ifdef CONFIG_USER_NS
 void release_uids(struct user_namespace *ns)
 {
 	int i;
@@ -451,6 +496,7 @@ void release_uids(struct user_namespace *ns)
 
 	free_uid(ns->root_user);
 }
+#endif
 
 static int __init uid_cache_init(void)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 7af90fc4f0f..4c9006275df 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -10,17 +10,6 @@
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
 
-struct user_namespace init_user_ns = {
-	.kref = {
-		.refcount	= ATOMIC_INIT(2),
-	},
-	.root_user = &root_user,
-};
-
-EXPORT_SYMBOL_GPL(init_user_ns);
-
-#ifdef CONFIG_USER_NS
-
 /*
  * Clone a new ns copying an original user ns, setting refcount to 1
  * @old_ns: namespace to clone
@@ -84,5 +73,3 @@ void free_user_ns(struct kref *kref)
 	release_uids(ns);
 	kfree(ns);
 }
-
-#endif /* CONFIG_USER_NS */
diff --git a/kernel/wait.c b/kernel/wait.c
index f9876888a56..c275c56cf2d 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -18,7 +18,7 @@ void init_waitqueue_head(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(init_waitqueue_head);
 
-void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
 	unsigned long flags;
 
@@ -29,7 +29,7 @@ void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(add_wait_queue);
 
-void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
 {
 	unsigned long flags;
 
@@ -40,7 +40,7 @@ void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 
-void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
 {
 	unsigned long flags;
 
@@ -63,7 +63,7 @@ EXPORT_SYMBOL(remove_wait_queue);
  * stops them from bleeding out - it would still allow subsequent
  * loads to move into the critical region).
  */
-void fastcall
+void
 prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
 {
 	unsigned long flags;
@@ -82,7 +82,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait);
 
-void fastcall
+void
 prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 {
 	unsigned long flags;
@@ -101,7 +101,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
 
-void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 {
 	unsigned long flags;
 
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(wake_bit_function);
  * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
  * permitted return codes. Nonzero return codes halt waiting and return.
  */
-int __sched fastcall
+int __sched
 __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
 			int (*action)(void *), unsigned mode)
 {
@@ -173,7 +173,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
 }
 EXPORT_SYMBOL(__wait_on_bit);
 
-int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
+int __sched out_of_line_wait_on_bit(void *word, int bit,
 					int (*action)(void *), unsigned mode)
 {
 	wait_queue_head_t *wq = bit_waitqueue(word, bit);
@@ -183,7 +183,7 @@ int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit);
 
-int __sched fastcall
+int __sched
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
 			int (*action)(void *), unsigned mode)
 {
@@ -201,7 +201,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
 }
 EXPORT_SYMBOL(__wait_on_bit_lock);
 
-int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
 					int (*action)(void *), unsigned mode)
 {
 	wait_queue_head_t *wq = bit_waitqueue(word, bit);
@@ -211,7 +211,7 @@ int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
 }
 EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
 
-void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
 {
 	struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
 	if (waitqueue_active(wq))
@@ -236,13 +236,13 @@ EXPORT_SYMBOL(__wake_up_bit);
  * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
  * because spin_unlock() does not guarantee a memory barrier.
  */
-void fastcall wake_up_bit(void *word, int bit)
+void wake_up_bit(void *word, int bit)
 {
 	__wake_up_bit(bit_waitqueue(word, bit), word, bit);
 }
 EXPORT_SYMBOL(wake_up_bit);
 
-fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit)
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
 {
 	const int shift = BITS_PER_LONG == 32 ? 5 : 6;
 	const struct zone *zone = page_zone(virt_to_page(word));
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52db48e7f6e..ff06611655a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -161,7 +161,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
  */
-int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
+int queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
 	int ret = 0;
 
@@ -175,7 +175,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(queue_work);
 
-void delayed_work_timer_fn(unsigned long __data)
+static void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
 	struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
@@ -192,7 +192,7 @@ void delayed_work_timer_fn(unsigned long __data)
  *
  * Returns 0 if @work was already on a queue, non-zero otherwise.
  */
-int fastcall queue_delayed_work(struct workqueue_struct *wq,
+int queue_delayed_work(struct workqueue_struct *wq,
 			struct delayed_work *dwork, unsigned long delay)
 {
 	timer_stats_timer_set_start_info(&dwork->timer);
@@ -388,7 +388,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
  * This function used to run the workqueues itself.  Now we just wait for the
  * helper threads to do it.
  */
-void fastcall flush_workqueue(struct workqueue_struct *wq)
+void flush_workqueue(struct workqueue_struct *wq)
 {
 	const cpumask_t *cpu_map = wq_cpu_map(wq);
 	int cpu;
@@ -546,7 +546,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
  *
  * This puts a job in the kernel-global workqueue.
  */
-int fastcall schedule_work(struct work_struct *work)
+int schedule_work(struct work_struct *work)
 {
 	return queue_work(keventd_wq, work);
 }
@@ -560,7 +560,7 @@ EXPORT_SYMBOL(schedule_work);
  * After waiting for a given time this puts a job in the kernel-global
  * workqueue.
  */
-int fastcall schedule_delayed_work(struct delayed_work *dwork,
+int schedule_delayed_work(struct delayed_work *dwork,
 					unsigned long delay)
 {
 	timer_stats_timer_set_start_info(&dwork->timer);