aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile16
-rw-r--r--kernel/audit.c12
-rw-r--r--kernel/audit_tree.c28
-rw-r--r--kernel/auditfilter.c15
-rw-r--r--kernel/auditsc.c28
-rw-r--r--kernel/cgroup.c318
-rw-r--r--kernel/compat.c43
-rw-r--r--kernel/cpu.c2
-rw-r--r--kernel/cpuset.c413
-rw-r--r--kernel/exit.c358
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/hrtimer.c103
-rw-r--r--kernel/irq/chip.c46
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/itimer.c2
-rw-r--r--kernel/kallsyms.c11
-rw-r--r--kernel/kexec.c18
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kprobes.c9
-rw-r--r--kernel/marker.c677
-rw-r--r--kernel/module.c38
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex.c29
-rw-r--r--kernel/notifier.c1
-rw-r--r--kernel/nsproxy.c1
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c22
-rw-r--r--kernel/pid.c209
-rw-r--r--kernel/pid_namespace.c197
-rw-r--r--kernel/posix-cpu-timers.c8
-rw-r--r--kernel/posix-timers.c27
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/printk.c52
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c22
-rw-r--r--kernel/rcupdate.c5
-rw-r--r--kernel/relay.c24
-rw-r--r--kernel/res_counter.c134
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex-debug.c12
-rw-r--r--kernel/rtmutex.c5
-rw-r--r--kernel/rtmutex_common.h2
-rw-r--r--kernel/sched.c510
-rw-r--r--kernel/sched_rt.c102
-rw-r--r--kernel/signal.c179
-rw-r--r--kernel/softirq.c8
-rw-r--r--kernel/srcu.c3
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c79
-rw-r--r--kernel/sysctl.c78
-rw-r--r--kernel/sysctl_check.c151
-rw-r--r--kernel/test_kprobes.c16
-rw-r--r--kernel/time.c46
-rw-r--r--kernel/time/clockevents.c13
-rw-r--r--kernel/time/clocksource.c20
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c6
-rw-r--r--kernel/timeconst.pl402
-rw-r--r--kernel/timer.c18
-rw-r--r--kernel/user.c60
-rw-r--r--kernel/user_namespace.c13
-rw-r--r--kernel/wait.c26
-rw-r--r--kernel/workqueue.c12
67 files changed, 3106 insertions, 1613 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index f2ab70073bd..ab4f1090f43 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -3,3 +3,4 @@
#
config_data.h
config_data.gz
+timeconst.h
diff --git a/kernel/Makefile b/kernel/Makefile
index 135a1b94344..6c584c55a6e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -4,12 +4,12 @@
obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
exit.o itimer.o time.o softirq.o resource.o \
- sysctl.o capability.o ptrace.o timer.o user.o user_namespace.o \
+ sysctl.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o \
- utsname.o notifier.o ksysfs.o pm_qos_params.o
+ notifier.o ksysfs.o pm_qos_params.o
obj-$(CONFIG_SYSCTL) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -42,7 +42,11 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_UTS_NS) += utsname.o
+obj-$(CONFIG_USER_NS) += user_namespace.o
+obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -87,3 +91,11 @@ quiet_cmd_ikconfiggz = IKCFG $@
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call if_changed,ikconfiggz)
+
+$(obj)/time.o: $(obj)/timeconst.h
+
+quiet_cmd_timeconst = TIMEC $@
+ cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@
+targets += timeconst.h
+$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
+ $(call if_changed,timeconst)
diff --git a/kernel/audit.c b/kernel/audit.c
index c8555b18021..2eeea9a1424 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1312,26 +1312,26 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
- struct dentry *dentry, struct vfsmount *vfsmnt)
+ struct path *path)
{
- char *p, *path;
+ char *p, *pathname;
if (prefix)
audit_log_format(ab, " %s", prefix);
/* We will allow 11 spaces for ' (deleted)' to be appended */
- path = kmalloc(PATH_MAX+11, ab->gfp_mask);
- if (!path) {
+ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
+ if (!pathname) {
audit_log_format(ab, "<no memory>");
return;
}
- p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
+ p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
audit_log_format(ab, "<too long>");
} else
audit_log_untrustedstring(ab, p);
- kfree(path);
+ kfree(pathname);
}
/**
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f4fcf58f20f..9ef5e0aacc3 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -549,8 +549,8 @@ void audit_trim_trees(void)
if (err)
goto skip_it;
- root_mnt = collect_mounts(nd.mnt, nd.dentry);
- path_release(&nd);
+ root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
+ path_put(&nd.path);
if (!root_mnt)
goto skip_it;
@@ -583,17 +583,17 @@ skip_it:
static int is_under(struct vfsmount *mnt, struct dentry *dentry,
struct nameidata *nd)
{
- if (mnt != nd->mnt) {
+ if (mnt != nd->path.mnt) {
for (;;) {
if (mnt->mnt_parent == mnt)
return 0;
- if (mnt->mnt_parent == nd->mnt)
+ if (mnt->mnt_parent == nd->path.mnt)
break;
mnt = mnt->mnt_parent;
}
dentry = mnt->mnt_mountpoint;
}
- return is_subdir(dentry, nd->dentry);
+ return is_subdir(dentry, nd->path.dentry);
}
int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -641,8 +641,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
err = path_lookup(tree->pathname, 0, &nd);
if (err)
goto Err;
- mnt = collect_mounts(nd.mnt, nd.dentry);
- path_release(&nd);
+ mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
+ path_put(&nd.path);
if (!mnt) {
err = -ENOMEM;
goto Err;
@@ -701,8 +701,8 @@ int audit_tag_tree(char *old, char *new)
err = path_lookup(new, 0, &nd);
if (err)
return err;
- tagged = collect_mounts(nd.mnt, nd.dentry);
- path_release(&nd);
+ tagged = collect_mounts(nd.path.mnt, nd.path.dentry);
+ path_put(&nd.path);
if (!tagged)
return -ENOMEM;
@@ -711,9 +711,9 @@ int audit_tag_tree(char *old, char *new)
drop_collected_mounts(tagged);
return err;
}
- mnt = mntget(nd.mnt);
- dentry = dget(nd.dentry);
- path_release(&nd);
+ mnt = mntget(nd.path.mnt);
+ dentry = dget(nd.path.dentry);
+ path_put(&nd.path);
if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
follow_up(&mnt, &dentry);
@@ -744,13 +744,13 @@ int audit_tag_tree(char *old, char *new)
spin_lock(&vfsmount_lock);
if (!is_under(mnt, dentry, &nd)) {
spin_unlock(&vfsmount_lock);
- path_release(&nd);
+ path_put(&nd.path);
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
spin_unlock(&vfsmount_lock);
- path_release(&nd);
+ path_put(&nd.path);
list_for_each_entry(p, &list, mnt_list) {
failed = tag_chunk(p->mnt_root->d_inode, tree);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6f19fd477aa..2f2914b7cc3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -169,8 +169,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
inotify_init_watch(&parent->wdata);
/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
get_inotify_watch(&parent->wdata);
- wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
- AUDIT_IN_WATCH);
+ wd = inotify_add_watch(audit_ih, &parent->wdata,
+ ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
if (wd < 0) {
audit_free_parent(&parent->wdata);
return ERR_PTR(wd);
@@ -1161,11 +1161,11 @@ static int audit_get_nd(char *path, struct nameidata **ndp,
static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
{
if (ndp) {
- path_release(ndp);
+ path_put(&ndp->path);
kfree(ndp);
}
if (ndw) {
- path_release(ndw);
+ path_put(&ndw->path);
kfree(ndw);
}
}
@@ -1214,8 +1214,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
/* update watch filter fields */
if (ndw) {
- watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
- watch->ino = ndw->dentry->d_inode->i_ino;
+ watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+ watch->ino = ndw->path.dentry->d_inode->i_ino;
}
/* The audit_filter_mutex must not be held during inotify calls because
@@ -1225,7 +1225,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
*/
mutex_unlock(&audit_filter_mutex);
- if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
+ if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+ &i_watch) < 0) {
parent = audit_init_parent(ndp);
if (IS_ERR(parent)) {
/* caller expects mutex locked */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c06ecf38d7..ac6d9b23b01 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -208,8 +208,7 @@ struct audit_context {
int name_count;
struct audit_names names[AUDIT_NAMES];
char * filterkey; /* key for rule that triggered record */
- struct dentry * pwd;
- struct vfsmount * pwdmnt;
+ struct path pwd;
struct audit_context *previous; /* For nested syscalls */
struct audit_aux_data *aux;
struct audit_aux_data *aux_pids;
@@ -786,12 +785,9 @@ static inline void audit_free_names(struct audit_context *context)
__putname(context->names[i].name);
}
context->name_count = 0;
- if (context->pwd)
- dput(context->pwd);
- if (context->pwdmnt)
- mntput(context->pwdmnt);
- context->pwd = NULL;
- context->pwdmnt = NULL;
+ path_put(&context->pwd);
+ context->pwd.dentry = NULL;
+ context->pwd.mnt = NULL;
}
static inline void audit_free_aux(struct audit_context *context)
@@ -930,8 +926,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
if ((vma->vm_flags & VM_EXECUTABLE) &&
vma->vm_file) {
audit_log_d_path(ab, "exe=",
- vma->vm_file->f_path.dentry,
- vma->vm_file->f_path.mnt);
+ &vma->vm_file->f_path);
break;
}
vma = vma->vm_next;
@@ -1341,10 +1336,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->target_sid, context->target_comm))
call_panic = 1;
- if (context->pwd && context->pwdmnt) {
+ if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
- audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
+ audit_log_d_path(ab, "cwd=", &context->pwd);
audit_log_end(ab);
}
}
@@ -1367,8 +1362,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
case 0:
/* name was specified as a relative path and the
* directory component is the cwd */
- audit_log_d_path(ab, " name=", context->pwd,
- context->pwdmnt);
+ audit_log_d_path(ab, " name=", &context->pwd);
break;
default:
/* log the name's directory component */
@@ -1695,10 +1689,10 @@ void __audit_getname(const char *name)
context->names[context->name_count].ino = (unsigned long)-1;
context->names[context->name_count].osid = 0;
++context->name_count;
- if (!context->pwd) {
+ if (!context->pwd.dentry) {
read_lock(&current->fs->lock);
- context->pwd = dget(current->fs->pwd);
- context->pwdmnt = mntget(current->fs->pwdmnt);
+ context->pwd = current->fs->pwd;
+ path_get(&current->fs->pwd);
read_unlock(&current->fs->lock);
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1a3c23936d4..4766bb65e4d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -141,7 +141,7 @@ enum {
ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
};
-inline int cgroup_is_releasable(const struct cgroup *cgrp)
+static int cgroup_is_releasable(const struct cgroup *cgrp)
{
const int bits =
(1 << CGRP_RELEASABLE) |
@@ -149,7 +149,7 @@ inline int cgroup_is_releasable(const struct cgroup *cgrp)
return (cgrp->flags & bits) == bits;
}
-inline int notify_on_release(const struct cgroup *cgrp)
+static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
@@ -489,7 +489,7 @@ static struct css_set *find_css_set(
* Any task can increment and decrement the count field without lock.
* So in general, code holding cgroup_mutex can't rely on the count
* field not changing. However, if the count goes to zero, then only
- * attach_task() can increment it again. Because a count of zero
+ * cgroup_attach_task() can increment it again. Because a count of zero
* means that no tasks are currently attached, therefore there is no
* way a task attached to that cgroup can fork (the other way to
* increment the count). So code holding cgroup_mutex can safely
@@ -520,17 +520,17 @@ static struct css_set *find_css_set(
* The task_lock() exception
*
* The need for this exception arises from the action of
- * attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
* another. It does so using cgroup_mutexe, however there are
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
- * in attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
* P.S. One more locking exception. RCU is used to guard the
- * update of a tasks cgroup pointer by attach_task()
+ * update of a tasks cgroup pointer by cgroup_attach_task()
*/
/**
@@ -586,11 +586,27 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
return inode;
}
+/*
+ * Call subsys's pre_destroy handler.
+ * This is called before css refcnt check.
+ */
+
+static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+{
+ struct cgroup_subsys *ss;
+ for_each_subsys(cgrp->root, ss)
+ if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
+ ss->pre_destroy(ss, cgrp);
+ return;
+}
+
+
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
if (S_ISDIR(inode->i_mode)) {
struct cgroup *cgrp = dentry->d_fsdata;
+ struct cgroup_subsys *ss;
BUG_ON(!(cgroup_is_removed(cgrp)));
/* It's possible for external users to be holding css
* reference counts on a cgroup; css_put() needs to
@@ -599,6 +615,23 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
* queue the cgroup to be handled by the release
* agent */
synchronize_rcu();
+
+ mutex_lock(&cgroup_mutex);
+ /*
+ * Release the subsystem state objects.
+ */
+ for_each_subsys(cgrp->root, ss) {
+ if (cgrp->subsys[ss->subsys_id])
+ ss->destroy(ss, cgrp);
+ }
+
+ cgrp->root->number_of_cgroups--;
+ mutex_unlock(&cgroup_mutex);
+
+ /* Drop the active superblock reference that we took when we
+ * created the cgroup */
+ deactivate_super(cgrp->root->sb);
+
kfree(cgrp);
}
iput(inode);
@@ -1161,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
* Call holding cgroup_mutex. May take task_lock of
* the task 'pid' during call.
*/
-static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
int retval = 0;
struct cgroup_subsys *ss;
@@ -1181,9 +1214,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
retval = ss->can_attach(ss, cgrp, tsk);
- if (retval) {
+ if (retval)
return retval;
- }
}
}
@@ -1192,9 +1224,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
* based on its final set of cgroups
*/
newcg = find_css_set(cg, cgrp);
- if (!newcg) {
+ if (!newcg)
return -ENOMEM;
- }
task_lock(tsk);
if (tsk->flags & PF_EXITING) {
@@ -1214,9 +1245,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
- if (ss->attach) {
+ if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, tsk);
- }
}
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
@@ -1239,7 +1269,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
if (pid) {
rcu_read_lock();
- tsk = find_task_by_pid(pid);
+ tsk = find_task_by_vpid(pid);
if (!tsk || tsk->flags & PF_EXITING) {
rcu_read_unlock();
return -ESRCH;
@@ -1257,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
get_task_struct(tsk);
}
- ret = attach_task(cgrp, tsk);
+ ret = cgroup_attach_task(cgrp, tsk);
put_task_struct(tsk);
return ret;
}
@@ -1329,9 +1359,14 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
goto out1;
}
buffer[nbytes] = 0; /* nul-terminate */
+ strstrip(buffer); /* strip -just- trailing whitespace */
mutex_lock(&cgroup_mutex);
+ /*
+ * This was already checked for in cgroup_file_write(), but
+ * check again now we're holding cgroup_mutex.
+ */
if (cgroup_is_removed(cgrp)) {
retval = -ENODEV;
goto out2;
@@ -1349,24 +1384,9 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
break;
case FILE_RELEASE_AGENT:
- {
- struct cgroupfs_root *root = cgrp->root;
- /* Strip trailing newline */
- if (nbytes && (buffer[nbytes-1] == '\n')) {
- buffer[nbytes-1] = 0;
- }
- if (nbytes < sizeof(root->release_agent_path)) {
- /* We never write anything other than '\0'
- * into the last char of release_agent_path,
- * so it always remains a NUL-terminated
- * string */
- strncpy(root->release_agent_path, buffer, nbytes);
- root->release_agent_path[nbytes] = 0;
- } else {
- retval = -ENOSPC;
- }
+ BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ strcpy(cgrp->root->release_agent_path, buffer);
break;
- }
default:
retval = -EINVAL;
goto out2;
@@ -1387,7 +1407,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- if (!cft)
+ if (!cft || cgroup_is_removed(cgrp))
return -ENODEV;
if (cft->write)
return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1457,7 +1477,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
- if (!cft)
+ if (!cft || cgroup_is_removed(cgrp))
return -ENODEV;
if (cft->read)
@@ -1675,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
it->task = cg->tasks.next;
}
+/*
+ * To reduce the fork() overhead for systems that are not actually
+ * using their cgroups capability, we don't maintain the lists running
+ * through each css_set to its tasks until we see the list actually
+ * used - in other words after the first call to cgroup_iter_start().
+ *
+ * The tasklist_lock is not held here, as do_each_thread() and
+ * while_each_thread() are protected by RCU.
+ */
+void cgroup_enable_task_cg_lists(void)
+{
+ struct task_struct *p, *g;
+ write_lock(&css_set_lock);
+ use_task_css_set_links = 1;
+ do_each_thread(g, p) {
+ task_lock(p);
+ if (list_empty(&p->cg_list))
+ list_add(&p->cg_list, &p->cgroups->tasks);
+ task_unlock(p);
+ } while_each_thread(g, p);
+ write_unlock(&css_set_lock);
+}
+
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
{
/*
@@ -1682,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
* we need to enable the list linking each css_set to its
* tasks, and fix up all existing tasks.
*/
- if (!use_task_css_set_links) {
- struct task_struct *p, *g;
- write_lock(&css_set_lock);
- use_task_css_set_links = 1;
- do_each_thread(g, p) {
- task_lock(p);
- if (list_empty(&p->cg_list))
- list_add(&p->cg_list, &p->cgroups->tasks);
- task_unlock(p);
- } while_each_thread(g, p);
- write_unlock(&css_set_lock);
- }
+ if (!use_task_css_set_links)
+ cgroup_enable_task_cg_lists();
+
read_lock(&css_set_lock);
it->cg_link = &cgrp->css_sets;
cgroup_advance_iter(cgrp, it);
@@ -1726,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
read_unlock(&css_set_lock);
}
+static inline int started_after_time(struct task_struct *t1,
+ struct timespec *time,
+ struct task_struct *t2)
+{
+ int start_diff = timespec_compare(&t1->start_time, time);
+ if (start_diff > 0) {
+ return 1;
+ } else if (start_diff < 0) {
+ return 0;
+ } else {
+ /*
+ * Arbitrarily, if two processes started at the same
+ * time, we'll say that the lower pointer value
+ * started first. Note that t2 may have exited by now
+ * so this may not be a valid pointer any longer, but
+ * that's fine - it still serves to distinguish
+ * between two tasks started (effectively) simultaneously.
+ */
+ return t1 > t2;
+ }
+}
+
+/*
+ * This function is a callback from heap_insert() and is used to order
+ * the heap.
+ * In this case we order the heap in descending task start time.
+ */
+static inline int started_after(void *p1, void *p2)
+{
+ struct task_struct *t1 = p1;
+ struct task_struct *t2 = p2;
+ return started_after_time(t1, &t2->start_time, t2);
+}
+
+/**
+ * cgroup_scan_tasks - iterate though all the tasks in a cgroup
+ * @scan: struct cgroup_scanner containing arguments for the scan
+ *
+ * Arguments include pointers to callback functions test_task() and
+ * process_task().
+ * Iterate through all the tasks in a cgroup, calling test_task() for each,
+ * and if it returns true, call process_task() for it also.
+ * The test_task pointer may be NULL, meaning always true (select all tasks).
+ * Effectively duplicates cgroup_iter_{start,next,end}()
+ * but does not lock css_set_lock for the call to process_task().
+ * The struct cgroup_scanner may be embedded in any structure of the caller's
+ * creation.
+ * It is guaranteed that process_task() will act on every task that
+ * is a member of the cgroup for the duration of this call. This
+ * function may or may not call process_task() for tasks that exit
+ * or move to a different cgroup during the call, or are forked or
+ * move into the cgroup during the call.
+ *
+ * Note that test_task() may be called with locks held, and may in some
+ * situations be called multiple times for the same task, so it should
+ * be cheap.
+ * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
+ * pre-allocated and will be used for heap operations (and its "gt" member will
+ * be overwritten), else a temporary heap will be used (allocation of which
+ * may cause this function to fail).
+ */
+int cgroup_scan_tasks(struct cgroup_scanner *scan)
+{
+ int retval, i;
+ struct cgroup_iter it;
+ struct task_struct *p, *dropped;
+ /* Never dereference latest_task, since it's not refcounted */
+ struct task_struct *latest_task = NULL;
+ struct ptr_heap tmp_heap;
+ struct ptr_heap *heap;
+ struct timespec latest_time = { 0, 0 };
+
+ if (scan->heap) {
+ /* The caller supplied our heap and pre-allocated its memory */
+ heap = scan->heap;
+ heap->gt = &started_after;
+ } else {
+ /* We need to allocate our own heap memory */
+ heap = &tmp_heap;
+ retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+ if (retval)
+ /* cannot allocate the heap */
+ return retval;
+ }
+
+ again:
+ /*
+ * Scan tasks in the cgroup, using the scanner's "test_task" callback
+ * to determine which are of interest, and using the scanner's
+ * "process_task" callback to process any of them that need an update.
+ * Since we don't want to hold any locks during the task updates,
+ * gather tasks to be processed in a heap structure.
+ * The heap is sorted by descending task start time.
+ * If the statically-sized heap fills up, we overflow tasks that
+ * started later, and in future iterations only consider tasks that
+ * started after the latest task in the previous pass. This
+ * guarantees forward progress and that we don't miss any tasks.
+ */
+ heap->size = 0;
+ cgroup_iter_start(scan->cg, &it);
+ while ((p = cgroup_iter_next(scan->cg, &it))) {
+ /*
+ * Only affect tasks that qualify per the caller's callback,
+ * if he provided one
+ */
+ if (scan->test_task && !scan->test_task(p, scan))
+ continue;
+ /*
+ * Only process tasks that started after the last task
+ * we processed
+ */
+ if (!started_after_time(p, &latest_time, latest_task))
+ continue;
+ dropped = heap_insert(heap, p);
+ if (dropped == NULL) {
+ /*
+ * The new task was inserted; the heap wasn't
+ * previously full
+ */
+ get_task_struct(p);
+ } else if (dropped != p) {
+ /*
+ * The new task was inserted, and pushed out a
+ * different task
+ */
+ get_task_struct(p);
+ put_task_struct(dropped);
+ }
+ /*
+ * Else the new task was newer than anything already in
+ * the heap and wasn't inserted
+ */
+ }
+ cgroup_iter_end(scan->cg, &it);
+
+ if (heap->size) {
+ for (i = 0; i < heap->size; i++) {
+ struct task_struct *p = heap->ptrs[i];
+ if (i == 0) {
+ latest_time = p->start_time;
+ latest_task = p;
+ }
+ /* Process the task per the caller's callback */
+ scan->process_task(p, scan);
+ put_task_struct(p);
+ }
+ /*
+ * If we had to process any tasks at all, scan again
+ * in case some of them were in the middle of forking
+ * children that didn't get processed.
+ * Not the most efficient way to do it, but it avoids
+ * having to take callback_mutex in the fork path
+ */
+ goto again;
+ }
+ if (heap == &tmp_heap)
+ heap_free(&tmp_heap);
+ return 0;
+}
+
/*
* Stuff for reading the 'tasks' file.
*
@@ -1761,7 +1955,7 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
while ((tsk = cgroup_iter_next(cgrp, &it))) {
if (unlikely(n == npids))
break;
- pidarray[n++] = task_pid_nr(tsk);
+ pidarray[n++] = task_pid_vnr(tsk);
}
cgroup_iter_end(cgrp, &it);
return n;
@@ -2126,9 +2320,8 @@ static inline int cgroup_has_css_refs(struct cgroup *cgrp)
* matter, since it can only happen if the cgroup
* has been deleted and hence no longer needs the
* release agent to be called anyway. */
- if (css && atomic_read(&css->refcnt)) {
+ if (css && atomic_read(&css->refcnt))
return 1;
- }
}
return 0;
}
@@ -2138,7 +2331,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
- struct cgroup_subsys *ss;
struct super_block *sb;
struct cgroupfs_root *root;
@@ -2157,17 +2349,19 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
parent = cgrp->parent;
root = cgrp->root;
sb = root->sb;
+ /*
+ * Call pre_destroy handlers of subsys
+ */
+ cgroup_call_pre_destroy(cgrp);
+ /*
+ * Notify subsyses that rmdir() request comes.
+ */
if (cgroup_has_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
- for_each_subsys(root, ss) {
- if (cgrp->subsys[ss->subsys_id])
- ss->destroy(ss, cgrp);
- }
-
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
@@ -2182,15 +2376,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
cgroup_d_remove_dir(d);
dput(d);
- root->number_of_cgroups--;
set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
mutex_unlock(&cgroup_mutex);
- /* Drop the active superblock reference that we took when we
- * created the cgroup */
- deactivate_super(sb);
return 0;
}
@@ -2324,7 +2514,7 @@ out:
* - Used for /proc/<pid>/cgroup.
* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
* doesn't really matter if tsk->cgroup changes after we read it,
- * and we take cgroup_mutex, keeping attach_task() from changing it
+ * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
* anyway. No need to check that tsk->cgroup != NULL, thanks to
* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
* cgroup to top_cgroup.
@@ -2435,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
* it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer. attach_task() might
+ * might no longer be a valid cgroup pointer. cgroup_attach_task() might
* have already changed current->cgroups, allowing the previously
* referenced cgroup group to be removed and freed.
*
@@ -2514,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
* attach us to a different cgroup, decrementing the count on
* the first cgroup that we never incremented. But in this case,
* top_cgroup isn't going away, and either task has PF_EXITING set,
- * which wards off any attach_task() attempts, or task is a failed
- * fork, never visible to attach_task.
+ * which wards off any cgroup_attach_task() attempts, or task is a failed
+ * fork, never visible to cgroup_attach_task.
*
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
@@ -2655,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
}
/* All seems fine. Finish by moving the task into the new cgroup */
- ret = attach_task(child, tsk);
+ ret = cgroup_attach_task(child, tsk);
mutex_unlock(&cgroup_mutex);
out_release:
diff --git a/kernel/compat.c b/kernel/compat.c
index 42a1ed4b61b..5f0e201bcfd 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -40,10 +40,35 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
+static long compat_nanosleep_restart(struct restart_block *restart)
+{
+ struct compat_timespec __user *rmtp;
+ struct timespec rmt;
+ mm_segment_t oldfs;
+ long ret;
+
+ rmtp = (struct compat_timespec __user *)(restart->arg1);
+ restart->arg1 = (unsigned long)&rmt;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = hrtimer_nanosleep_restart(restart);
+ set_fs(oldfs);
+
+ if (ret) {
+ restart->arg1 = (unsigned long)rmtp;
+
+ if (rmtp && put_compat_timespec(&rmt, rmtp))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
struct compat_timespec __user *rmtp)
{
struct timespec tu, rmt;
+ mm_segment_t oldfs;
long ret;
if (get_compat_timespec(&tu, rqtp))
@@ -52,11 +77,21 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
if (!timespec_valid(&tu))
return -EINVAL;
- ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
- CLOCK_MONOTONIC);
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = hrtimer_nanosleep(&tu,
+ rmtp ? (struct timespec __user *)&rmt : NULL,
+ HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ set_fs(oldfs);
+
+ if (ret) {
+ struct restart_block *restart
+ = &current_thread_info()->restart_block;
+
+ restart->fn = compat_nanosleep_restart;
+ restart->arg1 = (unsigned long)rmtp;
- if (ret && rmtp) {
- if (put_compat_timespec(&rmt, rmtp))
+ if (rmtp && put_compat_timespec(&rmt, rmtp))
return -EFAULT;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e0d3a4f56ec..2eff3f63abe 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -389,7 +389,7 @@ int disable_nonboot_cpus(void)
return error;
}
-void enable_nonboot_cpus(void)
+void __ref enable_nonboot_cpus(void)
{
int cpu, error;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cfaf6419d81..3e296ed81d4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -38,7 +38,6 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
-#include <linux/prio_heap.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
@@ -56,6 +55,8 @@
#include <asm/atomic.h>
#include <linux/mutex.h>
#include <linux/kfifo.h>
+#include <linux/workqueue.h>
+#include <linux/cgroup.h>
/*
* Tracks how many cpusets are currently defined in system.
@@ -64,7 +65,7 @@
*/
int number_of_cpusets __read_mostly;
-/* Retrieve the cpuset from a cgroup */
+/* Forward declare cgroup structures */
struct cgroup_subsys cpuset_subsys;
struct cpuset;
@@ -96,6 +97,9 @@ struct cpuset {
/* partition number for rebuild_sched_domains() */
int pn;
+
+ /* used for walking a cpuset heirarchy */
+ struct list_head stack_list;
};
/* Retrieve the cpuset for a cgroup */
@@ -111,7 +115,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
return container_of(task_subsys_state(task, cpuset_subsys_id),
struct cpuset, css);
}
-
+struct cpuset_hotplug_scanner {
+ struct cgroup_scanner scan;
+ struct cgroup *to;
+};
/* bits in struct cpuset flags field */
typedef enum {
@@ -160,17 +167,17 @@ static inline int is_spread_slab(const struct cpuset *cs)
* number, and avoid having to lock and reload mems_allowed unless
* the cpuset they're using changes generation.
*
- * A single, global generation is needed because attach_task() could
+ * A single, global generation is needed because cpuset_attach_task() could
* reattach a task to a different cpuset, which must not have its
* generation numbers aliased with those of that tasks previous cpuset.
*
* Generations are needed for mems_allowed because one task cannot
- * modify anothers memory placement. So we must enable every task,
+ * modify another's memory placement. So we must enable every task,
* on every visit to __alloc_pages(), to efficiently check whether
* its current->cpuset->mems_allowed has changed, requiring an update
* of its current->mems_allowed.
*
- * Since cpuset_mems_generation is guarded by manage_mutex,
+ * Since writes to cpuset_mems_generation are guarded by the cgroup lock
* there is no need to mark it atomic.
*/
static int cpuset_mems_generation;
@@ -182,17 +189,20 @@ static struct cpuset top_cpuset = {
};
/*
- * We have two global cpuset mutexes below. They can nest.
- * It is ok to first take manage_mutex, then nest callback_mutex. We also
- * require taking task_lock() when dereferencing a tasks cpuset pointer.
- * See "The task_lock() exception", at the end of this comment.
+ * There are two global mutexes guarding cpuset structures. The first
+ * is the main control groups cgroup_mutex, accessed via
+ * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
+ * callback_mutex, below. They can nest. It is ok to first take
+ * cgroup_mutex, then nest callback_mutex. We also require taking
+ * task_lock() when dereferencing a task's cpuset pointer. See "The
+ * task_lock() exception", at the end of this comment.
*
* A task must hold both mutexes to modify cpusets. If a task
- * holds manage_mutex, then it blocks others wanting that mutex,
+ * holds cgroup_mutex, then it blocks others wanting that mutex,
* ensuring that it is the only task able to also acquire callback_mutex
* and be able to modify cpusets. It can perform various checks on
* the cpuset structure first, knowing nothing will change. It can
- * also allocate memory while just holding manage_mutex. While it is
+ * also allocate memory while just holding cgroup_mutex. While it is
* performing these checks, various callback routines can briefly
* acquire callback_mutex to query cpusets. Once it is ready to make
* the changes, it takes callback_mutex, blocking everyone else.
@@ -208,60 +218,16 @@ static struct cpuset top_cpuset = {
* The task_struct fields mems_allowed and mems_generation may only
* be accessed in the context of that task, so require no locks.
*
- * Any task can increment and decrement the count field without lock.
- * So in general, code holding manage_mutex or callback_mutex can't rely
- * on the count field not changing. However, if the count goes to
- * zero, then only attach_task(), which holds both mutexes, can
- * increment it again. Because a count of zero means that no tasks
- * are currently attached, therefore there is no way a task attached
- * to that cpuset can fork (the other way to increment the count).
- * So code holding manage_mutex or callback_mutex can safely assume that
- * if the count is zero, it will stay zero. Similarly, if a task
- * holds manage_mutex or callback_mutex on a cpuset with zero count, it
- * knows that the cpuset won't be removed, as cpuset_rmdir() needs
- * both of those mutexes.
- *
* The cpuset_common_file_write handler for operations that modify
- * the cpuset hierarchy holds manage_mutex across the entire operation,
+ * the cpuset hierarchy holds cgroup_mutex across the entire operation,
* single threading all such cpuset modifications across the system.
*
* The cpuset_common_file_read() handlers only hold callback_mutex across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
- * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
- * (usually) take either mutex. These are the two most performance
- * critical pieces of code here. The exception occurs on cpuset_exit(),
- * when a task in a notify_on_release cpuset exits. Then manage_mutex
- * is taken, and if the cpuset count is zero, a usermode call made
- * to /sbin/cpuset_release_agent with the name of the cpuset (path
- * relative to the root of cpuset file system) as the argument.
- *
- * A cpuset can only be deleted if both its 'count' of using tasks
- * is zero, and its list of 'children' cpusets is empty. Since all
- * tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init), therefore, top_cpuset
- * always has either children cpusets and/or using tasks. So we don't
- * need a special hack to ensure that top_cpuset cannot be deleted.
- *
- * The above "Tale of Two Semaphores" would be complete, but for:
- *
- * The task_lock() exception
- *
- * The need for this exception arises from the action of attach_task(),
- * which overwrites one tasks cpuset pointer with another. It does
- * so using both mutexes, however there are several performance
- * critical places that need to reference task->cpuset without the
- * expense of grabbing a system global mutex. Therefore except as
- * noted below, when dereferencing or, as in attach_task(), modifying
- * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
- * (task->alloc_lock) already in the task_struct routinely used for
- * such matters.
- *
- * P.S. One more locking exception. RCU is used to guard the
- * update of a tasks cpuset pointer by attach_task() and the
- * access of task->cpuset->mems_generation via that pointer in
- * the routine cpuset_update_task_memory_state().
+ * Accessing a task's cpuset should be done in accordance with the
+ * guidelines for accessing subsystem state in kernel/cgroup.c
*/
static DEFINE_MUTEX(callback_mutex);
@@ -354,15 +320,14 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
* Do not call this routine if in_interrupt().
*
* Call without callback_mutex or task_lock() held. May be
- * called with or without manage_mutex held. Thanks in part to
- * 'the_top_cpuset_hack', the tasks cpuset pointer will never
+ * called with or without cgroup_mutex held. Thanks in part to
+ * 'the_top_cpuset_hack', the task's cpuset pointer will never
* be NULL. This routine also might acquire callback_mutex and
* current->mm->mmap_sem during call.
*
* Reading current->cpuset->mems_generation doesn't need task_lock
* to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset by attach_task(),
- * using RCU.
+ * from concurrent freeing of current->cpuset using RCU.
*
* The rcu_dereference() is technically probably not needed,
* as I don't actually mind if I see a new cpuset pointer but
@@ -424,7 +389,7 @@ void cpuset_update_task_memory_state(void)
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding manage_mutex.
+ * are only set if the other's are set. Call holding cgroup_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -442,7 +407,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * manage_mutex held.
+ * cgroup_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -476,7 +441,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
if (!is_cpuset_subset(trial, par))
return -EACCES;
- /* If either I or some sibling (!= me) is exclusive, we can't overlap */
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap
+ */
list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
c = cgroup_cs(cont);
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
@@ -732,22 +700,50 @@ static inline int started_after(void *p1, void *p2)
return started_after_time(t1, &t2->start_time, t2);
}
-/*
- * Call with manage_mutex held. May take callback_mutex during call.
+/**
+ * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
+ * @tsk: task to test
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Call with cgroup_mutex held. May take callback_mutex during call.
+ * Called for each task in a cgroup by cgroup_scan_tasks().
+ * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
+ * words, if its mask is not equal to its cpuset's mask).
+ */
+int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+ return !cpus_equal(tsk->cpus_allowed,
+ (cgroup_cs(scan->cg))->cpus_allowed);
+}
+
+/**
+ * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
+ * @tsk: task to test
+ * @scan: struct cgroup_scanner containing the cgroup of the task
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup whose
+ * cpus_allowed mask needs to be changed.
+ *
+ * We don't need to re-check for the cgroup/cpuset membership, since we're
+ * holding cgroup_lock() at this point.
*/
+void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+ set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed);
+}
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @cs: the cpuset to consider
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
static int update_cpumask(struct cpuset *cs, char *buf)
{
struct cpuset trialcs;
- int retval, i;
- int is_load_balanced;
- struct cgroup_iter it;
- struct cgroup *cgrp = cs->css.cgroup;
- struct task_struct *p, *dropped;
- /* Never dereference latest_task, since it's not refcounted */
- struct task_struct *latest_task = NULL;
+ struct cgroup_scanner scan;
struct ptr_heap heap;
- struct timespec latest_time = { 0, 0 };
+ int retval;
+ int is_load_balanced;
/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
if (cs == &top_cpuset)
@@ -756,7 +752,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
trialcs = *cs;
/*
- * An empty cpus_allowed is ok iff there are no tasks in the cpuset.
+ * An empty cpus_allowed is ok only if the cpuset has no tasks.
* Since cpulist_parse() fails on an empty mask, we special case
* that parsing. The validate_change() call ensures that cpusets
* with tasks have cpus.
@@ -777,6 +773,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
/* Nothing to do if the cpus didn't change */
if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
return 0;
+
retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
if (retval)
return retval;
@@ -787,62 +784,19 @@ static int update_cpumask(struct cpuset *cs, char *buf)
cs->cpus_allowed = trialcs.cpus_allowed;
mutex_unlock(&callback_mutex);
- again:
/*
* Scan tasks in the cpuset, and update the cpumasks of any
- * that need an update. Since we can't call set_cpus_allowed()
- * while holding tasklist_lock, gather tasks to be processed
- * in a heap structure. If the statically-sized heap fills up,
- * overflow tasks that started later, and in future iterations
- * only consider tasks that started after the latest task in
- * the previous pass. This guarantees forward progress and
- * that we don't miss any tasks
+ * that need an update.
*/
- heap.size = 0;
- cgroup_iter_start(cgrp, &it);
- while ((p = cgroup_iter_next(cgrp, &it))) {
- /* Only affect tasks that don't have the right cpus_allowed */
- if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
- continue;
- /*
- * Only process tasks that started after the last task
- * we processed
- */
- if (!started_after_time(p, &latest_time, latest_task))
- continue;
- dropped = heap_insert(&heap, p);
- if (dropped == NULL) {
- get_task_struct(p);
- } else if (dropped != p) {
- get_task_struct(p);
- put_task_struct(dropped);
- }
- }
- cgroup_iter_end(cgrp, &it);
- if (heap.size) {
- for (i = 0; i < heap.size; i++) {
- struct task_struct *p = heap.ptrs[i];
- if (i == 0) {
- latest_time = p->start_time;
- latest_task = p;
- }
- set_cpus_allowed(p, cs->cpus_allowed);
- put_task_struct(p);
- }
- /*
- * If we had to process any tasks at all, scan again
- * in case some of them were in the middle of forking
- * children that didn't notice the new cpumask
- * restriction. Not the most efficient way to do it,
- * but it avoids having to take callback_mutex in the
- * fork path
- */
- goto again;
- }
+ scan.cg = cs->css.cgroup;
+ scan.test_task = cpuset_test_cpumask;
+ scan.process_task = cpuset_change_cpumask;
+ scan.heap = &heap;
+ cgroup_scan_tasks(&scan);
heap_free(&heap);
+
if (is_load_balanced)
rebuild_sched_domains();
-
return 0;
}
@@ -854,11 +808,11 @@ static int update_cpumask(struct cpuset *cs, char *buf)
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding manage_mutex, so our current->cpuset won't change
- * during this call, as manage_mutex holds off any attach_task()
+ * Call holding cgroup_mutex, so current's cpuset won't change
+ * during this call, as manage_mutex holds off any cpuset_attach()
* calls. Therefore we don't need to take task_lock around the
* call to guarantee_online_mems(), as we know no one is changing
- * our tasks cpuset.
+ * our task's cpuset.
*
* Hold callback_mutex around the two modifications of our tasks
* mems_allowed to synchronize with cpuset_mems_allowed().
@@ -903,7 +857,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
* the cpuset is marked 'memory_migrate', migrate the tasks
* pages to the new memory.
*
- * Call with manage_mutex held. May take callback_mutex during call.
+ * Call with cgroup_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1016,7 +970,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
* tasklist_lock. Forks can happen again now - the mpol_copy()
* cpuset_being_rebound check will catch such forks, and rebind
* their vma mempolicies too. Because we still hold the global
- * cpuset manage_mutex, we know that no other rebind effort will
+ * cgroup_mutex, we know that no other rebind effort will
* be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1031,7 +985,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
mmput(mm);
}
- /* We're done rebinding vma's to this cpusets new mems_allowed. */
+ /* We're done rebinding vmas to this cpuset's new mems_allowed. */
kfree(mmarray);
cpuset_being_rebound = NULL;
retval = 0;
@@ -1045,7 +999,7 @@ int current_cpuset_is_being_rebound(void)
}
/*
- * Call with manage_mutex held.
+ * Call with cgroup_mutex held.
*/
static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -1066,7 +1020,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
* cs: the cpuset to update
* buf: the buffer where we read the 0 or 1
*
- * Call with manage_mutex held.
+ * Call with cgroup_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -1200,6 +1154,7 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss,
struct cgroup *cont, struct task_struct *tsk)
{
@@ -1547,7 +1502,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
* If this becomes a problem for some users who wish to
* allow that scenario, then cpuset_post_clone() could be
* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
+ * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
+ * held.
*/
static void cpuset_post_clone(struct cgroup_subsys *ss,
struct cgroup *cgroup)
@@ -1571,11 +1527,8 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
/*
* cpuset_create - create a cpuset
- * parent: cpuset that will be parent of the new cpuset.
- * name: name of the new cpuset. Will be strcpy'ed.
- * mode: mode to set on new inode
- *
- * Must be called with the mutex on the parent inode held
+ * ss: cpuset cgroup subsystem
+ * cont: control group that the new cpuset will be part of
*/
static struct cgroup_subsys_state *cpuset_create(
@@ -1687,53 +1640,140 @@ int __init cpuset_init(void)
return 0;
}
+/**
+ * cpuset_do_move_task - move a given task to another cpuset
+ * @tsk: pointer to task_struct the task to move
+ * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
+ *
+ * Called by cgroup_scan_tasks() for each task in a cgroup.
+ * Return nonzero to stop the walk through the tasks.
+ */
+void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
+{
+ struct cpuset_hotplug_scanner *chsp;
+
+ chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
+ cgroup_attach_task(chsp->to, tsk);
+}
+
+/**
+ * move_member_tasks_to_cpuset - move tasks from one cpuset to another
+ * @from: cpuset in which the tasks currently reside
+ * @to: cpuset to which the tasks will be moved
+ *
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
+ *
+ * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * calling callback functions for each.
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+ struct cpuset_hotplug_scanner scan;
+
+ scan.scan.cg = from->css.cgroup;
+ scan.scan.test_task = NULL; /* select all tasks in cgroup */
+ scan.scan.process_task = cpuset_do_move_task;
+ scan.scan.heap = NULL;
+ scan.to = to->css.cgroup;
+
+ if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
+ printk(KERN_ERR "move_member_tasks_to_cpuset: "
+ "cgroup_scan_tasks failed\n");
+}
+
/*
* If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
- * or guarantee_online_mems() code will use that emptied cpusets
- * parent online CPUs or nodes. Cpusets that were already empty of
- * CPUs or nodes are left empty.
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
+ *
+ * Called with cgroup_mutex held
+ * callback_mutex must not be held, as cpuset_attach() will take it.
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+ struct cpuset *parent;
+
+ /*
+ * The cgroup's css_sets list is in use if there are tasks
+ * in the cpuset; the list is empty if there are none;
+ * the cs->css.refcnt seems always 0.
+ */
+ if (list_empty(&cs->css.cgroup->css_sets))
+ return;
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = cs->parent;
+ while (cpus_empty(parent->cpus_allowed) ||
+ nodes_empty(parent->mems_allowed))
+ parent = parent->parent;
+
+ move_member_tasks_to_cpuset(cs, parent);
+}
+
+/*
+ * Walk the specified cpuset subtree and look for empty cpusets.
+ * The tasks of such cpuset must be moved to a parent cpuset.
*
- * This routine is intentionally inefficient in a couple of regards.
- * It will check all cpusets in a subtree even if the top cpuset of
- * the subtree has no offline CPUs or nodes. It checks both CPUs and
- * nodes, even though the caller could have been coded to know that
- * only one of CPUs or nodes needed to be checked on a given call.
- * This was done to minimize text size rather than cpu cycles.
+ * Called with cgroup_mutex held. We take callback_mutex to modify
+ * cpus_allowed and mems_allowed.
*
- * Call with both manage_mutex and callback_mutex held.
+ * This walk processes the tree from top to bottom, completing one layer
+ * before dropping down to the next. It always processes a node before
+ * any of its children.
*
- * Recursive, on depth of cpuset subtree.
+ * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * that has tasks along with an empty 'mems'. But if we did see such
+ * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
*/
-
-static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+static void scan_for_empty_cpusets(const struct cpuset *root)
{
+ struct cpuset *cp; /* scans cpusets being updated */
+ struct cpuset *child; /* scans child cpusets of cp */
+ struct list_head queue;
struct cgroup *cont;
- struct cpuset *c;
- /* Each of our child cpusets mems must be online */
- list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
- c = cgroup_cs(cont);
- guarantee_online_cpus_mems_in_subtree(c);
- if (!cpus_empty(c->cpus_allowed))
- guarantee_online_cpus(c, &c->cpus_allowed);
- if (!nodes_empty(c->mems_allowed))
- guarantee_online_mems(c, &c->mems_allowed);
+ INIT_LIST_HEAD(&queue);
+
+ list_add_tail((struct list_head *)&root->stack_list, &queue);
+
+ while (!list_empty(&queue)) {
+ cp = container_of(queue.next, struct cpuset, stack_list);
+ list_del(queue.next);
+ list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+ child = cgroup_cs(cont);
+ list_add_tail(&child->stack_list, &queue);
+ }
+ cont = cp->css.cgroup;
+
+ /* Continue past cpusets with all cpus, mems online */
+ if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
+ nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
+ continue;
+
+ /* Remove offline cpus and mems from this cpuset. */
+ mutex_lock(&callback_mutex);
+ cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
+ nodes_and(cp->mems_allowed, cp->mems_allowed,
+ node_states[N_HIGH_MEMORY]);
+ mutex_unlock(&callback_mutex);
+
+ /* Move tasks from the empty cpuset to a parent */
+ if (cpus_empty(cp->cpus_allowed) ||
+ nodes_empty(cp->mems_allowed))
+ remove_tasks_in_empty_cpuset(cp);
}
}
/*
* The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
* cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug
- * event.
- *
- * To ensure that we don't remove a CPU or node from the top cpuset
- * that is currently in use by a child cpuset (which would violate
- * the rule that cpusets must be subsets of their parent), we first
- * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ * track what's online after any CPU or memory node hotplug or unplug event.
*
* Since there are two callers of this routine, one for CPU hotplug
* events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1784,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
static void common_cpu_mem_hotplug_unplug(void)
{
cgroup_lock();
- mutex_lock(&callback_mutex);
- guarantee_online_cpus_mems_in_subtree(&top_cpuset);
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ scan_for_empty_cpusets(&top_cpuset);
- mutex_unlock(&callback_mutex);
cgroup_unlock();
}
@@ -1826,7 +1864,7 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
/**
* cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
+ * Must be called with callback_mutex held.
**/
cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
{
@@ -2163,10 +2201,8 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take manage_mutex, keeping attach_task() from changing it
- * anyway. No need to check that tsk->cpuset != NULL, thanks to
- * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
- * cpuset to top_cpuset.
+ * and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ * anyway.
*/
static int proc_cpuset_show(struct seq_file *m, void *unused_v)
{
@@ -2219,13 +2255,14 @@ const struct file_operations proc_cpuset_operations = {
#endif /* CONFIG_PROC_PID_CPUSET */
/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */
-char *cpuset_task_status_allowed(struct task_struct *task, char *buffer)
-{
- buffer += sprintf(buffer, "Cpus_allowed:\t");
- buffer += cpumask_scnprintf(buffer, PAGE_SIZE, task->cpus_allowed);
- buffer += sprintf(buffer, "\n");
- buffer += sprintf(buffer, "Mems_allowed:\t");
- buffer += nodemask_scnprintf(buffer, PAGE_SIZE, task->mems_allowed);
- buffer += sprintf(buffer, "\n");
- return buffer;
+void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
+{
+ seq_printf(m, "Cpus_allowed:\t");
+ m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count,
+ task->cpus_allowed);
+ seq_printf(m, "\n");
+ seq_printf(m, "Mems_allowed:\t");
+ m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count,
+ task->mems_allowed);
+ seq_printf(m, "\n");
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d3d0f0b27d..506a957b665 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -293,26 +293,27 @@ static void reparent_to_kthreadd(void)
switch_uid(INIT_USER);
}
-void __set_special_pids(pid_t session, pid_t pgrp)
+void __set_special_pids(struct pid *pid)
{
struct task_struct *curr = current->group_leader;
+ pid_t nr = pid_nr(pid);
- if (task_session_nr(curr) != session) {
+ if (task_session(curr) != pid) {
detach_pid(curr, PIDTYPE_SID);
- set_task_session(curr, session);
- attach_pid(curr, PIDTYPE_SID, find_pid(session));
+ attach_pid(curr, PIDTYPE_SID, pid);
+ set_task_session(curr, nr);
}
- if (task_pgrp_nr(curr) != pgrp) {
+ if (task_pgrp(curr) != pid) {
detach_pid(curr, PIDTYPE_PGID);
- set_task_pgrp(curr, pgrp);
- attach_pid(curr, PIDTYPE_PGID, find_pid(pgrp));
+ attach_pid(curr, PIDTYPE_PGID, pid);
+ set_task_pgrp(curr, nr);
}
}
-static void set_special_pids(pid_t session, pid_t pgrp)
+static void set_special_pids(struct pid *pid)
{
write_lock_irq(&tasklist_lock);
- __set_special_pids(session, pgrp);
+ __set_special_pids(pid);
write_unlock_irq(&tasklist_lock);
}
@@ -383,7 +384,11 @@ void daemonize(const char *name, ...)
*/
current->flags |= PF_NOFREEZE;
- set_special_pids(1, 1);
+ if (current->nsproxy != &init_nsproxy) {
+ get_nsproxy(&init_nsproxy);
+ switch_task_namespaces(current, &init_nsproxy);
+ }
+ set_special_pids(&init_struct_pid);
proc_clear_tty(current);
/* Block and flush all signals */
@@ -398,11 +403,6 @@ void daemonize(const char *name, ...)
current->fs = fs;
atomic_inc(&fs->count);
- if (current->nsproxy != init_task.nsproxy) {
- get_nsproxy(init_task.nsproxy);
- switch_task_namespaces(current, init_task.nsproxy);
- }
-
exit_files(current);
current->files = init_task.files;
atomic_inc(&current->files->count);
@@ -458,7 +458,7 @@ struct files_struct *get_files_struct(struct task_struct *task)
return files;
}
-void fastcall put_files_struct(struct files_struct *files)
+void put_files_struct(struct files_struct *files)
{
struct fdtable *fdt;
@@ -512,14 +512,10 @@ static void __put_fs_struct(struct fs_struct *fs)
{
/* No need to hold fs->lock if we are killing it */
if (atomic_dec_and_test(&fs->count)) {
- dput(fs->root);
- mntput(fs->rootmnt);
- dput(fs->pwd);
- mntput(fs->pwdmnt);
- if (fs->altroot) {
- dput(fs->altroot);
- mntput(fs->altrootmnt);
- }
+ path_put(&fs->root);
+ path_put(&fs->pwd);
+ if (fs->altroot.dentry)
+ path_put(&fs->altroot);
kmem_cache_free(fs_cachep, fs);
}
}
@@ -745,24 +741,6 @@ static void exit_notify(struct task_struct *tsk)
struct task_struct *t;
struct pid *pgrp;
- if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT)
- && !thread_group_empty(tsk)) {
- /*
- * This occurs when there was a race between our exit
- * syscall and a group signal choosing us as the one to
- * wake up. It could be that we are the only thread
- * alerted to check for pending signals, but another thread
- * should be woken now to take the signal since we will not.
- * Now we'll wake all the threads in the group just to make
- * sure someone gets all the pending signals.
- */
- spin_lock_irq(&tsk->sighand->siglock);
- for (t = next_thread(tsk); t != tsk; t = next_thread(t))
- if (!signal_pending(t) && !(t->flags & PF_EXITING))
- recalc_sigpending_and_wake(t);
- spin_unlock_irq(&tsk->sighand->siglock);
- }
-
/*
* This does two things:
*
@@ -905,7 +883,7 @@ static inline void exit_child_reaper(struct task_struct *tsk)
zap_pid_ns_processes(tsk->nsproxy->pid_ns);
}
-fastcall NORET_TYPE void do_exit(long code)
+NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
@@ -947,7 +925,7 @@ fastcall NORET_TYPE void do_exit(long code)
schedule();
}
- tsk->flags |= PF_EXITING;
+ exit_signals(tsk); /* sets PF_EXITING */
/*
* tsk->flags are checked in the futex code to protect against
* an exiting task cleaning up the robust pi futexes.
@@ -1108,20 +1086,23 @@ asmlinkage void sys_exit_group(int error_code)
do_group_exit((error_code & 0xff) << 8);
}
-static int eligible_child(pid_t pid, int options, struct task_struct *p)
+static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
+{
+ struct pid *pid = NULL;
+ if (type == PIDTYPE_PID)
+ pid = task->pids[type].pid;
+ else if (type < PIDTYPE_MAX)
+ pid = task->group_leader->pids[type].pid;
+ return pid;
+}
+
+static int eligible_child(enum pid_type type, struct pid *pid, int options,
+ struct task_struct *p)
{
int err;
- struct pid_namespace *ns;
- ns = current->nsproxy->pid_ns;
- if (pid > 0) {
- if (task_pid_nr_ns(p, ns) != pid)
- return 0;
- } else if (!pid) {
- if (task_pgrp_nr_ns(p, ns) != task_pgrp_vnr(current))
- return 0;
- } else if (pid != -1) {
- if (task_pgrp_nr_ns(p, ns) != -pid)
+ if (type < PIDTYPE_MAX) {
+ if (task_pid_type(p, type) != pid)
return 0;
}
@@ -1140,18 +1121,16 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
&& !(options & __WALL))
return 0;
- /*
- * Do not consider thread group leaders that are
- * in a non-empty thread group:
- */
- if (delay_group_leader(p))
- return 2;
err = security_task_wait(p);
- if (err)
- return err;
+ if (likely(!err))
+ return 1;
- return 1;
+ if (type != PIDTYPE_PID)
+ return 0;
+ /* This child was explicitly requested, abort */
+ read_unlock(&tasklist_lock);
+ return err;
}
static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1191,20 +1170,13 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
{
unsigned long state;
int retval, status, traced;
- struct pid_namespace *ns;
-
- ns = current->nsproxy->pid_ns;
+ pid_t pid = task_pid_vnr(p);
if (unlikely(noreap)) {
- pid_t pid = task_pid_nr_ns(p, ns);
uid_t uid = p->uid;
int exit_code = p->exit_code;
int why, status;
- if (unlikely(p->exit_state != EXIT_ZOMBIE))
- return 0;
- if (unlikely(p->exit_signal == -1 && p->ptrace == 0))
- return 0;
get_task_struct(p);
read_unlock(&tasklist_lock);
if ((exit_code & 0x7f) == 0) {
@@ -1315,11 +1287,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
retval = put_user(status, &infop->si_status);
}
if (!retval && infop)
- retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid);
+ retval = put_user(pid, &infop->si_pid);
if (!retval && infop)
retval = put_user(p->uid, &infop->si_uid);
if (!retval)
- retval = task_pid_nr_ns(p, ns);
+ retval = pid;
if (traced) {
write_lock_irq(&tasklist_lock);
@@ -1351,21 +1323,38 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
+static int wait_task_stopped(struct task_struct *p,
int noreap, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
- int retval, exit_code;
+ int retval, exit_code, why;
+ uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
- if (!p->exit_code)
- return 0;
- if (delayed_group_leader && !(p->ptrace & PT_PTRACED) &&
- p->signal->group_stop_count > 0)
+ exit_code = 0;
+ spin_lock_irq(&p->sighand->siglock);
+
+ if (unlikely(!task_is_stopped_or_traced(p)))
+ goto unlock_sig;
+
+ if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0)
/*
* A group stop is in progress and this is the group leader.
* We won't report until all threads have stopped.
*/
+ goto unlock_sig;
+
+ exit_code = p->exit_code;
+ if (!exit_code)
+ goto unlock_sig;
+
+ if (!noreap)
+ p->exit_code = 0;
+
+ uid = p->uid;
+unlock_sig:
+ spin_unlock_irq(&p->sighand->siglock);
+ if (!exit_code)
return 0;
/*
@@ -1375,65 +1364,15 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
* keep holding onto the tasklist_lock while we call getrusage and
* possibly take page faults for user memory.
*/
- pid = task_pid_nr_ns(p, current->nsproxy->pid_ns);
get_task_struct(p);
+ pid = task_pid_vnr(p);
+ why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
- if (unlikely(noreap)) {
- uid_t uid = p->uid;
- int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
-
- exit_code = p->exit_code;
- if (unlikely(!exit_code) || unlikely(p->exit_state))
- goto bail_ref;
+ if (unlikely(noreap))
return wait_noreap_copyout(p, pid, uid,
why, exit_code,
infop, ru);
- }
-
- write_lock_irq(&tasklist_lock);
-
- /*
- * This uses xchg to be atomic with the thread resuming and setting
- * it. It must also be done with the write lock held to prevent a
- * race with the EXIT_ZOMBIE case.
- */
- exit_code = xchg(&p->exit_code, 0);
- if (unlikely(p->exit_state)) {
- /*
- * The task resumed and then died. Let the next iteration
- * catch it in EXIT_ZOMBIE. Note that exit_code might
- * already be zero here if it resumed and did _exit(0).
- * The task itself is dead and won't touch exit_code again;
- * other processors in this function are locked out.
- */
- p->exit_code = exit_code;
- exit_code = 0;
- }
- if (unlikely(exit_code == 0)) {
- /*
- * Another thread in this function got to it first, or it
- * resumed, or it resumed and then died.
- */
- write_unlock_irq(&tasklist_lock);
-bail_ref:
- put_task_struct(p);
- /*
- * We are returning to the wait loop without having successfully
- * removed the process and having released the lock. We cannot
- * continue, since the "p" task pointer is potentially stale.
- *
- * Return -EAGAIN, and do_wait() will restart the loop from the
- * beginning. Do _not_ re-acquire the lock.
- */
- return -EAGAIN;
- }
-
- /* move to end of parent's list to avoid starvation */
- remove_parent(p);
- add_parent(p);
-
- write_unlock_irq(&tasklist_lock);
retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
if (!retval && stat_addr)
@@ -1443,15 +1382,13 @@ bail_ref:
if (!retval && infop)
retval = put_user(0, &infop->si_errno);
if (!retval && infop)
- retval = put_user((short)((p->ptrace & PT_PTRACED)
- ? CLD_TRAPPED : CLD_STOPPED),
- &infop->si_code);
+ retval = put_user(why, &infop->si_code);
if (!retval && infop)
retval = put_user(exit_code, &infop->si_status);
if (!retval && infop)
retval = put_user(pid, &infop->si_pid);
if (!retval && infop)
- retval = put_user(p->uid, &infop->si_uid);
+ retval = put_user(uid, &infop->si_uid);
if (!retval)
retval = pid;
put_task_struct(p);
@@ -1473,7 +1410,6 @@ static int wait_task_continued(struct task_struct *p, int noreap,
int retval;
pid_t pid;
uid_t uid;
- struct pid_namespace *ns;
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
return 0;
@@ -1488,8 +1424,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
spin_unlock_irq(&p->sighand->siglock);
- ns = current->nsproxy->pid_ns;
- pid = task_pid_nr_ns(p, ns);
+ pid = task_pid_vnr(p);
uid = p->uid;
get_task_struct(p);
read_unlock(&tasklist_lock);
@@ -1500,7 +1435,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
if (!retval && stat_addr)
retval = put_user(0xffff, stat_addr);
if (!retval)
- retval = task_pid_nr_ns(p, ns);
+ retval = pid;
} else {
retval = wait_noreap_copyout(p, pid, uid,
CLD_CONTINUED, SIGCONT,
@@ -1511,103 +1446,63 @@ static int wait_task_continued(struct task_struct *p, int noreap,
return retval;
}
-
-static inline int my_ptrace_child(struct task_struct *p)
-{
- if (!(p->ptrace & PT_PTRACED))
- return 0;
- if (!(p->ptrace & PT_ATTACHED))
- return 1;
- /*
- * This child was PTRACE_ATTACH'd. We should be seeing it only if
- * we are the attacher. If we are the real parent, this is a race
- * inside ptrace_attach. It is waiting for the tasklist_lock,
- * which we have to switch the parent links, but has already set
- * the flags in p->ptrace.
- */
- return (p->parent != p->real_parent);
-}
-
-static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
- int __user *stat_addr, struct rusage __user *ru)
+static long do_wait(enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop, int __user *stat_addr,
+ struct rusage __user *ru)
{
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
int flag, retval;
- int allowed, denied;
add_wait_queue(&current->signal->wait_chldexit,&wait);
repeat:
+ /* If there is nothing that can match our critier just get out */
+ retval = -ECHILD;
+ if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
+ goto end;
+
/*
* We will set this flag if we see any child that might later
* match our criteria, even if we are not able to reap it yet.
*/
- flag = 0;
- allowed = denied = 0;
+ flag = retval = 0;
current->state = TASK_INTERRUPTIBLE;
read_lock(&tasklist_lock);
tsk = current;
do {
struct task_struct *p;
- int ret;
list_for_each_entry(p, &tsk->children, sibling) {
- ret = eligible_child(pid, options, p);
+ int ret = eligible_child(type, pid, options, p);
if (!ret)
continue;
if (unlikely(ret < 0)) {
- denied = ret;
- continue;
- }
- allowed = 1;
-
- if (task_is_stopped_or_traced(p)) {
+ retval = ret;
+ } else if (task_is_stopped_or_traced(p)) {
/*
* It's stopped now, so it might later
* continue, exit, or stop again.
- *
- * When we hit the race with PTRACE_ATTACH, we
- * will not report this child. But the race
- * means it has not yet been moved to our
- * ptrace_children list, so we need to set the
- * flag here to avoid a spurious ECHILD when
- * the race happens with the only child.
*/
flag = 1;
+ if (!(p->ptrace & PT_PTRACED) &&
+ !(options & WUNTRACED))
+ continue;
- if (!my_ptrace_child(p)) {
- if (task_is_traced(p))
- continue;
- if (!(options & WUNTRACED))
- continue;
- }
-
- retval = wait_task_stopped(p, ret == 2,
+ retval = wait_task_stopped(p,
(options & WNOWAIT), infop,
stat_addr, ru);
- if (retval == -EAGAIN)
- goto repeat;
- if (retval != 0) /* He released the lock. */
- goto end;
- } else if (p->exit_state == EXIT_DEAD) {
- continue;
- } else if (p->exit_state == EXIT_ZOMBIE) {
+ } else if (p->exit_state == EXIT_ZOMBIE &&
+ !delay_group_leader(p)) {
/*
- * Eligible but we cannot release it yet:
+ * We don't reap group leaders with subthreads.
*/
- if (ret == 2)
- goto check_continued;
if (!likely(options & WEXITED))
continue;
retval = wait_task_zombie(p,
(options & WNOWAIT), infop,
stat_addr, ru);
- /* He released the lock. */
- if (retval != 0)
- goto end;
- } else {
-check_continued:
+ } else if (p->exit_state != EXIT_DEAD) {
/*
* It's running now, so it might later
* exit, stop, or stop and then continue.
@@ -1618,17 +1513,20 @@ check_continued:
retval = wait_task_continued(p,
(options & WNOWAIT), infop,
stat_addr, ru);
- if (retval != 0) /* He released the lock. */
- goto end;
}
+ if (retval != 0) /* tasklist_lock released */
+ goto end;
}
if (!flag) {
list_for_each_entry(p, &tsk->ptrace_children,
- ptrace_list) {
- if (!eligible_child(pid, options, p))
+ ptrace_list) {
+ flag = eligible_child(type, pid, options, p);
+ if (!flag)
continue;
- flag = 1;
- break;
+ if (likely(flag > 0))
+ break;
+ retval = flag;
+ goto end;
}
}
if (options & __WNOTHREAD)
@@ -1636,10 +1534,9 @@ check_continued:
tsk = next_thread(tsk);
BUG_ON(tsk->signal != current->signal);
} while (tsk != current);
-
read_unlock(&tasklist_lock);
+
if (flag) {
- retval = 0;
if (options & WNOHANG)
goto end;
retval = -ERESTARTSYS;
@@ -1649,14 +1546,12 @@ check_continued:
goto repeat;
}
retval = -ECHILD;
- if (unlikely(denied) && !allowed)
- retval = denied;
end:
current->state = TASK_RUNNING;
remove_wait_queue(&current->signal->wait_chldexit,&wait);
if (infop) {
if (retval > 0)
- retval = 0;
+ retval = 0;
else {
/*
* For a WNOHANG return, clear out all the fields
@@ -1680,10 +1575,12 @@ end:
return retval;
}
-asmlinkage long sys_waitid(int which, pid_t pid,
+asmlinkage long sys_waitid(int which, pid_t upid,
struct siginfo __user *infop, int options,
struct rusage __user *ru)
{
+ struct pid *pid = NULL;
+ enum pid_type type;
long ret;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
@@ -1693,37 +1590,58 @@ asmlinkage long sys_waitid(int which, pid_t pid,
switch (which) {
case P_ALL:
- pid = -1;
+ type = PIDTYPE_MAX;
break;
case P_PID:
- if (pid <= 0)
+ type = PIDTYPE_PID;
+ if (upid <= 0)
return -EINVAL;
break;
case P_PGID:
- if (pid <= 0)
+ type = PIDTYPE_PGID;
+ if (upid <= 0)
return -EINVAL;
- pid = -pid;
break;
default:
return -EINVAL;
}
- ret = do_wait(pid, options, infop, NULL, ru);
+ if (type < PIDTYPE_MAX)
+ pid = find_get_pid(upid);
+ ret = do_wait(type, pid, options, infop, NULL, ru);
+ put_pid(pid);
/* avoid REGPARM breakage on x86: */
prevent_tail_call(ret);
return ret;
}
-asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
+asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
int options, struct rusage __user *ru)
{
+ struct pid *pid = NULL;
+ enum pid_type type;
long ret;
if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
- ret = do_wait(pid, options | WEXITED, NULL, stat_addr, ru);
+
+ if (upid == -1)
+ type = PIDTYPE_MAX;
+ else if (upid < 0) {
+ type = PIDTYPE_PGID;
+ pid = find_get_pid(-upid);
+ } else if (upid == 0) {
+ type = PIDTYPE_PGID;
+ pid = get_pid(task_pgrp(current));
+ } else /* upid > 0 */ {
+ type = PIDTYPE_PID;
+ pid = find_get_pid(upid);
+ }
+
+ ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
+ put_pid(pid);
/* avoid REGPARM breakage on x86: */
prevent_tail_call(ret);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2b55b74cd99..dd249c37b3a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
+#include <linux/memcontrol.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/acct.h>
@@ -340,7 +341,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
#include <linux/init_task.h>
-static struct mm_struct * mm_init(struct mm_struct * mm)
+static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
@@ -357,11 +358,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
mm->ioctx_list = NULL;
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
+ mm_init_cgroup(mm, p);
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
return mm;
}
+
+ mm_free_cgroup(mm);
free_mm(mm);
return NULL;
}
@@ -376,7 +380,7 @@ struct mm_struct * mm_alloc(void)
mm = allocate_mm();
if (mm) {
memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm);
+ mm = mm_init(mm, current);
}
return mm;
}
@@ -386,10 +390,11 @@ struct mm_struct * mm_alloc(void)
* is dropped: either by a lazy thread or by
* mmput. Free the page directory and the mm.
*/
-void fastcall __mmdrop(struct mm_struct *mm)
+void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
+ mm_free_cgroup(mm);
destroy_context(mm);
free_mm(mm);
}
@@ -511,7 +516,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
mm->token_priority = 0;
mm->last_interval = 0;
- if (!mm_init(mm))
+ if (!mm_init(mm, tsk))
goto fail_nomem;
if (init_new_context(tsk, mm))
@@ -595,16 +600,16 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
rwlock_init(&fs->lock);
fs->umask = old->umask;
read_lock(&old->lock);
- fs->rootmnt = mntget(old->rootmnt);
- fs->root = dget(old->root);
- fs->pwdmnt = mntget(old->pwdmnt);
- fs->pwd = dget(old->pwd);
- if (old->altroot) {
- fs->altrootmnt = mntget(old->altrootmnt);
- fs->altroot = dget(old->altroot);
+ fs->root = old->root;
+ path_get(&old->root);
+ fs->pwd = old->pwd;
+ path_get(&old->pwd);
+ if (old->altroot.dentry) {
+ fs->altroot = old->altroot;
+ path_get(&old->altroot);
} else {
- fs->altrootmnt = NULL;
- fs->altroot = NULL;
+ fs->altroot.mnt = NULL;
+ fs->altroot.dentry = NULL;
}
read_unlock(&old->lock);
}
@@ -904,7 +909,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->it_real_incr.tv64 = 0;
sig->real_timer.function = it_real_fn;
- sig->tsk = tsk;
sig->it_virt_expires = cputime_zero;
sig->it_virt_incr = cputime_zero;
@@ -1333,6 +1337,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (clone_flags & CLONE_NEWPID)
p->nsproxy->pid_ns->child_reaper = p;
+ p->signal->leader_pid = pid;
p->signal->tty = current->signal->tty;
set_task_pgrp(p, task_pgrp_nr(current));
set_task_session(p, task_session_nr(current));
@@ -1399,7 +1404,7 @@ fork_out:
return ERR_PTR(retval);
}
-noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
{
memset(regs, 0, sizeof(struct pt_regs));
return regs;
@@ -1483,13 +1488,7 @@ long do_fork(unsigned long clone_flags,
if (!IS_ERR(p)) {
struct completion vfork;
- /*
- * this is enough to call pid_nr_ns here, but this if
- * improves optimisation of regular fork()
- */
- nr = (clone_flags & CLONE_NEWPID) ?
- task_pid_nr_ns(p, current->nsproxy->pid_ns) :
- task_pid_vnr(p);
+ nr = task_pid_vnr(p);
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
@@ -1510,7 +1509,7 @@ long do_fork(unsigned long clone_flags,
if (!(clone_flags & CLONE_STOPPED))
wake_up_new_task(p, clone_flags);
else
- p->state = TASK_STOPPED;
+ __set_task_state(p, TASK_STOPPED);
if (unlikely (trace)) {
current->ptrace_message = nr;
diff --git a/kernel/futex.c b/kernel/futex.c
index a6baaec44b8..221f2128a43 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2116,7 +2116,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
t = timespec_to_ktime(ts);
if (cmd == FUTEX_WAIT)
- t = ktime_add(ktime_get(), t);
+ t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
/*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 133d558db45..7d5e4b016f3 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -176,7 +176,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
t = timespec_to_ktime(ts);
if (cmd == FUTEX_WAIT)
- t = ktime_add(ktime_get(), t);
+ t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 668f3967eb3..98bee013f71 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -326,6 +326,23 @@ u64 ktime_divns(const ktime_t kt, s64 div)
#endif /* BITS_PER_LONG >= 64 */
/*
+ * Add two ktime values and do a safety check for overflow:
+ */
+ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
+{
+ ktime_t res = ktime_add(lhs, rhs);
+
+ /*
+ * We use KTIME_SEC_MAX here, the maximum timeout which we can
+ * return to user space in a timespec:
+ */
+ if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
+ res = ktime_set(KTIME_SEC_MAX, 0);
+
+ return res;
+}
+
+/*
* Check, whether the timer is on the callback pending list
*/
static inline int hrtimer_cb_pending(const struct hrtimer *timer)
@@ -425,6 +442,8 @@ static int hrtimer_reprogram(struct hrtimer *timer,
ktime_t expires = ktime_sub(timer->expires, base->offset);
int res;
+ WARN_ON_ONCE(timer->expires.tv64 < 0);
+
/*
* When the callback is running, we do not reprogram the clock event
* device. The timer callback is either running on a different CPU or
@@ -435,6 +454,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
if (hrtimer_callback_running(timer))
return 0;
+ /*
+ * CLOCK_REALTIME timer might be requested with an absolute
+ * expiry time which is less than base->offset. Nothing wrong
+ * about that, just avoid to call into the tick code, which
+ * has now objections against negative expiry values.
+ */
+ if (expires.tv64 < 0)
+ return -ETIME;
+
if (expires.tv64 >= expires_next->tv64)
return 0;
@@ -682,13 +710,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
*/
orun++;
}
- timer->expires = ktime_add(timer->expires, interval);
- /*
- * Make sure, that the result did not wrap with a very large
- * interval.
- */
- if (timer->expires.tv64 < 0)
- timer->expires = ktime_set(KTIME_SEC_MAX, 0);
+ timer->expires = ktime_add_safe(timer->expires, interval);
return orun;
}
@@ -839,7 +861,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
new_base = switch_hrtimer_base(timer, base);
if (mode == HRTIMER_MODE_REL) {
- tim = ktime_add(tim, new_base->get_time());
+ tim = ktime_add_safe(tim, new_base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
* to signal that they simply return xtime in
@@ -848,16 +870,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
* timeouts. This will go away with the GTOD framework.
*/
#ifdef CONFIG_TIME_LOW_RES
- tim = ktime_add(tim, base->resolution);
+ tim = ktime_add_safe(tim, base->resolution);
#endif
- /*
- * Careful here: User space might have asked for a
- * very long sleep, so the add above might result in a
- * negative number, which enqueues the timer in front
- * of the queue.
- */
- if (tim.tv64 < 0)
- tim.tv64 = KTIME_MAX;
}
timer->expires = tim;
@@ -1319,13 +1333,26 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
return t->task == NULL;
}
+static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
+{
+ struct timespec rmt;
+ ktime_t rem;
+
+ rem = ktime_sub(timer->expires, timer->base->get_time());
+ if (rem.tv64 <= 0)
+ return 0;
+ rmt = ktime_to_timespec(rem);
+
+ if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+ return -EFAULT;
+
+ return 1;
+}
+
long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
struct hrtimer_sleeper t;
- struct timespec *rmtp;
- ktime_t time;
-
- restart->fn = do_no_restart_syscall;
+ struct timespec __user *rmtp;
hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS);
t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
@@ -1333,26 +1360,22 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
if (do_nanosleep(&t, HRTIMER_MODE_ABS))
return 0;
- rmtp = (struct timespec *)restart->arg1;
+ rmtp = (struct timespec __user *)restart->arg1;
if (rmtp) {
- time = ktime_sub(t.timer.expires, t.timer.base->get_time());
- if (time.tv64 <= 0)
- return 0;
- *rmtp = ktime_to_timespec(time);
+ int ret = update_rmtp(&t.timer, rmtp);
+ if (ret <= 0)
+ return ret;
}
- restart->fn = hrtimer_nanosleep_restart;
-
/* The other values in restart are already filled in */
return -ERESTART_RESTARTBLOCK;
}
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
+long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
const enum hrtimer_mode mode, const clockid_t clockid)
{
struct restart_block *restart;
struct hrtimer_sleeper t;
- ktime_t rem;
hrtimer_init(&t.timer, clockid, mode);
t.timer.expires = timespec_to_ktime(*rqtp);
@@ -1364,10 +1387,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
return -ERESTARTNOHAND;
if (rmtp) {
- rem = ktime_sub(t.timer.expires, t.timer.base->get_time());
- if (rem.tv64 <= 0)
- return 0;
- *rmtp = ktime_to_timespec(rem);
+ int ret = update_rmtp(&t.timer, rmtp);
+ if (ret <= 0)
+ return ret;
}
restart = &current_thread_info()->restart_block;
@@ -1383,8 +1405,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec *rmtp,
asmlinkage long
sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
{
- struct timespec tu, rmt;
- int ret;
+ struct timespec tu;
if (copy_from_user(&tu, rqtp, sizeof(tu)))
return -EFAULT;
@@ -1392,15 +1413,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
if (!timespec_valid(&tu))
return -EINVAL;
- ret = hrtimer_nanosleep(&tu, rmtp ? &rmt : NULL, HRTIMER_MODE_REL,
- CLOCK_MONOTONIC);
-
- if (ret && rmtp) {
- if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
- return -EFAULT;
- }
-
- return ret;
+ return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 44019ce30a1..cc54c627635 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -286,7 +286,7 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
* Note: The caller is expected to handle the ack, clear, mask and
* unmask issues if necessary.
*/
-void fastcall
+void
handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
struct irqaction *action;
@@ -327,7 +327,7 @@ out_unlock:
* it after the associated handler has acknowledged the device, so the
* interrupt line is back to inactive.
*/
-void fastcall
+void
handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
unsigned int cpu = smp_processor_id();
@@ -375,7 +375,7 @@ out_unlock:
* for modern forms of interrupt handlers, which handle the flow
* details in hardware, transparently.
*/
-void fastcall
+void
handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
{
unsigned int cpu = smp_processor_id();
@@ -434,7 +434,7 @@ out:
* the handler was running. If all pending interrupts are handled, the
* loop is left.
*/
-void fastcall
+void
handle_edge_irq(unsigned int irq, struct irq_desc *desc)
{
const unsigned int cpu = smp_processor_id();
@@ -505,7 +505,7 @@ out_unlock:
*
* Per CPU interrupts on SMP machines without locking requirements
*/
-void fastcall
+void
handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
{
irqreturn_t action_ret;
@@ -589,3 +589,39 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
set_irq_chip(irq, chip);
__set_irq_handler(irq, handle, 0, name);
}
+
+void __init set_irq_noprobe(unsigned int irq)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS) {
+ printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
+
+ return;
+ }
+
+ desc = irq_desc + irq;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->status |= IRQ_NOPROBE;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void __init set_irq_probe(unsigned int irq)
+{
+ struct irq_desc *desc;
+ unsigned long flags;
+
+ if (irq >= NR_IRQS) {
+ printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
+
+ return;
+ }
+
+ desc = irq_desc + irq;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->status &= ~IRQ_NOPROBE;
+ spin_unlock_irqrestore(&desc->lock, flags);
+}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index dc335ad2752..5fa6198e913 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -25,7 +25,7 @@
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
-void fastcall
+void
handle_bad_irq(unsigned int irq, struct irq_desc *desc)
{
print_irq_desc(irq, desc);
@@ -163,7 +163,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
* This is the original x86 implementation which is used for every
* interrupt type.
*/
-fastcall unsigned int __do_IRQ(unsigned int irq)
+unsigned int __do_IRQ(unsigned int irq)
{
struct irq_desc *desc = irq_desc + irq;
struct irqaction *action;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 2fab344dbf5..ab982747d9b 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -132,7 +132,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
struct signal_struct *sig =
container_of(timer, struct signal_struct, real_timer);
- send_group_sig_info(SIGALRM, SEND_SIG_PRIV, sig->tsk);
+ kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
return HRTIMER_NORESTART;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7dadc71ce51..f091d13def0 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -53,14 +53,6 @@ static inline int is_kernel_inittext(unsigned long addr)
return 0;
}
-static inline int is_kernel_extratext(unsigned long addr)
-{
- if (addr >= (unsigned long)_sextratext
- && addr <= (unsigned long)_eextratext)
- return 1;
- return 0;
-}
-
static inline int is_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
@@ -80,8 +72,7 @@ static int is_ksym_addr(unsigned long addr)
if (all_var)
return is_kernel(addr);
- return is_kernel_text(addr) || is_kernel_inittext(addr) ||
- is_kernel_extratext(addr);
+ return is_kernel_text(addr) || is_kernel_inittext(addr);
}
/* expand a compressed symbol data into the resulting uncompressed string,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a26eec9eb0..06a0e277565 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1361,8 +1361,8 @@ unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
static int __init crash_save_vmcoreinfo_init(void)
{
- vmcoreinfo_append_str("OSRELEASE=%s\n", init_uts_ns.name.release);
- vmcoreinfo_append_str("PAGESIZE=%ld\n", PAGE_SIZE);
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_SYMBOL(node_online_map);
@@ -1376,15 +1376,15 @@ static int __init crash_save_vmcoreinfo_init(void)
#ifdef CONFIG_SPARSEMEM
VMCOREINFO_SYMBOL(mem_section);
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_SIZE(mem_section);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
#endif
- VMCOREINFO_SIZE(page);
- VMCOREINFO_SIZE(pglist_data);
- VMCOREINFO_SIZE(zone);
- VMCOREINFO_SIZE(free_area);
- VMCOREINFO_SIZE(list_head);
- VMCOREINFO_TYPEDEF_SIZE(nodemask_t);
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
VMCOREINFO_OFFSET(page, flags);
VMCOREINFO_OFFSET(page, _count);
VMCOREINFO_OFFSET(page, mapping);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb7df2a28bd..22be3ff3f36 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -173,10 +173,7 @@ static int ____call_usermodehelper(void *data)
*/
set_user_nice(current, 0);
- retval = -EPERM;
- if (current->fs->root)
- retval = kernel_execve(sub_info->path,
- sub_info->argv, sub_info->envp);
+ retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
/* Exec failed? */
sub_info->retval = retval;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d0493eafea3..7a86e643233 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -699,6 +699,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
struct kretprobe_instance, uflist);
ri->rp = rp;
ri->task = current;
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ spin_unlock_irqrestore(&kretprobe_lock, flags);
+ return 0;
+ }
+
arch_prepare_kretprobe(ri, regs);
/* XXX(hch): why is there no hlist_move_head? */
@@ -745,7 +751,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
INIT_HLIST_HEAD(&rp->used_instances);
INIT_HLIST_HEAD(&rp->free_instances);
for (i = 0; i < rp->maxactive; i++) {
- inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
+ inst = kmalloc(sizeof(struct kretprobe_instance) +
+ rp->data_size, GFP_KERNEL);
if (inst == NULL) {
free_rp_inst(rp);
return -ENOMEM;
diff --git a/kernel/marker.c b/kernel/marker.c
index 5323cfaedbc..c4c2cd8b61f 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -27,35 +27,42 @@
extern struct marker __start___markers[];
extern struct marker __stop___markers[];
+/* Set to 1 to enable marker debug output */
+const int marker_debug;
+
/*
* markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers, the hash table and deferred_sync.
+ * and module markers and the hash table.
*/
static DEFINE_MUTEX(markers_mutex);
/*
- * Marker deferred synchronization.
- * Upon marker probe_unregister, we delay call to synchronize_sched() to
- * accelerate mass unregistration (only when there is no more reference to a
- * given module do we call synchronize_sched()). However, we need to make sure
- * every critical region has ended before we re-arm a marker that has been
- * unregistered and then registered back with a different probe data.
- */
-static int deferred_sync;
-
-/*
* Marker hash table, containing the active markers.
* Protected by module_mutex.
*/
#define MARKER_HASH_BITS 6
#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker. It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
struct marker_entry {
struct hlist_node hlist;
char *format;
- marker_probe_func *probe;
- void *private;
+ void (*call)(const struct marker *mdata, /* Probe wrapper */
+ void *call_private, const char *fmt, ...);
+ struct marker_probe_closure single;
+ struct marker_probe_closure *multi;
int refcount; /* Number of times armed. 0 if disarmed. */
+ struct rcu_head rcu;
+ void *oldptr;
+ char rcu_pending:1;
+ char ptype:1;
char name[0]; /* Contains name'\0'format'\0' */
};
@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
/**
* __mark_empty_function - Empty probe callback
- * @mdata: pointer of type const struct marker
+ * @probe_private: probe private data
+ * @call_private: call site private data
* @fmt: format string
* @...: variable argument list
*
@@ -72,13 +80,267 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
* though the function pointer change and the marker enabling are two distinct
* operations that modifies the execution flow of preemptible code.
*/
-void __mark_empty_function(const struct marker *mdata, void *private,
- const char *fmt, ...)
+void __mark_empty_function(void *probe_private, void *call_private,
+ const char *fmt, va_list *args)
{
}
EXPORT_SYMBOL_GPL(__mark_empty_function);
/*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...: Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+void marker_probe_cb(const struct marker *mdata, void *call_private,
+ const char *fmt, ...)
+{
+ va_list args;
+ char ptype;
+
+ /*
+ * disabling preemption to make sure the teardown of the callbacks can
+ * be done correctly when they are in modules and they insure RCU read
+ * coherency.
+ */
+ preempt_disable();
+ ptype = ACCESS_ONCE(mdata->ptype);
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = ACCESS_ONCE(mdata->single.func);
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ va_start(args, fmt);
+ func(mdata->single.probe_private, call_private, fmt, &args);
+ va_end(args);
+ } else {
+ struct marker_probe_closure *multi;
+ int i;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ multi = ACCESS_ONCE(mdata->multi);
+ for (i = 0; multi[i].func; i++) {
+ va_start(args, fmt);
+ multi[i].func(multi[i].probe_private, call_private, fmt,
+ &args);
+ va_end(args);
+ }
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...: Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+void marker_probe_cb_noarg(const struct marker *mdata,
+ void *call_private, const char *fmt, ...)
+{
+ va_list args; /* not initialized */
+ char ptype;
+
+ preempt_disable();
+ ptype = ACCESS_ONCE(mdata->ptype);
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = ACCESS_ONCE(mdata->single.func);
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func(mdata->single.probe_private, call_private, fmt, &args);
+ } else {
+ struct marker_probe_closure *multi;
+ int i;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ multi = ACCESS_ONCE(mdata->multi);
+ for (i = 0; multi[i].func; i++)
+ multi[i].func(multi[i].probe_private, call_private, fmt,
+ &args);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
+
+static void free_old_closure(struct rcu_head *head)
+{
+ struct marker_entry *entry = container_of(head,
+ struct marker_entry, rcu);
+ kfree(entry->oldptr);
+ /* Make sure we free the data before setting the pending flag to 0 */
+ smp_wmb();
+ entry->rcu_pending = 0;
+}
+
+static void debug_print_probes(struct marker_entry *entry)
+{
+ int i;
+
+ if (!marker_debug)
+ return;
+
+ if (!entry->ptype) {
+ printk(KERN_DEBUG "Single probe : %p %p\n",
+ entry->single.func,
+ entry->single.probe_private);
+ } else {
+ for (i = 0; entry->multi[i].func; i++)
+ printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+ entry->multi[i].func,
+ entry->multi[i].probe_private);
+ }
+}
+
+static struct marker_probe_closure *
+marker_entry_add_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *probe_private)
+{
+ int nr_probes = 0;
+ struct marker_probe_closure *old, *new;
+
+ WARN_ON(!probe);
+
+ debug_print_probes(entry);
+ old = entry->multi;
+ if (!entry->ptype) {
+ if (entry->single.func == probe &&
+ entry->single.probe_private == probe_private)
+ return ERR_PTR(-EBUSY);
+ if (entry->single.func == __mark_empty_function) {
+ /* 0 -> 1 probes */
+ entry->single.func = probe;
+ entry->single.probe_private = probe_private;
+ entry->refcount = 1;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* 1 -> 2 probes */
+ nr_probes = 1;
+ old = NULL;
+ }
+ } else {
+ /* (N -> N+1), (N != 0, 1) probes */
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ if (old[nr_probes].func == probe
+ && old[nr_probes].probe_private
+ == probe_private)
+ return ERR_PTR(-EBUSY);
+ }
+ /* + 2 : one for new probe, one for NULL func */
+ new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
+ GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (!old)
+ new[0] = entry->single;
+ else
+ memcpy(new, old,
+ nr_probes * sizeof(struct marker_probe_closure));
+ new[nr_probes].func = probe;
+ new[nr_probes].probe_private = probe_private;
+ entry->refcount = nr_probes + 1;
+ entry->multi = new;
+ entry->ptype = 1;
+ debug_print_probes(entry);
+ return old;
+}
+
+static struct marker_probe_closure *
+marker_entry_remove_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *probe_private)
+{
+ int nr_probes = 0, nr_del = 0, i;
+ struct marker_probe_closure *old, *new;
+
+ old = entry->multi;
+
+ debug_print_probes(entry);
+ if (!entry->ptype) {
+ /* 0 -> N is an error */
+ WARN_ON(entry->single.func == __mark_empty_function);
+ /* 1 -> 0 probes */
+ WARN_ON(probe && entry->single.func != probe);
+ WARN_ON(entry->single.probe_private != probe_private);
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* (N -> M), (N > 1, M >= 0) probes */
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if ((!probe || old[nr_probes].func == probe)
+ && old[nr_probes].probe_private
+ == probe_private)
+ nr_del++;
+ }
+ }
+
+ if (nr_probes - nr_del == 0) {
+ /* N -> 0, (N > 1) */
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ } else if (nr_probes - nr_del == 1) {
+ /* N -> 1, (N > 1) */
+ for (i = 0; old[i].func; i++)
+ if ((probe && old[i].func != probe) ||
+ old[i].probe_private != probe_private)
+ entry->single = old[i];
+ entry->refcount = 1;
+ entry->ptype = 0;
+ } else {
+ int j = 0;
+ /* N -> M, (N > 1, M > 1) */
+ /* + 1 for NULL */
+ new = kzalloc((nr_probes - nr_del + 1)
+ * sizeof(struct marker_probe_closure), GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; old[i].func; i++)
+ if ((probe && old[i].func != probe) ||
+ old[i].probe_private != probe_private)
+ new[j++] = old[i];
+ entry->refcount = nr_probes - nr_del;
+ entry->ptype = 1;
+ entry->multi = new;
+ }
+ debug_print_probes(entry);
+ return old;
+}
+
+/*
* Get marker if the marker is present in the marker hash table.
* Must be called with markers_mutex held.
* Returns NULL if not present.
@@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name)
* Add the marker to the marker hash table. Must be called with markers_mutex
* held.
*/
-static int add_marker(const char *name, const char *format,
- marker_probe_func *probe, void *private)
+static struct marker_entry *add_marker(const char *name, const char *format)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format,
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name)) {
printk(KERN_NOTICE
- "Marker %s busy, probe %p already installed\n",
- name, e->probe);
- return -EBUSY; /* Already there */
+ "Marker %s busy\n", name);
+ return ERR_PTR(-EBUSY); /* Already there */
}
}
/*
@@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format,
e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
GFP_KERNEL);
if (!e)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
memcpy(&e->name[0], name, name_len);
if (format) {
e->format = &e->name[name_len];
memcpy(e->format, format, format_len);
+ if (strcmp(e->format, MARK_NOARGS) == 0)
+ e->call = marker_probe_cb_noarg;
+ else
+ e->call = marker_probe_cb;
trace_mark(core_marker_format, "name %s format %s",
e->name, e->format);
- } else
+ } else {
e->format = NULL;
- e->probe = probe;
- e->private = private;
+ e->call = marker_probe_cb;
+ }
+ e->single.func = __mark_empty_function;
+ e->single.probe_private = NULL;
+ e->multi = NULL;
+ e->ptype = 0;
e->refcount = 0;
+ e->rcu_pending = 0;
hlist_add_head(&e->hlist, head);
- return 0;
+ return e;
}
/*
* Remove the marker from the marker hash table. Must be called with mutex_lock
* held.
*/
-static void *remove_marker(const char *name)
+static int remove_marker(const char *name)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
int found = 0;
size_t len = strlen(name) + 1;
- void *private = NULL;
u32 hash = jhash(name, len-1, 0);
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
@@ -167,12 +435,16 @@ static void *remove_marker(const char *name)
break;
}
}
- if (found) {
- private = e->private;
- hlist_del(&e->hlist);
- kfree(e);
- }
- return private;
+ if (!found)
+ return -ENOENT;
+ if (e->single.func != __mark_empty_function)
+ return -EBUSY;
+ hlist_del(&e->hlist);
+ /* Make sure the call_rcu has been executed */
+ if (e->rcu_pending)
+ rcu_barrier();
+ kfree(e);
+ return 0;
}
/*
@@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
size_t name_len = strlen((*entry)->name) + 1;
size_t format_len = strlen(format) + 1;
+
e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
GFP_KERNEL);
if (!e)
@@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
memcpy(&e->name[0], (*entry)->name, name_len);
e->format = &e->name[name_len];
memcpy(e->format, format, format_len);
- e->probe = (*entry)->probe;
- e->private = (*entry)->private;
+ if (strcmp(e->format, MARK_NOARGS) == 0)
+ e->call = marker_probe_cb_noarg;
+ else
+ e->call = marker_probe_cb;
+ e->single = (*entry)->single;
+ e->multi = (*entry)->multi;
+ e->ptype = (*entry)->ptype;
e->refcount = (*entry)->refcount;
+ e->rcu_pending = 0;
hlist_add_before(&e->hlist, &(*entry)->hlist);
hlist_del(&(*entry)->hlist);
+ /* Make sure the call_rcu has been executed */
+ if ((*entry)->rcu_pending)
+ rcu_barrier();
kfree(*entry);
*entry = e;
trace_mark(core_marker_format, "name %s format %s",
@@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
/*
* Sets the probe callback corresponding to one marker.
*/
-static int set_marker(struct marker_entry **entry, struct marker *elem)
+static int set_marker(struct marker_entry **entry, struct marker *elem,
+ int active)
{
int ret;
WARN_ON(strcmp((*entry)->name, elem->name) != 0);
@@ -226,9 +509,43 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
if (ret)
return ret;
}
- elem->call = (*entry)->probe;
- elem->private = (*entry)->private;
- elem->state = 1;
+
+ /*
+ * probe_cb setup (statically known) is done here. It is
+ * asynchronous with the rest of execution, therefore we only
+ * pass from a "safe" callback (with argument) to an "unsafe"
+ * callback (does not set arguments).
+ */
+ elem->call = (*entry)->call;
+ /*
+ * Sanity check :
+ * We only update the single probe private data when the ptr is
+ * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+ */
+ WARN_ON(elem->single.func != __mark_empty_function
+ && elem->single.probe_private
+ != (*entry)->single.probe_private &&
+ !elem->ptype);
+ elem->single.probe_private = (*entry)->single.probe_private;
+ /*
+ * Make sure the private data is valid when we update the
+ * single probe ptr.
+ */
+ smp_wmb();
+ elem->single.func = (*entry)->single.func;
+ /*
+ * We also make sure that the new probe callbacks array is consistent
+ * before setting a pointer to it.
+ */
+ rcu_assign_pointer(elem->multi, (*entry)->multi);
+ /*
+ * Update the function or multi probe array pointer before setting the
+ * ptype.
+ */
+ smp_wmb();
+ elem->ptype = (*entry)->ptype;
+ elem->state = active;
+
return 0;
}
@@ -240,8 +557,12 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
*/
static void disable_marker(struct marker *elem)
{
+ /* leave "call" as is. It is known statically. */
elem->state = 0;
- elem->call = __mark_empty_function;
+ elem->single.func = __mark_empty_function;
+ /* Update the function before setting the ptype */
+ smp_wmb();
+ elem->ptype = 0; /* single probe */
/*
* Leave the private data and id there, because removal is racy and
* should be done only after a synchronize_sched(). These are never used
@@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem)
* marker_update_probe_range - Update a probe range
* @begin: beginning of the range
* @end: end of the range
- * @probe_module: module address of the probe being updated
- * @refcount: number of references left to the given probe_module (out)
*
* Updates the probe callback corresponding to a range of markers.
*/
void marker_update_probe_range(struct marker *begin,
- struct marker *end, struct module *probe_module,
- int *refcount)
+ struct marker *end)
{
struct marker *iter;
struct marker_entry *mark_entry;
@@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin,
mutex_lock(&markers_mutex);
for (iter = begin; iter < end; iter++) {
mark_entry = get_marker(iter->name);
- if (mark_entry && mark_entry->refcount) {
- set_marker(&mark_entry, iter);
+ if (mark_entry) {
+ set_marker(&mark_entry, iter,
+ !!mark_entry->refcount);
/*
* ignore error, continue
*/
- if (probe_module)
- if (probe_module ==
- __module_text_address((unsigned long)mark_entry->probe))
- (*refcount)++;
} else {
disable_marker(iter);
}
@@ -289,20 +604,27 @@ void marker_update_probe_range(struct marker *begin,
* Issues a synchronize_sched() when no reference to the module passed
* as parameter is found in the probes so the probe module can be
* safely unloaded from now on.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions. All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
*/
-static void marker_update_probes(struct module *probe_module)
+static void marker_update_probes(void)
{
- int refcount = 0;
-
/* Core kernel markers */
- marker_update_probe_range(__start___markers,
- __stop___markers, probe_module, &refcount);
+ marker_update_probe_range(__start___markers, __stop___markers);
/* Markers in modules. */
- module_update_markers(probe_module, &refcount);
- if (probe_module && refcount == 0) {
- synchronize_sched();
- deferred_sync = 0;
- }
+ module_update_markers();
}
/**
@@ -310,33 +632,49 @@ static void marker_update_probes(struct module *probe_module)
* @name: marker name
* @format: format string
* @probe: probe handler
- * @private: probe private data
+ * @probe_private: probe private data
*
* private data must be a valid allocated memory address, or NULL.
* Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
*/
int marker_probe_register(const char *name, const char *format,
- marker_probe_func *probe, void *private)
+ marker_probe_func *probe, void *probe_private)
{
struct marker_entry *entry;
int ret = 0;
+ struct marker_probe_closure *old;
mutex_lock(&markers_mutex);
entry = get_marker(name);
- if (entry && entry->refcount) {
- ret = -EBUSY;
- goto end;
- }
- if (deferred_sync) {
- synchronize_sched();
- deferred_sync = 0;
+ if (!entry) {
+ entry = add_marker(name, format);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ goto end;
+ }
}
- ret = add_marker(name, format, probe, private);
- if (ret)
+ /*
+ * If we detect that a call_rcu is pending for this marker,
+ * make sure it's executed now.
+ */
+ if (entry->rcu_pending)
+ rcu_barrier();
+ old = marker_entry_add_probe(entry, probe, probe_private);
+ if (IS_ERR(old)) {
+ ret = PTR_ERR(old);
goto end;
+ }
mutex_unlock(&markers_mutex);
- marker_update_probes(NULL);
- return ret;
+ marker_update_probes(); /* may update entry */
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ WARN_ON(!entry);
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu(&entry->rcu, free_old_closure);
end:
mutex_unlock(&markers_mutex);
return ret;
@@ -346,171 +684,166 @@ EXPORT_SYMBOL_GPL(marker_probe_register);
/**
* marker_probe_unregister - Disconnect a probe from a marker
* @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
*
* Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
*/
-void *marker_probe_unregister(const char *name)
+int marker_probe_unregister(const char *name,
+ marker_probe_func *probe, void *probe_private)
{
- struct module *probe_module;
struct marker_entry *entry;
- void *private;
+ struct marker_probe_closure *old;
+ int ret = 0;
mutex_lock(&markers_mutex);
entry = get_marker(name);
if (!entry) {
- private = ERR_PTR(-ENOENT);
+ ret = -ENOENT;
goto end;
}
- entry->refcount = 0;
- /* In what module is the probe handler ? */
- probe_module = __module_text_address((unsigned long)entry->probe);
- private = remove_marker(name);
- deferred_sync = 1;
+ if (entry->rcu_pending)
+ rcu_barrier();
+ old = marker_entry_remove_probe(entry, probe, probe_private);
mutex_unlock(&markers_mutex);
- marker_update_probes(probe_module);
- return private;
+ marker_update_probes(); /* may update entry */
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu(&entry->rcu, free_old_closure);
+ remove_marker(name); /* Ignore busy error message */
end:
mutex_unlock(&markers_mutex);
- return private;
+ return ret;
}
EXPORT_SYMBOL_GPL(marker_probe_unregister);
-/**
- * marker_probe_unregister_private_data - Disconnect a probe from a marker
- * @private: probe private data
- *
- * Unregister a marker by providing the registered private data.
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- */
-void *marker_probe_unregister_private_data(void *private)
+static struct marker_entry *
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
{
- struct module *probe_module;
- struct hlist_head *head;
- struct hlist_node *node;
struct marker_entry *entry;
- int found = 0;
unsigned int i;
+ struct hlist_head *head;
+ struct hlist_node *node;
- mutex_lock(&markers_mutex);
for (i = 0; i < MARKER_TABLE_SIZE; i++) {
head = &marker_table[i];
hlist_for_each_entry(entry, node, head, hlist) {
- if (entry->private == private) {
- found = 1;
- goto iter_end;
+ if (!entry->ptype) {
+ if (entry->single.func == probe
+ && entry->single.probe_private
+ == probe_private)
+ return entry;
+ } else {
+ struct marker_probe_closure *closure;
+ closure = entry->multi;
+ for (i = 0; closure[i].func; i++) {
+ if (closure[i].func == probe &&
+ closure[i].probe_private
+ == probe_private)
+ return entry;
+ }
}
}
}
-iter_end:
- if (!found) {
- private = ERR_PTR(-ENOENT);
- goto end;
- }
- entry->refcount = 0;
- /* In what module is the probe handler ? */
- probe_module = __module_text_address((unsigned long)entry->probe);
- private = remove_marker(entry->name);
- deferred_sync = 1;
- mutex_unlock(&markers_mutex);
- marker_update_probes(probe_module);
- return private;
-end:
- mutex_unlock(&markers_mutex);
- return private;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
/**
- * marker_arm - Arm a marker
- * @name: marker name
+ * marker_probe_unregister_private_data - Disconnect a probe from a marker
+ * @probe: probe function
+ * @probe_private: probe private data
*
- * Activate a marker. It keeps a reference count of the number of
- * arming/disarming done.
- * Returns 0 if ok, error value on error.
+ * Unregister a probe by providing the registered private data.
+ * Only removes the first marker found in hash table.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
*/
-int marker_arm(const char *name)
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+ void *probe_private)
{
struct marker_entry *entry;
int ret = 0;
+ struct marker_probe_closure *old;
mutex_lock(&markers_mutex);
- entry = get_marker(name);
+ entry = get_marker_from_private_data(probe, probe_private);
if (!entry) {
ret = -ENOENT;
goto end;
}
- /*
- * Only need to update probes when refcount passes from 0 to 1.
- */
- if (entry->refcount++)
- goto end;
-end:
+ if (entry->rcu_pending)
+ rcu_barrier();
+ old = marker_entry_remove_probe(entry, NULL, probe_private);
mutex_unlock(&markers_mutex);
- marker_update_probes(NULL);
- return ret;
-}
-EXPORT_SYMBOL_GPL(marker_arm);
-
-/**
- * marker_disarm - Disarm a marker
- * @name: marker name
- *
- * Disarm a marker. It keeps a reference count of the number of arming/disarming
- * done.
- * Returns 0 if ok, error value on error.
- */
-int marker_disarm(const char *name)
-{
- struct marker_entry *entry;
- int ret = 0;
-
+ marker_update_probes(); /* may update entry */
mutex_lock(&markers_mutex);
- entry = get_marker(name);
- if (!entry) {
- ret = -ENOENT;
- goto end;
- }
- /*
- * Only permit decrement refcount if higher than 0.
- * Do probe update only on 1 -> 0 transition.
- */
- if (entry->refcount) {
- if (--entry->refcount)
- goto end;
- } else {
- ret = -EPERM;
- goto end;
- }
+ entry = get_marker_from_private_data(probe, probe_private);
+ WARN_ON(!entry);
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+ call_rcu(&entry->rcu, free_old_closure);
+ remove_marker(entry->name); /* Ignore busy error message */
end:
mutex_unlock(&markers_mutex);
- marker_update_probes(NULL);
return ret;
}
-EXPORT_SYMBOL_GPL(marker_disarm);
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
/**
* marker_get_private_data - Get a marker's probe private data
* @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
*
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
* Returns the private data pointer, or an ERR_PTR.
* The private data pointer should _only_ be dereferenced if the caller is the
* owner of the data, or its content could vanish. This is mostly used to
* confirm that a caller is the owner of a registered probe.
*/
-void *marker_get_private_data(const char *name)
+void *marker_get_private_data(const char *name, marker_probe_func *probe,
+ int num)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
size_t name_len = strlen(name) + 1;
u32 hash = jhash(name, name_len-1, 0);
- int found = 0;
+ int i;
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name)) {
- found = 1;
- return e->private;
+ if (!e->ptype) {
+ if (num == 0 && e->single.func == probe)
+ return e->single.probe_private;
+ else
+ break;
+ } else {
+ struct marker_probe_closure *closure;
+ int match = 0;
+ closure = e->multi;
+ for (i = 0; closure[i].func; i++) {
+ if (closure[i].func != probe)
+ continue;
+ if (match++ == num)
+ return closure[i].probe_private;
+ }
+ }
}
}
return ERR_PTR(-ENOENT);
diff --git a/kernel/module.c b/kernel/module.c
index bd60278ee70..92595bad381 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -46,6 +46,7 @@
#include <asm/semaphore.h>
#include <asm/cacheflush.h>
#include <linux/license.h>
+#include <asm/sections.h>
#if 0
#define DEBUGP printk
@@ -290,7 +291,7 @@ static unsigned long __find_symbol(const char *name,
}
}
DEBUGP("Failed to find symbol %s\n", name);
- return 0;
+ return -ENOENT;
}
/* Search for module by name: must hold module_mutex. */
@@ -343,9 +344,6 @@ static inline unsigned int block_size(int val)
return val;
}
-/* Created by linker magic */
-extern char __per_cpu_start[], __per_cpu_end[];
-
static void *percpu_modalloc(unsigned long size, unsigned long align,
const char *name)
{
@@ -783,7 +781,7 @@ void __symbol_put(const char *symbol)
const unsigned long *crc;
preempt_disable();
- if (!__find_symbol(symbol, &owner, &crc, 1))
+ if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1)))
BUG();
module_put(owner);
preempt_enable();
@@ -929,7 +927,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
const unsigned long *crc;
struct module *owner;
- if (!__find_symbol("struct_module", &owner, &crc, 1))
+ if (IS_ERR_VALUE(__find_symbol("struct_module",
+ &owner, &crc, 1)))
BUG();
return check_version(sechdrs, versindex, "struct_module", mod,
crc);
@@ -978,12 +977,12 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
ret = __find_symbol(name, &owner, &crc,
!(mod->taints & TAINT_PROPRIETARY_MODULE));
- if (ret) {
+ if (!IS_ERR_VALUE(ret)) {
/* use_module can fail due to OOM,
or module initialization or unloading */
if (!check_version(sechdrs, versindex, name, mod, crc) ||
!use_module(mod, owner))
- ret = 0;
+ ret = -EINVAL;
}
return ret;
}
@@ -1371,7 +1370,9 @@ void *__symbol_get(const char *symbol)
preempt_disable();
value = __find_symbol(symbol, &owner, &crc, 1);
- if (value && strong_try_module_get(owner) != 0)
+ if (IS_ERR_VALUE(value))
+ value = 0;
+ else if (strong_try_module_get(owner))
value = 0;
preempt_enable();
@@ -1391,14 +1392,16 @@ static int verify_export_symbols(struct module *mod)
const unsigned long *crc;
for (i = 0; i < mod->num_syms; i++)
- if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) {
+ if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name,
+ &owner, &crc, 1))) {
name = mod->syms[i].name;
ret = -ENOEXEC;
goto dup;
}
for (i = 0; i < mod->num_gpl_syms; i++)
- if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) {
+ if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name,
+ &owner, &crc, 1))) {
name = mod->gpl_syms[i].name;
ret = -ENOEXEC;
goto dup;
@@ -1448,7 +1451,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
strtab + sym[i].st_name, mod);
/* Ok if resolved. */
- if (sym[i].st_value != 0)
+ if (!IS_ERR_VALUE(sym[i].st_value))
break;
/* Ok if weak. */
if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
@@ -2035,7 +2038,7 @@ static struct module *load_module(void __user *umod,
#ifdef CONFIG_MARKERS
if (!mod->taints)
marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers, NULL, NULL);
+ mod->markers + mod->num_markers);
#endif
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
@@ -2250,7 +2253,7 @@ static const char *get_ksymbol(struct module *mod,
/* For kallsyms to ask for address resolution. NULL means not found. Careful
* not to lock to avoid deadlock on oopses, simply disable preemption. */
-char *module_address_lookup(unsigned long addr,
+const char *module_address_lookup(unsigned long addr,
unsigned long *size,
unsigned long *offset,
char **modname,
@@ -2275,7 +2278,7 @@ char *module_address_lookup(unsigned long addr,
ret = namebuf;
}
preempt_enable();
- return (char *)ret;
+ return ret;
}
int lookup_module_symbol_name(unsigned long addr, char *symname)
@@ -2561,7 +2564,7 @@ EXPORT_SYMBOL(struct_module);
#endif
#ifdef CONFIG_MARKERS
-void module_update_markers(struct module *probe_module, int *refcount)
+void module_update_markers(void)
{
struct module *mod;
@@ -2569,8 +2572,7 @@ void module_update_markers(struct module *probe_module, int *refcount)
list_for_each_entry(mod, &modules, list)
if (!mod->taints)
marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers,
- probe_module, refcount);
+ mod->markers + mod->num_markers);
mutex_unlock(&module_mutex);
}
#endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index d17436cdea1..3aaa06c561d 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -107,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name,
* use of the mutex is forbidden. The mutex must not be locked when
* this function is called.
*/
-void fastcall mutex_destroy(struct mutex *lock)
+void mutex_destroy(struct mutex *lock)
{
DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
lock->magic = NULL;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d9ec9b66625..d046a345d36 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init);
* We also put the fastpath first in the kernel image, to make sure the
* branch is predicted by the CPU as default-untaken.
*/
-static void fastcall noinline __sched
+static void noinline __sched
__mutex_lock_slowpath(atomic_t *lock_count);
/***
@@ -82,7 +82,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
*
* This function is similar to (but not equivalent to) down().
*/
-void inline fastcall __sched mutex_lock(struct mutex *lock)
+void inline __sched mutex_lock(struct mutex *lock)
{
might_sleep();
/*
@@ -95,8 +95,7 @@ void inline fastcall __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
-static void fastcall noinline __sched
-__mutex_unlock_slowpath(atomic_t *lock_count);
+static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
/***
* mutex_unlock - release the mutex
@@ -109,7 +108,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count);
*
* This function is similar to (but not equivalent to) up().
*/
-void fastcall __sched mutex_unlock(struct mutex *lock)
+void __sched mutex_unlock(struct mutex *lock)
{
/*
* The unlocking fastpath is the 0->1 transition from 'locked'
@@ -234,7 +233,7 @@ EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
/*
* Release the lock, slowpath:
*/
-static fastcall inline void
+static inline void
__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -271,7 +270,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
/*
* Release the lock, slowpath:
*/
-static fastcall noinline void
+static noinline void
__mutex_unlock_slowpath(atomic_t *lock_count)
{
__mutex_unlock_common_slowpath(lock_count, 1);
@@ -282,10 +281,10 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
* Here come the less common (and hence less performance-critical) APIs:
* mutex_lock_interruptible() and mutex_trylock().
*/
-static int fastcall noinline __sched
+static noinline int __sched
__mutex_lock_killable_slowpath(atomic_t *lock_count);
-static noinline int fastcall __sched
+static noinline int __sched
__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
/***
@@ -299,7 +298,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
*
* This function is similar to (but not equivalent to) down_interruptible().
*/
-int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
+int __sched mutex_lock_interruptible(struct mutex *lock)
{
might_sleep();
return __mutex_fastpath_lock_retval
@@ -308,7 +307,7 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock_interruptible);
-int fastcall __sched mutex_lock_killable(struct mutex *lock)
+int __sched mutex_lock_killable(struct mutex *lock)
{
might_sleep();
return __mutex_fastpath_lock_retval
@@ -316,7 +315,7 @@ int fastcall __sched mutex_lock_killable(struct mutex *lock)
}
EXPORT_SYMBOL(mutex_lock_killable);
-static void fastcall noinline __sched
+static noinline void __sched
__mutex_lock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -324,7 +323,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
}
-static int fastcall noinline __sched
+static noinline int __sched
__mutex_lock_killable_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -332,7 +331,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
}
-static noinline int fastcall __sched
+static noinline int __sched
__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
@@ -381,7 +380,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
* This function must not be used in interrupt context. The
* mutex must be released by the same task that acquired it.
*/
-int fastcall __sched mutex_trylock(struct mutex *lock)
+int __sched mutex_trylock(struct mutex *lock)
{
return __mutex_fastpath_trylock(&lock->count,
__mutex_trylock_slowpath);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4253f472f06..643360d1bb1 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -4,6 +4,7 @@
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
+#include <linux/reboot.h>
/*
* Notifier list for kernel code which wants to be called
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 79f871bc0ef..f5d332cf8c6 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,7 @@
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
+#include <linux/ipc_namespace.h>
static struct kmem_cache *nsproxy_cachep;
diff --git a/kernel/panic.c b/kernel/panic.c
index d9e90cfe329..24af9f8bac9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -161,7 +161,7 @@ const char *print_tainted(void)
{
static char buf[20];
if (tainted) {
- snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c",
+ snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c",
tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -169,7 +169,8 @@ const char *print_tainted(void)
tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
tainted & TAINT_BAD_PAGE ? 'B' : ' ',
tainted & TAINT_USER ? 'U' : ' ',
- tainted & TAINT_DIE ? 'D' : ' ');
+ tainted & TAINT_DIE ? 'D' : ' ',
+ tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ');
}
else
snprintf(buf, sizeof(buf), "Not tainted");
diff --git a/kernel/params.c b/kernel/params.c
index 42fe5e6126c..afc46a23eb6 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -180,12 +180,12 @@ int parse_args(const char *name,
#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \
int param_set_##name(const char *val, struct kernel_param *kp) \
{ \
- char *endp; \
tmptype l; \
+ int ret; \
\
if (!val) return -EINVAL; \
- l = strtolfn(val, &endp, 0); \
- if (endp == val || ((type)l != l)) \
+ ret = strtolfn(val, 0, &l); \
+ if (ret == -EINVAL || ((type)l != l)) \
return -EINVAL; \
*((type *)kp->arg) = l; \
return 0; \
@@ -195,13 +195,13 @@ int parse_args(const char *name,
return sprintf(buffer, format, *((type *)kp->arg)); \
}
-STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, simple_strtoul);
-STANDARD_PARAM_DEF(short, short, "%hi", long, simple_strtol);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, simple_strtoul);
-STANDARD_PARAM_DEF(int, int, "%i", long, simple_strtol);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, simple_strtoul);
-STANDARD_PARAM_DEF(long, long, "%li", long, simple_strtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, simple_strtoul);
+STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
int param_set_charp(const char *val, struct kernel_param *kp)
{
@@ -272,7 +272,7 @@ static int param_array(const char *name,
unsigned int min, unsigned int max,
void *elem, int elemsize,
int (*set)(const char *, struct kernel_param *kp),
- int *num)
+ unsigned int *num)
{
int ret;
struct kernel_param kp;
diff --git a/kernel/pid.c b/kernel/pid.c
index f815455431b..477691576b3 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,7 +41,6 @@
static struct hlist_head *pid_hash;
static int pidhash_shift;
struct pid init_struct_pid = INIT_STRUCT_PID;
-static struct kmem_cache *pid_ns_cachep;
int pid_max = PID_MAX_DEFAULT;
@@ -112,7 +111,7 @@ EXPORT_SYMBOL(is_container_init);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
+static void free_pidmap(struct pid_namespace *pid_ns, int pid)
{
struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
int offset = pid & BITS_PER_PAGE_MASK;
@@ -181,7 +180,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
return -1;
}
-static int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, int last)
{
int offset;
struct pidmap *map, *end;
@@ -199,7 +198,7 @@ static int next_pidmap(struct pid_namespace *pid_ns, int last)
return -1;
}
-fastcall void put_pid(struct pid *pid)
+void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
@@ -221,7 +220,7 @@ static void delayed_put_pid(struct rcu_head *rhp)
put_pid(pid);
}
-fastcall void free_pid(struct pid *pid)
+void free_pid(struct pid *pid)
{
/* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
@@ -287,7 +286,7 @@ out_free:
goto out;
}
-struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns)
+struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
struct hlist_node *elem;
struct upid *pnr;
@@ -317,7 +316,7 @@ EXPORT_SYMBOL_GPL(find_pid);
/*
* attach_pid() must be called with the tasklist_lock write-held.
*/
-int fastcall attach_pid(struct task_struct *task, enum pid_type type,
+int attach_pid(struct task_struct *task, enum pid_type type,
struct pid *pid)
{
struct pid_link *link;
@@ -329,7 +328,7 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type,
return 0;
}
-void fastcall detach_pid(struct task_struct *task, enum pid_type type)
+void detach_pid(struct task_struct *task, enum pid_type type)
{
struct pid_link *link;
struct pid *pid;
@@ -349,7 +348,7 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type)
}
/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
-void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
+void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
new->pids[type].pid = old->pids[type].pid;
@@ -357,7 +356,7 @@ void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
old->pids[type].pid = NULL;
}
-struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
+struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result = NULL;
if (pid) {
@@ -368,6 +367,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
}
return result;
}
+EXPORT_SYMBOL(pid_task);
/*
* Must be called under rcu_read_lock() or with tasklist_lock read-held.
@@ -408,7 +408,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
return pid;
}
-struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
+struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
struct task_struct *result;
rcu_read_lock();
@@ -443,6 +443,12 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
return nr;
}
+pid_t pid_vnr(struct pid *pid)
+{
+ return pid_nr_ns(pid, current->nsproxy->pid_ns);
+}
+EXPORT_SYMBOL_GPL(pid_vnr);
+
pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return pid_nr_ns(task_pid(tsk), ns);
@@ -487,180 +493,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
}
EXPORT_SYMBOL_GPL(find_get_pid);
-struct pid_cache {
- int nr_ids;
- char name[16];
- struct kmem_cache *cachep;
- struct list_head list;
-};
-
-static LIST_HEAD(pid_caches_lh);
-static DEFINE_MUTEX(pid_caches_mutex);
-
-/*
- * creates the kmem cache to allocate pids from.
- * @nr_ids: the number of numerical ids this pid will have to carry
- */
-
-static struct kmem_cache *create_pid_cachep(int nr_ids)
-{
- struct pid_cache *pcache;
- struct kmem_cache *cachep;
-
- mutex_lock(&pid_caches_mutex);
- list_for_each_entry (pcache, &pid_caches_lh, list)
- if (pcache->nr_ids == nr_ids)
- goto out;
-
- pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
- if (pcache == NULL)
- goto err_alloc;
-
- snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
- cachep = kmem_cache_create(pcache->name,
- sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (cachep == NULL)
- goto err_cachep;
-
- pcache->nr_ids = nr_ids;
- pcache->cachep = cachep;
- list_add(&pcache->list, &pid_caches_lh);
-out:
- mutex_unlock(&pid_caches_mutex);
- return pcache->cachep;
-
-err_cachep:
- kfree(pcache);
-err_alloc:
- mutex_unlock(&pid_caches_mutex);
- return NULL;
-}
-
-#ifdef CONFIG_PID_NS
-static struct pid_namespace *create_pid_namespace(int level)
-{
- struct pid_namespace *ns;
- int i;
-
- ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
- if (ns == NULL)
- goto out;
-
- ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!ns->pidmap[0].page)
- goto out_free;
-
- ns->pid_cachep = create_pid_cachep(level + 1);
- if (ns->pid_cachep == NULL)
- goto out_free_map;
-
- kref_init(&ns->kref);
- ns->last_pid = 0;
- ns->child_reaper = NULL;
- ns->level = level;
-
- set_bit(0, ns->pidmap[0].page);
- atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
- for (i = 1; i < PIDMAP_ENTRIES; i++) {
- ns->pidmap[i].page = 0;
- atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
- }
-
- return ns;
-
-out_free_map:
- kfree(ns->pidmap[0].page);
-out_free:
- kmem_cache_free(pid_ns_cachep, ns);
-out:
- return ERR_PTR(-ENOMEM);
-}
-
-static void destroy_pid_namespace(struct pid_namespace *ns)
-{
- int i;
-
- for (i = 0; i < PIDMAP_ENTRIES; i++)
- kfree(ns->pidmap[i].page);
- kmem_cache_free(pid_ns_cachep, ns);
-}
-
-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
-{
- struct pid_namespace *new_ns;
-
- BUG_ON(!old_ns);
- new_ns = get_pid_ns(old_ns);
- if (!(flags & CLONE_NEWPID))
- goto out;
-
- new_ns = ERR_PTR(-EINVAL);
- if (flags & CLONE_THREAD)
- goto out_put;
-
- new_ns = create_pid_namespace(old_ns->level + 1);
- if (!IS_ERR(new_ns))
- new_ns->parent = get_pid_ns(old_ns);
-
-out_put:
- put_pid_ns(old_ns);
-out:
- return new_ns;
-}
-
-void free_pid_ns(struct kref *kref)
-{
- struct pid_namespace *ns, *parent;
-
- ns = container_of(kref, struct pid_namespace, kref);
-
- parent = ns->parent;
- destroy_pid_namespace(ns);
-
- if (parent != NULL)
- put_pid_ns(parent);
-}
-#endif /* CONFIG_PID_NS */
-
-void zap_pid_ns_processes(struct pid_namespace *pid_ns)
-{
- int nr;
- int rc;
-
- /*
- * The last thread in the cgroup-init thread group is terminating.
- * Find remaining pid_ts in the namespace, signal and wait for them
- * to exit.
- *
- * Note: This signals each threads in the namespace - even those that
- * belong to the same thread group, To avoid this, we would have
- * to walk the entire tasklist looking a processes in this
- * namespace, but that could be unnecessarily expensive if the
- * pid namespace has just a few processes. Or we need to
- * maintain a tasklist for each pid namespace.
- *
- */
- read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
- kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
- nr = next_pidmap(pid_ns, nr);
- }
- read_unlock(&tasklist_lock);
-
- do {
- clear_thread_flag(TIF_SIGPENDING);
- rc = sys_wait4(-1, NULL, __WALL, NULL);
- } while (rc != -ECHILD);
-
-
- /* Child reaper for the pid namespace is going away */
- pid_ns->child_reaper = NULL;
- return;
-}
-
/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -693,9 +525,6 @@ void __init pidmap_init(void)
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
- init_pid_ns.pid_cachep = create_pid_cachep(1);
- if (init_pid_ns.pid_cachep == NULL)
- panic("Can't create pid_1 cachep\n");
-
- pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ init_pid_ns.pid_cachep = KMEM_CACHE(pid,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC);
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
new file mode 100644
index 00000000000..6d792b66d85
--- /dev/null
+++ b/kernel/pid_namespace.c
@@ -0,0 +1,197 @@
+/*
+ * Pid namespaces
+ *
+ * Authors:
+ * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
+ * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
+ * Many thanks to Oleg Nesterov for comments and help
+ *
+ */
+
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
+#include <linux/err.h>
+
+#define BITS_PER_PAGE (PAGE_SIZE*8)
+
+struct pid_cache {
+ int nr_ids;
+ char name[16];
+ struct kmem_cache *cachep;
+ struct list_head list;
+};
+
+static LIST_HEAD(pid_caches_lh);
+static DEFINE_MUTEX(pid_caches_mutex);
+static struct kmem_cache *pid_ns_cachep;
+
+/*
+ * creates the kmem cache to allocate pids from.
+ * @nr_ids: the number of numerical ids this pid will have to carry
+ */
+
+static struct kmem_cache *create_pid_cachep(int nr_ids)
+{
+ struct pid_cache *pcache;
+ struct kmem_cache *cachep;
+
+ mutex_lock(&pid_caches_mutex);
+ list_for_each_entry(pcache, &pid_caches_lh, list)
+ if (pcache->nr_ids == nr_ids)
+ goto out;
+
+ pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
+ if (pcache == NULL)
+ goto err_alloc;
+
+ snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
+ cachep = kmem_cache_create(pcache->name,
+ sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (cachep == NULL)
+ goto err_cachep;
+
+ pcache->nr_ids = nr_ids;
+ pcache->cachep = cachep;
+ list_add(&pcache->list, &pid_caches_lh);
+out:
+ mutex_unlock(&pid_caches_mutex);
+ return pcache->cachep;
+
+err_cachep:
+ kfree(pcache);
+err_alloc:
+ mutex_unlock(&pid_caches_mutex);
+ return NULL;
+}
+
+static struct pid_namespace *create_pid_namespace(int level)
+{
+ struct pid_namespace *ns;
+ int i;
+
+ ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL);
+ if (ns == NULL)
+ goto out;
+
+ ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!ns->pidmap[0].page)
+ goto out_free;
+
+ ns->pid_cachep = create_pid_cachep(level + 1);
+ if (ns->pid_cachep == NULL)
+ goto out_free_map;
+
+ kref_init(&ns->kref);
+ ns->last_pid = 0;
+ ns->child_reaper = NULL;
+ ns->level = level;
+
+ set_bit(0, ns->pidmap[0].page);
+ atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
+
+ for (i = 1; i < PIDMAP_ENTRIES; i++) {
+ ns->pidmap[i].page = 0;
+ atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+ }
+
+ return ns;
+
+out_free_map:
+ kfree(ns->pidmap[0].page);
+out_free:
+ kmem_cache_free(pid_ns_cachep, ns);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void destroy_pid_namespace(struct pid_namespace *ns)
+{
+ int i;
+
+ for (i = 0; i < PIDMAP_ENTRIES; i++)
+ kfree(ns->pidmap[i].page);
+ kmem_cache_free(pid_ns_cachep, ns);
+}
+
+struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+{
+ struct pid_namespace *new_ns;
+
+ BUG_ON(!old_ns);
+ new_ns = get_pid_ns(old_ns);
+ if (!(flags & CLONE_NEWPID))
+ goto out;
+
+ new_ns = ERR_PTR(-EINVAL);
+ if (flags & CLONE_THREAD)
+ goto out_put;
+
+ new_ns = create_pid_namespace(old_ns->level + 1);
+ if (!IS_ERR(new_ns))
+ new_ns->parent = get_pid_ns(old_ns);
+
+out_put:
+ put_pid_ns(old_ns);
+out:
+ return new_ns;
+}
+
+void free_pid_ns(struct kref *kref)
+{
+ struct pid_namespace *ns, *parent;
+
+ ns = container_of(kref, struct pid_namespace, kref);
+
+ parent = ns->parent;
+ destroy_pid_namespace(ns);
+
+ if (parent != NULL)
+ put_pid_ns(parent);
+}
+
+void zap_pid_ns_processes(struct pid_namespace *pid_ns)
+{
+ int nr;
+ int rc;
+
+ /*
+ * The last thread in the cgroup-init thread group is terminating.
+ * Find remaining pid_ts in the namespace, signal and wait for them
+ * to exit.
+ *
+ * Note: This signals each threads in the namespace - even those that
+ * belong to the same thread group, To avoid this, we would have
+ * to walk the entire tasklist looking a processes in this
+ * namespace, but that could be unnecessarily expensive if the
+ * pid namespace has just a few processes. Or we need to
+ * maintain a tasklist for each pid namespace.
+ *
+ */
+ read_lock(&tasklist_lock);
+ nr = next_pidmap(pid_ns, 1);
+ while (nr > 0) {
+ kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
+ nr = next_pidmap(pid_ns, nr);
+ }
+ read_unlock(&tasklist_lock);
+
+ do {
+ clear_thread_flag(TIF_SIGPENDING);
+ rc = sys_wait4(-1, NULL, __WALL, NULL);
+ } while (rc != -ECHILD);
+
+
+ /* Child reaper for the pid namespace is going away */
+ pid_ns->child_reaper = NULL;
+ return;
+}
+
+static __init int pid_namespaces_init(void)
+{
+ pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ return 0;
+}
+
+__initcall(pid_namespaces_init);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0b7c82ac467..2eae91f954c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -20,7 +20,7 @@ static int check_clock(const clockid_t which_clock)
return 0;
read_lock(&tasklist_lock);
- p = find_task_by_pid(pid);
+ p = find_task_by_vpid(pid);
if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
same_thread_group(p, current) : thread_group_leader(p))) {
error = -EINVAL;
@@ -305,7 +305,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
*/
struct task_struct *p;
rcu_read_lock();
- p = find_task_by_pid(pid);
+ p = find_task_by_vpid(pid);
if (p) {
if (CPUCLOCK_PERTHREAD(which_clock)) {
if (same_thread_group(p, current)) {
@@ -354,7 +354,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
if (pid == 0) {
p = current;
} else {
- p = find_task_by_pid(pid);
+ p = find_task_by_vpid(pid);
if (p && !same_thread_group(p, current))
p = NULL;
}
@@ -362,7 +362,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
if (pid == 0) {
p = current->group_leader;
} else {
- p = find_task_by_pid(pid);
+ p = find_task_by_vpid(pid);
if (p && !thread_group_leader(p))
p = NULL;
}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 122d5c787fe..a9b04203a66 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -404,7 +404,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
struct task_struct *rtn = current->group_leader;
if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
- (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
+ (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
!same_thread_group(rtn, current) ||
(event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
return NULL;
@@ -767,9 +767,11 @@ common_timer_set(struct k_itimer *timr, int flags,
/* SIGEV_NONE timers are not queued ! See common_timer_get */
if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
/* Setup correct expiry time for relative timers */
- if (mode == HRTIMER_MODE_REL)
- timer->expires = ktime_add(timer->expires,
- timer->base->get_time());
+ if (mode == HRTIMER_MODE_REL) {
+ timer->expires =
+ ktime_add_safe(timer->expires,
+ timer->base->get_time());
+ }
return 0;
}
@@ -982,20 +984,9 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
static int common_nsleep(const clockid_t which_clock, int flags,
struct timespec *tsave, struct timespec __user *rmtp)
{
- struct timespec rmt;
- int ret;
-
- ret = hrtimer_nanosleep(tsave, rmtp ? &rmt : NULL,
- flags & TIMER_ABSTIME ?
- HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
- which_clock);
-
- if (ret && rmtp) {
- if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
- return -EFAULT;
- }
-
- return ret;
+ return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
+ HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+ which_clock);
}
asmlinkage long
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ef9b802738a..79833170bb9 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -74,8 +74,8 @@ config PM_TRACE_RTC
RTC across reboots, so that you can debug a machine that just hangs
during suspend (or more commonly, during resume).
- To use this debugging feature you should attempt to suspend the machine,
- then reboot it, then run
+ To use this debugging feature you should attempt to suspend the
+ machine, reboot it and then run
dmesg -s 1000000 | grep 'hash matches'
@@ -123,7 +123,10 @@ config HIBERNATION
called "hibernation" in user interfaces. STD checkpoints the
system and powers it off; and restores that checkpoint on reboot.
- You can suspend your machine with 'echo disk > /sys/power/state'.
+ You can suspend your machine with 'echo disk > /sys/power/state'
+ after placing resume=/dev/swappartition on the kernel command line
+ in your bootloader's configuration file.
+
Alternatively, you can use the additional userland tools available
from <http://suspend.sf.net>.
diff --git a/kernel/printk.c b/kernel/printk.c
index 29ae1e99cde..bee36100f11 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,7 +32,6 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
-#include <linux/jiffies.h>
#include <asm/uaccess.h>
@@ -93,16 +92,16 @@ static int console_locked, console_suspended;
*/
static DEFINE_SPINLOCK(logbuf_lock);
-#define LOG_BUF_MASK (log_buf_len-1)
+#define LOG_BUF_MASK (log_buf_len-1)
#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
/*
* The indices into log_buf are not constrained to log_buf_len - they
* must be masked before subscripting
*/
-static unsigned long log_start; /* Index into log_buf: next char to be read by syslog() */
-static unsigned long con_start; /* Index into log_buf: next char to be sent to consoles */
-static unsigned long log_end; /* Index into log_buf: most-recently-written-char + 1 */
+static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */
+static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */
+static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
/*
* Array of consoles built from command line options (console=)
@@ -128,17 +127,17 @@ static int console_may_schedule;
static char __log_buf[__LOG_BUF_LEN];
static char *log_buf = __log_buf;
static int log_buf_len = __LOG_BUF_LEN;
-static unsigned long logged_chars; /* Number of chars produced since last read+clear operation */
+static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
static int __init log_buf_len_setup(char *str)
{
- unsigned long size = memparse(str, &str);
+ unsigned size = memparse(str, &str);
unsigned long flags;
if (size)
size = roundup_pow_of_two(size);
if (size > log_buf_len) {
- unsigned long start, dest_idx, offset;
+ unsigned start, dest_idx, offset;
char *new_log_buf;
new_log_buf = alloc_bootmem(size);
@@ -295,7 +294,7 @@ int log_buf_read(int idx)
*/
int do_syslog(int type, char __user *buf, int len)
{
- unsigned long i, j, limit, count;
+ unsigned i, j, limit, count;
int do_clear = 0;
char c;
int error = 0;
@@ -436,7 +435,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len)
/*
* Call the console drivers on a range of log_buf
*/
-static void __call_console_drivers(unsigned long start, unsigned long end)
+static void __call_console_drivers(unsigned start, unsigned end)
{
struct console *con;
@@ -463,8 +462,8 @@ early_param("ignore_loglevel", ignore_loglevel_setup);
/*
* Write out chars from start to end - 1 inclusive
*/
-static void _call_console_drivers(unsigned long start,
- unsigned long end, int msg_log_level)
+static void _call_console_drivers(unsigned start,
+ unsigned end, int msg_log_level)
{
if ((msg_log_level < console_loglevel || ignore_loglevel) &&
console_drivers && start != end) {
@@ -484,12 +483,12 @@ static void _call_console_drivers(unsigned long start,
* log_buf[start] to log_buf[end - 1].
* The console_sem must be held.
*/
-static void call_console_drivers(unsigned long start, unsigned long end)
+static void call_console_drivers(unsigned start, unsigned end)
{
- unsigned long cur_index, start_print;
+ unsigned cur_index, start_print;
static int msg_level = -1;
- BUG_ON(((long)(start - end)) > 0);
+ BUG_ON(((int)(start - end)) > 0);
cur_index = start;
start_print = start;
@@ -567,19 +566,6 @@ static int printk_time = 0;
#endif
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
-static int __init printk_time_setup(char *str)
-{
- if (*str)
- return 0;
- printk_time = 1;
- printk(KERN_NOTICE "The 'time' option is deprecated and "
- "is scheduled for removal in early 2008\n");
- printk(KERN_NOTICE "Use 'printk.time=<value>' instead\n");
- return 1;
-}
-
-__setup("time", printk_time_setup);
-
/* Check if we have any console registered that can be called early in boot. */
static int have_callable_console(void)
{
@@ -790,7 +776,7 @@ asmlinkage long sys_syslog(int type, char __user *buf, int len)
return -ENOSYS;
}
-static void call_console_drivers(unsigned long start, unsigned long end)
+static void call_console_drivers(unsigned start, unsigned end)
{
}
@@ -983,8 +969,8 @@ void wake_up_klogd(void)
void release_console_sem(void)
{
unsigned long flags;
- unsigned long _con_start, _log_end;
- unsigned long wake_klogd = 0;
+ unsigned _con_start, _log_end;
+ unsigned wake_klogd = 0;
if (console_suspended) {
up(&secondary_console_sem);
@@ -1265,6 +1251,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
return;
}
+#if defined CONFIG_PRINTK
/*
* printk rate limiting, lifted from the networking subsystem.
*
@@ -1275,7 +1262,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
{
static DEFINE_SPINLOCK(ratelimit_lock);
- static unsigned long toks = 10 * 5 * HZ;
+ static unsigned toks = 10 * 5 * HZ;
static unsigned long last_msg;
static int missed;
unsigned long flags;
@@ -1334,3 +1321,4 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
return false;
}
EXPORT_SYMBOL(printk_timed_ratelimit);
+#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index e64c2da11c0..3b7a1b05512 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -20,7 +20,6 @@
#include <linux/mm.h>
#include <linux/cpumask.h>
#include <linux/cpu.h>
-#include <linux/profile.h>
#include <linux/highmem.h>
#include <linux/mutex.h>
#include <asm/sections.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b0d4ab4dfd3..fdb34e86f92 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -20,6 +20,7 @@
#include <linux/signal.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
+#include <linux/syscalls.h>
#include <asm/pgtable.h>
#include <asm/uaccess.h>
@@ -53,7 +54,7 @@ void ptrace_untrace(struct task_struct *child)
spin_lock(&child->sighand->siglock);
if (task_is_traced(child)) {
if (child->signal->flags & SIGNAL_STOP_STOPPED) {
- child->state = TASK_STOPPED;
+ __set_task_state(child, TASK_STOPPED);
} else {
signal_wake_up(child, 1);
}
@@ -98,23 +99,23 @@ int ptrace_check_attach(struct task_struct *child, int kill)
* be changed by us so it's not changing right after this.
*/
read_lock(&tasklist_lock);
- if ((child->ptrace & PT_PTRACED) && child->parent == current &&
- (!(child->ptrace & PT_ATTACHED) || child->real_parent != current)
- && child->signal != NULL) {
+ if ((child->ptrace & PT_PTRACED) && child->parent == current) {
ret = 0;
+ /*
+ * child->sighand can't be NULL, release_task()
+ * does ptrace_unlink() before __exit_signal().
+ */
spin_lock_irq(&child->sighand->siglock);
- if (task_is_stopped(child)) {
+ if (task_is_stopped(child))
child->state = TASK_TRACED;
- } else if (!task_is_traced(child) && !kill) {
+ else if (!task_is_traced(child) && !kill)
ret = -ESRCH;
- }
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
- if (!ret && !kill) {
+ if (!ret && !kill)
wait_task_inactive(child);
- }
/* All systems go.. */
return ret;
@@ -201,8 +202,7 @@ repeat:
goto bad;
/* Go */
- task->ptrace |= PT_PTRACED | ((task->real_parent != current)
- ? PT_ATTACHED : 0);
+ task->ptrace |= PT_PTRACED;
if (capable(CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 760dfc233a0..c09605f8d16 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count;
static DEFINE_MUTEX(rcu_barrier_mutex);
static struct completion rcu_barrier_completion;
-/* Because of FASTCALL declaration of complete, we use this wrapper */
+/*
+ * Awaken the corresponding synchronize_rcu() instance now that a
+ * grace period has elapsed.
+ */
static void wakeme_after_rcu(struct rcu_head *head)
{
struct rcu_synchronize *rcu;
diff --git a/kernel/relay.c b/kernel/relay.c
index 7c0373322f1..d080b9d161a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -37,37 +37,31 @@ static void relay_file_mmap_close(struct vm_area_struct *vma)
}
/*
- * nopage() vm_op implementation for relay file mapping.
+ * fault() vm_op implementation for relay file mapping.
*/
-static struct page *relay_buf_nopage(struct vm_area_struct *vma,
- unsigned long address,
- int *type)
+static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page;
struct rchan_buf *buf = vma->vm_private_data;
- unsigned long offset = address - vma->vm_start;
+ pgoff_t pgoff = vmf->pgoff;
- if (address > vma->vm_end)
- return NOPAGE_SIGBUS; /* Disallow mremap */
if (!buf)
- return NOPAGE_OOM;
+ return VM_FAULT_OOM;
- page = vmalloc_to_page(buf->start + offset);
+ page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT));
if (!page)
- return NOPAGE_OOM;
+ return VM_FAULT_SIGBUS;
get_page(page);
+ vmf->page = page;
- if (type)
- *type = VM_FAULT_MINOR;
-
- return page;
+ return 0;
}
/*
* vm_ops for relay file mappings.
*/
static struct vm_operations_struct relay_file_mmap_ops = {
- .nopage = relay_buf_nopage,
+ .fault = relay_buf_fault,
.close = relay_file_mmap_close,
};
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
new file mode 100644
index 00000000000..16cbec2d5d6
--- /dev/null
+++ b/kernel/res_counter.c
@@ -0,0 +1,134 @@
+/*
+ * resource cgroups
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/parser.h>
+#include <linux/fs.h>
+#include <linux/res_counter.h>
+#include <linux/uaccess.h>
+
+void res_counter_init(struct res_counter *counter)
+{
+ spin_lock_init(&counter->lock);
+ counter->limit = (unsigned long long)LLONG_MAX;
+}
+
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
+{
+ if (counter->usage + val > counter->limit) {
+ counter->failcnt++;
+ return -ENOMEM;
+ }
+
+ counter->usage += val;
+ return 0;
+}
+
+int res_counter_charge(struct res_counter *counter, unsigned long val)
+{
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&counter->lock, flags);
+ ret = res_counter_charge_locked(counter, val);
+ spin_unlock_irqrestore(&counter->lock, flags);
+ return ret;
+}
+
+void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
+{
+ if (WARN_ON(counter->usage < val))
+ val = counter->usage;
+
+ counter->usage -= val;
+}
+
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&counter->lock, flags);
+ res_counter_uncharge_locked(counter, val);
+ spin_unlock_irqrestore(&counter->lock, flags);
+}
+
+
+static inline unsigned long long *
+res_counter_member(struct res_counter *counter, int member)
+{
+ switch (member) {
+ case RES_USAGE:
+ return &counter->usage;
+ case RES_LIMIT:
+ return &counter->limit;
+ case RES_FAILCNT:
+ return &counter->failcnt;
+ };
+
+ BUG();
+ return NULL;
+}
+
+ssize_t res_counter_read(struct res_counter *counter, int member,
+ const char __user *userbuf, size_t nbytes, loff_t *pos,
+ int (*read_strategy)(unsigned long long val, char *st_buf))
+{
+ unsigned long long *val;
+ char buf[64], *s;
+
+ s = buf;
+ val = res_counter_member(counter, member);
+ if (read_strategy)
+ s += read_strategy(*val, s);
+ else
+ s += sprintf(s, "%llu\n", *val);
+ return simple_read_from_buffer((void __user *)userbuf, nbytes,
+ pos, buf, s - buf);
+}
+
+ssize_t res_counter_write(struct res_counter *counter, int member,
+ const char __user *userbuf, size_t nbytes, loff_t *pos,
+ int (*write_strategy)(char *st_buf, unsigned long long *val))
+{
+ int ret;
+ char *buf, *end;
+ unsigned long flags;
+ unsigned long long tmp, *val;
+
+ buf = kmalloc(nbytes + 1, GFP_KERNEL);
+ ret = -ENOMEM;
+ if (buf == NULL)
+ goto out;
+
+ buf[nbytes] = '\0';
+ ret = -EFAULT;
+ if (copy_from_user(buf, userbuf, nbytes))
+ goto out_free;
+
+ ret = -EINVAL;
+
+ if (write_strategy) {
+ if (write_strategy(buf, &tmp)) {
+ goto out_free;
+ }
+ } else {
+ tmp = simple_strtoull(buf, &end, 10);
+ if (*end != '\0')
+ goto out_free;
+ }
+ spin_lock_irqsave(&counter->lock, flags);
+ val = res_counter_member(counter, member);
+ *val = tmp;
+ spin_unlock_irqrestore(&counter->lock, flags);
+ ret = nbytes;
+out_free:
+ kfree(buf);
+out:
+ return ret;
+}
diff --git a/kernel/resource.c b/kernel/resource.c
index 2eb553d9b51..82aea814d40 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -228,7 +228,7 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
/*
* Finds the lowest memory reosurce exists within [res->start.res->end)
* the caller must specify res->start, res->end, res->flags.
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 56d73cb8826..5fcb4fe645e 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -130,7 +130,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
task = rt_mutex_owner(act_waiter->lock);
if (task && task != current) {
- act_waiter->deadlock_task_pid = task->pid;
+ act_waiter->deadlock_task_pid = get_pid(task_pid(task));
act_waiter->deadlock_lock = lock;
}
}
@@ -142,9 +142,12 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
if (!waiter->deadlock_lock || !rt_trace_on)
return;
- task = find_task_by_pid(waiter->deadlock_task_pid);
- if (!task)
+ rcu_read_lock();
+ task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID);
+ if (!task) {
+ rcu_read_unlock();
return;
+ }
TRACE_OFF_NOLOCK();
@@ -173,6 +176,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
current->comm, task_pid_nr(current));
dump_stack();
debug_show_all_locks();
+ rcu_read_unlock();
printk("[ turning off deadlock detection."
"Please report this trace. ]\n\n");
@@ -203,10 +207,12 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
memset(waiter, 0x11, sizeof(*waiter));
plist_node_init(&waiter->list_entry, MAX_PRIO);
plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
+ waiter->deadlock_task_pid = NULL;
}
void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
{
+ put_pid(waiter->deadlock_task_pid);
TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
TRACE_WARN_ON(waiter->task);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0deef71ff8d..6522ae5b14a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(state);
/* Setup the timer, when timeout != NULL */
- if (unlikely(timeout))
+ if (unlikely(timeout)) {
hrtimer_start(&timeout->timer, timeout->timer.expires,
HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&timeout->timer))
+ timeout->task = NULL;
+ }
for (;;) {
/* Try to acquire the lock: */
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 2d3b83593ca..e124bf5800e 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -51,7 +51,7 @@ struct rt_mutex_waiter {
struct rt_mutex *lock;
#ifdef CONFIG_DEBUG_RT_MUTEXES
unsigned long ip;
- pid_t deadlock_task_pid;
+ struct pid *deadlock_task_pid;
struct rt_mutex *deadlock_lock;
#endif
};
diff --git a/kernel/sched.c b/kernel/sched.c
index 9474b23c28b..f28f19e65b5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -155,7 +155,7 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO];
};
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
#include <linux/cgroup.h>
@@ -165,19 +165,16 @@ static LIST_HEAD(task_groups);
/* task group related information */
struct task_group {
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
struct cgroup_subsys_state css;
#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
- struct sched_rt_entity **rt_se;
- struct rt_rq **rt_rq;
-
- unsigned int rt_ratio;
-
/*
* shares assigned to a task group governs how much of cpu bandwidth
* is allocated to the group. The more shares a group has, the more is
@@ -213,33 +210,46 @@ struct task_group {
*
*/
unsigned long shares;
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity **rt_se;
+ struct rt_rq **rt_rq;
+
+ u64 rt_runtime;
+#endif
struct rcu_head rcu;
struct list_head list;
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
/* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-
static struct sched_entity *init_sched_entity_p[NR_CPUS];
static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
static struct rt_rq *init_rt_rq_p[NR_CPUS];
+#endif
-/* task_group_mutex serializes add/remove of task groups and also changes to
+/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
*/
-static DEFINE_MUTEX(task_group_mutex);
+static DEFINE_SPINLOCK(task_group_lock);
/* doms_cur_mutex serializes access to doms_cur[] array */
static DEFINE_MUTEX(doms_cur_mutex);
+#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/* kernel thread that runs rebalance_shares() periodically */
static struct task_struct *lb_monitor_task;
@@ -248,35 +258,40 @@ static int load_balance_monitor(void *unused);
static void set_se_shares(struct sched_entity *se, unsigned long shares);
+#ifdef CONFIG_USER_SCHED
+# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
+#else
+# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
+#endif
+
+#define MIN_GROUP_SHARES 2
+
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+#endif
+
/* Default task group.
* Every task in system belong to this group at bootup.
*/
struct task_group init_task_group = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
.se = init_sched_entity_p,
.cfs_rq = init_cfs_rq_p,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
.rt_se = init_sched_rt_entity_p,
.rt_rq = init_rt_rq_p,
-};
-
-#ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
-#else
-# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif
-
-#define MIN_GROUP_SHARES 2
-
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+};
/* return group to which a task belongs */
static inline struct task_group *task_group(struct task_struct *p)
{
struct task_group *tg;
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
tg = p->user->tg;
-#elif defined(CONFIG_FAIR_CGROUP_SCHED)
+#elif defined(CONFIG_CGROUP_SCHED)
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
struct task_group, css);
#else
@@ -288,21 +303,15 @@ static inline struct task_group *task_group(struct task_struct *p)
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
+#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
p->se.parent = task_group(p)->se[cpu];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
p->rt.rt_rq = task_group(p)->rt_rq[cpu];
p->rt.parent = task_group(p)->rt_se[cpu];
-}
-
-static inline void lock_task_group_list(void)
-{
- mutex_lock(&task_group_mutex);
-}
-
-static inline void unlock_task_group_list(void)
-{
- mutex_unlock(&task_group_mutex);
+#endif
}
static inline void lock_doms_cur(void)
@@ -318,12 +327,10 @@ static inline void unlock_doms_cur(void)
#else
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_task_group_list(void) { }
-static inline void unlock_task_group_list(void) { }
static inline void lock_doms_cur(void) { }
static inline void unlock_doms_cur(void) { }
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_GROUP_SCHED */
/* CFS-related fields in a runqueue */
struct cfs_rq {
@@ -363,7 +370,7 @@ struct cfs_rq {
struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
int highest_prio; /* highest queued rt task prio */
#endif
#ifdef CONFIG_SMP
@@ -373,7 +380,9 @@ struct rt_rq {
int rt_throttled;
u64 rt_time;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+ unsigned long rt_nr_boosted;
+
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
@@ -447,6 +456,8 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
@@ -652,19 +663,21 @@ const_debug unsigned int sysctl_sched_features =
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
* default: 1s
*/
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+unsigned int sysctl_sched_rt_period = 1000000;
-#define SCHED_RT_FRAC_SHIFT 16
-#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
/*
- * ratio of time -rt tasks may consume.
- * default: 95%
+ * single value that denotes runtime == period, ie unlimited time.
*/
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+#define RUNTIME_INF ((u64)~0ULL)
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1893,13 +1906,13 @@ out:
return success;
}
-int fastcall wake_up_process(struct task_struct *p)
+int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_ALL, 0);
}
EXPORT_SYMBOL(wake_up_process);
-int fastcall wake_up_state(struct task_struct *p, unsigned int state)
+int wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0);
}
@@ -1986,7 +1999,7 @@ void sched_fork(struct task_struct *p, int clone_flags)
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
unsigned long flags;
struct rq *rq;
@@ -3753,7 +3766,7 @@ void scheduler_tick(void)
#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
-void fastcall add_preempt_count(int val)
+void add_preempt_count(int val)
{
/*
* Underflow?
@@ -3769,7 +3782,7 @@ void fastcall add_preempt_count(int val)
}
EXPORT_SYMBOL(add_preempt_count);
-void fastcall sub_preempt_count(int val)
+void sub_preempt_count(int val)
{
/*
* Underflow?
@@ -4067,7 +4080,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*/
-void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
unsigned long flags;
@@ -4081,7 +4094,7 @@ EXPORT_SYMBOL(__wake_up);
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
-void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
{
__wake_up_common(q, mode, 1, 0, NULL);
}
@@ -4099,7 +4112,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
*
* On UP it can prevent extra preemption.
*/
-void fastcall
+void
__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
{
unsigned long flags;
@@ -4571,6 +4584,15 @@ recheck:
return -EPERM;
}
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
+ return -EPERM;
+#endif
+
retval = security_task_setscheduler(p, policy, param);
if (retval)
return retval;
@@ -7112,7 +7134,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
rt_rq->highest_prio = MAX_RT_PRIO;
#endif
#ifdef CONFIG_SMP
@@ -7123,7 +7145,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
#endif
}
@@ -7146,7 +7169,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
se->parent = NULL;
}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
int cpu, int add)
@@ -7175,7 +7200,7 @@ void __init sched_init(void)
init_defrootdomain();
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
list_add(&init_task_group.list, &task_groups);
#endif
@@ -7196,7 +7221,10 @@ void __init sched_init(void)
&per_cpu(init_cfs_rq, i),
&per_cpu(init_sched_entity, i), i, 1);
- init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ init_task_group.rt_runtime =
+ sysctl_sched_rt_runtime * NSEC_PER_USEC;
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(rq, &init_task_group,
&per_cpu(init_rt_rq, i),
@@ -7303,7 +7331,7 @@ void normalize_rt_tasks(void)
unsigned long flags;
struct rq *rq;
- read_lock_irq(&tasklist_lock);
+ read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
/*
* Only normalize user tasks:
@@ -7329,16 +7357,16 @@ void normalize_rt_tasks(void)
continue;
}
- spin_lock_irqsave(&p->pi_lock, flags);
+ spin_lock(&p->pi_lock);
rq = __task_rq_lock(p);
normalize_task(rq, p);
__task_rq_unlock(rq);
- spin_unlock_irqrestore(&p->pi_lock, flags);
+ spin_unlock(&p->pi_lock);
} while_each_thread(g, p);
- read_unlock_irq(&tasklist_lock);
+ read_unlock_irqrestore(&tasklist_lock, flags);
}
#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7387,9 +7415,9 @@ void set_curr_task(int cpu, struct task_struct *p)
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
-#ifdef CONFIG_SMP
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
/*
* distribute shares of all task groups among their schedulable entities,
* to reflect load distribution across cpus.
@@ -7540,7 +7568,8 @@ static int load_balance_monitor(void *unused)
}
#endif /* CONFIG_SMP */
-static void free_sched_group(struct task_group *tg)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -7549,49 +7578,27 @@ static void free_sched_group(struct task_group *tg)
kfree(tg->cfs_rq[i]);
if (tg->se)
kfree(tg->se[i]);
- if (tg->rt_rq)
- kfree(tg->rt_rq[i]);
- if (tg->rt_se)
- kfree(tg->rt_se[i]);
}
kfree(tg->cfs_rq);
kfree(tg->se);
- kfree(tg->rt_rq);
- kfree(tg->rt_se);
- kfree(tg);
}
-/* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(void)
+static int alloc_fair_sched_group(struct task_group *tg)
{
- struct task_group *tg;
struct cfs_rq *cfs_rq;
struct sched_entity *se;
- struct rt_rq *rt_rq;
- struct sched_rt_entity *rt_se;
struct rq *rq;
int i;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
- if (!tg)
- return ERR_PTR(-ENOMEM);
-
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
if (!tg->cfs_rq)
goto err;
tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
if (!tg->se)
goto err;
- tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
- if (!tg->rt_rq)
- goto err;
- tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
- if (!tg->rt_se)
- goto err;
tg->shares = NICE_0_LOAD;
- tg->rt_ratio = 0; /* XXX */
for_each_possible_cpu(i) {
rq = cpu_rq(i);
@@ -7606,6 +7613,79 @@ struct task_group *sched_create_group(void)
if (!se)
goto err;
+ init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+ }
+
+ return 1;
+
+ err:
+ return 0;
+}
+
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+ list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
+ &cpu_rq(cpu)->leaf_cfs_rq_list);
+}
+
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+ list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+}
+#else
+static inline void free_fair_sched_group(struct task_group *tg)
+{
+}
+
+static inline int alloc_fair_sched_group(struct task_group *tg)
+{
+ return 1;
+}
+
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (tg->rt_rq)
+ kfree(tg->rt_rq[i]);
+ if (tg->rt_se)
+ kfree(tg->rt_se[i]);
+ }
+
+ kfree(tg->rt_rq);
+ kfree(tg->rt_se);
+}
+
+static int alloc_rt_sched_group(struct task_group *tg)
+{
+ struct rt_rq *rt_rq;
+ struct sched_rt_entity *rt_se;
+ struct rq *rq;
+ int i;
+
+ tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+ if (!tg->rt_rq)
+ goto err;
+ tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+ if (!tg->rt_se)
+ goto err;
+
+ tg->rt_runtime = 0;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
rt_rq = kmalloc_node(sizeof(struct rt_rq),
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
if (!rt_rq)
@@ -7616,20 +7696,75 @@ struct task_group *sched_create_group(void)
if (!rt_se)
goto err;
- init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
}
- lock_task_group_list();
+ return 1;
+
+ err:
+ return 0;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+ list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
+ &cpu_rq(cpu)->leaf_rt_rq_list);
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+ list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
+}
+#else
+static inline void free_rt_sched_group(struct task_group *tg)
+{
+}
+
+static inline int alloc_rt_sched_group(struct task_group *tg)
+{
+ return 1;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+
+static void free_sched_group(struct task_group *tg)
+{
+ free_fair_sched_group(tg);
+ free_rt_sched_group(tg);
+ kfree(tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(void)
+{
+ struct task_group *tg;
+ unsigned long flags;
+ int i;
+
+ tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ if (!alloc_fair_sched_group(tg))
+ goto err;
+
+ if (!alloc_rt_sched_group(tg))
+ goto err;
+
+ spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
- cfs_rq = tg->cfs_rq[i];
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
- rt_rq = tg->rt_rq[i];
- list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+ register_fair_sched_group(tg, i);
+ register_rt_sched_group(tg, i);
}
list_add_rcu(&tg->list, &task_groups);
- unlock_task_group_list();
+ spin_unlock_irqrestore(&task_group_lock, flags);
return tg;
@@ -7648,21 +7783,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
- struct cfs_rq *cfs_rq = NULL;
- struct rt_rq *rt_rq = NULL;
+ unsigned long flags;
int i;
- lock_task_group_list();
+ spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i) {
- cfs_rq = tg->cfs_rq[i];
- list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
- rt_rq = tg->rt_rq[i];
- list_del_rcu(&rt_rq->leaf_rt_rq_list);
+ unregister_fair_sched_group(tg, i);
+ unregister_rt_sched_group(tg, i);
}
list_del_rcu(&tg->list);
- unlock_task_group_list();
-
- BUG_ON(!cfs_rq);
+ spin_unlock_irqrestore(&task_group_lock, flags);
/* wait for possible concurrent references to cfs_rqs complete */
call_rcu(&tg->rcu, free_sched_group_rcu);
@@ -7703,6 +7833,7 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, &flags);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
/* rq->lock to be locked by caller */
static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
@@ -7728,13 +7859,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
}
}
+static DEFINE_MUTEX(shares_mutex);
+
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
- struct cfs_rq *cfs_rq;
- struct rq *rq;
+ unsigned long flags;
- lock_task_group_list();
+ mutex_lock(&shares_mutex);
if (tg->shares == shares)
goto done;
@@ -7746,10 +7878,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* load_balance_fair) from referring to this group first,
* by taking it off the rq->leaf_cfs_rq_list on each cpu.
*/
- for_each_possible_cpu(i) {
- cfs_rq = tg->cfs_rq[i];
- list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
- }
+ spin_lock_irqsave(&task_group_lock, flags);
+ for_each_possible_cpu(i)
+ unregister_fair_sched_group(tg, i);
+ spin_unlock_irqrestore(&task_group_lock, flags);
/* wait for any ongoing reference to this group to finish */
synchronize_sched();
@@ -7769,13 +7901,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* Enable load balance activity on this group, by inserting it back on
* each cpu's rq->leaf_cfs_rq_list.
*/
- for_each_possible_cpu(i) {
- rq = cpu_rq(i);
- cfs_rq = tg->cfs_rq[i];
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
- }
+ spin_lock_irqsave(&task_group_lock, flags);
+ for_each_possible_cpu(i)
+ register_fair_sched_group(tg, i);
+ spin_unlock_irqrestore(&task_group_lock, flags);
done:
- unlock_task_group_list();
+ mutex_unlock(&shares_mutex);
return 0;
}
@@ -7783,35 +7914,84 @@ unsigned long sched_group_shares(struct task_group *tg)
{
return tg->shares;
}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
/*
- * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
*/
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+ if (runtime == RUNTIME_INF)
+ return 1ULL << 16;
+
+ runtime *= (1ULL << 16);
+ div64_64(runtime, period);
+ return runtime;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
struct task_group *tgi;
unsigned long total = 0;
+ unsigned long global_ratio =
+ to_ratio(sysctl_sched_rt_period,
+ sysctl_sched_rt_runtime < 0 ?
+ RUNTIME_INF : sysctl_sched_rt_runtime);
rcu_read_lock();
- list_for_each_entry_rcu(tgi, &task_groups, list)
- total += tgi->rt_ratio;
- rcu_read_unlock();
+ list_for_each_entry_rcu(tgi, &task_groups, list) {
+ if (tgi == tg)
+ continue;
- if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
- return -EINVAL;
+ total += to_ratio(period, tgi->rt_runtime);
+ }
+ rcu_read_unlock();
- tg->rt_ratio = rt_ratio;
- return 0;
+ return total + to_ratio(period, runtime) < global_ratio;
}
-unsigned long sched_group_rt_ratio(struct task_group *tg)
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
- return tg->rt_ratio;
+ u64 rt_runtime, rt_period;
+ int err = 0;
+
+ rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us == -1)
+ rt_runtime = rt_period;
+
+ mutex_lock(&rt_constraints_mutex);
+ if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+ if (rt_runtime_us == -1)
+ rt_runtime = RUNTIME_INF;
+ tg->rt_runtime = rt_runtime;
+ unlock:
+ mutex_unlock(&rt_constraints_mutex);
+
+ return err;
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+long sched_group_rt_runtime(struct task_group *tg)
+{
+ u64 rt_runtime_us;
+
+ if (tg->rt_runtime == RUNTIME_INF)
+ return -1;
+
+ rt_runtime_us = tg->rt_runtime;
+ do_div(rt_runtime_us, NSEC_PER_USEC);
+ return rt_runtime_us;
+}
+#endif
+#endif /* CONFIG_GROUP_SCHED */
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
/* return corresponding task_group object of a cgroup */
static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
@@ -7857,9 +8037,15 @@ static int
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk)
{
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
+ return -EINVAL;
+#else
/* We don't support RT-tasks being in separate groups */
if (tsk->sched_class != &fair_sched_class)
return -EINVAL;
+#endif
return 0;
}
@@ -7871,6 +8057,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
sched_move_task(tsk);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
{
@@ -7883,31 +8070,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
return (u64) tg->shares;
}
+#endif
-static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
- u64 rt_ratio_val)
+#ifdef CONFIG_RT_GROUP_SCHED
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *unused_ppos)
{
- return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+ char buffer[64];
+ int retval = 0;
+ s64 val;
+ char *end;
+
+ if (!nbytes)
+ return -EINVAL;
+ if (nbytes >= sizeof(buffer))
+ return -E2BIG;
+ if (copy_from_user(buffer, userbuf, nbytes))
+ return -EFAULT;
+
+ buffer[nbytes] = 0; /* nul-terminate */
+
+ /* strip newline if necessary */
+ if (nbytes && (buffer[nbytes-1] == '\n'))
+ buffer[nbytes-1] = 0;
+ val = simple_strtoll(buffer, &end, 0);
+ if (*end)
+ return -EINVAL;
+
+ /* Pass to subsystem */
+ retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+ if (!retval)
+ retval = nbytes;
+ return retval;
}
-static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ char __user *buf, size_t nbytes,
+ loff_t *ppos)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ char tmp[64];
+ long val = sched_group_rt_runtime(cgroup_tg(cgrp));
+ int len = sprintf(tmp, "%ld\n", val);
- return (u64) tg->rt_ratio;
+ return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
+#endif
static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
.read_uint = cpu_shares_read_uint,
.write_uint = cpu_shares_write_uint,
},
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
{
- .name = "rt_ratio",
- .read_uint = cpu_rt_ratio_read_uint,
- .write_uint = cpu_rt_ratio_write_uint,
+ .name = "rt_runtime_us",
+ .read = cpu_rt_runtime_read,
+ .write = cpu_rt_runtime_write,
},
+#endif
};
static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -7926,7 +8152,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.early_init = 1,
};
-#endif /* CONFIG_FAIR_CGROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_CGROUP_CPUACCT
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 274b40d7bef..f54792b175b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
return !list_empty(&rt_se->run_list);
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
if (!rt_rq->tg)
- return SCHED_RT_FRAC;
+ return RUNTIME_INF;
- return rt_rq->tg->rt_ratio;
+ return rt_rq->tg->rt_runtime;
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
-static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
}
}
-static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
dequeue_rt_entity(rt_se);
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+ struct task_struct *p;
+
+ if (rt_rq)
+ return !!rt_rq->rt_nr_boosted;
+
+ p = rt_task_of(rt_se);
+ return p->prio != p->normal_prio;
+}
+
#else
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
- return sysctl_sched_rt_ratio;
+ if (sysctl_sched_rt_runtime == -1)
+ return RUNTIME_INF;
+
+ return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
return NULL;
}
-static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
}
-static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
+}
#endif
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
@@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
}
-static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
- unsigned int rt_ratio = sched_rt_ratio(rt_rq);
- u64 period, ratio;
+ u64 runtime = sched_rt_runtime(rt_rq);
- if (rt_ratio == SCHED_RT_FRAC)
+ if (runtime == RUNTIME_INF)
return 0;
if (rt_rq->rt_throttled)
- return 1;
-
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
- ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+ return rt_rq_throttled(rt_rq);
- if (rt_rq->rt_time > ratio) {
+ if (rt_rq->rt_time > runtime) {
struct rq *rq = rq_of_rt_rq(rt_rq);
rq->rt_throttled = 1;
rt_rq->rt_throttled = 1;
- sched_rt_ratio_dequeue(rt_rq);
- return 1;
+ if (rt_rq_throttled(rt_rq)) {
+ sched_rt_rq_dequeue(rt_rq);
+ return 1;
+ }
}
return 0;
@@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
u64 period;
while (rq->clock > rq->rt_period_expire) {
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+ period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
rq->rt_period_expire += period;
for_each_leaf_rt_rq(rt_rq, rq) {
- unsigned long rt_ratio = sched_rt_ratio(rt_rq);
- u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+ u64 runtime = sched_rt_runtime(rt_rq);
- rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
- if (rt_rq->rt_throttled) {
+ rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
+ if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
rt_rq->rt_throttled = 0;
- sched_rt_ratio_enqueue(rt_rq);
+ sched_rt_rq_enqueue(rt_rq);
}
}
@@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
cpuacct_charge(curr, delta_exec);
rt_rq->rt_time += delta_exec;
- /*
- * might make it a tad more accurate:
- *
- * update_sched_rt_period(rq);
- */
- if (sched_rt_ratio_exceeded(rt_rq))
+ if (sched_rt_runtime_exceeded(rt_rq))
resched_task(curr);
}
@@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_se_prio(rt_se) < rt_rq->highest_prio)
rt_rq->highest_prio = rt_se_prio(rt_se);
#endif
@@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+#endif
}
static inline
@@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running--;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_rq->rt_nr_running) {
struct rt_prio_array *array;
@@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif /* CONFIG_SMP */
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted--;
+
+ WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
- if (group_rq && group_rq->rt_throttled)
+ if (group_rq && rt_rq_throttled(group_rq))
return;
list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -496,7 +522,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
if (unlikely(!rt_rq->rt_nr_running))
return NULL;
- if (sched_rt_ratio_exceeded(rt_rq))
+ if (rt_rq_throttled(rt_rq))
return NULL;
do {
diff --git a/kernel/signal.c b/kernel/signal.c
index 6a5f97cd337..84917fe507f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -972,7 +972,7 @@ void zap_other_threads(struct task_struct *p)
}
}
-int fastcall __fatal_signal_pending(struct task_struct *tsk)
+int __fatal_signal_pending(struct task_struct *tsk)
{
return sigismember(&tsk->pending.signal, SIGKILL);
}
@@ -1018,7 +1018,7 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
}
/*
- * kill_pgrp_info() sends a signal to a process group: this is what the tty
+ * __kill_pgrp_info() sends a signal to a process group: this is what the tty
* control characters do (^C, ^Z etc)
*/
@@ -1037,30 +1037,28 @@ int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
return success ? 0 : retval;
}
-int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
-{
- int retval;
-
- read_lock(&tasklist_lock);
- retval = __kill_pgrp_info(sig, info, pgrp);
- read_unlock(&tasklist_lock);
-
- return retval;
-}
-
int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
{
- int error;
+ int error = -ESRCH;
struct task_struct *p;
rcu_read_lock();
if (unlikely(sig_needs_tasklist(sig)))
read_lock(&tasklist_lock);
+retry:
p = pid_task(pid, PIDTYPE_PID);
- error = -ESRCH;
- if (p)
+ if (p) {
error = group_send_sig_info(sig, info, p);
+ if (unlikely(error == -ESRCH))
+ /*
+ * The task was unhashed in between, try again.
+ * If it is dead, pid_task() will return NULL,
+ * if we race with de_thread() it will find the
+ * new leader.
+ */
+ goto retry;
+ }
if (unlikely(sig_needs_tasklist(sig)))
read_unlock(&tasklist_lock);
@@ -1125,14 +1123,22 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
static int kill_something_info(int sig, struct siginfo *info, int pid)
{
int ret;
- rcu_read_lock();
- if (!pid) {
- ret = kill_pgrp_info(sig, info, task_pgrp(current));
- } else if (pid == -1) {
+
+ if (pid > 0) {
+ rcu_read_lock();
+ ret = kill_pid_info(sig, info, find_vpid(pid));
+ rcu_read_unlock();
+ return ret;
+ }
+
+ read_lock(&tasklist_lock);
+ if (pid != -1) {
+ ret = __kill_pgrp_info(sig, info,
+ pid ? find_vpid(-pid) : task_pgrp(current));
+ } else {
int retval = 0, count = 0;
struct task_struct * p;
- read_lock(&tasklist_lock);
for_each_process(p) {
if (p->pid > 1 && !same_thread_group(p, current)) {
int err = group_send_sig_info(sig, info, p);
@@ -1141,14 +1147,10 @@ static int kill_something_info(int sig, struct siginfo *info, int pid)
retval = err;
}
}
- read_unlock(&tasklist_lock);
ret = count ? retval : -ESRCH;
- } else if (pid < 0) {
- ret = kill_pgrp_info(sig, info, find_vpid(-pid));
- } else {
- ret = kill_pid_info(sig, info, find_vpid(pid));
}
- rcu_read_unlock();
+ read_unlock(&tasklist_lock);
+
return ret;
}
@@ -1196,20 +1198,6 @@ send_sig(int sig, struct task_struct *p, int priv)
return send_sig_info(sig, __si_special(priv), p);
}
-/*
- * This is the entry point for "process-wide" signals.
- * They will go to an appropriate thread in the thread group.
- */
-int
-send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
-{
- int ret;
- read_lock(&tasklist_lock);
- ret = group_send_sig_info(sig, info, p);
- read_unlock(&tasklist_lock);
- return ret;
-}
-
void
force_sig(int sig, struct task_struct *p)
{
@@ -1237,7 +1225,13 @@ force_sigsegv(int sig, struct task_struct *p)
int kill_pgrp(struct pid *pid, int sig, int priv)
{
- return kill_pgrp_info(sig, __si_special(priv), pid);
+ int ret;
+
+ read_lock(&tasklist_lock);
+ ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+ read_unlock(&tasklist_lock);
+
+ return ret;
}
EXPORT_SYMBOL(kill_pgrp);
@@ -1556,11 +1550,6 @@ static inline int may_ptrace_stop(void)
{
if (!likely(current->ptrace & PT_PTRACED))
return 0;
-
- if (unlikely(current->parent == current->real_parent &&
- (current->ptrace & PT_ATTACHED)))
- return 0;
-
/*
* Are we in the middle of do_coredump?
* If so and our tracer is also part of the coredump stopping
@@ -1578,6 +1567,17 @@ static inline int may_ptrace_stop(void)
}
/*
+ * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Called with the siglock held.
+ */
+static int sigkill_pending(struct task_struct *tsk)
+{
+ return ((sigismember(&tsk->pending.signal, SIGKILL) ||
+ sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) &&
+ !unlikely(sigismember(&tsk->blocked, SIGKILL)));
+}
+
+/*
* This must be called with current->sighand->siglock held.
*
* This should be the path for all ptrace stops.
@@ -1585,11 +1585,31 @@ static inline int may_ptrace_stop(void)
* That makes it a way to test a stopped process for
* being ptrace-stopped vs being job-control-stopped.
*
- * If we actually decide not to stop at all because the tracer is gone,
- * we leave nostop_code in current->exit_code.
+ * If we actually decide not to stop at all because the tracer
+ * is gone, we keep current->exit_code unless clear_code.
*/
-static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
+static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
{
+ int killed = 0;
+
+ if (arch_ptrace_stop_needed(exit_code, info)) {
+ /*
+ * The arch code has something special to do before a
+ * ptrace stop. This is allowed to block, e.g. for faults
+ * on user stack pages. We can't keep the siglock while
+ * calling arch_ptrace_stop, so we must release it now.
+ * To preserve proper semantics, we must do this before
+ * any signal bookkeeping like checking group_stop_count.
+ * Meanwhile, a SIGKILL could come in before we retake the
+ * siglock. That must prevent us from sleeping in TASK_TRACED.
+ * So after regaining the lock, we must check for SIGKILL.
+ */
+ spin_unlock_irq(&current->sighand->siglock);
+ arch_ptrace_stop(exit_code, info);
+ spin_lock_irq(&current->sighand->siglock);
+ killed = sigkill_pending(current);
+ }
+
/*
* If there is a group stop in progress,
* we must participate in the bookkeeping.
@@ -1601,22 +1621,23 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
current->exit_code = exit_code;
/* Let the debugger run. */
- set_current_state(TASK_TRACED);
+ __set_current_state(TASK_TRACED);
spin_unlock_irq(&current->sighand->siglock);
try_to_freeze();
read_lock(&tasklist_lock);
- if (may_ptrace_stop()) {
+ if (!unlikely(killed) && may_ptrace_stop()) {
do_notify_parent_cldstop(current, CLD_TRAPPED);
read_unlock(&tasklist_lock);
schedule();
} else {
/*
* By the time we got the lock, our tracer went away.
- * Don't stop here.
+ * Don't drop the lock yet, another tracer may come.
*/
+ __set_current_state(TASK_RUNNING);
+ if (clear_code)
+ current->exit_code = 0;
read_unlock(&tasklist_lock);
- set_current_state(TASK_RUNNING);
- current->exit_code = nostop_code;
}
/*
@@ -1649,7 +1670,7 @@ void ptrace_notify(int exit_code)
/* Let the debugger run. */
spin_lock_irq(&current->sighand->siglock);
- ptrace_stop(exit_code, 0, &info);
+ ptrace_stop(exit_code, 1, &info);
spin_unlock_irq(&current->sighand->siglock);
}
@@ -1712,7 +1733,7 @@ static int do_signal_stop(int signr)
* stop is always done with the siglock held,
* so this check has no races.
*/
- if (!t->exit_state &&
+ if (!(t->flags & PF_EXITING) &&
!task_is_stopped_or_traced(t)) {
stop_count++;
signal_wake_up(t, 0);
@@ -1756,7 +1777,7 @@ relock:
ptrace_signal_deliver(regs, cookie);
/* Let the debugger run. */
- ptrace_stop(signr, signr, info);
+ ptrace_stop(signr, 0, info);
/* We're back. Did the debugger cancel the sig? */
signr = current->exit_code;
@@ -1873,6 +1894,48 @@ relock:
return signr;
}
+void exit_signals(struct task_struct *tsk)
+{
+ int group_stop = 0;
+ struct task_struct *t;
+
+ if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
+ tsk->flags |= PF_EXITING;
+ return;
+ }
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ /*
+ * From now this task is not visible for group-wide signals,
+ * see wants_signal(), do_signal_stop().
+ */
+ tsk->flags |= PF_EXITING;
+ if (!signal_pending(tsk))
+ goto out;
+
+ /* It could be that __group_complete_signal() choose us to
+ * notify about group-wide signal. Another thread should be
+ * woken now to take the signal since we will not.
+ */
+ for (t = tsk; (t = next_thread(t)) != tsk; )
+ if (!signal_pending(t) && !(t->flags & PF_EXITING))
+ recalc_sigpending_and_wake(t);
+
+ if (unlikely(tsk->signal->group_stop_count) &&
+ !--tsk->signal->group_stop_count) {
+ tsk->signal->flags = SIGNAL_STOP_STOPPED;
+ group_stop = 1;
+ }
+out:
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (unlikely(group_stop)) {
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(tsk, CLD_STOPPED);
+ read_unlock(&tasklist_lock);
+ }
+}
+
EXPORT_SYMBOL(recalc_sigpending);
EXPORT_SYMBOL_GPL(dequeue_signal);
EXPORT_SYMBOL(flush_signals);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7837d45419..5b3aea5f471 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -320,7 +320,7 @@ void irq_exit(void)
/*
* This function must run with irqs disabled!
*/
-inline fastcall void raise_softirq_irqoff(unsigned int nr)
+inline void raise_softirq_irqoff(unsigned int nr)
{
__raise_softirq_irqoff(nr);
@@ -337,7 +337,7 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr)
wakeup_softirqd();
}
-void fastcall raise_softirq(unsigned int nr)
+void raise_softirq(unsigned int nr)
{
unsigned long flags;
@@ -363,7 +363,7 @@ struct tasklet_head
static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
-void fastcall __tasklet_schedule(struct tasklet_struct *t)
+void __tasklet_schedule(struct tasklet_struct *t)
{
unsigned long flags;
@@ -376,7 +376,7 @@ void fastcall __tasklet_schedule(struct tasklet_struct *t)
EXPORT_SYMBOL(__tasklet_schedule);
-void fastcall __tasklet_hi_schedule(struct tasklet_struct *t)
+void __tasklet_hi_schedule(struct tasklet_struct *t)
{
unsigned long flags;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 3507cabe963..b0aeeaf22ce 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -74,7 +74,7 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
* severe errors when invoked on an active srcu_struct. That said, it
* can be useful as an error check at cleanup time.
*/
-int srcu_readers_active(struct srcu_struct *sp)
+static int srcu_readers_active(struct srcu_struct *sp)
{
return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1);
}
@@ -255,4 +255,3 @@ EXPORT_SYMBOL_GPL(srcu_read_lock);
EXPORT_SYMBOL_GPL(srcu_read_unlock);
EXPORT_SYMBOL_GPL(synchronize_srcu);
EXPORT_SYMBOL_GPL(srcu_batches_completed);
-EXPORT_SYMBOL_GPL(srcu_readers_active);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51b5ee53571..6f4e0e13f70 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -29,7 +29,6 @@ enum stopmachine_state {
static enum stopmachine_state stopmachine_state;
static unsigned int stopmachine_num_threads;
static atomic_t stopmachine_thread_ack;
-static DECLARE_MUTEX(stopmachine_mutex);
static int stopmachine(void *cpu)
{
@@ -170,6 +169,7 @@ static int do_stop(void *_smdata)
struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
unsigned int cpu)
{
+ static DEFINE_MUTEX(stopmachine_mutex);
struct stop_machine_data smdata;
struct task_struct *p;
@@ -177,7 +177,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
smdata.data = data;
init_completion(&smdata.done);
- down(&stopmachine_mutex);
+ mutex_lock(&stopmachine_mutex);
/* If they don't care which CPU fn runs on, bind to any online one. */
if (cpu == NR_CPUS)
@@ -193,7 +193,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
wake_up_process(p);
wait_for_completion(&smdata.done);
}
- up(&stopmachine_mutex);
+ mutex_unlock(&stopmachine_mutex);
return p;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 53de35fc824..a626116af5d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -916,8 +916,8 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
{
struct task_struct *p;
struct task_struct *group_leader = current->group_leader;
- int err = -EINVAL;
- struct pid_namespace *ns;
+ struct pid *pgrp;
+ int err;
if (!pid)
pid = task_pid_vnr(group_leader);
@@ -929,12 +929,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
/* From this point forward we keep holding onto the tasklist lock
* so that our parent does not change from under us. -DaveM
*/
- ns = current->nsproxy->pid_ns;
-
write_lock_irq(&tasklist_lock);
err = -ESRCH;
- p = find_task_by_pid_ns(pid, ns);
+ p = find_task_by_vpid(pid);
if (!p)
goto out;
@@ -942,7 +940,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
if (!thread_group_leader(p))
goto out;
- if (p->real_parent->tgid == group_leader->tgid) {
+ if (same_thread_group(p->real_parent, group_leader)) {
err = -EPERM;
if (task_session(p) != task_session(group_leader))
goto out;
@@ -959,10 +957,12 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
if (p->signal->leader)
goto out;
+ pgrp = task_pid(p);
if (pgid != pid) {
struct task_struct *g;
- g = find_task_by_pid_type_ns(PIDTYPE_PGID, pgid, ns);
+ pgrp = find_vpid(pgid);
+ g = pid_task(pgrp, PIDTYPE_PGID);
if (!g || task_session(g) != task_session(group_leader))
goto out;
}
@@ -971,13 +971,10 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
if (err)
goto out;
- if (task_pgrp_nr_ns(p, ns) != pgid) {
- struct pid *pid;
-
+ if (task_pgrp(p) != pgrp) {
detach_pid(p, PIDTYPE_PGID);
- pid = find_vpid(pgid);
- attach_pid(p, PIDTYPE_PGID, pid);
- set_task_pgrp(p, pid_nr(pid));
+ attach_pid(p, PIDTYPE_PGID, pgrp);
+ set_task_pgrp(p, pid_nr(pgrp));
}
err = 0;
@@ -994,17 +991,14 @@ asmlinkage long sys_getpgid(pid_t pid)
else {
int retval;
struct task_struct *p;
- struct pid_namespace *ns;
-
- ns = current->nsproxy->pid_ns;
read_lock(&tasklist_lock);
- p = find_task_by_pid_ns(pid, ns);
+ p = find_task_by_vpid(pid);
retval = -ESRCH;
if (p) {
retval = security_task_getpgid(p);
if (!retval)
- retval = task_pgrp_nr_ns(p, ns);
+ retval = task_pgrp_vnr(p);
}
read_unlock(&tasklist_lock);
return retval;
@@ -1028,19 +1022,16 @@ asmlinkage long sys_getsid(pid_t pid)
else {
int retval;
struct task_struct *p;
- struct pid_namespace *ns;
-
- ns = current->nsproxy->pid_ns;
- read_lock(&tasklist_lock);
- p = find_task_by_pid_ns(pid, ns);
+ rcu_read_lock();
+ p = find_task_by_vpid(pid);
retval = -ESRCH;
if (p) {
retval = security_task_getsid(p);
if (!retval)
- retval = task_session_nr_ns(p, ns);
+ retval = task_session_vnr(p);
}
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return retval;
}
}
@@ -1048,35 +1039,29 @@ asmlinkage long sys_getsid(pid_t pid)
asmlinkage long sys_setsid(void)
{
struct task_struct *group_leader = current->group_leader;
- pid_t session;
+ struct pid *sid = task_pid(group_leader);
+ pid_t session = pid_vnr(sid);
int err = -EPERM;
write_lock_irq(&tasklist_lock);
-
/* Fail if I am already a session leader */
if (group_leader->signal->leader)
goto out;
- session = group_leader->pid;
/* Fail if a process group id already exists that equals the
* proposed session id.
- *
- * Don't check if session id == 1 because kernel threads use this
- * session id and so the check will always fail and make it so
- * init cannot successfully call setsid.
*/
- if (session > 1 && find_task_by_pid_type_ns(PIDTYPE_PGID,
- session, &init_pid_ns))
+ if (pid_task(sid, PIDTYPE_PGID))
goto out;
group_leader->signal->leader = 1;
- __set_special_pids(session, session);
+ __set_special_pids(sid);
spin_lock(&group_leader->sighand->siglock);
group_leader->signal->tty = NULL;
spin_unlock(&group_leader->sighand->siglock);
- err = task_pgrp_vnr(group_leader);
+ err = session;
out:
write_unlock_irq(&tasklist_lock);
return err;
@@ -1145,16 +1130,16 @@ static int groups_to_user(gid_t __user *grouplist,
struct group_info *group_info)
{
int i;
- int count = group_info->ngroups;
+ unsigned int count = group_info->ngroups;
for (i = 0; i < group_info->nblocks; i++) {
- int cp_count = min(NGROUPS_PER_BLOCK, count);
- int off = i * NGROUPS_PER_BLOCK;
- int len = cp_count * sizeof(*grouplist);
+ unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+ unsigned int len = cp_count * sizeof(*grouplist);
- if (copy_to_user(grouplist+off, group_info->blocks[i], len))
+ if (copy_to_user(grouplist, group_info->blocks[i], len))
return -EFAULT;
+ grouplist += NGROUPS_PER_BLOCK;
count -= cp_count;
}
return 0;
@@ -1165,16 +1150,16 @@ static int groups_from_user(struct group_info *group_info,
gid_t __user *grouplist)
{
int i;
- int count = group_info->ngroups;
+ unsigned int count = group_info->ngroups;
for (i = 0; i < group_info->nblocks; i++) {
- int cp_count = min(NGROUPS_PER_BLOCK, count);
- int off = i * NGROUPS_PER_BLOCK;
- int len = cp_count * sizeof(*grouplist);
+ unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
+ unsigned int len = cp_count * sizeof(*grouplist);
- if (copy_from_user(group_info->blocks[i], grouplist+off, len))
+ if (copy_from_user(group_info->blocks[i], grouplist, len))
return -EFAULT;
+ grouplist += NGROUPS_PER_BLOCK;
count -= cp_count;
}
return 0;
@@ -1472,7 +1457,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
+ if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
return -EPERM;
retval = security_task_setrlimit(resource, &new_rlim);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5e2ad5bf88e..8b7e9541179 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,7 +37,6 @@
#include <linux/highuid.h>
#include <linux/writeback.h>
#include <linux/hugetlb.h>
-#include <linux/security.h>
#include <linux/initrd.h>
#include <linux/times.h>
#include <linux/limits.h>
@@ -67,14 +66,13 @@ extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern int sysctl_panic_on_oom;
extern int sysctl_oom_kill_allocating_task;
+extern int sysctl_oom_dump_tasks;
extern int max_threads;
extern int core_uses_pid;
extern int suid_dumpable;
extern char core_pattern[];
extern int pid_max;
extern int min_free_kbytes;
-extern int printk_ratelimit_jiffies;
-extern int printk_ratelimit_burst;
extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
@@ -313,22 +311,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "sched_rt_period_ms",
- .data = &sysctl_sched_rt_period,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "sched_rt_ratio",
- .data = &sysctl_sched_rt_ratio,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
{
.ctl_name = CTL_UNNUMBERED,
@@ -350,6 +332,22 @@ static struct ctl_table kern_table[] = {
#endif
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_rt_period_us",
+ .data = &sysctl_sched_rt_period,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_rt_runtime_us",
+ .data = &sysctl_sched_rt_runtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "sched_compat_yield",
.data = &sysctl_sched_compat_yield,
.maxlen = sizeof(unsigned int),
@@ -490,14 +488,6 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
- {
- .ctl_name = KERN_PRINTK,
- .procname = "printk",
- .data = &console_loglevel,
- .maxlen = 4*sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
#ifdef CONFIG_KMOD
{
.ctl_name = KERN_MODPROBE,
@@ -644,6 +634,15 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#if defined CONFIG_PRINTK
+ {
+ .ctl_name = KERN_PRINTK,
+ .procname = "printk",
+ .data = &console_loglevel,
+ .maxlen = 4*sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{
.ctl_name = KERN_PRINTK_RATELIMIT,
.procname = "printk_ratelimit",
@@ -661,6 +660,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#endif
{
.ctl_name = KERN_NGROUPS_MAX,
.procname = "ngroups_max",
@@ -871,6 +871,14 @@ static struct ctl_table vm_table[] = {
.proc_handler = &proc_dointvec,
},
{
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "oom_dump_tasks",
+ .data = &sysctl_oom_dump_tasks,
+ .maxlen = sizeof(sysctl_oom_dump_tasks),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
.ctl_name = VM_OVERCOMMIT_RATIO,
.procname = "overcommit_ratio",
.data = &sysctl_overcommit_ratio,
@@ -970,10 +978,10 @@ static struct ctl_table vm_table[] = {
{
.ctl_name = CTL_UNNUMBERED,
.procname = "nr_overcommit_hugepages",
- .data = &nr_overcommit_huge_pages,
- .maxlen = sizeof(nr_overcommit_huge_pages),
+ .data = &sysctl_overcommit_huge_pages,
+ .maxlen = sizeof(sysctl_overcommit_huge_pages),
.mode = 0644,
- .proc_handler = &proc_doulongvec_minmax,
+ .proc_handler = &hugetlb_overcommit_handler,
},
#endif
{
@@ -1203,6 +1211,14 @@ static struct ctl_table fs_table[] = {
.proc_handler = &proc_dointvec,
},
{
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nr_open",
+ .data = &sysctl_nr_open,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
.ctl_name = FS_DENTRY,
.procname = "dentry-state",
.data = &dentry_stat,
@@ -2471,7 +2487,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
pid_t tmp;
int r;
- tmp = pid_nr_ns(cad_pid, current->nsproxy->pid_ns);
+ tmp = pid_vnr(cad_pid);
r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
lenp, ppos, NULL, NULL);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 006365b69ea..c09350d564f 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -8,10 +8,10 @@
struct trans_ctl_table {
int ctl_name;
const char *procname;
- struct trans_ctl_table *child;
+ const struct trans_ctl_table *child;
};
-static struct trans_ctl_table trans_random_table[] = {
+static const struct trans_ctl_table trans_random_table[] = {
{ RANDOM_POOLSIZE, "poolsize" },
{ RANDOM_ENTROPY_COUNT, "entropy_avail" },
{ RANDOM_READ_THRESH, "read_wakeup_threshold" },
@@ -21,13 +21,13 @@ static struct trans_ctl_table trans_random_table[] = {
{}
};
-static struct trans_ctl_table trans_pty_table[] = {
+static const struct trans_ctl_table trans_pty_table[] = {
{ PTY_MAX, "max" },
{ PTY_NR, "nr" },
{}
};
-static struct trans_ctl_table trans_kern_table[] = {
+static const struct trans_ctl_table trans_kern_table[] = {
{ KERN_OSTYPE, "ostype" },
{ KERN_OSRELEASE, "osrelease" },
/* KERN_OSREV not used */
@@ -107,7 +107,7 @@ static struct trans_ctl_table trans_kern_table[] = {
{}
};
-static struct trans_ctl_table trans_vm_table[] = {
+static const struct trans_ctl_table trans_vm_table[] = {
{ VM_OVERCOMMIT_MEMORY, "overcommit_memory" },
{ VM_PAGE_CLUSTER, "page-cluster" },
{ VM_DIRTY_BACKGROUND, "dirty_background_ratio" },
@@ -139,7 +139,7 @@ static struct trans_ctl_table trans_vm_table[] = {
{}
};
-static struct trans_ctl_table trans_net_core_table[] = {
+static const struct trans_ctl_table trans_net_core_table[] = {
{ NET_CORE_WMEM_MAX, "wmem_max" },
{ NET_CORE_RMEM_MAX, "rmem_max" },
{ NET_CORE_WMEM_DEFAULT, "wmem_default" },
@@ -165,14 +165,14 @@ static struct trans_ctl_table trans_net_core_table[] = {
{},
};
-static struct trans_ctl_table trans_net_unix_table[] = {
+static const struct trans_ctl_table trans_net_unix_table[] = {
/* NET_UNIX_DESTROY_DELAY unused */
/* NET_UNIX_DELETE_DELAY unused */
{ NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" },
{}
};
-static struct trans_ctl_table trans_net_ipv4_route_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_route_table[] = {
{ NET_IPV4_ROUTE_FLUSH, "flush" },
{ NET_IPV4_ROUTE_MIN_DELAY, "min_delay" },
{ NET_IPV4_ROUTE_MAX_DELAY, "max_delay" },
@@ -195,7 +195,7 @@ static struct trans_ctl_table trans_net_ipv4_route_table[] = {
{}
};
-static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
{ NET_IPV4_CONF_FORWARDING, "forwarding" },
{ NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" },
@@ -222,14 +222,14 @@ static struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
{}
};
-static struct trans_ctl_table trans_net_ipv4_conf_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_conf_table[] = {
{ NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table },
{ NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table },
{ 0, NULL, trans_net_ipv4_conf_vars_table },
{}
};
-static struct trans_ctl_table trans_net_neigh_vars_table[] = {
+static const struct trans_ctl_table trans_net_neigh_vars_table[] = {
{ NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
{ NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
{ NET_NEIGH_APP_SOLICIT, "app_solicit" },
@@ -251,13 +251,13 @@ static struct trans_ctl_table trans_net_neigh_vars_table[] = {
{}
};
-static struct trans_ctl_table trans_net_neigh_table[] = {
+static const struct trans_ctl_table trans_net_neigh_table[] = {
{ NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table },
{ 0, NULL, trans_net_neigh_vars_table },
{}
};
-static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
{ NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" },
{ NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" },
@@ -294,7 +294,7 @@ static struct trans_ctl_table trans_net_ipv4_netfilter_table[] = {
{}
};
-static struct trans_ctl_table trans_net_ipv4_table[] = {
+static const struct trans_ctl_table trans_net_ipv4_table[] = {
{ NET_IPV4_FORWARD, "ip_forward" },
{ NET_IPV4_DYNADDR, "ip_dynaddr" },
@@ -393,13 +393,13 @@ static struct trans_ctl_table trans_net_ipv4_table[] = {
{}
};
-static struct trans_ctl_table trans_net_ipx_table[] = {
+static const struct trans_ctl_table trans_net_ipx_table[] = {
{ NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" },
/* NET_IPX_FORWARDING unused */
{}
};
-static struct trans_ctl_table trans_net_atalk_table[] = {
+static const struct trans_ctl_table trans_net_atalk_table[] = {
{ NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" },
{ NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" },
{ NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" },
@@ -407,7 +407,7 @@ static struct trans_ctl_table trans_net_atalk_table[] = {
{},
};
-static struct trans_ctl_table trans_net_netrom_table[] = {
+static const struct trans_ctl_table trans_net_netrom_table[] = {
{ NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" },
{ NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" },
{ NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" },
@@ -423,7 +423,7 @@ static struct trans_ctl_table trans_net_netrom_table[] = {
{}
};
-static struct trans_ctl_table trans_net_ax25_param_table[] = {
+static const struct trans_ctl_table trans_net_ax25_param_table[] = {
{ NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
{ NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
{ NET_AX25_BACKOFF_TYPE, "backoff_type" },
@@ -441,12 +441,12 @@ static struct trans_ctl_table trans_net_ax25_param_table[] = {
{}
};
-static struct trans_ctl_table trans_net_ax25_table[] = {
+static const struct trans_ctl_table trans_net_ax25_table[] = {
{ 0, NULL, trans_net_ax25_param_table },
{}
};
-static struct trans_ctl_table trans_net_bridge_table[] = {
+static const struct trans_ctl_table trans_net_bridge_table[] = {
{ NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
{ NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
{ NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" },
@@ -455,7 +455,7 @@ static struct trans_ctl_table trans_net_bridge_table[] = {
{}
};
-static struct trans_ctl_table trans_net_rose_table[] = {
+static const struct trans_ctl_table trans_net_rose_table[] = {
{ NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
{ NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
{ NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
@@ -469,7 +469,7 @@ static struct trans_ctl_table trans_net_rose_table[] = {
{}
};
-static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
{ NET_IPV6_FORWARDING, "forwarding" },
{ NET_IPV6_HOP_LIMIT, "hop_limit" },
{ NET_IPV6_MTU, "mtu" },
@@ -497,14 +497,14 @@ static struct trans_ctl_table trans_net_ipv6_conf_var_table[] = {
{}
};
-static struct trans_ctl_table trans_net_ipv6_conf_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_conf_table[] = {
{ NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table },
{ NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table },
{ 0, NULL, trans_net_ipv6_conf_var_table },
{}
};
-static struct trans_ctl_table trans_net_ipv6_route_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_route_table[] = {
{ NET_IPV6_ROUTE_FLUSH, "flush" },
{ NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" },
{ NET_IPV6_ROUTE_MAX_SIZE, "max_size" },
@@ -518,12 +518,12 @@ static struct trans_ctl_table trans_net_ipv6_route_table[] = {
{}
};
-static struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = {
{ NET_IPV6_ICMP_RATELIMIT, "ratelimit" },
{}
};
-static struct trans_ctl_table trans_net_ipv6_table[] = {
+static const struct trans_ctl_table trans_net_ipv6_table[] = {
{ NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table },
{ NET_IPV6_NEIGH, "neigh", trans_net_neigh_table },
{ NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table },
@@ -538,7 +538,7 @@ static struct trans_ctl_table trans_net_ipv6_table[] = {
{}
};
-static struct trans_ctl_table trans_net_x25_table[] = {
+static const struct trans_ctl_table trans_net_x25_table[] = {
{ NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" },
{ NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" },
{ NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" },
@@ -548,13 +548,13 @@ static struct trans_ctl_table trans_net_x25_table[] = {
{}
};
-static struct trans_ctl_table trans_net_tr_table[] = {
+static const struct trans_ctl_table trans_net_tr_table[] = {
{ NET_TR_RIF_TIMEOUT, "rif_timeout" },
{}
};
-static struct trans_ctl_table trans_net_decnet_conf_vars[] = {
+static const struct trans_ctl_table trans_net_decnet_conf_vars[] = {
{ NET_DECNET_CONF_DEV_FORWARDING, "forwarding" },
{ NET_DECNET_CONF_DEV_PRIORITY, "priority" },
{ NET_DECNET_CONF_DEV_T2, "t2" },
@@ -562,12 +562,12 @@ static struct trans_ctl_table trans_net_decnet_conf_vars[] = {
{}
};
-static struct trans_ctl_table trans_net_decnet_conf[] = {
+static const struct trans_ctl_table trans_net_decnet_conf[] = {
{ 0, NULL, trans_net_decnet_conf_vars },
{}
};
-static struct trans_ctl_table trans_net_decnet_table[] = {
+static const struct trans_ctl_table trans_net_decnet_table[] = {
{ NET_DECNET_CONF, "conf", trans_net_decnet_conf },
{ NET_DECNET_NODE_ADDRESS, "node_address" },
{ NET_DECNET_NODE_NAME, "node_name" },
@@ -585,7 +585,7 @@ static struct trans_ctl_table trans_net_decnet_table[] = {
{}
};
-static struct trans_ctl_table trans_net_sctp_table[] = {
+static const struct trans_ctl_table trans_net_sctp_table[] = {
{ NET_SCTP_RTO_INITIAL, "rto_initial" },
{ NET_SCTP_RTO_MIN, "rto_min" },
{ NET_SCTP_RTO_MAX, "rto_max" },
@@ -606,7 +606,7 @@ static struct trans_ctl_table trans_net_sctp_table[] = {
{}
};
-static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
+static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
{ NET_LLC2_ACK_TIMEOUT, "ack" },
{ NET_LLC2_P_TIMEOUT, "p" },
{ NET_LLC2_REJ_TIMEOUT, "rej" },
@@ -614,23 +614,23 @@ static struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = {
{}
};
-static struct trans_ctl_table trans_net_llc_station_table[] = {
+static const struct trans_ctl_table trans_net_llc_station_table[] = {
{ NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" },
{}
};
-static struct trans_ctl_table trans_net_llc_llc2_table[] = {
+static const struct trans_ctl_table trans_net_llc_llc2_table[] = {
{ NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table },
{}
};
-static struct trans_ctl_table trans_net_llc_table[] = {
+static const struct trans_ctl_table trans_net_llc_table[] = {
{ NET_LLC2, "llc2", trans_net_llc_llc2_table },
{ NET_LLC_STATION, "station", trans_net_llc_station_table },
{}
};
-static struct trans_ctl_table trans_net_netfilter_table[] = {
+static const struct trans_ctl_table trans_net_netfilter_table[] = {
{ NET_NF_CONNTRACK_MAX, "nf_conntrack_max" },
{ NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" },
{ NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" },
@@ -667,12 +667,12 @@ static struct trans_ctl_table trans_net_netfilter_table[] = {
{}
};
-static struct trans_ctl_table trans_net_dccp_table[] = {
+static const struct trans_ctl_table trans_net_dccp_table[] = {
{ NET_DCCP_DEFAULT, "default" },
{}
};
-static struct trans_ctl_table trans_net_irda_table[] = {
+static const struct trans_ctl_table trans_net_irda_table[] = {
{ NET_IRDA_DISCOVERY, "discovery" },
{ NET_IRDA_DEVNAME, "devname" },
{ NET_IRDA_DEBUG, "debug" },
@@ -690,7 +690,7 @@ static struct trans_ctl_table trans_net_irda_table[] = {
{}
};
-static struct trans_ctl_table trans_net_table[] = {
+static const struct trans_ctl_table trans_net_table[] = {
{ NET_CORE, "core", trans_net_core_table },
/* NET_ETHER not used */
/* NET_802 not used */
@@ -716,7 +716,7 @@ static struct trans_ctl_table trans_net_table[] = {
{}
};
-static struct trans_ctl_table trans_fs_quota_table[] = {
+static const struct trans_ctl_table trans_fs_quota_table[] = {
{ FS_DQ_LOOKUPS, "lookups" },
{ FS_DQ_DROPS, "drops" },
{ FS_DQ_READS, "reads" },
@@ -729,7 +729,7 @@ static struct trans_ctl_table trans_fs_quota_table[] = {
{}
};
-static struct trans_ctl_table trans_fs_xfs_table[] = {
+static const struct trans_ctl_table trans_fs_xfs_table[] = {
{ XFS_RESTRICT_CHOWN, "restrict_chown" },
{ XFS_SGID_INHERIT, "irix_sgid_inherit" },
{ XFS_SYMLINK_MODE, "irix_symlink_mode" },
@@ -750,24 +750,24 @@ static struct trans_ctl_table trans_fs_xfs_table[] = {
{}
};
-static struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
+static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = {
{ 1, "hb_ctl_path" },
{}
};
-static struct trans_ctl_table trans_fs_ocfs2_table[] = {
+static const struct trans_ctl_table trans_fs_ocfs2_table[] = {
{ 1, "nm", trans_fs_ocfs2_nm_table },
{}
};
-static struct trans_ctl_table trans_inotify_table[] = {
+static const struct trans_ctl_table trans_inotify_table[] = {
{ INOTIFY_MAX_USER_INSTANCES, "max_user_instances" },
{ INOTIFY_MAX_USER_WATCHES, "max_user_watches" },
{ INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" },
{}
};
-static struct trans_ctl_table trans_fs_table[] = {
+static const struct trans_ctl_table trans_fs_table[] = {
{ FS_NRINODE, "inode-nr" },
{ FS_STATINODE, "inode-state" },
/* FS_MAXINODE unused */
@@ -793,11 +793,11 @@ static struct trans_ctl_table trans_fs_table[] = {
{}
};
-static struct trans_ctl_table trans_debug_table[] = {
+static const struct trans_ctl_table trans_debug_table[] = {
{}
};
-static struct trans_ctl_table trans_cdrom_table[] = {
+static const struct trans_ctl_table trans_cdrom_table[] = {
{ DEV_CDROM_INFO, "info" },
{ DEV_CDROM_AUTOCLOSE, "autoclose" },
{ DEV_CDROM_AUTOEJECT, "autoeject" },
@@ -807,12 +807,12 @@ static struct trans_ctl_table trans_cdrom_table[] = {
{}
};
-static struct trans_ctl_table trans_ipmi_table[] = {
+static const struct trans_ctl_table trans_ipmi_table[] = {
{ DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" },
{}
};
-static struct trans_ctl_table trans_mac_hid_files[] = {
+static const struct trans_ctl_table trans_mac_hid_files[] = {
/* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
/* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
{ DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" },
@@ -822,35 +822,35 @@ static struct trans_ctl_table trans_mac_hid_files[] = {
{}
};
-static struct trans_ctl_table trans_raid_table[] = {
+static const struct trans_ctl_table trans_raid_table[] = {
{ DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" },
{ DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" },
{}
};
-static struct trans_ctl_table trans_scsi_table[] = {
+static const struct trans_ctl_table trans_scsi_table[] = {
{ DEV_SCSI_LOGGING_LEVEL, "logging_level" },
{}
};
-static struct trans_ctl_table trans_parport_default_table[] = {
+static const struct trans_ctl_table trans_parport_default_table[] = {
{ DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" },
{ DEV_PARPORT_DEFAULT_SPINTIME, "spintime" },
{}
};
-static struct trans_ctl_table trans_parport_device_table[] = {
+static const struct trans_ctl_table trans_parport_device_table[] = {
{ DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" },
{}
};
-static struct trans_ctl_table trans_parport_devices_table[] = {
+static const struct trans_ctl_table trans_parport_devices_table[] = {
{ DEV_PARPORT_DEVICES_ACTIVE, "active" },
{ 0, NULL, trans_parport_device_table },
{}
};
-static struct trans_ctl_table trans_parport_parport_table[] = {
+static const struct trans_ctl_table trans_parport_parport_table[] = {
{ DEV_PARPORT_SPINTIME, "spintime" },
{ DEV_PARPORT_BASE_ADDR, "base-addr" },
{ DEV_PARPORT_IRQ, "irq" },
@@ -864,13 +864,13 @@ static struct trans_ctl_table trans_parport_parport_table[] = {
{ DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" },
{}
};
-static struct trans_ctl_table trans_parport_table[] = {
+static const struct trans_ctl_table trans_parport_table[] = {
{ DEV_PARPORT_DEFAULT, "default", trans_parport_default_table },
{ 0, NULL, trans_parport_parport_table },
{}
};
-static struct trans_ctl_table trans_dev_table[] = {
+static const struct trans_ctl_table trans_dev_table[] = {
{ DEV_CDROM, "cdrom", trans_cdrom_table },
/* DEV_HWMON unused */
{ DEV_PARPORT, "parport", trans_parport_table },
@@ -881,19 +881,19 @@ static struct trans_ctl_table trans_dev_table[] = {
{}
};
-static struct trans_ctl_table trans_bus_isa_table[] = {
+static const struct trans_ctl_table trans_bus_isa_table[] = {
{ BUS_ISA_MEM_BASE, "membase" },
{ BUS_ISA_PORT_BASE, "portbase" },
{ BUS_ISA_PORT_SHIFT, "portshift" },
{}
};
-static struct trans_ctl_table trans_bus_table[] = {
+static const struct trans_ctl_table trans_bus_table[] = {
{ CTL_BUS_ISA, "isa", trans_bus_isa_table },
{}
};
-static struct trans_ctl_table trans_arlan_conf_table0[] = {
+static const struct trans_ctl_table trans_arlan_conf_table0[] = {
{ 1, "spreadingCode" },
{ 2, "channelNumber" },
{ 3, "scramblingDisable" },
@@ -964,7 +964,7 @@ static struct trans_ctl_table trans_arlan_conf_table0[] = {
{}
};
-static struct trans_ctl_table trans_arlan_conf_table1[] = {
+static const struct trans_ctl_table trans_arlan_conf_table1[] = {
{ 1, "spreadingCode" },
{ 2, "channelNumber" },
{ 3, "scramblingDisable" },
@@ -1035,7 +1035,7 @@ static struct trans_ctl_table trans_arlan_conf_table1[] = {
{}
};
-static struct trans_ctl_table trans_arlan_conf_table2[] = {
+static const struct trans_ctl_table trans_arlan_conf_table2[] = {
{ 1, "spreadingCode" },
{ 2, "channelNumber" },
{ 3, "scramblingDisable" },
@@ -1106,7 +1106,7 @@ static struct trans_ctl_table trans_arlan_conf_table2[] = {
{}
};
-static struct trans_ctl_table trans_arlan_conf_table3[] = {
+static const struct trans_ctl_table trans_arlan_conf_table3[] = {
{ 1, "spreadingCode" },
{ 2, "channelNumber" },
{ 3, "scramblingDisable" },
@@ -1177,7 +1177,7 @@ static struct trans_ctl_table trans_arlan_conf_table3[] = {
{}
};
-static struct trans_ctl_table trans_arlan_table[] = {
+static const struct trans_ctl_table trans_arlan_table[] = {
{ 1, "arlan0", trans_arlan_conf_table0 },
{ 2, "arlan1", trans_arlan_conf_table1 },
{ 3, "arlan2", trans_arlan_conf_table2 },
@@ -1185,13 +1185,13 @@ static struct trans_ctl_table trans_arlan_table[] = {
{}
};
-static struct trans_ctl_table trans_s390dbf_table[] = {
+static const struct trans_ctl_table trans_s390dbf_table[] = {
{ 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
{ 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
{}
};
-static struct trans_ctl_table trans_sunrpc_table[] = {
+static const struct trans_ctl_table trans_sunrpc_table[] = {
{ CTL_RPCDEBUG, "rpc_debug" },
{ CTL_NFSDEBUG, "nfs_debug" },
{ CTL_NFSDDEBUG, "nfsd_debug" },
@@ -1203,7 +1203,7 @@ static struct trans_ctl_table trans_sunrpc_table[] = {
{}
};
-static struct trans_ctl_table trans_pm_table[] = {
+static const struct trans_ctl_table trans_pm_table[] = {
{ 1 /* CTL_PM_SUSPEND */, "suspend" },
{ 2 /* CTL_PM_CMODE */, "cmode" },
{ 3 /* CTL_PM_P0 */, "p0" },
@@ -1211,13 +1211,13 @@ static struct trans_ctl_table trans_pm_table[] = {
{}
};
-static struct trans_ctl_table trans_frv_table[] = {
+static const struct trans_ctl_table trans_frv_table[] = {
{ 1, "cache-mode" },
{ 2, "pin-cxnr" },
{}
};
-static struct trans_ctl_table trans_root_table[] = {
+static const struct trans_ctl_table trans_root_table[] = {
{ CTL_KERN, "kernel", trans_kern_table },
{ CTL_VM, "vm", trans_vm_table },
{ CTL_NET, "net", trans_net_table },
@@ -1261,15 +1261,14 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
return table;
}
-static struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
+static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table)
{
struct ctl_table *test;
- struct trans_ctl_table *ref;
- int depth, cur_depth;
+ const struct trans_ctl_table *ref;
+ int cur_depth;
- depth = sysctl_depth(table);
+ cur_depth = sysctl_depth(table);
- cur_depth = depth;
ref = trans_root_table;
repeat:
test = sysctl_parent(table, cur_depth);
@@ -1437,7 +1436,7 @@ static void sysctl_check_leaf(struct nsproxy *namespaces,
static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
{
- struct trans_ctl_table *ref;
+ const struct trans_ctl_table *ref;
ref = sysctl_binary_lookup(table);
if (table->ctl_name && !ref)
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 88cdb109e13..06b6395b45b 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -135,6 +135,12 @@ static int test_jprobe(void)
#ifdef CONFIG_KRETPROBES
static u32 krph_val;
+static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
+{
+ krph_val = (rand1 / div_factor);
+ return 0;
+}
+
static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long ret = regs_return_value(regs);
@@ -144,13 +150,19 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
printk(KERN_ERR "Kprobe smoke test failed: "
"incorrect value in kretprobe handler\n");
}
+ if (krph_val == 0) {
+ handler_errors++;
+ printk(KERN_ERR "Kprobe smoke test failed: "
+ "call to kretprobe entry handler failed\n");
+ }
- krph_val = (rand1 / div_factor);
+ krph_val = rand1;
return 0;
}
static struct kretprobe rp = {
.handler = return_handler,
+ .entry_handler = entry_handler,
.kp.symbol_name = "kprobe_target"
};
@@ -167,7 +179,7 @@ static int test_kretprobe(void)
ret = kprobe_target(rand1);
unregister_kretprobe(&rp);
- if (krph_val == 0) {
+ if (krph_val != rand1) {
printk(KERN_ERR "Kprobe smoke test failed: "
"kretprobe handler not called\n");
handler_errors++;
diff --git a/kernel/time.c b/kernel/time.c
index 4064c0566e7..a5ec013b6c8 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -39,6 +39,8 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
+#include "timeconst.h"
+
/*
* The timezone where the local system is located. Used as a default by some
* programs who obtain this value by using gettimeofday.
@@ -93,7 +95,8 @@ asmlinkage long sys_stime(time_t __user *tptr)
#endif /* __ARCH_WANT_SYS_TIME */
-asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
+asmlinkage long sys_gettimeofday(struct timeval __user *tv,
+ struct timezone __user *tz)
{
if (likely(tv != NULL)) {
struct timeval ktv;
@@ -118,7 +121,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
* hard to make the program warp the clock precisely n hours) or
* compile in the timezone information into the kernel. Bad, bad....
*
- * - TYT, 1992-01-01
+ * - TYT, 1992-01-01
*
* The best thing to do is to keep the CMOS clock in universal time (UTC)
* as real UNIX machines always do it. This avoids all headaches about
@@ -240,7 +243,11 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
- return (j * MSEC_PER_SEC) / HZ;
+# if BITS_PER_LONG == 32
+ return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+# else
+ return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
+# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);
@@ -252,7 +259,11 @@ unsigned int inline jiffies_to_usecs(const unsigned long j)
#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
#else
- return (j * USEC_PER_SEC) / HZ;
+# if BITS_PER_LONG == 32
+ return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
+# else
+ return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
+# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);
@@ -267,7 +278,7 @@ EXPORT_SYMBOL(jiffies_to_usecs);
*
* This function should be only used for timestamps returned by
* current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
- * it doesn't handle the better resolution of the later.
+ * it doesn't handle the better resolution of the latter.
*/
struct timespec timespec_trunc(struct timespec t, unsigned gran)
{
@@ -315,7 +326,7 @@ EXPORT_SYMBOL_GPL(getnstimeofday);
* This algorithm was first published by Gauss (I think).
*
* WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines were long is 32-bit! (However, as time_t is signed, we
+ * machines where long is 32-bit! (However, as time_t is signed, we
* will already get problems at other places on 2038-01-19 03:14:08)
*/
unsigned long
@@ -352,7 +363,7 @@ EXPORT_SYMBOL(mktime);
* normalize to the timespec storage format
*
* Note: The tv_nsec part is always in the range of
- * 0 <= tv_nsec < NSEC_PER_SEC
+ * 0 <= tv_nsec < NSEC_PER_SEC
* For negative values only the tv_sec field is negative !
*/
void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
@@ -453,12 +464,13 @@ unsigned long msecs_to_jiffies(const unsigned int m)
/*
* Generic case - multiply, round and divide. But first
* check that if we are doing a net multiplication, that
- * we wouldnt overflow:
+ * we wouldn't overflow:
*/
if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
- return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC;
+ return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+ >> MSEC_TO_HZ_SHR32;
#endif
}
EXPORT_SYMBOL(msecs_to_jiffies);
@@ -472,7 +484,8 @@ unsigned long usecs_to_jiffies(const unsigned int u)
#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
return u * (HZ / USEC_PER_SEC);
#else
- return (u * HZ + USEC_PER_SEC - 1) / USEC_PER_SEC;
+ return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+ >> USEC_TO_HZ_SHR32;
#endif
}
EXPORT_SYMBOL(usecs_to_jiffies);
@@ -566,7 +579,11 @@ EXPORT_SYMBOL(jiffies_to_timeval);
clock_t jiffies_to_clock_t(long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+ return x * (USER_HZ / HZ);
+# else
return x / (HZ / USER_HZ);
+# endif
#else
u64 tmp = (u64)x * TICK_NSEC;
do_div(tmp, (NSEC_PER_SEC / USER_HZ));
@@ -599,7 +616,14 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
+# if HZ < USER_HZ
+ x *= USER_HZ;
+ do_div(x, HZ);
+# elif HZ > USER_HZ
do_div(x, HZ / USER_HZ);
+# else
+ /* Nothing to do */
+# endif
#else
/*
* There are better ways that don't overflow early,
@@ -611,7 +635,6 @@ u64 jiffies_64_to_clock_t(u64 x)
#endif
return x;
}
-
EXPORT_SYMBOL(jiffies_64_to_clock_t);
u64 nsec_to_clock_t(u64 x)
@@ -646,7 +669,6 @@ u64 get_jiffies_64(void)
} while (read_seqretry(&xtime_lock, seq));
return ret;
}
-
EXPORT_SYMBOL(get_jiffies_64);
#endif
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3e59fce6dd4..3d1e3e1a197 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -133,7 +133,7 @@ static void clockevents_do_notify(unsigned long reason, void *dev)
}
/*
- * Called after a notify add to make devices availble which were
+ * Called after a notify add to make devices available which were
* released from the notifier call.
*/
static void clockevents_notify_released(void)
@@ -218,6 +218,8 @@ void clockevents_exchange_device(struct clock_event_device *old,
*/
void clockevents_notify(unsigned long reason, void *arg)
{
+ struct list_head *node, *tmp;
+
spin_lock(&clockevents_lock);
clockevents_do_notify(reason, arg);
@@ -227,13 +229,8 @@ void clockevents_notify(unsigned long reason, void *arg)
* Unregister the clock event devices which were
* released from the users in the notify chain.
*/
- while (!list_empty(&clockevents_released)) {
- struct clock_event_device *dev;
-
- dev = list_entry(clockevents_released.next,
- struct clock_event_device, list);
- list_del(&dev->list);
- }
+ list_for_each_safe(node, tmp, &clockevents_released)
+ list_del(node);
break;
default:
break;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6e9259a5d50..548c436a776 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -91,7 +91,6 @@ static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
cs->name, delta);
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
clocksource_change_rating(cs, 0);
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
list_del(&cs->wd_list);
}
@@ -363,15 +362,13 @@ void clocksource_unregister(struct clocksource *cs)
static ssize_t
sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
{
- char *curr = buf;
+ ssize_t count = 0;
spin_lock_irq(&clocksource_lock);
- curr += sprintf(curr, "%s ", curr_clocksource->name);
+ count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
spin_unlock_irq(&clocksource_lock);
- curr += sprintf(curr, "\n");
-
- return curr - buf;
+ return count;
}
/**
@@ -439,17 +436,20 @@ static ssize_t
sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
{
struct clocksource *src;
- char *curr = buf;
+ ssize_t count = 0;
spin_lock_irq(&clocksource_lock);
list_for_each_entry(src, &clocksource_list, list) {
- curr += sprintf(curr, "%s ", src->name);
+ count += snprintf(buf + count,
+ max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
+ "%s ", src->name);
}
spin_unlock_irq(&clocksource_lock);
- curr += sprintf(curr, "\n");
+ count += snprintf(buf + count,
+ max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
- return curr - buf;
+ return count;
}
/*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index e64efaf957e..c88b5910e7a 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -43,10 +43,6 @@ long time_freq; /* frequency offset (scaled ppm)*/
static long time_reftime; /* time at last adjustment (s) */
long time_adjust;
-#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE)
-#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \
- (s64)CLOCK_TICK_RATE)
-
static void ntp_update_frequency(void)
{
u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 88267f0a847..fa9bb73dbdb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -681,7 +681,7 @@ int tick_check_oneshot_change(int allow_nohz)
if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
return 0;
- if (!timekeeping_is_continuous() || !tick_is_oneshot_available())
+ if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
return 0;
if (!allow_nohz)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cd5dbc4579c..1af9fb050fe 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -201,9 +201,9 @@ static inline s64 __get_nsec_offset(void) { return 0; }
#endif
/**
- * timekeeping_is_continuous - check to see if timekeeping is free running
+ * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
*/
-int timekeeping_is_continuous(void)
+int timekeeping_valid_for_hres(void)
{
unsigned long seq;
int ret;
@@ -364,7 +364,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
* with losing too many ticks, otherwise we would overadjust and
* produce an even larger error. The smaller the adjustment the
* faster we try to adjust for it, as lost ticks can do less harm
- * here. This is tuned so that an error of about 1 msec is adusted
+ * here. This is tuned so that an error of about 1 msec is adjusted
* within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
*/
error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
new file mode 100644
index 00000000000..41468035473
--- /dev/null
+++ b/kernel/timeconst.pl
@@ -0,0 +1,402 @@
+#!/usr/bin/perl
+# -----------------------------------------------------------------------
+#
+# Copyright 2007 rPath, Inc. - All Rights Reserved
+#
+# This file is part of the Linux kernel, and is made available under
+# the terms of the GNU General Public License version 2 or (at your
+# option) any later version; incorporated herein by reference.
+#
+# -----------------------------------------------------------------------
+#
+
+#
+# Usage: timeconst.pl HZ > timeconst.h
+#
+
+# Precomputed values for systems without Math::BigInt
+# Generated by:
+# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
+%canned_values = (
+ 24 => [
+ '0xa6aaaaab','0x2aaaaaa',26,
+ '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58,
+ 125,3,
+ '0xc49ba5e4','0x1fbe76c8b4',37,
+ '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69,
+ 3,125,
+ '0xa2c2aaab','0xaaaa',16,
+ '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48,
+ 125000,3,
+ '0xc9539b89','0x7fffbce4217d',47,
+ '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79,
+ 3,125000,
+ ], 32 => [
+ '0xfa000000','0x6000000',27,
+ '0xfa00000000000000','0x600000000000000',59,
+ 125,4,
+ '0x83126e98','0xfdf3b645a',36,
+ '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68,
+ 4,125,
+ '0xf4240000','0x0',17,
+ '0xf424000000000000','0x0',49,
+ 31250,1,
+ '0x8637bd06','0x3fff79c842fa',46,
+ '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78,
+ 1,31250,
+ ], 48 => [
+ '0xa6aaaaab','0x6aaaaaa',27,
+ '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59,
+ 125,6,
+ '0xc49ba5e4','0xfdf3b645a',36,
+ '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68,
+ 6,125,
+ '0xa2c2aaab','0x15555',17,
+ '0xa2c2aaaaaaaaaaab','0x1555555555555',49,
+ 62500,3,
+ '0xc9539b89','0x3fffbce4217d',46,
+ '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78,
+ 3,62500,
+ ], 64 => [
+ '0xfa000000','0xe000000',28,
+ '0xfa00000000000000','0xe00000000000000',60,
+ 125,8,
+ '0x83126e98','0x7ef9db22d',35,
+ '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67,
+ 8,125,
+ '0xf4240000','0x0',18,
+ '0xf424000000000000','0x0',50,
+ 15625,1,
+ '0x8637bd06','0x1fff79c842fa',45,
+ '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77,
+ 1,15625,
+ ], 100 => [
+ '0xa0000000','0x0',28,
+ '0xa000000000000000','0x0',60,
+ 10,1,
+ '0xcccccccd','0x733333333',35,
+ '0xcccccccccccccccd','0x73333333333333333',67,
+ 1,10,
+ '0x9c400000','0x0',18,
+ '0x9c40000000000000','0x0',50,
+ 10000,1,
+ '0xd1b71759','0x1fff2e48e8a7',45,
+ '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77,
+ 1,10000,
+ ], 122 => [
+ '0x8325c53f','0xfbcda3a',28,
+ '0x8325c53ef368eb05','0xfbcda3ac10c9714',60,
+ 500,61,
+ '0xf9db22d1','0x7fbe76c8b',35,
+ '0xf9db22d0e560418a','0x7fbe76c8b43958106',67,
+ 61,500,
+ '0x8012e2a0','0x3ef36',18,
+ '0x8012e29f79b47583','0x3ef368eb04325',50,
+ 500000,61,
+ '0xffda4053','0x1ffffbce4217',45,
+ '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77,
+ 61,500000,
+ ], 128 => [
+ '0xfa000000','0x1e000000',29,
+ '0xfa00000000000000','0x1e00000000000000',61,
+ 125,16,
+ '0x83126e98','0x3f7ced916',34,
+ '0x83126e978d4fdf3c','0x3f7ced916872b020c',66,
+ 16,125,
+ '0xf4240000','0x40000',19,
+ '0xf424000000000000','0x4000000000000',51,
+ 15625,2,
+ '0x8637bd06','0xfffbce4217d',44,
+ '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76,
+ 2,15625,
+ ], 200 => [
+ '0xa0000000','0x0',29,
+ '0xa000000000000000','0x0',61,
+ 5,1,
+ '0xcccccccd','0x333333333',34,
+ '0xcccccccccccccccd','0x33333333333333333',66,
+ 1,5,
+ '0x9c400000','0x0',19,
+ '0x9c40000000000000','0x0',51,
+ 5000,1,
+ '0xd1b71759','0xfff2e48e8a7',44,
+ '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76,
+ 1,5000,
+ ], 250 => [
+ '0x80000000','0x0',29,
+ '0x8000000000000000','0x0',61,
+ 4,1,
+ '0x80000000','0x180000000',33,
+ '0x8000000000000000','0x18000000000000000',65,
+ 1,4,
+ '0xfa000000','0x0',20,
+ '0xfa00000000000000','0x0',52,
+ 4000,1,
+ '0x83126e98','0x7ff7ced9168',43,
+ '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75,
+ 1,4000,
+ ], 256 => [
+ '0xfa000000','0x3e000000',30,
+ '0xfa00000000000000','0x3e00000000000000',62,
+ 125,32,
+ '0x83126e98','0x1fbe76c8b',33,
+ '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65,
+ 32,125,
+ '0xf4240000','0xc0000',20,
+ '0xf424000000000000','0xc000000000000',52,
+ 15625,4,
+ '0x8637bd06','0x7ffde7210be',43,
+ '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75,
+ 4,15625,
+ ], 300 => [
+ '0xd5555556','0x2aaaaaaa',30,
+ '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62,
+ 10,3,
+ '0x9999999a','0x1cccccccc',33,
+ '0x999999999999999a','0x1cccccccccccccccc',65,
+ 3,10,
+ '0xd0555556','0xaaaaa',20,
+ '0xd055555555555556','0xaaaaaaaaaaaaa',52,
+ 10000,3,
+ '0x9d495183','0x7ffcb923a29',43,
+ '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75,
+ 3,10000,
+ ], 512 => [
+ '0xfa000000','0x7e000000',31,
+ '0xfa00000000000000','0x7e00000000000000',63,
+ 125,64,
+ '0x83126e98','0xfdf3b645',32,
+ '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64,
+ 64,125,
+ '0xf4240000','0x1c0000',21,
+ '0xf424000000000000','0x1c000000000000',53,
+ 15625,8,
+ '0x8637bd06','0x3ffef39085f',42,
+ '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74,
+ 8,15625,
+ ], 1000 => [
+ '0x80000000','0x0',31,
+ '0x8000000000000000','0x0',63,
+ 1,1,
+ '0x80000000','0x0',31,
+ '0x8000000000000000','0x0',63,
+ 1,1,
+ '0xfa000000','0x0',22,
+ '0xfa00000000000000','0x0',54,
+ 1000,1,
+ '0x83126e98','0x1ff7ced9168',41,
+ '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73,
+ 1,1000,
+ ], 1024 => [
+ '0xfa000000','0xfe000000',32,
+ '0xfa00000000000000','0xfe00000000000000',64,
+ 125,128,
+ '0x83126e98','0x7ef9db22',31,
+ '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63,
+ 128,125,
+ '0xf4240000','0x3c0000',22,
+ '0xf424000000000000','0x3c000000000000',54,
+ 15625,16,
+ '0x8637bd06','0x1fff79c842f',41,
+ '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73,
+ 16,15625,
+ ], 1200 => [
+ '0xd5555556','0xd5555555',32,
+ '0xd555555555555556','0xd555555555555555',64,
+ 5,6,
+ '0x9999999a','0x66666666',31,
+ '0x999999999999999a','0x6666666666666666',63,
+ 6,5,
+ '0xd0555556','0x2aaaaa',22,
+ '0xd055555555555556','0x2aaaaaaaaaaaaa',54,
+ 2500,3,
+ '0x9d495183','0x1ffcb923a29',41,
+ '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73,
+ 3,2500,
+ ]
+);
+
+$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
+
+sub bint($)
+{
+ my($x) = @_;
+ return Math::BigInt->new($x);
+}
+
+#
+# Constants for division by reciprocal multiplication.
+# (bits, numerator, denominator)
+#
+sub fmul($$$)
+{
+ my ($b,$n,$d) = @_;
+
+ $n = bint($n);
+ $d = bint($d);
+
+ return scalar (($n << $b)+$d-bint(1))/$d;
+}
+
+sub fadj($$$)
+{
+ my($b,$n,$d) = @_;
+
+ $n = bint($n);
+ $d = bint($d);
+
+ $d = $d/bgcd($n, $d);
+ return scalar (($d-bint(1)) << $b)/$d;
+}
+
+sub fmuls($$$) {
+ my($b,$n,$d) = @_;
+ my($s,$m);
+ my($thres) = bint(1) << ($b-1);
+
+ $n = bint($n);
+ $d = bint($d);
+
+ for ($s = 0; 1; $s++) {
+ $m = fmul($s,$n,$d);
+ return $s if ($m >= $thres);
+ }
+ return 0;
+}
+
+# Provides mul, adj, and shr factors for a specific
+# (bit, time, hz) combination
+sub muladj($$$) {
+ my($b, $t, $hz) = @_;
+ my $s = fmuls($b, $t, $hz);
+ my $m = fmul($s, $t, $hz);
+ my $a = fadj($s, $t, $hz);
+ return ($m->as_hex(), $a->as_hex(), $s);
+}
+
+# Provides numerator, denominator values
+sub numden($$) {
+ my($n, $d) = @_;
+ my $g = bgcd($n, $d);
+ return ($n/$g, $d/$g);
+}
+
+# All values for a specific (time, hz) combo
+sub conversions($$) {
+ my ($t, $hz) = @_;
+ my @val = ();
+
+ # HZ_TO_xx
+ push(@val, muladj(32, $t, $hz));
+ push(@val, muladj(64, $t, $hz));
+ push(@val, numden($t, $hz));
+
+ # xx_TO_HZ
+ push(@val, muladj(32, $hz, $t));
+ push(@val, muladj(64, $hz, $t));
+ push(@val, numden($hz, $t));
+
+ return @val;
+}
+
+sub compute_values($) {
+ my($hz) = @_;
+ my @val = ();
+ my $s, $m, $a, $g;
+
+ if (!$has_bigint) {
+ die "$0: HZ == $hz not canned and ".
+ "Math::BigInt not available\n";
+ }
+
+ # MSEC conversions
+ push(@val, conversions(1000, $hz));
+
+ # USEC conversions
+ push(@val, conversions(1000000, $hz));
+
+ return @val;
+}
+
+sub output($@)
+{
+ my($hz, @val) = @_;
+ my $pfx, $bit, $suf, $s, $m, $a;
+
+ print "/* Automatically generated by kernel/timeconst.pl */\n";
+ print "/* Conversion constants for HZ == $hz */\n";
+ print "\n";
+ print "#ifndef KERNEL_TIMECONST_H\n";
+ print "#define KERNEL_TIMECONST_H\n";
+ print "\n";
+
+ print "#include <linux/param.h>\n";
+
+ print "\n";
+ print "#if HZ != $hz\n";
+ print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
+ print "#endif\n";
+ print "\n";
+
+ foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
+ 'HZ_TO_USEC','USEC_TO_HZ') {
+ foreach $bit (32, 64) {
+ foreach $suf ('MUL', 'ADJ', 'SHR') {
+ printf "#define %-23s %s\n",
+ "${pfx}_$suf$bit", shift(@val);
+ }
+ }
+ foreach $suf ('NUM', 'DEN') {
+ printf "#define %-23s %s\n",
+ "${pfx}_$suf", shift(@val);
+ }
+ }
+
+ print "\n";
+ print "#endif /* KERNEL_TIMECONST_H */\n";
+}
+
+($hz) = @ARGV;
+
+# Use this to generate the %canned_values structure
+if ($hz eq '--can') {
+ shift(@ARGV);
+ @hzlist = sort {$a <=> $b} (@ARGV);
+
+ print "# Precomputed values for systems without Math::BigInt\n";
+ print "# Generated by:\n";
+ print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
+ print "\%canned_values = (\n";
+ my $pf = "\t";
+ foreach $hz (@hzlist) {
+ my @values = compute_values($hz);
+ print "$pf$hz => [\n";
+ while (scalar(@values)) {
+ my $bit;
+ foreach $bit (32, 64) {
+ my $m = shift(@values);
+ my $a = shift(@values);
+ my $s = shift(@values);
+ print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n";
+ }
+ my $n = shift(@values);
+ my $d = shift(@values);
+ print "\t\t",$n,',',$d,",\n";
+ }
+ print "\t]";
+ $pf = ', ';
+ }
+ print "\n);\n";
+} else {
+ $hz += 0; # Force to number
+ if ($hz < 1) {
+ die "Usage: $0 HZ\n";
+ }
+
+ @val = @{$canned_values{$hz}};
+ if (!defined(@val)) {
+ @val = compute_values($hz);
+ }
+ output($hz, @val);
+}
+exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 9fbb472b8cf..99b00a25f88 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -327,7 +327,7 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
* init_timer() must be done to a timer prior calling *any* of the
* other timer functions.
*/
-void fastcall init_timer(struct timer_list *timer)
+void init_timer(struct timer_list *timer)
{
timer->entry.next = NULL;
timer->base = __raw_get_cpu_var(tvec_bases);
@@ -339,7 +339,7 @@ void fastcall init_timer(struct timer_list *timer)
}
EXPORT_SYMBOL(init_timer);
-void fastcall init_timer_deferrable(struct timer_list *timer)
+void init_timer_deferrable(struct timer_list *timer)
{
init_timer(timer);
timer_set_deferrable(timer);
@@ -818,12 +818,14 @@ unsigned long next_timer_interrupt(void)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
void account_process_tick(struct task_struct *p, int user_tick)
{
+ cputime_t one_jiffy = jiffies_to_cputime(1);
+
if (user_tick) {
- account_user_time(p, jiffies_to_cputime(1));
- account_user_time_scaled(p, jiffies_to_cputime(1));
+ account_user_time(p, one_jiffy);
+ account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
} else {
- account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
- account_system_time_scaled(p, jiffies_to_cputime(1));
+ account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
+ account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
}
}
#endif
@@ -977,7 +979,7 @@ asmlinkage long sys_getppid(void)
int pid;
rcu_read_lock();
- pid = task_tgid_nr_ns(current->real_parent, current->nsproxy->pid_ns);
+ pid = task_tgid_vnr(current->real_parent);
rcu_read_unlock();
return pid;
@@ -1040,7 +1042,7 @@ static void process_timeout(unsigned long __data)
*
* In all cases the return value is guaranteed to be non-negative.
*/
-fastcall signed long __sched schedule_timeout(signed long timeout)
+signed long __sched schedule_timeout(signed long timeout)
{
struct timer_list timer;
unsigned long expire;
diff --git a/kernel/user.c b/kernel/user.c
index bc1c48d35cb..7132022a040 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,6 +17,14 @@
#include <linux/module.h>
#include <linux/user_namespace.h>
+struct user_namespace init_user_ns = {
+ .kref = {
+ .refcount = ATOMIC_INIT(2),
+ },
+ .root_user = &root_user,
+};
+EXPORT_SYMBOL_GPL(init_user_ns);
+
/*
* UID task count cache, to get fast user lookup in "alloc_uid"
* when changing user ID's (ie setuid() and friends).
@@ -49,7 +57,7 @@ struct user_struct root_user = {
.uid_keyring = &root_user_keyring,
.session_keyring = &root_session_keyring,
#endif
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
.tg = &init_task_group,
#endif
};
@@ -82,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
return NULL;
}
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
static void sched_destroy_user(struct user_struct *up)
{
@@ -105,15 +113,15 @@ static void sched_switch_user(struct task_struct *p)
sched_move_task(p);
}
-#else /* CONFIG_FAIR_USER_SCHED */
+#else /* CONFIG_USER_SCHED */
static void sched_destroy_user(struct user_struct *up) { }
static int sched_create_user(struct user_struct *up) { return 0; }
static void sched_switch_user(struct task_struct *p) { }
-#endif /* CONFIG_FAIR_USER_SCHED */
+#endif /* CONFIG_USER_SCHED */
-#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
static DEFINE_MUTEX(uids_mutex);
@@ -129,6 +137,7 @@ static inline void uids_mutex_unlock(void)
}
/* uid directory attributes */
+#ifdef CONFIG_FAIR_GROUP_SCHED
static ssize_t cpu_shares_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
@@ -155,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
static struct kobj_attribute cpu_share_attr =
__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+ return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+}
+
+static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+ unsigned long rt_runtime;
+ int rc;
+
+ sscanf(buf, "%lu", &rt_runtime);
+
+ rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
+
+ return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_rt_runtime_attr =
+ __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+#endif
/* default attributes per uid directory */
static struct attribute *uids_attributes[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
&cpu_share_attr.attr,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ &cpu_rt_runtime_attr.attr,
+#endif
NULL
};
@@ -261,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
schedule_work(&up->work);
}
-#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
+#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
int uids_sysfs_init(void) { return 0; }
static inline int uids_user_create(struct user_struct *up) { return 0; }
@@ -365,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
- /* This case is not possible when CONFIG_FAIR_USER_SCHED
+ /* This case is not possible when CONFIG_USER_SCHED
* is defined, since we serialize alloc_uid() using
* uids_mutex. Hence no need to call
* sched_destroy_user() or remove_user_sysfs_dir().
@@ -427,6 +471,7 @@ void switch_uid(struct user_struct *new_user)
suid_keys(current);
}
+#ifdef CONFIG_USER_NS
void release_uids(struct user_namespace *ns)
{
int i;
@@ -451,6 +496,7 @@ void release_uids(struct user_namespace *ns)
free_uid(ns->root_user);
}
+#endif
static int __init uid_cache_init(void)
{
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 7af90fc4f0f..4c9006275df 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -10,17 +10,6 @@
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
-struct user_namespace init_user_ns = {
- .kref = {
- .refcount = ATOMIC_INIT(2),
- },
- .root_user = &root_user,
-};
-
-EXPORT_SYMBOL_GPL(init_user_ns);
-
-#ifdef CONFIG_USER_NS
-
/*
* Clone a new ns copying an original user ns, setting refcount to 1
* @old_ns: namespace to clone
@@ -84,5 +73,3 @@ void free_user_ns(struct kref *kref)
release_uids(ns);
kfree(ns);
}
-
-#endif /* CONFIG_USER_NS */
diff --git a/kernel/wait.c b/kernel/wait.c
index f9876888a56..c275c56cf2d 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -18,7 +18,7 @@ void init_waitqueue_head(wait_queue_head_t *q)
EXPORT_SYMBOL(init_waitqueue_head);
-void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
@@ -29,7 +29,7 @@ void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
}
EXPORT_SYMBOL(add_wait_queue);
-void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
+void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
@@ -40,7 +40,7 @@ void fastcall add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
-void fastcall remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
+void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
@@ -63,7 +63,7 @@ EXPORT_SYMBOL(remove_wait_queue);
* stops them from bleeding out - it would still allow subsequent
* loads to move into the critical region).
*/
-void fastcall
+void
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
@@ -82,7 +82,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait);
-void fastcall
+void
prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
@@ -101,7 +101,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
-void fastcall finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(wake_bit_function);
* waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
* permitted return codes. Nonzero return codes halt waiting and return.
*/
-int __sched fastcall
+int __sched
__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
int (*action)(void *), unsigned mode)
{
@@ -173,7 +173,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
}
EXPORT_SYMBOL(__wait_on_bit);
-int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
+int __sched out_of_line_wait_on_bit(void *word, int bit,
int (*action)(void *), unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
@@ -183,7 +183,7 @@ int __sched fastcall out_of_line_wait_on_bit(void *word, int bit,
}
EXPORT_SYMBOL(out_of_line_wait_on_bit);
-int __sched fastcall
+int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
int (*action)(void *), unsigned mode)
{
@@ -201,7 +201,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
}
EXPORT_SYMBOL(__wait_on_bit_lock);
-int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
+int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
int (*action)(void *), unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
@@ -211,7 +211,7 @@ int __sched fastcall out_of_line_wait_on_bit_lock(void *word, int bit,
}
EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
-void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
+void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
{
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
if (waitqueue_active(wq))
@@ -236,13 +236,13 @@ EXPORT_SYMBOL(__wake_up_bit);
* may need to use a less regular barrier, such fs/inode.c's smp_mb(),
* because spin_unlock() does not guarantee a memory barrier.
*/
-void fastcall wake_up_bit(void *word, int bit)
+void wake_up_bit(void *word, int bit)
{
__wake_up_bit(bit_waitqueue(word, bit), word, bit);
}
EXPORT_SYMBOL(wake_up_bit);
-fastcall wait_queue_head_t *bit_waitqueue(void *word, int bit)
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
{
const int shift = BITS_PER_LONG == 32 ? 5 : 6;
const struct zone *zone = page_zone(virt_to_page(word));
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52db48e7f6e..ff06611655a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -161,7 +161,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
* We queue the work to the CPU it was submitted, but there is no
* guarantee that it will be processed by that CPU.
*/
-int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
+int queue_work(struct workqueue_struct *wq, struct work_struct *work)
{
int ret = 0;
@@ -175,7 +175,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
}
EXPORT_SYMBOL_GPL(queue_work);
-void delayed_work_timer_fn(unsigned long __data)
+static void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
@@ -192,7 +192,7 @@ void delayed_work_timer_fn(unsigned long __data)
*
* Returns 0 if @work was already on a queue, non-zero otherwise.
*/
-int fastcall queue_delayed_work(struct workqueue_struct *wq,
+int queue_delayed_work(struct workqueue_struct *wq,
struct delayed_work *dwork, unsigned long delay)
{
timer_stats_timer_set_start_info(&dwork->timer);
@@ -388,7 +388,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
* This function used to run the workqueues itself. Now we just wait for the
* helper threads to do it.
*/
-void fastcall flush_workqueue(struct workqueue_struct *wq)
+void flush_workqueue(struct workqueue_struct *wq)
{
const cpumask_t *cpu_map = wq_cpu_map(wq);
int cpu;
@@ -546,7 +546,7 @@ static struct workqueue_struct *keventd_wq __read_mostly;
*
* This puts a job in the kernel-global workqueue.
*/
-int fastcall schedule_work(struct work_struct *work)
+int schedule_work(struct work_struct *work)
{
return queue_work(keventd_wq, work);
}
@@ -560,7 +560,7 @@ EXPORT_SYMBOL(schedule_work);
* After waiting for a given time this puts a job in the kernel-global
* workqueue.
*/
-int fastcall schedule_delayed_work(struct delayed_work *dwork,
+int schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
timer_stats_timer_set_start_info(&dwork->timer);