aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c13
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/auditsc.c21
-rw-r--r--kernel/capability.c21
-rw-r--r--kernel/cgroup.c44
-rw-r--r--kernel/cpu.c31
-rw-r--r--kernel/cpuset.c418
-rw-r--r--kernel/dma-coherent.c17
-rw-r--r--kernel/exit.c106
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/hrtimer.c95
-rw-r--r--kernel/irq/manage.c12
-rw-r--r--kernel/irq/proc.c96
-rw-r--r--kernel/kexec.c74
-rw-r--r--kernel/kgdb.c107
-rw-r--r--kernel/lockdep.c309
-rw-r--r--kernel/lockdep_internals.h19
-rw-r--r--kernel/lockdep_proc.c48
-rw-r--r--kernel/marker.c12
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/mutex.c1
-rw-r--r--kernel/nsproxy.c1
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/pm_qos_params.c41
-rw-r--r--kernel/posix-timers.c21
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/printk.c24
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/rcuclassic.c337
-rw-r--r--kernel/rcupdate.c1
-rw-r--r--kernel/rcupreempt.c8
-rw-r--r--kernel/rcupreempt_trace.c7
-rw-r--r--kernel/relay.c12
-rw-r--r--kernel/resource.c158
-rw-r--r--kernel/sched.c586
-rw-r--r--kernel/sched_clock.c224
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c235
-rw-r--r--kernel/sched_features.h3
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c81
-rw-r--r--kernel/semaphore.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/smp.c68
-rw-r--r--kernel/softlockup.c3
-rw-r--r--kernel/spinlock.c12
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys.c14
-rw-r--r--kernel/sysctl.c20
-rw-r--r--kernel/time/clockevents.c15
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-broadcast.c99
-rw-r--r--kernel/time/tick-common.c15
-rw-r--r--kernel/time/tick-internal.h11
-rw-r--r--kernel/time/tick-oneshot.c44
-rw-r--r--kernel/time/tick-sched.c35
-rw-r--r--kernel/trace/trace_sysprof.c2
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/utsname.c1
-rw-r--r--kernel/utsname_sysctl.c1
-rw-r--r--kernel/workqueue.c37
66 files changed, 2285 insertions, 1351 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 382dd5a8b2d..94fabd534b0 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
default 1000 if HZ_1000
config SCHED_HRTICK
- def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS
+ def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
diff --git a/kernel/acct.c b/kernel/acct.c
index dd68b905941..f6006a60df5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
#endif
spin_lock_irq(&current->sighand->siglock);
- tty = current->signal->tty;
+ tty = current->signal->tty; /* Safe as we hold the siglock */
ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
diff --git a/kernel/audit.c b/kernel/audit.c
index e092f1c0ce3..4414e93d875 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (status_get->mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(status_get->enabled,
loginuid, sessionid, sid);
- if (err < 0) return err;
+ if (err < 0)
+ return err;
}
if (status_get->mask & AUDIT_STATUS_FAILURE) {
err = audit_set_failure(status_get->failure,
loginuid, sessionid, sid);
- if (err < 0) return err;
+ if (err < 0)
+ return err;
}
if (status_get->mask & AUDIT_STATUS_PID) {
int new_pid = status_get->pid;
@@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_pid = new_pid;
audit_nlk_pid = NETLINK_CB(skb).pid;
}
- if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+ if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
err = audit_set_rate_limit(status_get->rate_limit,
loginuid, sessionid, sid);
+ if (err < 0)
+ return err;
+ }
if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
err = audit_set_backlog_limit(status_get->backlog_limit,
loginuid, sessionid, sid);
@@ -1366,7 +1371,7 @@ int audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
for (p = string; p < (const unsigned char *)string + len && *p; p++) {
- if (*p == '"' || *p < 0x21 || *p > 0x7f)
+ if (*p == '"' || *p < 0x21 || *p > 0x7e)
return 1;
}
return 0;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 98c50cc671b..b7d354e2b0e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent,
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_KERNEL,
AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "auid=%u ses=%u",
+ audit_get_loginuid(current),
+ audit_get_sessionid(current));
audit_log_format(ab,
- "op=updated rules specifying path=");
+ " op=updated rules specifying path=");
audit_log_untrustedstring(ab, owatch->path);
audit_log_format(ab, " with dev=%u ino=%lu\n",
dev, ino);
@@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_KERNEL,
AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "op=remove rule path=");
+ audit_log_format(ab, "auid=%u ses=%u",
+ audit_get_loginuid(current),
+ audit_get_sessionid(current));
+ audit_log_format(ab, " op=remove rule path=");
audit_log_untrustedstring(ab, w->path);
if (r->filterkey) {
audit_log_format(ab, " key=");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4699950e65b..cf5bc2f5f9c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,7 +243,11 @@ static inline int open_arg(int flags, int mask)
static int audit_match_perm(struct audit_context *ctx, int mask)
{
- unsigned n = ctx->major;
+ unsigned n;
+ if (unlikely(!ctx))
+ return 0;
+ n = ctx->major;
+
switch (audit_classify_syscall(ctx->arch, n)) {
case 0: /* native */
if ((mask & AUDIT_PERM_WRITE) &&
@@ -284,6 +288,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
{
unsigned index = which & ~S_IFMT;
mode_t mode = which & S_IFMT;
+
+ if (unlikely(!ctx))
+ return 0;
+
if (index >= ctx->name_count)
return 0;
if (ctx->names[index].ino == -1)
@@ -610,7 +618,7 @@ static int audit_filter_rules(struct task_struct *tsk,
if (!result)
return 0;
}
- if (rule->filterkey)
+ if (rule->filterkey && ctx)
ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
switch (rule->action) {
case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
@@ -1196,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
- mutex_lock(&tty_mutex);
- read_lock(&tasklist_lock);
+ spin_lock_irq(&tsk->sighand->siglock);
if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
tty = tsk->signal->tty->name;
else
tty = "(none)";
- read_unlock(&tasklist_lock);
+ spin_unlock_irq(&tsk->sighand->siglock);
+
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
" ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1222,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->egid, context->sgid, context->fsgid, tty,
tsk->sessionid);
- mutex_unlock(&tty_mutex);
audit_log_task_info(ab, tsk);
if (context->filterkey) {
@@ -2375,7 +2382,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
struct audit_context *ctx = tsk->audit_context;
if (audit_pid && t->tgid == audit_pid) {
- if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
+ if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
audit_sig_pid = tsk->pid;
if (tsk->loginuid != -1)
audit_sig_uid = tsk->loginuid;
diff --git a/kernel/capability.c b/kernel/capability.c
index 0101e847603..33e51e78c2d 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -486,17 +486,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
return ret;
}
-int __capable(struct task_struct *t, int cap)
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+int capable(int cap)
{
- if (security_capable(t, cap) == 0) {
- t->flags |= PF_SUPERPRIV;
+ if (has_capability(current, cap)) {
+ current->flags |= PF_SUPERPRIV;
return 1;
}
return 0;
}
-
-int capable(int cap)
-{
- return __capable(current, cap);
-}
EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 657f8f8d93a..a0123d75ec9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set(
return NULL;
}
+static void free_cg_links(struct list_head *tmp)
+{
+ struct cg_cgroup_link *link;
+ struct cg_cgroup_link *saved_link;
+
+ list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
+ list_del(&link->cgrp_link_list);
+ kfree(link);
+ }
+}
+
/*
* allocate_cg_links() allocates "count" cg_cgroup_link structures
* and chains them on tmp through their cgrp_link_list fields. Returns 0 on
@@ -363,17 +374,12 @@ static struct css_set *find_existing_css_set(
static int allocate_cg_links(int count, struct list_head *tmp)
{
struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
int i;
INIT_LIST_HEAD(tmp);
for (i = 0; i < count; i++) {
link = kmalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
- list_for_each_entry_safe(link, saved_link, tmp,
- cgrp_link_list) {
- list_del(&link->cgrp_link_list);
- kfree(link);
- }
+ free_cg_links(tmp);
return -ENOMEM;
}
list_add(&link->cgrp_link_list, tmp);
@@ -381,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp)
return 0;
}
-static void free_cg_links(struct list_head *tmp)
-{
- struct cg_cgroup_link *link;
- struct cg_cgroup_link *saved_link;
-
- list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
- list_del(&link->cgrp_link_list);
- kfree(link);
- }
-}
-
/*
* find_css_set() takes an existing cgroup group and a
* cgroup object, and returns a css_set object that's
@@ -956,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
struct super_block *sb;
struct cgroupfs_root *root;
struct list_head tmp_cg_links;
- INIT_LIST_HEAD(&tmp_cg_links);
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
@@ -1424,14 +1418,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
if (buffer == NULL)
return -ENOMEM;
}
- if (nbytes && copy_from_user(buffer, userbuf, nbytes))
- return -EFAULT;
+ if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
+ retval = -EFAULT;
+ goto out;
+ }
buffer[nbytes] = 0; /* nul-terminate */
strstrip(buffer);
retval = cft->write_string(cgrp, cft, buffer);
if (!retval)
retval = nbytes;
+out:
if (buffer != local_buffer)
kfree(buffer);
return retval;
@@ -2371,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
-static inline int cgroup_has_css_refs(struct cgroup *cgrp)
+static int cgroup_has_css_refs(struct cgroup *cgrp)
{
/* Check the reference count on each subsystem. Since we
* already established that there are no tasks in the
@@ -2741,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
*/
void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
{
- struct cgroup *oldcgrp, *newcgrp;
+ struct cgroup *oldcgrp, *newcgrp = NULL;
if (need_mm_owner_callback) {
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
oldcgrp = task_cgroup(old, ss->subsys_id);
- newcgrp = task_cgroup(new, ss->subsys_id);
+ if (new)
+ newcgrp = task_cgroup(new, ss->subsys_id);
if (oldcgrp == newcgrp)
continue;
if (ss->mm_owner_changed)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e202a68d1cc..86d49045dae 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
struct take_cpu_down_param *param = _param;
int err;
- raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
- param->hcpu);
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
+ raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+ param->hcpu);
+
/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */
sched_idle_next();
@@ -349,6 +350,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
+ cpu_set(cpu, cpu_active_map);
+
/* Now call notifier in preparation. */
raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
@@ -367,7 +370,7 @@ int __cpuinit cpu_up(unsigned int cpu)
if (!cpu_isset(cpu, cpu_possible_map)) {
printk(KERN_ERR "can't online cpu %d because it is not "
"configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390)
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
printk(KERN_ERR "please check additional_cpus= boot "
"parameter\n");
#endif
@@ -383,9 +386,6 @@ int __cpuinit cpu_up(unsigned int cpu)
err = _cpu_up(cpu, 0);
- if (cpu_online(cpu))
- cpu_set(cpu, cpu_active_map);
-
out:
cpu_maps_update_done();
return err;
@@ -454,6 +454,25 @@ out:
}
#endif /* CONFIG_PM_SLEEP_SMP */
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+ unsigned long val = CPU_STARTING;
+
+#ifdef CONFIG_PM_SLEEP_SMP
+ if (cpu_isset(cpu, frozen_cpus))
+ val = CPU_STARTING_FROZEN;
+#endif /* CONFIG_PM_SLEEP_SMP */
+ raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+}
+
#endif /* CONFIG_SMP */
/*
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 91cf85b36dd..eab7bd6628e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* 2006 Rework by Paul Menage to use generic cgroups
+ * 2008 Rework of the scheduler domains and CPU hotplug handling
+ * by Max Krasnyansky
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
@@ -54,7 +56,6 @@
#include <asm/uaccess.h>
#include <asm/atomic.h>
#include <linux/mutex.h>
-#include <linux/kfifo.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
@@ -237,9 +238,11 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(callback_mutex);
-/* This is ugly, but preserves the userspace API for existing cpuset
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead */
+ * silently switch it to mount "cgroup" instead
+ */
static int cpuset_get_sb(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data, struct vfsmount *mnt)
@@ -474,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
}
/*
- * Helper routine for rebuild_sched_domains().
+ * Helper routine for generate_sched_domains().
* Do cpusets a, b have overlapping cpus_allowed masks?
*/
-
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -486,34 +488,48 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
static void
update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
{
- if (!dattr)
- return;
if (dattr->relax_domain_level < c->relax_domain_level)
dattr->relax_domain_level = c->relax_domain_level;
return;
}
+static void
+update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+{
+ LIST_HEAD(q);
+
+ list_add(&c->stack_list, &q);
+ while (!list_empty(&q)) {
+ struct cpuset *cp;
+ struct cgroup *cont;
+ struct cpuset *child;
+
+ cp = list_first_entry(&q, struct cpuset, stack_list);
+ list_del(q.next);
+
+ if (cpus_empty(cp->cpus_allowed))
+ continue;
+
+ if (is_sched_load_balance(cp))
+ update_domain_attr(dattr, cp);
+
+ list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+ child = cgroup_cs(cont);
+ list_add_tail(&child->stack_list, &q);
+ }
+ }
+}
+
/*
- * rebuild_sched_domains()
- *
- * This routine will be called to rebuild the scheduler's dynamic
- * sched domains:
- * - if the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes,
- * - or if the 'cpus' allowed changes in any cpuset which has that
- * flag enabled,
- * - or if the 'sched_relax_domain_level' of any cpuset which has
- * that flag enabled and with non-empty 'cpus' changes,
- * - or if any cpuset with non-empty 'cpus' is removed,
- * - or if a cpu gets offlined.
- *
- * This routine builds a partial partition of the systems CPUs
- * (the set of non-overlappping cpumask_t's in the array 'part'
- * below), and passes that partial partition to the kernel/sched.c
- * partition_sched_domains() routine, which will rebuild the
- * schedulers load balancing domains (sched domains) as specified
- * by that partial partition. A 'partial partition' is a set of
- * non-overlapping subsets whose union is a subset of that set.
+ * generate_sched_domains()
+ *
+ * This function builds a partial partition of the systems CPUs
+ * A 'partial partition' is a set of non-overlapping subsets whose
+ * union is a subset of that set.
+ * The output of this function needs to be passed to kernel/sched.c
+ * partition_sched_domains() routine, which will rebuild the scheduler's
+ * load balancing domains (sched domains) as specified by that partial
+ * partition.
*
* See "What is sched_load_balance" in Documentation/cpusets.txt
* for a background explanation of this.
@@ -523,16 +539,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Call with cgroup_mutex held. May take callback_mutex during
- * call due to the kfifo_alloc() and kmalloc() calls. May nest
- * a call to the get_online_cpus()/put_online_cpus() pair.
- * Must not be called holding callback_mutex, because we must not
- * call get_online_cpus() while holding callback_mutex. Elsewhere
- * the kernel nests callback_mutex inside get_online_cpus() calls.
- * So the reverse nesting would risk an ABBA deadlock.
+ * Must be called with cgroup_lock held.
*
* The three key local variables below are:
- * q - a kfifo queue of cpuset pointers, used to implement a
+ * q - a linked-list queue of cpuset pointers, used to implement a
* top-down scan of all cpusets. This scan loads a pointer
* to each cpuset marked is_sched_load_balance into the
* array 'csa'. For our purposes, rebuilding the schedulers
@@ -564,10 +574,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
* element of the partition (one sched domain) to be passed to
* partition_sched_domains().
*/
-
-void rebuild_sched_domains(void)
+static int generate_sched_domains(cpumask_t **domains,
+ struct sched_domain_attr **attributes)
{
- struct kfifo *q; /* queue of cpusets to be scanned */
+ LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */
@@ -577,49 +587,58 @@ void rebuild_sched_domains(void)
int ndoms; /* number of sched domains in result */
int nslot; /* next empty doms[] cpumask_t slot */
- q = NULL;
- csa = NULL;
+ ndoms = 0;
doms = NULL;
dattr = NULL;
+ csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
- ndoms = 1;
doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
if (!doms)
- goto rebuild;
+ goto done;
+
dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
if (dattr) {
*dattr = SD_ATTR_INIT;
- update_domain_attr(dattr, &top_cpuset);
+ update_domain_attr_tree(dattr, &top_cpuset);
}
*doms = top_cpuset.cpus_allowed;
- goto rebuild;
- }
- q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
- if (IS_ERR(q))
+ ndoms = 1;
goto done;
+ }
+
csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
if (!csa)
goto done;
csn = 0;
- cp = &top_cpuset;
- __kfifo_put(q, (void *)&cp, sizeof(cp));
- while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
+ list_add(&top_cpuset.stack_list, &q);
+ while (!list_empty(&q)) {
struct cgroup *cont;
struct cpuset *child; /* scans child cpusets of cp */
+ cp = list_first_entry(&q, struct cpuset, stack_list);
+ list_del(q.next);
+
if (cpus_empty(cp->cpus_allowed))
continue;
- if (is_sched_load_balance(cp))
+ /*
+ * All child cpusets contain a subset of the parent's cpus, so
+ * just skip them, and then we call update_domain_attr_tree()
+ * to calc relax_domain_level of the corresponding sched
+ * domain.
+ */
+ if (is_sched_load_balance(cp)) {
csa[csn++] = cp;
+ continue;
+ }
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
child = cgroup_cs(cont);
- __kfifo_put(q, (void *)&child, sizeof(cp));
+ list_add_tail(&child->stack_list, &q);
}
}
@@ -650,63 +669,141 @@ restart:
}
}
- /* Convert <csn, csa> to <ndoms, doms> */
+ /*
+ * Now we know how many domains to create.
+ * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+ */
doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
- if (!doms)
- goto rebuild;
+ if (!doms) {
+ ndoms = 0;
+ goto done;
+ }
+
+ /*
+ * The rest of the code, including the scheduler, can deal with
+ * dattr==NULL case. No need to abort if alloc fails.
+ */
dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
+ cpumask_t *dp;
int apn = a->pn;
- if (apn >= 0) {
- cpumask_t *dp = doms + nslot;
-
- if (nslot == ndoms) {
- static int warnings = 10;
- if (warnings) {
- printk(KERN_WARNING
- "rebuild_sched_domains confused:"
- " nslot %d, ndoms %d, csn %d, i %d,"
- " apn %d\n",
- nslot, ndoms, csn, i, apn);
- warnings--;
- }
- continue;
+ if (apn < 0) {
+ /* Skip completed partitions */
+ continue;
+ }
+
+ dp = doms + nslot;
+
+ if (nslot == ndoms) {
+ static int warnings = 10;
+ if (warnings) {
+ printk(KERN_WARNING
+ "rebuild_sched_domains confused:"
+ " nslot %d, ndoms %d, csn %d, i %d,"
+ " apn %d\n",
+ nslot, ndoms, csn, i, apn);
+ warnings--;
}
+ continue;
+ }
- cpus_clear(*dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- for (j = i; j < csn; j++) {
- struct cpuset *b = csa[j];
-
- if (apn == b->pn) {
- cpus_or(*dp, *dp, b->cpus_allowed);
- b->pn = -1;
- if (dattr)
- update_domain_attr(dattr
- + nslot, b);
- }
+ cpus_clear(*dp);
+ if (dattr)
+ *(dattr + nslot) = SD_ATTR_INIT;
+ for (j = i; j < csn; j++) {
+ struct cpuset *b = csa[j];
+
+ if (apn == b->pn) {
+ cpus_or(*dp, *dp, b->cpus_allowed);
+ if (dattr)
+ update_domain_attr_tree(dattr + nslot, b);
+
+ /* Done with this partition */
+ b->pn = -1;
}
- nslot++;
}
+ nslot++;
}
BUG_ON(nslot != ndoms);
-rebuild:
- /* Have scheduler rebuild sched domains */
+done:
+ kfree(csa);
+
+ *domains = doms;
+ *attributes = dattr;
+ return ndoms;
+}
+
+/*
+ * Rebuild scheduler domains.
+ *
+ * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * Takes both cgroup_mutex and get_online_cpus().
+ *
+ * Cannot be directly called from cpuset code handling changes
+ * to the cpuset pseudo-filesystem, because it cannot be called
+ * from code that already holds cgroup_mutex.
+ */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+ struct sched_domain_attr *attr;
+ cpumask_t *doms;
+ int ndoms;
+
get_online_cpus();
- partition_sched_domains(ndoms, doms, dattr);
+
+ /* Generate domain masks and attrs */
+ cgroup_lock();
+ ndoms = generate_sched_domains(&doms, &attr);
+ cgroup_unlock();
+
+ /* Have scheduler rebuild the domains */
+ partition_sched_domains(ndoms, doms, attr);
+
put_online_cpus();
+}
-done:
- if (q && !IS_ERR(q))
- kfifo_free(q);
- kfree(csa);
- /* Don't kfree(doms) -- partition_sched_domains() does that. */
- /* Don't kfree(dattr) -- partition_sched_domains() does that. */
+static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
+
+/*
+ * Rebuild scheduler domains, asynchronously via workqueue.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * The rebuild_sched_domains() and partition_sched_domains()
+ * routines must nest cgroup_lock() inside get_online_cpus(),
+ * but such cpuset changes as these must nest that locking the
+ * other way, holding cgroup_lock() for much of the code.
+ *
+ * So in order to avoid an ABBA deadlock, the cpuset code handling
+ * these user changes delegates the actual sched domain rebuilding
+ * to a separate workqueue thread, which ends up processing the
+ * above do_rebuild_sched_domains() function.
+ */
+static void async_rebuild_sched_domains(void)
+{
+ schedule_work(&rebuild_sched_domains_work);
+}
+
+/*
+ * Accomplishes the same scheduler domain rebuild as the above
+ * async_rebuild_sched_domains(), however it directly calls the
+ * rebuild routine synchronously rather than calling it via an
+ * asynchronous work thread.
+ *
+ * This can only be called from code that is not holding
+ * cgroup_mutex (not nested in a cgroup_lock() call.)
+ */
+void rebuild_sched_domains(void)
+{
+ do_rebuild_sched_domains(NULL);
}
/**
@@ -746,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
* Called with cgroup_mutex held
*
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each.
*
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
*/
-static int update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
{
struct cgroup_scanner scan;
- struct ptr_heap heap;
- int retval;
-
- /*
- * cgroup_scan_tasks() will initialize heap->gt for us.
- * heap_init() is still needed here for we should not change
- * cs->cpus_allowed when heap_init() fails.
- */
- retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
- if (retval)
- return retval;
scan.cg = cs->css.cgroup;
scan.test_task = cpuset_test_cpumask;
scan.process_task = cpuset_change_cpumask;
- scan.heap = &heap;
- retval = cgroup_scan_tasks(&scan);
-
- heap_free(&heap);
- return retval;
+ scan.heap = heap;
+ cgroup_scan_tasks(&scan);
}
/**
@@ -786,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs)
*/
static int update_cpumask(struct cpuset *cs, const char *buf)
{
+ struct ptr_heap heap;
struct cpuset trialcs;
int retval;
int is_load_balanced;
@@ -820,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
return 0;
+ retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+ if (retval)
+ return retval;
+
is_load_balanced = is_sched_load_balance(&trialcs);
mutex_lock(&callback_mutex);
@@ -830,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
* Scan tasks in the cpuset, and update the cpumasks of any
* that need an update.
*/
- retval = update_tasks_cpumask(cs);
- if (retval < 0)
- return retval;
+ update_tasks_cpumask(cs, &heap);
+
+ heap_free(&heap);
if (is_load_balanced)
- rebuild_sched_domains();
+ async_rebuild_sched_domains();
return 0;
}
@@ -1062,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
- rebuild_sched_domains();
+ async_rebuild_sched_domains();
}
return 0;
@@ -1103,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex);
if (cpus_nonempty && balance_flag_changed)
- rebuild_sched_domains();
+ async_rebuild_sched_domains();
return 0;
}
@@ -1464,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
default:
BUG();
}
+
+ /* Unreachable but makes gcc happy */
+ return 0;
}
static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1476,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
default:
BUG();
}
+
+ /* Unrechable but makes gcc happy */
+ return 0;
}
@@ -1664,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create(
}
/*
- * Locking note on the strange update_flag() call below:
- *
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains(). The get_online_cpus()
- * call in rebuild_sched_domains() must not be made while holding
- * callback_mutex. Elsewhere the kernel nests callback_mutex inside
- * get_online_cpus() calls. So the reverse nesting would risk an
- * ABBA deadlock.
+ * will call async_rebuild_sched_domains().
*/
static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1691,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
struct cgroup_subsys cpuset_subsys = {
.name = "cpuset",
.create = cpuset_create,
- .destroy = cpuset_destroy,
+ .destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
.attach = cpuset_attach,
.populate = cpuset_populate,
@@ -1783,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
}
/*
- * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty
@@ -1831,26 +1921,23 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
* that has tasks along with an empty 'mems'. But if we did see such
* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
*/
-static void scan_for_empty_cpusets(const struct cpuset *root)
+static void scan_for_empty_cpusets(struct cpuset *root)
{
+ LIST_HEAD(queue);
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
- struct list_head queue;
struct cgroup *cont;
nodemask_t oldmems;
- INIT_LIST_HEAD(&queue);
-
list_add_tail((struct list_head *)&root->stack_list, &queue);
while (!list_empty(&queue)) {
- cp = container_of(queue.next, struct cpuset, stack_list);
+ cp = list_first_entry(&queue, struct cpuset, stack_list);
list_del(queue.next);
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
child = cgroup_cs(cont);
list_add_tail(&child->stack_list, &queue);
}
- cont = cp->css.cgroup;
/* Continue past cpusets with all cpus, mems online */
if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
@@ -1871,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
else {
- update_tasks_cpumask(cp);
+ update_tasks_cpumask(cp, NULL);
update_tasks_nodemask(cp, &oldmems);
}
}
}
/*
- * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug event.
- *
- * Since there are two callers of this routine, one for CPU hotplug
- * events and one for memory node hotplug events, we could have coded
- * two separate routines here. We code it as a single common routine
- * in order to minimize text size.
- */
-
-static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
-{
- cgroup_lock();
-
- top_cpuset.cpus_allowed = cpu_online_map;
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- scan_for_empty_cpusets(&top_cpuset);
-
- /*
- * Scheduler destroys domains on hotplug events.
- * Rebuild them based on the current settings.
- */
- if (rebuild_sd)
- rebuild_sched_domains();
-
- cgroup_unlock();
-}
-
-/*
* The top_cpuset tracks what CPUs and Memory Nodes are online,
* period. This is necessary in order to make cpusets transparent
* (of no affect) on systems that are actively using CPU hotplug
@@ -1914,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
*
* This routine ensures that top_cpuset.cpus_allowed tracks
* cpu_online_map on each CPU hotplug (cpuhp) event.
+ *
+ * Called within get_online_cpus(). Needs to call cgroup_lock()
+ * before calling generate_sched_domains().
*/
-
-static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
+static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
unsigned long phase, void *unused_cpu)
{
+ struct sched_domain_attr *attr;
+ cpumask_t *doms;
+ int ndoms;
+
switch (phase) {
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- common_cpu_mem_hotplug_unplug(1);
break;
+
default:
return NOTIFY_DONE;
}
+ cgroup_lock();
+ top_cpuset.cpus_allowed = cpu_online_map;
+ scan_for_empty_cpusets(&top_cpuset);
+ ndoms = generate_sched_domains(&doms, &attr);
+ cgroup_unlock();
+
+ /* Have scheduler rebuild the domains */
+ partition_sched_domains(ndoms, doms, attr);
+
return NOTIFY_OK;
}
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after you change
- * node_states[N_HIGH_MEMORY].
- * See also the previous routine cpuset_handle_cpuhp().
+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
+ * See also the previous routine cpuset_track_online_cpus().
*/
-
void cpuset_track_online_nodes(void)
{
- common_cpu_mem_hotplug_unplug(0);
+ cgroup_lock();
+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ scan_for_empty_cpusets(&top_cpuset);
+ cgroup_unlock();
}
#endif
@@ -1962,7 +2032,7 @@ void __init cpuset_init_smp(void)
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- hotcpu_notifier(cpuset_handle_cpuhp, 0);
+ hotcpu_notifier(cpuset_track_online_cpus, 0);
}
/**
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 7517115a8cc..f013a0c2e11 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -77,15 +77,14 @@ void *dma_mark_declared_memory_occupied(struct device *dev,
{
struct dma_coherent_mem *mem = dev->dma_mem;
int pos, err;
- int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
- pages >>= PAGE_SHIFT;
+ size += device_addr & ~PAGE_MASK;
if (!mem)
return ERR_PTR(-EINVAL);
pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
- err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+ err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
if (err != 0)
return ERR_PTR(err);
return mem->virt_base + (pos << PAGE_SHIFT);
@@ -93,7 +92,7 @@ void *dma_mark_declared_memory_occupied(struct device *dev,
EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
/**
- * Try to allocate memory from the per-device coherent area.
+ * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
*
* @dev: device from which we allocate memory
* @size: size of requested memory area
@@ -101,11 +100,11 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
* @ret: This pointer will be filled with the virtual address
* to allocated area.
*
- * This function should be only called from per-arch %dma_alloc_coherent()
+ * This function should be only called from per-arch dma_alloc_coherent()
* to support allocation from per-device coherent memory pools.
*
* Returns 0 if dma_alloc_coherent should continue with allocating from
- * generic memory areas, or !0 if dma_alloc_coherent should return %ret.
+ * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
*/
int dma_alloc_from_coherent(struct device *dev, ssize_t size,
dma_addr_t *dma_handle, void **ret)
@@ -125,9 +124,10 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
}
return (mem != NULL);
}
+EXPORT_SYMBOL(dma_alloc_from_coherent);
/**
- * Try to free the memory allocated from per-device coherent memory pool.
+ * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
* @dev: device from which the memory was allocated
* @order: the order of pages allocated
* @vaddr: virtual address of allocated pages
@@ -136,7 +136,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
* coherent memory pool and if so, releases that memory.
*
* Returns 1 if we correctly released the memory, or 0 if
- * %dma_release_coherent() should proceed with releasing memory from
+ * dma_release_coherent() should proceed with releasing memory from
* generic pools.
*/
int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
@@ -152,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
}
return 0;
}
+EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index eb4d6470d1d..85a83c83185 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -112,9 +112,9 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime = cputime_add(sig->utime, tsk->utime);
- sig->stime = cputime_add(sig->stime, tsk->stime);
- sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+ sig->utime = cputime_add(sig->utime, task_utime(tsk));
+ sig->stime = cputime_add(sig->stime, task_stime(tsk));
+ sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
* If there are other users of the mm and the owner (us) is exiting
* we need to find a new owner to take on the responsibility.
*/
- if (!mm)
- return 0;
if (atomic_read(&mm->mm_users) <= 1)
return 0;
if (mm->owner != p)
@@ -627,6 +625,16 @@ retry:
} while_each_thread(g, c);
read_unlock(&tasklist_lock);
+ /*
+ * We found no owner yet mm_users > 1: this implies that we are
+ * most likely racing with swapoff (try_to_unuse()) or /proc or
+ * ptrace or page migration (get_task_mm()). Mark owner as NULL,
+ * so that subsystems can understand the callback and take action.
+ */
+ down_write(&mm->mmap_sem);
+ cgroup_mm_owner_callbacks(mm->owner, NULL);
+ mm->owner = NULL;
+ up_write(&mm->mmap_sem);
return;
assign_new_owner:
@@ -831,26 +839,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
* the child reaper process (ie "init") in our pid
* space.
*/
+static struct task_struct *find_new_reaper(struct task_struct *father)
+{
+ struct pid_namespace *pid_ns = task_active_pid_ns(father);
+ struct task_struct *thread;
+
+ thread = father;
+ while_each_thread(father, thread) {
+ if (thread->flags & PF_EXITING)
+ continue;
+ if (unlikely(pid_ns->child_reaper == father))
+ pid_ns->child_reaper = thread;
+ return thread;
+ }
+
+ if (unlikely(pid_ns->child_reaper == father)) {
+ write_unlock_irq(&tasklist_lock);
+ if (unlikely(pid_ns == &init_pid_ns))
+ panic("Attempted to kill init!");
+
+ zap_pid_ns_processes(pid_ns);
+ write_lock_irq(&tasklist_lock);
+ /*
+ * We can not clear ->child_reaper or leave it alone.
+ * There may by stealth EXIT_DEAD tasks on ->children,
+ * forget_original_parent() must move them somewhere.
+ */
+ pid_ns->child_reaper = init_pid_ns.child_reaper;
+ }
+
+ return pid_ns->child_reaper;
+}
+
static void forget_original_parent(struct task_struct *father)
{
- struct task_struct *p, *n, *reaper = father;
+ struct task_struct *p, *n, *reaper;
LIST_HEAD(ptrace_dead);
write_lock_irq(&tasklist_lock);
-
+ reaper = find_new_reaper(father);
/*
* First clean up ptrace if we were using it.
*/
ptrace_exit(father, &ptrace_dead);
- do {
- reaper = next_thread(reaper);
- if (reaper == father) {
- reaper = task_child_reaper(father);
- break;
- }
- } while (reaper->flags & PF_EXITING);
-
list_for_each_entry_safe(p, n, &father->children, sibling) {
p->real_parent = reaper;
if (p->parent == father) {
@@ -911,15 +943,15 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tsk->exit_signal = SIGCHLD;
signal = tracehook_notify_death(tsk, &cookie, group_dead);
- if (signal > 0)
+ if (signal >= 0)
signal = do_notify_parent(tsk, signal);
- tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE;
+ tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
/* mt-exec, de_thread() is waiting for us */
if (thread_group_leader(tsk) &&
- tsk->signal->notify_count < 0 &&
- tsk->signal->group_exit_task)
+ tsk->signal->group_exit_task &&
+ tsk->signal->notify_count < 0)
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);
@@ -927,7 +959,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tracehook_report_death(tsk, signal, cookie, group_dead);
/* If the process is dead, release it - nobody will wait for it */
- if (signal < 0)
+ if (signal == DEATH_REAP)
release_task(tsk);
}
@@ -959,39 +991,6 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
-static inline void exit_child_reaper(struct task_struct *tsk)
-{
- if (likely(tsk->group_leader != task_child_reaper(tsk)))
- return;
-
- if (tsk->nsproxy->pid_ns == &init_pid_ns)
- panic("Attempted to kill init!");
-
- /*
- * @tsk is the last thread in the 'cgroup-init' and is exiting.
- * Terminate all remaining processes in the namespace and reap them
- * before exiting @tsk.
- *
- * Note that @tsk (last thread of cgroup-init) may not necessarily
- * be the child-reaper (i.e main thread of cgroup-init) of the
- * namespace i.e the child_reaper may have already exited.
- *
- * Even after a child_reaper exits, we let it inherit orphaned children,
- * because, pid_ns->child_reaper remains valid as long as there is
- * at least one living sub-thread in the cgroup init.
-
- * This living sub-thread of the cgroup-init will be notified when
- * a child inherited by the 'child-reaper' exits (do_notify_parent()
- * uses __group_send_sig_info()). Further, when reaping child processes,
- * do_wait() iterates over children of all living sub threads.
-
- * i.e even though 'child_reaper' thread is listed as the parent of the
- * orphaned children, any living sub-thread in the cgroup-init can
- * perform the role of the child_reaper.
- */
- zap_pid_ns_processes(tsk->nsproxy->pid_ns);
-}
-
NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
@@ -1051,7 +1050,6 @@ NORET_TYPE void do_exit(long code)
}
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
- exit_child_reaper(tsk);
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe8479..30de644a40c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,6 +802,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->leader = 0; /* session leadership doesn't inherit */
sig->tty_old_pgrp = NULL;
+ sig->tty = NULL;
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
sig->gtime = cputime_zero;
@@ -838,6 +839,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
void __cleanup_signal(struct signal_struct *sig)
{
exit_thread_group_keys(sig);
+ tty_kref_put(sig->tty);
kmem_cache_free(signal_cachep, sig);
}
@@ -1227,7 +1229,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
- p->signal->tty = current->signal->tty;
+ tty_kref_put(p->signal->tty);
+ p->signal->tty = tty_kref_get(current->signal->tty);
set_task_pgrp(p, task_pgrp_nr(current));
set_task_session(p, task_session_nr(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a7..cdec83e722f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
*/
BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
return 1;
- case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+ case HRTIMER_CB_IRQSAFE_PERCPU:
+ case HRTIMER_CB_IRQSAFE_UNLOCKED:
/*
* This is solely for the sched tick emulation with
* dynamic tick support to ensure that we do not
* restart the tick right on the edge and end up with
* the tick timer in the softirq ! The calling site
- * takes care of this.
+ * takes care of this. Also used for hrtimer sleeper !
*/
debug_hrtimer_deactivate(timer);
return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
timer_stats_account_hrtimer(timer);
fn = timer->function;
- if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+ timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
/*
* Used for scheduler timers, avoid lock inversion with
* rq->lock and tasklist_lock.
@@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
sl->timer.function = hrtimer_wakeup;
sl->task = task;
#ifdef CONFIG_HIGH_RES_TIMERS
- sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
#endif
}
@@ -1591,29 +1593,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
#ifdef CONFIG_HOTPLUG_CPU
-static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
- struct hrtimer_clock_base *new_base)
+static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+ struct hrtimer_clock_base *new_base, int dcpu)
{
struct hrtimer *timer;
struct rb_node *node;
+ int raise = 0;
while ((node = rb_first(&old_base->active))) {
timer = rb_entry(node, struct hrtimer, node);
BUG_ON(hrtimer_callback_running(timer));
debug_hrtimer_deactivate(timer);
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
+
+ /*
+ * Should not happen. Per CPU timers should be
+ * canceled _before_ the migration code is called
+ */
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+ __remove_hrtimer(timer, old_base,
+ HRTIMER_STATE_INACTIVE, 0);
+ WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+ timer, timer->function, dcpu);
+ continue;
+ }
+
+ /*
+ * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+ * timer could be seen as !active and just vanish away
+ * under us on another CPU
+ */
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
timer->base = new_base;
/*
* Enqueue the timer. Allow reprogramming of the event device
*/
enqueue_hrtimer(timer, new_base, 1);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+ /*
+ * Happens with high res enabled when the timer was
+ * already expired and the callback mode is
+ * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+ * enqueue code does not move them to the soft irq
+ * pending list for performance/latency reasons, but
+ * in the migration state, we need to do that
+ * otherwise we end up with a stale timer.
+ */
+ if (timer->state == HRTIMER_STATE_MIGRATE) {
+ timer->state = HRTIMER_STATE_PENDING;
+ list_add_tail(&timer->cb_entry,
+ &new_base->cpu_base->cb_pending);
+ raise = 1;
+ }
+#endif
+ /* Clear the migration state bit */
+ timer->state &= ~HRTIMER_STATE_MIGRATE;
+ }
+ return raise;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+ struct hrtimer_cpu_base *new_base)
+{
+ struct hrtimer *timer;
+ int raise = 0;
+
+ while (!list_empty(&old_base->cb_pending)) {
+ timer = list_entry(old_base->cb_pending.next,
+ struct hrtimer, cb_entry);
+
+ __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
+ timer->base = &new_base->clock_base[timer->base->index];
+ list_add_tail(&timer->cb_entry, &new_base->cb_pending);
+ raise = 1;
}
+ return raise;
+}
+#else
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+ struct hrtimer_cpu_base *new_base)
+{
+ return 0;
}
+#endif
static void migrate_hrtimers(int cpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
- int i;
+ int i, raise = 0;
BUG_ON(cpu_online(cpu));
old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1626,14 +1694,21 @@ static void migrate_hrtimers(int cpu)
spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- migrate_hrtimer_list(&old_base->clock_base[i],
- &new_base->clock_base[i]);
+ if (migrate_hrtimer_list(&old_base->clock_base[i],
+ &new_base->clock_base[i], cpu))
+ raise = 1;
}
+ if (migrate_hrtimer_pending(old_base, new_base))
+ raise = 1;
+
spin_unlock(&old_base->lock);
spin_unlock(&new_base->lock);
local_irq_enable();
put_cpu_var(hrtimer_bases);
+
+ if (raise)
+ hrtimer_raise_softirq();
}
#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 152abfd3589..60c49e32439 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -89,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
set_balance_irq_affinity(irq, cpumask);
#ifdef CONFIG_GENERIC_PENDING_IRQ
- set_pending_irq(irq, cpumask);
+ if (desc->status & IRQ_MOVE_PCNTXT) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ desc->chip->set_affinity(irq, cpumask);
+ spin_unlock_irqrestore(&desc->lock, flags);
+ } else
+ set_pending_irq(irq, cpumask);
#else
desc->affinity = cpumask;
desc->chip->set_affinity(irq, cpumask);
@@ -323,7 +330,8 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
if (ret)
- pr_err("setting flow type for irq %u failed (%pF)\n",
+ pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
+ (int)(flags & IRQF_TRIGGER_MASK),
irq, chip->set_type);
return ret;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c6d35d68ee..a09dd29c2fd 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -8,6 +8,7 @@
#include <linux/irq.h>
#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include "internals.h"
@@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
-static int irq_affinity_read_proc(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
- struct irq_desc *desc = irq_desc + (long)data;
+ struct irq_desc *desc = irq_desc + (long)m->private;
cpumask_t *mask = &desc->affinity;
- int len;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PENDING)
mask = &desc->pending_mask;
#endif
- len = cpumask_scnprintf(page, count, *mask);
-
- if (count - len < 2)
- return -EINVAL;
- len += sprintf(page + len, "\n");
- return len;
+ seq_cpumask(m, mask);
+ seq_putc(m, '\n');
+ return 0;
}
#ifndef is_affinity_mask_valid
@@ -40,11 +36,12 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off,
#endif
int no_irq_affinity;
-static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+static ssize_t irq_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)data, full_count = count, err;
+ unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
cpumask_t new_value;
+ int err;
if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
irq_balancing_disabled(irq))
@@ -65,28 +62,38 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
if (!cpus_intersects(new_value, cpu_online_map))
/* Special case for empty set - allow the architecture
code to set default SMP affinity. */
- return irq_select_affinity(irq) ? -EINVAL : full_count;
+ return irq_select_affinity(irq) ? -EINVAL : count;
irq_set_affinity(irq, new_value);
- return full_count;
+ return count;
}
-static int default_affinity_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
- int len = cpumask_scnprintf(page, count, irq_default_affinity);
- if (count - len < 2)
- return -EINVAL;
- len += sprintf(page + len, "\n");
- return len;
+ return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}
-static int default_affinity_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+static const struct file_operations irq_affinity_proc_fops = {
+ .open = irq_affinity_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_affinity_proc_write,
+};
+
+static int default_affinity_show(struct seq_file *m, void *v)
+{
+ seq_cpumask(m, &irq_default_affinity);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static ssize_t default_affinity_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
- unsigned int full_count = count, err;
cpumask_t new_value;
+ int err;
err = cpumask_parse_user(buffer, count, new_value);
if (err)
@@ -105,8 +112,21 @@ static int default_affinity_write(struct file *file, const char __user *buffer,
irq_default_affinity = new_value;
- return full_count;
+ return count;
}
+
+static int default_affinity_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, default_affinity_show, NULL);
+}
+
+static const struct file_operations default_affinity_proc_fops = {
+ .open = default_affinity_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = default_affinity_write,
+};
#endif
static int irq_spurious_read(char *page, char **start, off_t off,
@@ -178,16 +198,9 @@ void register_irq_proc(unsigned int irq)
irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
#ifdef CONFIG_SMP
- {
- /* create /proc/irq/<irq>/smp_affinity */
- entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
-
- if (entry) {
- entry->data = (void *)(long)irq;
- entry->read_proc = irq_affinity_read_proc;
- entry->write_proc = irq_affinity_write_proc;
- }
- }
+ /* create /proc/irq/<irq>/smp_affinity */
+ proc_create_data("smp_affinity", 0600, irq_desc[irq].dir,
+ &irq_affinity_proc_fops, (void *)(long)irq);
#endif
entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
@@ -208,15 +221,8 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
void register_default_affinity_proc(void)
{
#ifdef CONFIG_SMP
- struct proc_dir_entry *entry;
-
- /* create /proc/irq/default_smp_affinity */
- entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
- if (entry) {
- entry->data = NULL;
- entry->read_proc = default_affinity_read;
- entry->write_proc = default_affinity_write;
- }
+ proc_create("irq/default_smp_affinity", 0600, NULL,
+ &default_affinity_proc_fops);
#endif
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8a4370e2a3..aef265325cd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -12,7 +12,7 @@
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/kexec.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
@@ -77,7 +77,7 @@ int kexec_should_crash(struct task_struct *p)
*
* The code for the transition from the current kernel to the
* the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
+ * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
* page of memory is necessary, but some architectures require more.
* Because this memory must be identity mapped in the transition from
* virtual to physical addresses it must live in the range
@@ -242,7 +242,7 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
*/
result = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
- get_order(KEXEC_CONTROL_CODE_SIZE));
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
goto out;
@@ -317,7 +317,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
*/
result = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
- get_order(KEXEC_CONTROL_CODE_SIZE));
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
goto out;
@@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
*old = addr | (*old & ~PAGE_MASK);
/* The old page I have found cannot be a
- * destination page, so return it.
+ * destination page, so return it if it's
+ * gfp_flags honor the ones passed in.
*/
+ if (!(gfp_mask & __GFP_HIGHMEM) &&
+ PageHighMem(old_page)) {
+ kimage_free_pages(old_page);
+ continue;
+ }
addr = old_addr;
page = old_page;
break;
@@ -924,19 +930,14 @@ static int kimage_load_segment(struct kimage *image,
*/
struct kimage *kexec_image;
struct kimage *kexec_crash_image;
-/*
- * A home grown binary mutex.
- * Nothing can wait so this mutex is safe to use
- * in interrupt context :)
- */
-static int kexec_lock;
+
+static DEFINE_MUTEX(kexec_mutex);
asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
struct kexec_segment __user *segments,
unsigned long flags)
{
struct kimage **dest_image, *image;
- int locked;
int result;
/* We only trust the superuser with rebooting the system. */
@@ -972,8 +973,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
*
* KISS: always take the mutex.
*/
- locked = xchg(&kexec_lock, 1);
- if (locked)
+ if (!mutex_trylock(&kexec_mutex))
return -EBUSY;
dest_image = &kexec_image;
@@ -1015,8 +1015,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
image = xchg(dest_image, image);
out:
- locked = xchg(&kexec_lock, 0); /* Release the mutex */
- BUG_ON(!locked);
+ mutex_unlock(&kexec_mutex);
kimage_free(image);
return result;
@@ -1063,10 +1062,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
void crash_kexec(struct pt_regs *regs)
{
- int locked;
-
-
- /* Take the kexec_lock here to prevent sys_kexec_load
+ /* Take the kexec_mutex here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
*
@@ -1074,8 +1070,7 @@ void crash_kexec(struct pt_regs *regs)
* of memory the xchg(&kexec_crash_image) would be
* sufficient. But since I reuse the memory...
*/
- locked = xchg(&kexec_lock, 1);
- if (!locked) {
+ if (mutex_trylock(&kexec_mutex)) {
if (kexec_crash_image) {
struct pt_regs fixed_regs;
crash_setup_regs(&fixed_regs, regs);
@@ -1083,8 +1078,7 @@ void crash_kexec(struct pt_regs *regs)
machine_crash_shutdown(&fixed_regs);
machine_kexec(kexec_crash_image);
}
- locked = xchg(&kexec_lock, 0);
- BUG_ON(!locked);
+ mutex_unlock(&kexec_mutex);
}
}
@@ -1426,25 +1420,23 @@ static int __init crash_save_vmcoreinfo_init(void)
module_init(crash_save_vmcoreinfo_init)
-/**
- * kernel_kexec - reboot the system
- *
- * Move into place and start executing a preloaded standalone
- * executable. If nothing was preloaded return an error.
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
*/
int kernel_kexec(void)
{
int error = 0;
- if (xchg(&kexec_lock, 1))
+ if (!mutex_trylock(&kexec_mutex))
return -EBUSY;
if (!kexec_image) {
error = -EINVAL;
goto Unlock;
}
- if (kexec_image->preserve_context) {
#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
mutex_lock(&pm_mutex);
pm_prepare_console();
error = freeze_processes();
@@ -1459,6 +1451,7 @@ int kernel_kexec(void)
error = disable_nonboot_cpus();
if (error)
goto Resume_devices;
+ device_pm_lock();
local_irq_disable();
/* At this point, device_suspend() has been called,
* but *not* device_power_down(). We *must*
@@ -1470,26 +1463,22 @@ int kernel_kexec(void)
error = device_power_down(PMSG_FREEZE);
if (error)
goto Enable_irqs;
- save_processor_state();
+ } else
#endif
- } else {
- blocking_notifier_call_chain(&reboot_notifier_list,
- SYS_RESTART, NULL);
- system_state = SYSTEM_RESTART;
- device_shutdown();
- sysdev_shutdown();
+ {
+ kernel_restart_prepare(NULL);
printk(KERN_EMERG "Starting new kernel\n");
machine_shutdown();
}
machine_kexec(kexec_image);
- if (kexec_image->preserve_context) {
#ifdef CONFIG_KEXEC_JUMP
- restore_processor_state();
+ if (kexec_image->preserve_context) {
device_power_up(PMSG_RESTORE);
Enable_irqs:
local_irq_enable();
+ device_pm_unlock();
enable_nonboot_cpus();
Resume_devices:
device_resume(PMSG_RESTORE);
@@ -1499,11 +1488,10 @@ int kernel_kexec(void)
Restore_console:
pm_restore_console();
mutex_unlock(&pm_mutex);
-#endif
}
+#endif
Unlock:
- xchg(&kexec_lock, 0);
-
+ mutex_unlock(&kexec_mutex);
return error;
}
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 3ec23c3ec97..e4dcfb2272a 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -56,12 +56,14 @@
static int kgdb_break_asap;
+#define KGDB_MAX_THREAD_QUERY 17
struct kgdb_state {
int ex_vector;
int signo;
int err_code;
int cpu;
int pass_exception;
+ unsigned long thr_query;
unsigned long threadid;
long kgdb_usethreadid;
struct pt_regs *linux_regs;
@@ -166,13 +168,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
* Weak aliases for breakpoint management,
* can be overriden by architectures when needed:
*/
-int __weak kgdb_validate_break_address(unsigned long addr)
-{
- char tmp_variable[BREAK_INSTR_SIZE];
-
- return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
-}
-
int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
{
int err;
@@ -191,6 +186,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
(char *)bundle, BREAK_INSTR_SIZE);
}
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+ char tmp_variable[BREAK_INSTR_SIZE];
+ int err;
+ /* Validate setting the breakpoint and then removing it. In the
+ * remove fails, the kernel needs to emit a bad message because we
+ * are deep trouble not being able to put things back the way we
+ * found them.
+ */
+ err = kgdb_arch_set_breakpoint(addr, tmp_variable);
+ if (err)
+ return err;
+ err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
+ if (err)
+ printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
+ "memory destroyed at: %lx", addr);
+ return err;
+}
+
unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
{
return instruction_pointer(regs);
@@ -433,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
{
int hex_val;
int num = 0;
+ int negate = 0;
*long_val = 0;
+ if (**ptr == '-') {
+ negate = 1;
+ (*ptr)++;
+ }
while (**ptr) {
hex_val = hex(**ptr);
if (hex_val < 0)
@@ -446,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
(*ptr)++;
}
+ if (negate)
+ *long_val = -*long_val;
+
return num;
}
@@ -466,7 +488,7 @@ static int write_mem_msg(int binary)
if (err)
return err;
if (CACHE_FLUSH_IS_SAFE)
- flush_icache_range(addr, addr + length + 1);
+ flush_icache_range(addr, addr + length);
return 0;
}
@@ -515,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value)
static struct task_struct *getthread(struct pt_regs *regs, int tid)
{
/*
- * Non-positive TIDs are remapped idle tasks:
+ * Non-positive TIDs are remapped to the cpu shadow information
*/
- if (tid <= 0)
- return idle_task(-tid);
+ if (tid == 0 || tid == -1)
+ tid = -atomic_read(&kgdb_active) - 2;
+ if (tid < 0) {
+ if (kgdb_info[-tid - 2].task)
+ return kgdb_info[-tid - 2].task;
+ else
+ return idle_task(-tid - 2);
+ }
/*
* find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -562,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
/* Signal the primary CPU that we are done: */
atomic_set(&cpu_in_kgdb[cpu], 0);
+ touch_softlockup_watchdog();
clocksource_touch_watchdog();
local_irq_restore(flags);
}
@@ -725,14 +754,15 @@ setundefined:
}
/*
- * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
+ * Remap normal tasks to their real PID,
+ * CPU shadow threads are mapped to -CPU - 2
*/
static inline int shadow_pid(int realpid)
{
if (realpid)
return realpid;
- return -1-raw_smp_processor_id();
+ return -raw_smp_processor_id() - 2;
}
static char gdbmsgbuf[BUFMAX + 1];
@@ -826,7 +856,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
} else {
local_debuggerinfo = NULL;
- for (i = 0; i < NR_CPUS; i++) {
+ for_each_online_cpu(i) {
/*
* Try to find the task on some other
* or possibly this node if we do not
@@ -960,10 +990,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
/* Handle the 'q' query packets */
static void gdb_cmd_query(struct kgdb_state *ks)
{
- struct task_struct *thread;
+ struct task_struct *g;
+ struct task_struct *p;
unsigned char thref[8];
char *ptr;
int i;
+ int cpu;
+ int finished = 0;
switch (remcom_in_buffer[1]) {
case 's':
@@ -973,22 +1006,34 @@ static void gdb_cmd_query(struct kgdb_state *ks)
break;
}
- if (remcom_in_buffer[1] == 'f')
- ks->threadid = 1;
-
+ i = 0;
remcom_out_buffer[0] = 'm';
ptr = remcom_out_buffer + 1;
-
- for (i = 0; i < 17; ks->threadid++) {
- thread = getthread(ks->linux_regs, ks->threadid);
- if (thread) {
- int_to_threadref(thref, ks->threadid);
+ if (remcom_in_buffer[1] == 'f') {
+ /* Each cpu is a shadow thread */
+ for_each_online_cpu(cpu) {
+ ks->thr_query = 0;
+ int_to_threadref(thref, -cpu - 2);
pack_threadid(ptr, thref);
ptr += BUF_THREAD_ID_SIZE;
*(ptr++) = ',';
i++;
}
}
+
+ do_each_thread(g, p) {
+ if (i >= ks->thr_query && !finished) {
+ int_to_threadref(thref, p->pid);
+ pack_threadid(ptr, thref);
+ ptr += BUF_THREAD_ID_SIZE;
+ *(ptr++) = ',';
+ ks->thr_query++;
+ if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
+ finished = 1;
+ }
+ i++;
+ } while_each_thread(g, p);
+
*(--ptr) = '\0';
break;
@@ -1011,15 +1056,15 @@ static void gdb_cmd_query(struct kgdb_state *ks)
error_packet(remcom_out_buffer, -EINVAL);
break;
}
- if (ks->threadid > 0) {
+ if ((int)ks->threadid > 0) {
kgdb_mem2hex(getthread(ks->linux_regs,
ks->threadid)->comm,
remcom_out_buffer, 16);
} else {
static char tmpstr[23 + BUF_THREAD_ID_SIZE];
- sprintf(tmpstr, "Shadow task %d for pid 0",
- (int)(-ks->threadid-1));
+ sprintf(tmpstr, "shadowCPU%d",
+ (int)(-ks->threadid - 2));
kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
}
break;
@@ -1388,6 +1433,7 @@ acquirelock:
atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
atomic_set(&kgdb_active, -1);
+ touch_softlockup_watchdog();
clocksource_touch_watchdog();
local_irq_restore(flags);
@@ -1418,7 +1464,7 @@ acquirelock:
* Get the passive CPU lock which will hold all the non-primary
* CPU in a spin state while the debugger is active
*/
- if (!kgdb_single_step || !kgdb_contthread) {
+ if (!kgdb_single_step) {
for (i = 0; i < NR_CPUS; i++)
atomic_set(&passive_cpu_wait[i], 1);
}
@@ -1431,7 +1477,7 @@ acquirelock:
#ifdef CONFIG_SMP
/* Signal the other CPUs to enter kgdb_wait() */
- if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
+ if ((!kgdb_single_step) && kgdb_do_roundup)
kgdb_roundup_cpus(flags);
#endif
@@ -1450,7 +1496,7 @@ acquirelock:
kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
kgdb_deactivate_sw_breakpoints();
kgdb_single_step = 0;
- kgdb_contthread = NULL;
+ kgdb_contthread = current;
exception_level = 0;
/* Talk to debugger with gdbserial protocol */
@@ -1464,7 +1510,7 @@ acquirelock:
kgdb_info[ks->cpu].task = NULL;
atomic_set(&cpu_in_kgdb[ks->cpu], 0);
- if (!kgdb_single_step || !kgdb_contthread) {
+ if (!kgdb_single_step) {
for (i = NR_CPUS-1; i >= 0; i--)
atomic_set(&passive_cpu_wait[i], 0);
/*
@@ -1480,6 +1526,7 @@ acquirelock:
kgdb_restore:
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
+ touch_softlockup_watchdog();
clocksource_touch_watchdog();
local_irq_restore(flags);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index d38a6436297..dbda475b13b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
unsigned long nr_lock_classes;
static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static inline struct lock_class *hlock_class(struct held_lock *hlock)
+{
+ if (!hlock->class_idx) {
+ DEBUG_LOCKS_WARN_ON(1);
+ return NULL;
+ }
+ return lock_classes + hlock->class_idx - 1;
+}
+
#ifdef CONFIG_LOCK_STAT
static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
@@ -222,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock)
holdtime = sched_clock() - hlock->holdtime_stamp;
- stats = get_lock_stats(hlock->class);
+ stats = get_lock_stats(hlock_class(hlock));
if (hlock->read)
lock_time_inc(&stats->read_holdtime, holdtime);
else
@@ -372,6 +381,19 @@ unsigned int nr_process_chains;
unsigned int max_lockdep_depth;
unsigned int max_recursion_depth;
+static unsigned int lockdep_dependency_gen_id;
+
+static bool lockdep_dependency_visit(struct lock_class *source,
+ unsigned int depth)
+{
+ if (!depth)
+ lockdep_dependency_gen_id++;
+ if (source->dep_gen_id == lockdep_dependency_gen_id)
+ return true;
+ source->dep_gen_id = lockdep_dependency_gen_id;
+ return false;
+}
+
#ifdef CONFIG_DEBUG_LOCKDEP
/*
* We cannot printk in early bootup code. Not even early_printk()
@@ -505,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
static void print_lock(struct held_lock *hlock)
{
- print_lock_name(hlock->class);
+ print_lock_name(hlock_class(hlock));
printk(", at: ");
print_ip_sym(hlock->acquire_ip);
}
@@ -558,6 +580,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
{
struct lock_list *entry;
+ if (lockdep_dependency_visit(class, depth))
+ return;
+
if (DEBUG_LOCKS_WARN_ON(depth >= 20))
return;
@@ -850,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
if (!entry)
return 0;
- entry->class = this;
- entry->distance = distance;
if (!save_trace(&entry->trace))
return 0;
+ entry->class = this;
+ entry->distance = distance;
/*
* Since we never remove from the dependency list, the list can
* be walked lockless by other CPUs, it's only allocation
@@ -932,7 +957,7 @@ static noinline int print_circular_bug_tail(void)
if (debug_locks_silent)
return 0;
- this.class = check_source->class;
+ this.class = hlock_class(check_source);
if (!save_trace(&this.trace))
return 0;
@@ -959,6 +984,67 @@ static int noinline print_infinite_recursion_bug(void)
return 0;
}
+unsigned long __lockdep_count_forward_deps(struct lock_class *class,
+ unsigned int depth)
+{
+ struct lock_list *entry;
+ unsigned long ret = 1;
+
+ if (lockdep_dependency_visit(class, depth))
+ return 0;
+
+ /*
+ * Recurse this class's dependency list:
+ */
+ list_for_each_entry(entry, &class->locks_after, entry)
+ ret += __lockdep_count_forward_deps(entry->class, depth + 1);
+
+ return ret;
+}
+
+unsigned long lockdep_count_forward_deps(struct lock_class *class)
+{
+ unsigned long ret, flags;
+
+ local_irq_save(flags);
+ __raw_spin_lock(&lockdep_lock);
+ ret = __lockdep_count_forward_deps(class, 0);
+ __raw_spin_unlock(&lockdep_lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+unsigned long __lockdep_count_backward_deps(struct lock_class *class,
+ unsigned int depth)
+{
+ struct lock_list *entry;
+ unsigned long ret = 1;
+
+ if (lockdep_dependency_visit(class, depth))
+ return 0;
+ /*
+ * Recurse this class's dependency list:
+ */
+ list_for_each_entry(entry, &class->locks_before, entry)
+ ret += __lockdep_count_backward_deps(entry->class, depth + 1);
+
+ return ret;
+}
+
+unsigned long lockdep_count_backward_deps(struct lock_class *class)
+{
+ unsigned long ret, flags;
+
+ local_irq_save(flags);
+ __raw_spin_lock(&lockdep_lock);
+ ret = __lockdep_count_backward_deps(class, 0);
+ __raw_spin_unlock(&lockdep_lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
/*
* Prove that the dependency graph starting at <entry> can not
* lead to <target>. Print an error and return 0 if it does.
@@ -968,6 +1054,9 @@ check_noncircular(struct lock_class *source, unsigned int depth)
{
struct lock_list *entry;
+ if (lockdep_dependency_visit(source, depth))
+ return 1;
+
debug_atomic_inc(&nr_cyclic_check_recursions);
if (depth > max_recursion_depth)
max_recursion_depth = depth;
@@ -977,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
* Check this lock's dependency list:
*/
list_for_each_entry(entry, &source->locks_after, entry) {
- if (entry->class == check_target->class)
+ if (entry->class == hlock_class(check_target))
return print_circular_bug_header(entry, depth+1);
debug_atomic_inc(&nr_cyclic_checks);
if (!check_noncircular(entry->class, depth+1))
@@ -1011,6 +1100,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
struct lock_list *entry;
int ret;
+ if (lockdep_dependency_visit(source, depth))
+ return 1;
+
if (depth > max_recursion_depth)
max_recursion_depth = depth;
if (depth >= RECURSION_LIMIT)
@@ -1050,6 +1142,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
struct lock_list *entry;
int ret;
+ if (lockdep_dependency_visit(source, depth))
+ return 1;
+
if (!__raw_spin_is_locked(&lockdep_lock))
return DEBUG_LOCKS_WARN_ON(1);
@@ -1064,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
return 2;
}
+ if (!source && debug_locks_off_graph_unlock()) {
+ WARN_ON(1);
+ return 0;
+ }
+
/*
* Check this lock's dependency list:
*/
@@ -1103,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr,
printk("\nand this task is already holding:\n");
print_lock(prev);
printk("which would create a new lock dependency:\n");
- print_lock_name(prev->class);
+ print_lock_name(hlock_class(prev));
printk(" ->");
- print_lock_name(next->class);
+ print_lock_name(hlock_class(next));
printk("\n");
printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
@@ -1146,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
find_usage_bit = bit_backwards;
/* fills in <backwards_match> */
- ret = find_usage_backwards(prev->class, 0);
+ ret = find_usage_backwards(hlock_class(prev), 0);
if (!ret || ret == 1)
return ret;
find_usage_bit = bit_forwards;
- ret = find_usage_forwards(next->class, 0);
+ ret = find_usage_forwards(hlock_class(next), 0);
if (!ret || ret == 1)
return ret;
/* ret == 2 */
@@ -1272,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
struct lockdep_map *next_instance, int read)
{
struct held_lock *prev;
+ struct held_lock *nest = NULL;
int i;
for (i = 0; i < curr->lockdep_depth; i++) {
prev = curr->held_locks + i;
- if (prev->class != next->class)
+
+ if (prev->instance == next->nest_lock)
+ nest = prev;
+
+ if (hlock_class(prev) != hlock_class(next))
continue;
+
/*
* Allow read-after-read recursion of the same
* lock class (i.e. read_lock(lock)+read_lock(lock)):
*/
if ((read == 2) && prev->read)
return 2;
+
+ /*
+ * We're holding the nest_lock, which serializes this lock's
+ * nesting behaviour.
+ */
+ if (nest)
+ return 2;
+
return print_deadlock_bug(curr, prev, next);
}
return 1;
@@ -1329,7 +1443,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
*/
check_source = next;
check_target = prev;
- if (!(check_noncircular(next->class, 0)))
+ if (!(check_noncircular(hlock_class(next), 0)))
return print_circular_bug_tail();
if (!check_prev_add_irq(curr, prev, next))
@@ -1353,8 +1467,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* chains - the second one will be new, but L1 already has
* L2 added to its dependency list, due to the first chain.)
*/
- list_for_each_entry(entry, &prev->class->locks_after, entry) {
- if (entry->class == next->class) {
+ list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
+ if (entry->class == hlock_class(next)) {
if (distance == 1)
entry->distance = 1;
return 2;
@@ -1365,26 +1479,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Ok, all validations passed, add the new lock
* to the previous lock's dependency list:
*/
- ret = add_lock_to_list(prev->class, next->class,
- &prev->class->locks_after, next->acquire_ip, distance);
+ ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+ &hlock_class(prev)->locks_after,
+ next->acquire_ip, distance);
if (!ret)
return 0;
- ret = add_lock_to_list(next->class, prev->class,
- &next->class->locks_before, next->acquire_ip, distance);
+ ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+ &hlock_class(next)->locks_before,
+ next->acquire_ip, distance);
if (!ret)
return 0;
/*
* Debugging printouts:
*/
- if (verbose(prev->class) || verbose(next->class)) {
+ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
graph_unlock();
printk("\n new dependency: ");
- print_lock_name(prev->class);
+ print_lock_name(hlock_class(prev));
printk(" => ");
- print_lock_name(next->class);
+ print_lock_name(hlock_class(next));
printk("\n");
dump_stack();
return graph_lock();
@@ -1481,7 +1597,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct held_lock *hlock,
u64 chain_key)
{
- struct lock_class *class = hlock->class;
+ struct lock_class *class = hlock_class(hlock);
struct list_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
struct held_lock *hlock_curr, *hlock_next;
@@ -1554,7 +1670,7 @@ cache_hit:
if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
chain->base = cn;
for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class - lock_classes;
+ int lock_id = curr->held_locks[i].class_idx - 1;
chain_hlocks[chain->base + j] = lock_id;
}
chain_hlocks[chain->base + j] = class - lock_classes;
@@ -1643,14 +1759,13 @@ static void check_chain_key(struct task_struct *curr)
hlock = curr->held_locks + i;
if (chain_key != hlock->prev_chain_key) {
debug_locks_off();
- printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+ WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
curr->lockdep_depth, i,
(unsigned long long)chain_key,
(unsigned long long)hlock->prev_chain_key);
- WARN_ON(1);
return;
}
- id = hlock->class - lock_classes;
+ id = hlock->class_idx - 1;
if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
return;
@@ -1662,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr)
}
if (chain_key != curr->curr_chain_key) {
debug_locks_off();
- printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+ WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
curr->lockdep_depth, i,
(unsigned long long)chain_key,
(unsigned long long)curr->curr_chain_key);
- WARN_ON(1);
}
#endif
}
@@ -1695,7 +1809,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
print_lock(this);
printk("{%s} state was registered at:\n", usage_str[prev_bit]);
- print_stack_trace(this->class->usage_traces + prev_bit, 1);
+ print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
print_irqtrace_events(curr);
printk("\nother info that might help us debug this:\n");
@@ -1714,7 +1828,7 @@ static inline int
valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
- if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+ if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
return print_usage_bug(curr, this, bad_bit, new_bit);
return 1;
}
@@ -1753,7 +1867,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
lockdep_print_held_locks(curr);
printk("\nthe first lock's dependencies:\n");
- print_lock_dependencies(this->class, 0);
+ print_lock_dependencies(hlock_class(this), 0);
printk("\nthe second lock's dependencies:\n");
print_lock_dependencies(other, 0);
@@ -1776,7 +1890,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
find_usage_bit = bit;
/* fills in <forwards_match> */
- ret = find_usage_forwards(this->class, 0);
+ ret = find_usage_forwards(hlock_class(this), 0);
if (!ret || ret == 1)
return ret;
@@ -1795,7 +1909,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
find_usage_bit = bit;
/* fills in <backwards_match> */
- ret = find_usage_backwards(this->class, 0);
+ ret = find_usage_backwards(hlock_class(this), 0);
if (!ret || ret == 1)
return ret;
@@ -1861,7 +1975,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
return 0;
#endif
- if (hardirq_verbose(this->class))
+ if (hardirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_USED_IN_SOFTIRQ:
@@ -1886,7 +2000,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
return 0;
#endif
- if (softirq_verbose(this->class))
+ if (softirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_USED_IN_HARDIRQ_READ:
@@ -1899,7 +2013,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
if (!check_usage_forwards(curr, this,
LOCK_ENABLED_HARDIRQS, "hard"))
return 0;
- if (hardirq_verbose(this->class))
+ if (hardirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_USED_IN_SOFTIRQ_READ:
@@ -1912,7 +2026,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
if (!check_usage_forwards(curr, this,
LOCK_ENABLED_SOFTIRQS, "soft"))
return 0;
- if (softirq_verbose(this->class))
+ if (softirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_ENABLED_HARDIRQS:
@@ -1938,7 +2052,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
return 0;
#endif
- if (hardirq_verbose(this->class))
+ if (hardirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_ENABLED_SOFTIRQS:
@@ -1964,7 +2078,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
return 0;
#endif
- if (softirq_verbose(this->class))
+ if (softirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_ENABLED_HARDIRQS_READ:
@@ -1979,7 +2093,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
LOCK_USED_IN_HARDIRQ, "hard"))
return 0;
#endif
- if (hardirq_verbose(this->class))
+ if (hardirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_ENABLED_SOFTIRQS_READ:
@@ -1994,7 +2108,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
LOCK_USED_IN_SOFTIRQ, "soft"))
return 0;
#endif
- if (softirq_verbose(this->class))
+ if (softirq_verbose(hlock_class(this)))
ret = 2;
break;
default:
@@ -2310,7 +2424,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
* If already set then do not dirty the cacheline,
* nor do any checks:
*/
- if (likely(this->class->usage_mask & new_mask))
+ if (likely(hlock_class(this)->usage_mask & new_mask))
return 1;
if (!graph_lock())
@@ -2318,14 +2432,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Make sure we didnt race:
*/
- if (unlikely(this->class->usage_mask & new_mask)) {
+ if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
graph_unlock();
return 1;
}
- this->class->usage_mask |= new_mask;
+ hlock_class(this)->usage_mask |= new_mask;
- if (!save_trace(this->class->usage_traces + new_bit))
+ if (!save_trace(hlock_class(this)->usage_traces + new_bit))
return 0;
switch (new_bit) {
@@ -2405,7 +2519,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
*/
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
- unsigned long ip)
+ struct lockdep_map *nest_lock, unsigned long ip)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -2459,14 +2573,16 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
return 0;
hlock = curr->held_locks + depth;
-
- hlock->class = class;
+ if (DEBUG_LOCKS_WARN_ON(!class))
+ return 0;
+ hlock->class_idx = class - lock_classes + 1;
hlock->acquire_ip = ip;
hlock->instance = lock;
+ hlock->nest_lock = nest_lock;
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
- hlock->hardirqs_off = hardirqs_off;
+ hlock->hardirqs_off = !!hardirqs_off;
#ifdef CONFIG_LOCK_STAT
hlock->waittime_stamp = 0;
hlock->holdtime_stamp = sched_clock();
@@ -2574,6 +2690,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
return 1;
}
+static int
+__lock_set_subclass(struct lockdep_map *lock,
+ unsigned int subclass, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock, *prev_hlock;
+ struct lock_class *class;
+ unsigned int depth;
+ int i;
+
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (hlock->instance == lock)
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+ class = register_lock_class(lock, subclass, 0);
+ hlock->class_idx = class - lock_classes + 1;
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ for (; i < depth; i++) {
+ hlock = curr->held_locks + i;
+ if (!__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass, hlock->trylock,
+ hlock->read, hlock->check, hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip))
+ return 0;
+ }
+
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ return 0;
+ return 1;
+}
+
/*
* Remove the lock to the list of currently held locks in a
* potentially non-nested (out of order) manner. This is a
@@ -2624,9 +2789,9 @@ found_it:
for (i++; i < depth; i++) {
hlock = curr->held_locks + i;
if (!__lock_acquire(hlock->instance,
- hlock->class->subclass, hlock->trylock,
+ hlock_class(hlock)->subclass, hlock->trylock,
hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->acquire_ip))
+ hlock->nest_lock, hlock->acquire_ip))
return 0;
}
@@ -2669,7 +2834,7 @@ static int lock_release_nested(struct task_struct *curr,
#ifdef CONFIG_DEBUG_LOCKDEP
hlock->prev_chain_key = 0;
- hlock->class = NULL;
+ hlock->class_idx = 0;
hlock->acquire_ip = 0;
hlock->irq_context = 0;
#endif
@@ -2738,18 +2903,36 @@ static void check_flags(unsigned long flags)
#endif
}
+void
+lock_set_subclass(struct lockdep_map *lock,
+ unsigned int subclass, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ check_flags(flags);
+ if (__lock_set_subclass(lock, subclass, ip))
+ check_chain_key(current);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_set_subclass);
+
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
*/
void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
- int trylock, int read, int check, unsigned long ip)
+ int trylock, int read, int check,
+ struct lockdep_map *nest_lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(!lock_stat && !prove_locking))
- return;
-
if (unlikely(current->lockdep_recursion))
return;
@@ -2758,7 +2941,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
current->lockdep_recursion = 1;
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), ip);
+ irqs_disabled_flags(flags), nest_lock, ip);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
@@ -2770,9 +2953,6 @@ void lock_release(struct lockdep_map *lock, int nested,
{
unsigned long flags;
- if (unlikely(!lock_stat && !prove_locking))
- return;
-
if (unlikely(current->lockdep_recursion))
return;
@@ -2845,11 +3025,11 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
found_it:
hlock->waittime_stamp = sched_clock();
- point = lock_contention_point(hlock->class, ip);
+ point = lock_contention_point(hlock_class(hlock), ip);
- stats = get_lock_stats(hlock->class);
+ stats = get_lock_stats(hlock_class(hlock));
if (point < ARRAY_SIZE(stats->contention_point))
- stats->contention_point[i]++;
+ stats->contention_point[point]++;
if (lock->cpu != smp_processor_id())
stats->bounces[bounce_contended + !!hlock->read]++;
put_lock_stats(stats);
@@ -2893,7 +3073,7 @@ found_it:
hlock->holdtime_stamp = now;
}
- stats = get_lock_stats(hlock->class);
+ stats = get_lock_stats(hlock_class(hlock));
if (waittime) {
if (hlock->read)
lock_time_inc(&stats->read_waittime, waittime);
@@ -2988,6 +3168,7 @@ static void zap_class(struct lock_class *class)
list_del_rcu(&class->hash_entry);
list_del_rcu(&class->lock_entry);
+ class->key = NULL;
}
static inline int within(const void *addr, void *start, unsigned long size)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index c3600a091a2..56b196932c0 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -17,9 +17,6 @@
*/
#define MAX_LOCKDEP_ENTRIES 8192UL
-#define MAX_LOCKDEP_KEYS_BITS 11
-#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
-
#define MAX_LOCKDEP_CHAINS_BITS 14
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
@@ -53,6 +50,22 @@ extern unsigned int nr_process_chains;
extern unsigned int max_lockdep_depth;
extern unsigned int max_recursion_depth;
+#ifdef CONFIG_PROVE_LOCKING
+extern unsigned long lockdep_count_forward_deps(struct lock_class *);
+extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#else
+static inline unsigned long
+lockdep_count_forward_deps(struct lock_class *class)
+{
+ return 0;
+}
+static inline unsigned long
+lockdep_count_backward_deps(struct lock_class *class)
+{
+ return 0;
+}
+#endif
+
#ifdef CONFIG_DEBUG_LOCKDEP
/*
* Various lockdep statistics:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 9b0e940e254..20dbcbf9c7d 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v)
{
}
-static unsigned long count_forward_deps(struct lock_class *class)
-{
- struct lock_list *entry;
- unsigned long ret = 1;
-
- /*
- * Recurse this class's dependency list:
- */
- list_for_each_entry(entry, &class->locks_after, entry)
- ret += count_forward_deps(entry->class);
-
- return ret;
-}
-
-static unsigned long count_backward_deps(struct lock_class *class)
-{
- struct lock_list *entry;
- unsigned long ret = 1;
-
- /*
- * Recurse this class's dependency list:
- */
- list_for_each_entry(entry, &class->locks_before, entry)
- ret += count_backward_deps(entry->class);
-
- return ret;
-}
-
static void print_name(struct seq_file *m, struct lock_class *class)
{
char str[128];
@@ -110,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- unsigned long nr_forward_deps, nr_backward_deps;
struct lock_class *class = v;
struct lock_list *entry;
char c1, c2, c3, c4;
@@ -124,11 +95,10 @@ static int l_show(struct seq_file *m, void *v)
#ifdef CONFIG_DEBUG_LOCKDEP
seq_printf(m, " OPS:%8ld", class->ops);
#endif
- nr_forward_deps = count_forward_deps(class);
- seq_printf(m, " FD:%5ld", nr_forward_deps);
-
- nr_backward_deps = count_backward_deps(class);
- seq_printf(m, " BD:%5ld", nr_backward_deps);
+#ifdef CONFIG_PROVE_LOCKING
+ seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
+ seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
+#endif
get_usage_chars(class, &c1, &c2, &c3, &c4);
seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
@@ -229,6 +199,9 @@ static int lc_show(struct seq_file *m, void *v)
for (i = 0; i < chain->depth; i++) {
class = lock_chain_get_class(chain, i);
+ if (!class->key)
+ continue;
+
seq_printf(m, "[%p] ", class->key);
print_name(m, class);
seq_puts(m, "\n");
@@ -350,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
nr_hardirq_read_unsafe++;
- sum_forward_deps += count_forward_deps(class);
+#ifdef CONFIG_PROVE_LOCKING
+ sum_forward_deps += lockdep_count_forward_deps(class);
+#endif
}
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
@@ -497,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
{
unsigned long rem;
+ nr += 5; /* for display rounding */
rem = do_div(nr, 1000); /* XXX: do_div_signed */
- snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10);
+ snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
}
static void seq_time(struct seq_file *m, s64 time)
diff --git a/kernel/marker.c b/kernel/marker.c
index 971da531790..7d1faecd7a5 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -126,6 +126,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
struct marker_probe_closure *multi;
int i;
/*
+ * Read mdata->ptype before mdata->multi.
+ */
+ smp_rmb();
+ multi = mdata->multi;
+ /*
* multi points to an array, therefore accessing the array
* depends on reading multi. However, even in this case,
* we must insure that the pointer is read _before_ the array
@@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
* in the fast path, so put the explicit barrier here.
*/
smp_read_barrier_depends();
- multi = mdata->multi;
for (i = 0; multi[i].func; i++) {
va_start(args, call_private);
multi[i].func(multi[i].probe_private, call_private,
@@ -175,6 +179,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
struct marker_probe_closure *multi;
int i;
/*
+ * Read mdata->ptype before mdata->multi.
+ */
+ smp_rmb();
+ multi = mdata->multi;
+ /*
* multi points to an array, therefore accessing the array
* depends on reading multi. However, even in this case,
* we must insure that the pointer is read _before_ the array
@@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
* in the fast path, so put the explicit barrier here.
*/
smp_read_barrier_depends();
- multi = mdata->multi;
for (i = 0; multi[i].func; i++)
multi[i].func(multi[i].probe_private, call_private,
mdata->format, &args);
diff --git a/kernel/module.c b/kernel/module.c
index 61d212120df..9db11911e04 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1799,7 +1799,7 @@ static void *module_alloc_update_bounds(unsigned long size)
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
-static struct module *load_module(void __user *umod,
+static noinline struct module *load_module(void __user *umod,
unsigned long len,
const char __user *uargs)
{
@@ -2288,7 +2288,7 @@ sys_init_module(void __user *umod,
/* Start the module */
if (mod->init != NULL)
- ret = mod->init();
+ ret = do_one_initcall(mod->init);
if (ret < 0) {
/* Init routine failed: abort. Try to protect us from
buggy refcounters. */
diff --git a/kernel/mutex.c b/kernel/mutex.c
index bcdc9ac8ef6..12c779dc65d 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -34,6 +34,7 @@
/***
* mutex_init - initialize the mutex
* @lock: the mutex to be initialized
+ * @key: the lock_class_key for the class; used by mutex lock debugging
*
* Initialize the mutex to unlocked state.
*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 21575fc46d0..1d3ef29a258 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,6 @@
*/
#include <linux/module.h>
-#include <linux/version.h>
#include <linux/nsproxy.h>
#include <linux/init_task.h>
#include <linux/mnt_namespace.h>
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ea567b78d1a..fab8ea86fac 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -179,9 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
-
- /* Child reaper for the pid namespace is going away */
- pid_ns->child_reaper = NULL;
acct_exit_ns(pid_ns);
return;
}
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 8cb75702638..dfdec524d1b 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -24,7 +24,7 @@
* requirement that the application has is cleaned up when closes the file
* pointer or exits the pm_qos_object will get an opportunity to clean up.
*
- * mark gross mgross@linux.intel.com
+ * Mark Gross <mgross@linux.intel.com>
*/
#include <linux/pm_qos_params.h>
@@ -43,7 +43,7 @@
#include <linux/uaccess.h>
/*
- * locking rule: all changes to target_value or requirements or notifiers lists
+ * locking rule: all changes to requirements or notifiers lists
* or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
* held, taken with _irqsave. One lock to rule them all
*/
@@ -66,7 +66,7 @@ struct pm_qos_object {
struct miscdevice pm_qos_power_miscdev;
char *name;
s32 default_value;
- s32 target_value;
+ atomic_t target_value;
s32 (*comparitor)(s32, s32);
};
@@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
.notifiers = &cpu_dma_lat_notifier,
.name = "cpu_dma_latency",
.default_value = 2000 * USEC_PER_SEC,
- .target_value = 2000 * USEC_PER_SEC,
+ .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
.comparitor = min_compare
};
@@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = {
.notifiers = &network_lat_notifier,
.name = "network_latency",
.default_value = 2000 * USEC_PER_SEC,
- .target_value = 2000 * USEC_PER_SEC,
+ .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
.comparitor = min_compare
};
@@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = {
.notifiers = &network_throughput_notifier,
.name = "network_throughput",
.default_value = 0,
- .target_value = 0,
+ .target_value = ATOMIC_INIT(0),
.comparitor = max_compare
};
@@ -150,11 +150,11 @@ static void update_target(int target)
extreme_value = pm_qos_array[target]->comparitor(
extreme_value, node->value);
}
- if (pm_qos_array[target]->target_value != extreme_value) {
+ if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
call_notifier = 1;
- pm_qos_array[target]->target_value = extreme_value;
+ atomic_set(&pm_qos_array[target]->target_value, extreme_value);
pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
- pm_qos_array[target]->target_value);
+ atomic_read(&pm_qos_array[target]->target_value));
}
spin_unlock_irqrestore(&pm_qos_lock, flags);
@@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor)
*/
int pm_qos_requirement(int pm_qos_class)
{
- int ret_val;
- unsigned long flags;
-
- spin_lock_irqsave(&pm_qos_lock, flags);
- ret_val = pm_qos_array[pm_qos_class]->target_value;
- spin_unlock_irqrestore(&pm_qos_lock, flags);
-
- return ret_val;
+ return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
}
EXPORT_SYMBOL_GPL(pm_qos_requirement);
@@ -211,8 +204,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement);
* @value: defines the qos request
*
* This function inserts a new entry in the pm_qos_class list of requested qos
- * performance charactoistics. It recomputes the agregate QoS expectations for
- * the pm_qos_class of parrameters.
+ * performance characteristics. It recomputes the aggregate QoS expectations
+ * for the pm_qos_class of parameters.
*/
int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
{
@@ -250,10 +243,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
* @name: identifies the request
* @value: defines the qos request
*
- * Updates an existing qos requierement for the pm_qos_class of parameters along
+ * Updates an existing qos requirement for the pm_qos_class of parameters along
* with updating the target pm_qos_class value.
*
- * If the named request isn't in the lest then no change is made.
+ * If the named request isn't in the list then no change is made.
*/
int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
{
@@ -287,7 +280,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
* @pm_qos_class: identifies which list of qos request to us
* @name: identifies the request
*
- * Will remove named qos request from pm_qos_class list of parrameters and
+ * Will remove named qos request from pm_qos_class list of parameters and
* recompute the current target value for the pm_qos_class.
*/
void pm_qos_remove_requirement(int pm_qos_class, char *name)
@@ -319,7 +312,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
* @notifier: notifier block managed by caller.
*
* will register the notifier into a notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
*/
int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
{
@@ -338,7 +331,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
* @notifier: notifier block to be removed.
*
* will remove the notifier from the notification chain that gets called
- * uppon changes to the pm_qos_class target value.
+ * upon changes to the pm_qos_class target value.
*/
int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
{
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9a21681aa80..5131e547116 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -289,21 +289,29 @@ void do_schedule_next_timer(struct siginfo *info)
else
schedule_next_timer(timr);
- info->si_overrun = timr->it_overrun_last;
+ info->si_overrun += timr->it_overrun_last;
}
if (timr)
unlock_timer(timr, flags);
}
-int posix_timer_event(struct k_itimer *timr,int si_private)
+int posix_timer_event(struct k_itimer *timr, int si_private)
{
- memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+ /*
+ * FIXME: if ->sigq is queued we can race with
+ * dequeue_signal()->do_schedule_next_timer().
+ *
+ * If dequeue_signal() sees the "right" value of
+ * si_sys_private it calls do_schedule_next_timer().
+ * We re-queue ->sigq and drop ->it_lock().
+ * do_schedule_next_timer() locks the timer
+ * and re-schedules it while ->sigq is pending.
+ * Not really bad, but not that we want.
+ */
timr->sigq->info.si_sys_private = si_private;
- /* Send signal to the process that owns this timer.*/
timr->sigq->info.si_signo = timr->it_sigev_signo;
- timr->sigq->info.si_errno = 0;
timr->sigq->info.si_code = SI_TIMER;
timr->sigq->info.si_tid = timr->it_id;
timr->sigq->info.si_value = timr->it_sigev_value;
@@ -433,8 +441,9 @@ static struct k_itimer * alloc_posix_timer(void)
return tmr;
if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
kmem_cache_free(posix_timers_cache, tmr);
- tmr = NULL;
+ return NULL;
}
+ memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
return tmr;
}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f011e0870b5..bbd85c60f74 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -21,6 +21,7 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
+#include <linux/ftrace.h>
#include "power.h"
@@ -255,7 +256,7 @@ static int create_image(int platform_mode)
int hibernation_snapshot(int platform_mode)
{
- int error;
+ int error, ftrace_save;
/* Free memory before shutting down devices. */
error = swsusp_shrink_memory();
@@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode)
goto Close;
suspend_console();
+ ftrace_save = __ftrace_enabled_save();
error = device_suspend(PMSG_FREEZE);
if (error)
goto Recover_platform;
@@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode)
Resume_devices:
device_resume(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+ __ftrace_enabled_restore(ftrace_save);
resume_console();
Close:
platform_end(platform_mode);
@@ -366,10 +369,11 @@ static int resume_target_kernel(void)
int hibernation_restore(int platform_mode)
{
- int error;
+ int error, ftrace_save;
pm_prepare_console();
suspend_console();
+ ftrace_save = __ftrace_enabled_save();
error = device_suspend(PMSG_QUIESCE);
if (error)
goto Finish;
@@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode)
platform_restore_cleanup(platform_mode);
device_resume(PMSG_RECOVER);
Finish:
+ __ftrace_enabled_restore(ftrace_save);
resume_console();
pm_restore_console();
return error;
@@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode)
int hibernation_platform_enter(void)
{
- int error;
+ int error, ftrace_save;
if (!hibernation_ops)
return -ENOSYS;
@@ -411,6 +416,7 @@ int hibernation_platform_enter(void)
goto Close;
suspend_console();
+ ftrace_save = __ftrace_enabled_save();
error = device_suspend(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -445,6 +451,7 @@ int hibernation_platform_enter(void)
hibernation_ops->finish();
Resume_devices:
device_resume(PMSG_RESTORE);
+ __ftrace_enabled_restore(ftrace_save);
resume_console();
Close:
hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0b7476f5d2a..540b16b6856 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,6 +21,7 @@
#include <linux/freezer.h>
#include <linux/vmstat.h>
#include <linux/syscalls.h>
+#include <linux/ftrace.h>
#include "power.h"
@@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state)
*/
int suspend_devices_and_enter(suspend_state_t state)
{
- int error;
+ int error, ftrace_save;
if (!suspend_ops)
return -ENOSYS;
@@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state)
goto Close;
}
suspend_console();
+ ftrace_save = __ftrace_enabled_save();
suspend_test_start();
error = device_suspend(PMSG_SUSPEND);
if (error) {
@@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
device_resume(PMSG_RESUME);
suspend_test_finish("resume devices");
+ __ftrace_enabled_restore(ftrace_save);
resume_console();
Close:
if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0abf9a463f..80ccac849e4 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -14,7 +14,6 @@
#include <linux/module.h>
#include <linux/file.h>
#include <linux/utsname.h>
-#include <linux/version.h>
#include <linux/delay.h>
#include <linux/bitops.h>
#include <linux/genhd.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index a7f7559c5f6..a430fd04008 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1291,32 +1291,16 @@ static int __init disable_boot_consoles(void)
}
late_initcall(disable_boot_consoles);
-/**
- * tty_write_message - write a message to a certain tty, not just the console.
- * @tty: the destination tty_struct
- * @msg: the message to write
- *
- * This is used for messages that need to be redirected to a specific tty.
- * We don't put it into the syslog queue right now maybe in the future if
- * really needed.
- */
-void tty_write_message(struct tty_struct *tty, char *msg)
-{
- if (tty && tty->ops->write)
- tty->ops->write(tty, msg, strlen(msg));
- return;
-}
-
#if defined CONFIG_PRINTK
-DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
/*
* printk rate limiting, lifted from the networking subsystem.
*
- * This enforces a rate limit: not more than one kernel message
- * every printk_ratelimit_jiffies to make a denial-of-service
- * attack impossible.
+ * This enforces a rate limit: not more than 10 kernel messages
+ * every 5s to make a denial-of-service attack impossible.
*/
+DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
+
int printk_ratelimit(void)
{
return __ratelimit(&printk_ratelimit_state);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 082b3fcb32a..356699a96d5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
if (!dumpable && !capable(CAP_SYS_PTRACE))
return -EPERM;
- return security_ptrace(current, task, mode);
+ return security_ptrace_may_access(task, mode);
}
bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -499,8 +499,7 @@ repeat:
goto repeat;
}
- ret = security_ptrace(current->parent, current,
- PTRACE_MODE_ATTACH);
+ ret = security_ptrace_traceme(current->parent);
/*
* Set the ptrace bit in the process ptrace flags.
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index aad93cdc9f6..37f72e55154 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
+#include <linux/time.h>
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
@@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
static struct rcu_ctrlblk rcu_ctrlblk = {
.cur = -300,
.completed = -300,
+ .pending = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
+ .pending = -300,
.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
@@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
{
int cpu;
cpumask_t cpumask;
+ unsigned long flags;
+
set_need_resched();
+ spin_lock_irqsave(&rcp->lock, flags);
if (unlikely(!rcp->signaled)) {
rcp->signaled = 1;
/*
@@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
for_each_cpu_mask_nr(cpu, cpumask)
smp_send_reschedule(cpu);
}
+ spin_unlock_irqrestore(&rcp->lock, flags);
}
#else
static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
}
#endif
+static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
+ struct rcu_data *rdp)
+{
+ long batch;
+
+ head->next = NULL;
+ smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
+
+ /*
+ * Determine the batch number of this callback.
+ *
+ * Using ACCESS_ONCE to avoid the following error when gcc eliminates
+ * local variable "batch" and emits codes like this:
+ * 1) rdp->batch = rcp->cur + 1 # gets old value
+ * ......
+ * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
+ * then [*nxttail[0], *nxttail[1]) may contain callbacks
+ * that batch# = rdp->batch, see the comment of struct rcu_data.
+ */
+ batch = ACCESS_ONCE(rcp->cur) + 1;
+
+ if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
+ /* process callbacks */
+ rdp->nxttail[0] = rdp->nxttail[1];
+ rdp->nxttail[1] = rdp->nxttail[2];
+ if (rcu_batch_after(batch - 1, rdp->batch))
+ rdp->nxttail[0] = rdp->nxttail[2];
+ }
+
+ rdp->batch = batch;
+ *rdp->nxttail[2] = head;
+ rdp->nxttail[2] = &head->next;
+
+ if (unlikely(++rdp->qlen > qhimark)) {
+ rdp->blimit = INT_MAX;
+ force_quiescent_state(rdp, &rcu_ctrlblk);
+ }
+}
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+ rcp->gp_start = jiffies;
+ rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ int cpu;
+ long delta;
+ unsigned long flags;
+
+ /* Only let one CPU complain about others per time interval. */
+
+ spin_lock_irqsave(&rcp->lock, flags);
+ delta = jiffies - rcp->jiffies_stall;
+ if (delta < 2 || rcp->cur != rcp->completed) {
+ spin_unlock_irqrestore(&rcp->lock, flags);
+ return;
+ }
+ rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+ spin_unlock_irqrestore(&rcp->lock, flags);
+
+ /* OK, time to rat on our buddy... */
+
+ printk(KERN_ERR "RCU detected CPU stalls:");
+ for_each_possible_cpu(cpu) {
+ if (cpu_isset(cpu, rcp->cpumask))
+ printk(" %d", cpu);
+ }
+ printk(" (detected by %d, t=%ld jiffies)\n",
+ smp_processor_id(), (long)(jiffies - rcp->gp_start));
+}
+
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ unsigned long flags;
+
+ printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+ smp_processor_id(), jiffies,
+ jiffies - rcp->gp_start);
+ dump_stack();
+ spin_lock_irqsave(&rcp->lock, flags);
+ if ((long)(jiffies - rcp->jiffies_stall) >= 0)
+ rcp->jiffies_stall =
+ jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+ spin_unlock_irqrestore(&rcp->lock, flags);
+ set_need_resched(); /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+ long delta;
+
+ delta = jiffies - rcp->jiffies_stall;
+ if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+
+ /* We haven't checked in, so go dump stack. */
+ print_cpu_stall(rcp);
+
+ } else if (rcp->cur != rcp->completed && delta >= 2) {
+
+ /* They had two seconds to dump stack, so complain. */
+ print_other_cpu_stall(rcp);
+ }
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
@@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
- struct rcu_data *rdp;
head->func = func;
- head->next = NULL;
local_irq_save(flags);
- rdp = &__get_cpu_var(rcu_data);
- *rdp->nxttail = head;
- rdp->nxttail = &head->next;
- if (unlikely(++rdp->qlen > qhimark)) {
- rdp->blimit = INT_MAX;
- force_quiescent_state(rdp, &rcu_ctrlblk);
- }
+ __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head,
void (*func)(struct rcu_head *rcu))
{
unsigned long flags;
- struct rcu_data *rdp;
head->func = func;
- head->next = NULL;
local_irq_save(flags);
- rdp = &__get_cpu_var(rcu_bh_data);
- *rdp->nxttail = head;
- rdp->nxttail = &head->next;
-
- if (unlikely(++rdp->qlen > qhimark)) {
- rdp->blimit = INT_MAX;
- force_quiescent_state(rdp, &rcu_bh_ctrlblk);
- }
-
+ __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
static inline void raise_rcu_softirq(void)
{
raise_softirq(RCU_SOFTIRQ);
- /*
- * The smp_mb() here is required to ensure that this cpu's
- * __rcu_process_callbacks() reads the most recently updated
- * value of rcu->cur.
- */
- smp_mb();
}
/*
@@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void)
*/
static void rcu_do_batch(struct rcu_data *rdp)
{
+ unsigned long flags;
struct rcu_head *next, *list;
int count = 0;
@@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
}
rdp->donelist = list;
- local_irq_disable();
+ local_irq_save(flags);
rdp->qlen -= count;
- local_irq_enable();
+ local_irq_restore(flags);
if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
@@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
* rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
* period (if necessary).
*/
+
/*
* Register a new batch of callbacks, and start it up if there is currently no
* active batch and the batch to be registered has not already occurred.
@@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
*/
static void rcu_start_batch(struct rcu_ctrlblk *rcp)
{
- if (rcp->next_pending &&
+ if (rcp->cur != rcp->pending &&
rcp->completed == rcp->cur) {
- rcp->next_pending = 0;
- /*
- * next_pending == 0 must be visible in
- * __rcu_process_callbacks() before it can see new value of cur.
- */
- smp_wmb();
rcp->cur++;
+ record_gp_stall_check_time(rcp);
/*
* Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
+ unsigned long flags;
+
if (rdp->quiescbatch != rcp->cur) {
/* start new grace period: */
rdp->qs_pending = 1;
@@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
return;
rdp->qs_pending = 0;
- spin_lock(&rcp->lock);
+ spin_lock_irqsave(&rcp->lock, flags);
/*
* rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
* during cpu startup. Ignore the quiescent state.
@@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
if (likely(rdp->quiescbatch == rcp->cur))
cpu_quiet(rdp->cpu, rcp);
- spin_unlock(&rcp->lock);
+ spin_unlock_irqrestore(&rcp->lock, flags);
}
@@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
* which is dead and hence not processing interrupts.
*/
static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
- struct rcu_head **tail)
+ struct rcu_head **tail, long batch)
{
- local_irq_disable();
- *this_rdp->nxttail = list;
- if (list)
- this_rdp->nxttail = tail;
- local_irq_enable();
+ unsigned long flags;
+
+ if (list) {
+ local_irq_save(flags);
+ this_rdp->batch = batch;
+ *this_rdp->nxttail[2] = list;
+ this_rdp->nxttail[2] = tail;
+ local_irq_restore(flags);
+ }
}
static void __rcu_offline_cpu(struct rcu_data *this_rdp,
struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
- /* if the cpu going offline owns the grace period
+ unsigned long flags;
+
+ /*
+ * if the cpu going offline owns the grace period
* we can block indefinitely waiting for it, so flush
* it here
*/
- spin_lock_bh(&rcp->lock);
+ spin_lock_irqsave(&rcp->lock, flags);
if (rcp->cur != rcp->completed)
cpu_quiet(rdp->cpu, rcp);
- spin_unlock_bh(&rcp->lock);
- rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
- rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
- rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+ rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
+ rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
+ spin_unlock(&rcp->lock);
- local_irq_disable();
this_rdp->qlen += rdp->qlen;
- local_irq_enable();
+ local_irq_restore(flags);
}
static void rcu_offline_cpu(int cpu)
@@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu)
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
- if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
- *rdp->donetail = rdp->curlist;
- rdp->donetail = rdp->curtail;
- rdp->curlist = NULL;
- rdp->curtail = &rdp->curlist;
- }
+ unsigned long flags;
+ long completed_snap;
- if (rdp->nxtlist && !rdp->curlist) {
- local_irq_disable();
- rdp->curlist = rdp->nxtlist;
- rdp->curtail = rdp->nxttail;
- rdp->nxtlist = NULL;
- rdp->nxttail = &rdp->nxtlist;
- local_irq_enable();
+ if (rdp->nxtlist) {
+ local_irq_save(flags);
+ completed_snap = ACCESS_ONCE(rcp->completed);
/*
- * start the next batch of callbacks
+ * move the other grace-period-completed entries to
+ * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
*/
+ if (!rcu_batch_before(completed_snap, rdp->batch))
+ rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
+ else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
+ rdp->nxttail[0] = rdp->nxttail[1];
- /* determine batch number */
- rdp->batch = rcp->cur + 1;
- /* see the comment and corresponding wmb() in
- * the rcu_start_batch()
+ /*
+ * the grace period for entries in
+ * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
+ * move these entries to donelist
*/
- smp_rmb();
+ if (rdp->nxttail[0] != &rdp->nxtlist) {
+ *rdp->donetail = rdp->nxtlist;
+ rdp->donetail = rdp->nxttail[0];
+ rdp->nxtlist = *rdp->nxttail[0];
+ *rdp->donetail = NULL;
+
+ if (rdp->nxttail[1] == rdp->nxttail[0])
+ rdp->nxttail[1] = &rdp->nxtlist;
+ if (rdp->nxttail[2] == rdp->nxttail[0])
+ rdp->nxttail[2] = &rdp->nxtlist;
+ rdp->nxttail[0] = &rdp->nxtlist;
+ }
+
+ local_irq_restore(flags);
+
+ if (rcu_batch_after(rdp->batch, rcp->pending)) {
+ unsigned long flags2;
- if (!rcp->next_pending) {
/* and start it/schedule start if it's a new batch */
- spin_lock(&rcp->lock);
- rcp->next_pending = 1;
- rcu_start_batch(rcp);
- spin_unlock(&rcp->lock);
+ spin_lock_irqsave(&rcp->lock, flags2);
+ if (rcu_batch_after(rdp->batch, rcp->pending)) {
+ rcp->pending = rdp->batch;
+ rcu_start_batch(rcp);
+ }
+ spin_unlock_irqrestore(&rcp->lock, flags2);
}
}
@@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
static void rcu_process_callbacks(struct softirq_action *unused)
{
+ /*
+ * Memory references from any prior RCU read-side critical sections
+ * executed by the interrupted code must be see before any RCU
+ * grace-period manupulations below.
+ */
+
+ smp_mb(); /* See above block comment. */
+
__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+
+ /*
+ * Memory references from any later RCU read-side critical sections
+ * executed by the interrupted code must be see after any RCU
+ * grace-period manupulations above.
+ */
+
+ smp_mb(); /* See above block comment. */
}
static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
{
- /* This cpu has pending rcu entries and the grace period
- * for them has completed.
- */
- if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
- return 1;
+ /* Check for CPU stalls, if enabled. */
+ check_cpu_stall(rcp);
- /* This cpu has no pending entries, but there are new entries */
- if (!rdp->curlist && rdp->nxtlist)
- return 1;
+ if (rdp->nxtlist) {
+ long completed_snap = ACCESS_ONCE(rcp->completed);
+
+ /*
+ * This cpu has pending rcu entries and the grace period
+ * for them has completed.
+ */
+ if (!rcu_batch_before(completed_snap, rdp->batch))
+ return 1;
+ if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
+ rdp->nxttail[0] != rdp->nxttail[1])
+ return 1;
+ if (rdp->nxttail[0] != &rdp->nxtlist)
+ return 1;
+
+ /*
+ * This cpu has pending rcu entries and the new batch
+ * for then hasn't been started nor scheduled start
+ */
+ if (rcu_batch_after(rdp->batch, rcp->pending))
+ return 1;
+ }
/* This cpu has finished callbacks to invoke */
if (rdp->donelist)
@@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu)
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
- return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+ return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
}
+/*
+ * Top-level function driving RCU grace-period detection, normally
+ * invoked from the scheduler-clock interrupt. This function simply
+ * increments counters that are read only from softirq by this same
+ * CPU, so there are no memory barriers required.
+ */
void rcu_check_callbacks(int cpu, int user)
{
if (user ||
@@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user)
static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rcp->lock, flags);
memset(rdp, 0, sizeof(*rdp));
- rdp->curtail = &rdp->curlist;
- rdp->nxttail = &rdp->nxtlist;
+ rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
rdp->donetail = &rdp->donelist;
rdp->quiescbatch = rcp->completed;
rdp->qs_pending = 0;
rdp->cpu = cpu;
rdp->blimit = blimit;
+ spin_unlock_irqrestore(&rcp->lock, flags);
}
static void __cpuinit rcu_online_cpu(int cpu)
@@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
*/
void __init __rcu_init(void)
{
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+ printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
/* Register notifier for non-boot CPUs */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f14f372cf6f..467d5940f62 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head *head)
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*/
+void synchronize_rcu(void); /* Makes kernel-doc tools happy */
synchronize_rcu_xxx(synchronize_rcu, call_rcu)
EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 27827931ca0..ca4bbbe04aa 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -59,14 +59,6 @@
#include <linux/rcupreempt_trace.h>
/*
- * Macro that prevents the compiler from reordering accesses, but does
- * absolutely -nothing- to prevent CPUs from reordering. This is used
- * only to mediate communication between mainline code and hardware
- * interrupt and NMI handlers.
- */
-#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
-
-/*
* PREEMPT_RCU data structures.
*/
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c34bb..35c2d3360ec 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -308,11 +308,16 @@ out:
static int __init rcupreempt_trace_init(void)
{
+ int ret;
+
mutex_init(&rcupreempt_trace_mutex);
rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
if (!rcupreempt_trace_buf)
return 1;
- return rcupreempt_debugfs_init();
+ ret = rcupreempt_debugfs_init();
+ if (ret)
+ kfree(rcupreempt_trace_buf);
+ return ret;
}
static void __exit rcupreempt_trace_cleanup(void)
diff --git a/kernel/relay.c b/kernel/relay.c
index 04006ef970b..8d13a7855c0 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -944,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf,
size_t n_subbufs = buf->chan->n_subbufs;
size_t read_subbuf;
+ if (buf->subbufs_produced == buf->subbufs_consumed &&
+ buf->offset == buf->bytes_consumed)
+ return;
+
if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
relay_subbufs_consumed(buf->chan, buf->cpu, 1);
buf->bytes_consumed = 0;
@@ -975,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
relay_file_read_consume(buf, read_pos, 0);
+ consumed = buf->subbufs_consumed;
+
if (unlikely(buf->offset > subbuf_size)) {
if (produced == consumed)
return 0;
@@ -993,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
if (consumed > produced)
produced += n_subbufs * subbuf_size;
- if (consumed == produced)
+ if (consumed == produced) {
+ if (buf->offset == subbuf_size &&
+ buf->subbufs_produced > buf->subbufs_consumed)
+ return 1;
return 0;
+ }
return 1;
}
diff --git a/kernel/resource.c b/kernel/resource.c
index 74af2d7cb5a..414d6fc9131 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new,
EXPORT_SYMBOL(allocate_resource);
-/**
- * insert_resource - Inserts a resource in the resource tree
- * @parent: parent of the new resource
- * @new: new resource to insert
- *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
- *
- * This function is equivalent to request_resource when no conflict
- * happens. If a conflict happens, and the conflicting resources
- * entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become children of
- * the new resource.
+/*
+ * Insert a resource into the resource tree. If successful, return NULL,
+ * otherwise return the conflicting resource (compare to __request_resource())
*/
-int insert_resource(struct resource *parent, struct resource *new)
+static struct resource * __insert_resource(struct resource *parent, struct resource *new)
{
- int result;
struct resource *first, *next;
- write_lock(&resource_lock);
-
for (;; parent = first) {
- result = 0;
first = __request_resource(parent, new);
if (!first)
- goto out;
+ return first;
- result = -EBUSY;
if (first == parent)
- goto out;
+ return first;
if ((first->start > new->start) || (first->end < new->end))
break;
@@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new)
for (next = first; ; next = next->sibling) {
/* Partial overlap? Bad, and unfixable */
if (next->start < new->start || next->end > new->end)
- goto out;
+ return next;
if (!next->sibling)
break;
if (next->sibling->start > new->end)
break;
}
- result = 0;
-
new->parent = parent;
new->sibling = next->sibling;
new->child = first;
@@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new)
next = next->sibling;
next->sibling = new;
}
+ return NULL;
+}
- out:
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is equivalent to request_resource when no conflict
+ * happens. If a conflict happens, and the conflicting resources
+ * entirely fit within the range of the new resource, then the new
+ * resource is inserted and the conflicting resources become children of
+ * the new resource.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+ struct resource *conflict;
+
+ write_lock(&resource_lock);
+ conflict = __insert_resource(parent, new);
+ write_unlock(&resource_lock);
+ return conflict ? -EBUSY : 0;
+}
+
+/**
+ * insert_resource_expand_to_fit - Insert a resource into the resource tree
+ * @root: root resource descriptor
+ * @new: new resource to insert
+ *
+ * Insert a resource into the resource tree, possibly expanding it in order
+ * to make it encompass any conflicting resources.
+ */
+void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
+{
+ if (new->parent)
+ return;
+
+ write_lock(&resource_lock);
+ for (;;) {
+ struct resource *conflict;
+
+ conflict = __insert_resource(root, new);
+ if (!conflict)
+ break;
+ if (conflict == root)
+ break;
+
+ /* Ok, expand resource to cover the conflict, then try again .. */
+ if (conflict->start < new->start)
+ new->start = conflict->start;
+ if (conflict->end > new->end)
+ new->end = conflict->end;
+
+ printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
+ }
write_unlock(&resource_lock);
- return result;
}
/**
@@ -478,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
return result;
}
+static void __init __reserve_region_with_split(struct resource *root,
+ resource_size_t start, resource_size_t end,
+ const char *name)
+{
+ struct resource *parent = root;
+ struct resource *conflict;
+ struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+
+ if (!res)
+ return;
+
+ res->name = name;
+ res->start = start;
+ res->end = end;
+ res->flags = IORESOURCE_BUSY;
+
+ for (;;) {
+ conflict = __request_resource(parent, res);
+ if (!conflict)
+ break;
+ if (conflict != parent) {
+ parent = conflict;
+ if (!(conflict->flags & IORESOURCE_BUSY))
+ continue;
+ }
+
+ /* Uhhuh, that didn't work out.. */
+ kfree(res);
+ res = NULL;
+ break;
+ }
+
+ if (!res) {
+ printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n",
+ conflict->name, conflict->start, conflict->end,
+ name, start, end);
+
+ /* failed, split and try again */
+
+ /* conflict coverred whole area */
+ if (conflict->start <= start && conflict->end >= end)
+ return;
+
+ if (conflict->start > start)
+ __reserve_region_with_split(root, start, conflict->start-1, name);
+ if (!(conflict->flags & IORESOURCE_BUSY)) {
+ resource_size_t common_start, common_end;
+
+ common_start = max(conflict->start, start);
+ common_end = min(conflict->end, end);
+ if (common_start < common_end)
+ __reserve_region_with_split(root, common_start, common_end, name);
+ }
+ if (conflict->end < end)
+ __reserve_region_with_split(root, conflict->end+1, end, name);
+ }
+
+}
+
+void reserve_region_with_split(struct resource *root,
+ resource_size_t start, resource_size_t end,
+ const char *name)
+{
+ write_lock(&resource_lock);
+ __reserve_region_with_split(root, start, end, name);
+ write_unlock(&resource_lock);
+}
+
EXPORT_SYMBOL(adjust_resource);
/**
@@ -490,7 +596,7 @@ resource_size_t resource_alignment(struct resource *res)
{
switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
case IORESOURCE_SIZEALIGN:
- return res->end - res->start + 1;
+ return resource_size(res);
case IORESOURCE_STARTALIGN:
return res->start;
default:
diff --git a/kernel/sched.c b/kernel/sched.c
index 0236958addc..6f230596bd0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+}
+
+static inline int rt_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
}
static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
ktime_t now;
- if (rt_b->rt_runtime == RUNTIME_INF)
+ if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
return;
if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
#endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_USER_SCHED */
#define root_task_group init_task_group
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_USER_SCHED */
/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
@@ -600,14 +605,13 @@ struct rq {
/* BKL stats */
unsigned int bkl_count;
#endif
- struct lock_class_key rq_lock_key;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
{
- rq->curr->sched_class->check_preempt_curr(rq, p);
+ rq->curr->sched_class->check_preempt_curr(rq, p, sync);
}
static inline int cpu_of(struct rq *rq)
@@ -809,9 +813,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
* ratelimit for updating the group shares.
- * default: 0.5ms
+ * default: 0.25ms
*/
-const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+unsigned int sysctl_sched_shares_ratelimit = 250000;
/*
* period over which we measure -rt task cpu usage in us.
@@ -834,7 +838,7 @@ static inline u64 global_rt_period(void)
static inline u64 global_rt_runtime(void)
{
- if (sysctl_sched_rt_period < 0)
+ if (sysctl_sched_rt_runtime < 0)
return RUNTIME_INF;
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
@@ -1088,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_DONE;
}
-static void init_hrtick(void)
+static __init void init_hrtick(void)
{
hotcpu_notifier(hotplug_hrtick, 0);
}
@@ -1103,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
}
-static void init_hrtick(void)
+static inline void init_hrtick(void)
{
}
#endif /* CONFIG_SMP */
@@ -1120,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq)
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
}
-#else
+#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
@@ -1134,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
static inline void init_hrtick(void)
{
}
-#endif
+#endif /* CONFIG_SCHED_HRTICK */
/*
* resched_task - mark a task 'to be rescheduled now'.
@@ -1381,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
update_load_sub(&rq->load, load);
}
-#ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- if (rq->nr_running)
- rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-
- return rq->avg_load_per_task;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+typedef int (*tg_visitor)(struct task_group *, void *);
/*
* Iterate the full tree, calling @down when first entering a node and @up when
* leaving it for the final time.
*/
-static void
-walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
+ int ret;
rcu_read_lock();
parent = &root_task_group;
down:
- (*down)(parent, cpu, sd);
+ ret = (*down)(parent, data);
+ if (ret)
+ goto out_unlock;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
@@ -1420,15 +1410,43 @@ down:
up:
continue;
}
- (*up)(parent, cpu, sd);
+ ret = (*up)(parent, data);
+ if (ret)
+ goto out_unlock;
child = parent;
parent = parent->parent;
if (parent)
goto up;
+out_unlock:
rcu_read_unlock();
+
+ return ret;
+}
+
+static int tg_nop(struct task_group *tg, void *data)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->nr_running)
+ rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+
+ return rq->avg_load_per_task;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
/*
@@ -1487,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
* This needs to be done in a bottom-up fashion because the rq weight of a
* parent group depends on the shares of its child groups.
*/
-static void
-tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long rq_weight = 0;
unsigned long shares = 0;
+ struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
@@ -1516,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
__update_group_shares_cpu(tg, i, shares, rq_weight);
spin_unlock_irqrestore(&rq->lock, flags);
}
+
+ return 0;
}
/*
@@ -1523,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
* This needs to be done in a top-down fashion because the load of a child
* group is a fraction of its parents load.
*/
-static void
-tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_load_down(struct task_group *tg, void *data)
{
unsigned long load;
+ long cpu = (long)data;
if (!tg->parent) {
load = cpu_rq(cpu)->load.weight;
@@ -1537,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
}
tg->cfs_rq[cpu]->h_load = load;
-}
-static void
-tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
+ return 0;
}
static void update_shares(struct sched_domain *sd)
@@ -1551,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
sd->last_update = now;
- walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+ walk_tg_tree(tg_nop, tg_shares_up, sd);
}
}
@@ -1562,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
spin_lock(&rq->lock);
}
-static void update_h_load(int cpu)
+static void update_h_load(long cpu)
{
- walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+ walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
#else
@@ -1922,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
- if (!match_state || p->state == match_state) {
- ncsw = p->nivcsw + p->nvcsw;
- if (unlikely(!ncsw))
- ncsw = 1;
- }
+ if (!match_state || p->state == match_state)
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, &flags);
/*
@@ -2286,7 +2300,7 @@ out_running:
trace_mark(kernel_sched_wakeup,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
- check_preempt_curr(rq, p);
+ check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
@@ -2421,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
trace_mark(kernel_sched_wakeup_new,
"pid %d state %ld ## rq %p task %p rq->curr %p",
p->pid, p->state, rq, p, rq->curr);
- check_preempt_curr(rq, p);
+ check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
p->sched_class->task_wake_up(rq, p);
@@ -2759,10 +2773,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
} else {
if (rq1 < rq2) {
spin_lock(&rq1->lock);
- spin_lock(&rq2->lock);
+ spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
} else {
spin_lock(&rq2->lock);
- spin_lock(&rq1->lock);
+ spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
}
}
update_rq_clock(rq1);
@@ -2805,14 +2819,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
spin_lock(&busiest->lock);
- spin_lock(&this_rq->lock);
+ spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
ret = 1;
} else
- spin_lock(&busiest->lock);
+ spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
}
return ret;
}
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(busiest->lock)
+{
+ spin_unlock(&busiest->lock);
+ lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
@@ -2874,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
* Note that idle threads have a prio of MAX_PRIO, for this test
* to be always true for them.
*/
- check_preempt_curr(this_rq, p);
+ check_preempt_curr(this_rq, p, 0);
}
/*
@@ -3637,7 +3658,7 @@ redo:
ld_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, CPU_NEWLY_IDLE,
&all_pinned);
- spin_unlock(&busiest->lock);
+ double_unlock_balance(this_rq, busiest);
if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), *cpus);
@@ -3752,7 +3773,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
else
schedstat_inc(sd, alb_failed);
}
- spin_unlock(&target_rq->lock);
+ double_unlock_balance(busiest_rq, target_rq);
}
#ifdef CONFIG_NO_HZ
@@ -4173,6 +4194,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
}
/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+cputime_t task_utime(struct task_struct *p)
+{
+ return p->utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+ return p->stime;
+}
+#else
+cputime_t task_utime(struct task_struct *p)
+{
+ clock_t utime = cputime_to_clock_t(p->utime),
+ total = utime + cputime_to_clock_t(p->stime);
+ u64 temp;
+
+ /*
+ * Use CFS's precise accounting:
+ */
+ temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+
+ if (total) {
+ temp *= utime;
+ do_div(temp, total);
+ }
+ utime = (clock_t)temp;
+
+ p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
+ return p->prev_utime;
+}
+
+cputime_t task_stime(struct task_struct *p)
+{
+ clock_t stime;
+
+ /*
+ * Use CFS's precise accounting. (we subtract utime from
+ * the total, to make sure the total observed by userspace
+ * grows monotonically - apps rely on that):
+ */
+ stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+ cputime_to_clock_t(task_utime(p));
+
+ if (stime >= 0)
+ p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
+
+ return p->prev_stime;
+}
+#endif
+
+inline cputime_t task_gtime(struct task_struct *p)
+{
+ return p->gtime;
+}
+
+/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
@@ -4562,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
void complete(struct completion *x)
{
unsigned long flags;
@@ -4573,6 +4662,12 @@ void complete(struct completion *x)
}
EXPORT_SYMBOL(complete);
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x: holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
void complete_all(struct completion *x)
{
unsigned long flags;
@@ -4593,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue_tail(&x->wait, &wait);
do {
- if ((state == TASK_INTERRUPTIBLE &&
- signal_pending(current)) ||
- (state == TASK_KILLABLE &&
- fatal_signal_pending(current))) {
+ if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
@@ -4624,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
return timeout;
}
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
void __sched wait_for_completion(struct completion *x)
{
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
{
@@ -4637,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
}
EXPORT_SYMBOL(wait_for_completion_timeout);
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x: holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
int __sched wait_for_completion_interruptible(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4646,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
}
EXPORT_SYMBOL(wait_for_completion_interruptible);
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x: holds the state of this particular completion
+ * @timeout: timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
unsigned long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
@@ -4654,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x: holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
int __sched wait_for_completion_killable(struct completion *x)
{
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -4663,6 +4796,52 @@ int __sched wait_for_completion_killable(struct completion *x)
}
EXPORT_SYMBOL(wait_for_completion_killable);
+/**
+ * try_wait_for_completion - try to decrement a completion without blocking
+ * @x: completion structure
+ *
+ * Returns: 0 if a decrement cannot be done without blocking
+ * 1 if a decrement succeeded.
+ *
+ * If a completion is being used as a counting completion,
+ * attempt to decrement the counter without blocking. This
+ * enables us to avoid waiting if the resource the completion
+ * is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+ int ret = 1;
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done)
+ ret = 0;
+ else
+ x->done--;
+ spin_unlock_irq(&x->wait.lock);
+ return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+
+/**
+ * completion_done - Test to see if a completion has any waiters
+ * @x: completion structure
+ *
+ * Returns: 0 if there are waiters (wait_for_completion() in progress)
+ * 1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+ int ret = 1;
+
+ spin_lock_irq(&x->wait.lock);
+ if (!x->done)
+ ret = 0;
+ spin_unlock_irq(&x->wait.lock);
+ return ret;
+}
+EXPORT_SYMBOL(completion_done);
+
static long __sched
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
{
@@ -5004,19 +5183,22 @@ recheck:
return -EPERM;
}
+ if (user) {
#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (user
- && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
- return -EPERM;
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0)
+ return -EPERM;
#endif
- retval = security_task_setscheduler(p, policy, param);
- if (retval)
- return retval;
+ retval = security_task_setscheduler(p, policy, param);
+ if (retval)
+ return retval;
+ }
+
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -5732,6 +5914,8 @@ static inline void sched_init_granularity(void)
sysctl_sched_latency = limit;
sysctl_sched_wakeup_granularity *= factor;
+
+ sysctl_sched_shares_ratelimit *= factor;
}
#ifdef CONFIG_SMP
@@ -5842,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
set_task_cpu(p, dest_cpu);
if (on_rq) {
activate_task(rq_dest, p, 0);
- check_preempt_curr(rq_dest, p);
+ check_preempt_curr(rq_dest, p, 0);
}
done:
ret = 1;
@@ -6167,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(12);
+ struct ctl_table *table = sd_alloc_ctl_entry(13);
if (table == NULL)
return NULL;
@@ -6195,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax);
- /* &table[11] is terminator */
+ set_table_entry(&table[11], "name", sd->name,
+ CORENAME_MAX_SIZE, 0444, proc_dostring);
+ /* &table[12] is terminator */
return table;
}
@@ -7079,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type) sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type) do { } while (0)
+#endif
+
#define SD_INIT(sd, type) sd_init_##type(sd)
+
#define SD_INIT_FUNC(type) \
static noinline void sd_init_##type(struct sched_domain *sd) \
{ \
memset(sd, 0, sizeof(*sd)); \
*sd = SD_##type##_INIT; \
sd->level = SD_LV_##type; \
+ SD_INIT_NAME(sd, type); \
}
SD_INIT_FUNC(CPU)
@@ -7581,24 +7775,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* and partition_sched_domains() will fallback to the single partition
* 'fallback_doms', it also forces the domains to be rebuilt.
*
+ * If doms_new==NULL it will be replaced with cpu_online_map.
+ * ndoms_new==0 is a special case for destroying existing domains.
+ * It will not create the default domain.
+ *
* Call with hotplug lock held
*/
void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
struct sched_domain_attr *dattr_new)
{
- int i, j;
+ int i, j, n;
mutex_lock(&sched_domains_mutex);
/* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl();
- if (doms_new == NULL)
- ndoms_new = 0;
+ n = doms_new ? ndoms_new : 0;
/* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) {
- for (j = 0; j < ndoms_new; j++) {
+ for (j = 0; j < n; j++) {
if (cpus_equal(doms_cur[i], doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
@@ -7611,7 +7808,6 @@ match1:
if (doms_new == NULL) {
ndoms_cur = 0;
- ndoms_new = 1;
doms_new = &fallback_doms;
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
dattr_new = NULL;
@@ -7648,8 +7844,13 @@ match2:
int arch_reinit_sched_domains(void)
{
get_online_cpus();
+
+ /* Destroy domains first to force the rebuild */
+ partition_sched_domains(0, NULL, NULL);
+
rebuild_sched_domains();
put_online_cpus();
+
return 0;
}
@@ -7671,34 +7872,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
}
#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *page)
+static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+ char *page)
{
return sprintf(page, "%u\n", sched_mc_power_savings);
}
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 0);
}
-static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
- sched_mc_power_savings_store);
+static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
+ sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
#endif
#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *page)
+static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+ char *page)
{
return sprintf(page, "%u\n", sched_smt_power_savings);
}
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 1);
}
-static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+ sched_smt_power_savings_show,
sched_smt_power_savings_store);
#endif
@@ -7733,7 +7934,7 @@ static int update_sched_domains(struct notifier_block *nfb,
case CPU_ONLINE_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- partition_sched_domains(0, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
return NOTIFY_OK;
default:
@@ -7998,7 +8199,6 @@ void __init sched_init(void)
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
- lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
@@ -8121,20 +8321,25 @@ void __might_sleep(char *file, int line)
#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
- if ((in_atomic() || irqs_disabled()) &&
- system_state == SYSTEM_RUNNING && !oops_in_progress) {
- if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
- return;
- prev_jiffy = jiffies;
- printk(KERN_ERR "BUG: sleeping function called from invalid"
- " context at %s:%d\n", file, line);
- printk("in_atomic():%d, irqs_disabled():%d\n",
- in_atomic(), irqs_disabled());
- debug_show_held_locks(current);
- if (irqs_disabled())
- print_irqtrace_events(current);
- dump_stack();
- }
+ if ((!in_atomic() && !irqs_disabled()) ||
+ system_state != SYSTEM_RUNNING || oops_in_progress)
+ return;
+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+ return;
+ prev_jiffy = jiffies;
+
+ printk(KERN_ERR
+ "BUG: sleeping function called from invalid context at %s:%d\n",
+ file, line);
+ printk(KERN_ERR
+ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(),
+ current->pid, current->comm);
+
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+ dump_stack();
#endif
}
EXPORT_SYMBOL(__might_sleep);
@@ -8455,8 +8660,8 @@ struct task_group *sched_create_group(struct task_group *parent)
WARN_ON(!parent); /* root should already exist */
tg->parent = parent;
- list_add_rcu(&tg->siblings, &parent->children);
INIT_LIST_HEAD(&tg->children);
+ list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
return tg;
@@ -8632,73 +8837,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
- return 1ULL << 16;
+ return 1ULL << 20;
- return div64_u64(runtime << 16, period);
+ return div64_u64(runtime << 20, period);
}
-#ifdef CONFIG_CGROUP_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
{
- struct task_group *tgi, *parent = tg->parent;
- unsigned long total = 0;
+ struct task_struct *g, *p;
- if (!parent) {
- if (global_rt_period() < period)
- return 0;
+ do_each_thread(g, p) {
+ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+ return 1;
+ } while_each_thread(g, p);
- return to_ratio(period, runtime) <
- to_ratio(global_rt_period(), global_rt_runtime());
- }
+ return 0;
+}
- if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
- return 0;
+struct rt_schedulable_data {
+ struct task_group *tg;
+ u64 rt_period;
+ u64 rt_runtime;
+};
- rcu_read_lock();
- list_for_each_entry_rcu(tgi, &parent->children, siblings) {
- if (tgi == tg)
- continue;
+static int tg_schedulable(struct task_group *tg, void *data)
+{
+ struct rt_schedulable_data *d = data;
+ struct task_group *child;
+ unsigned long total, sum = 0;
+ u64 period, runtime;
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
+ period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ runtime = tg->rt_bandwidth.rt_runtime;
+
+ if (tg == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
}
- rcu_read_unlock();
- return total + to_ratio(period, runtime) <=
- to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
- parent->rt_bandwidth.rt_runtime);
-}
-#elif defined CONFIG_USER_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
- struct task_group *tgi;
- unsigned long total = 0;
- unsigned long global_ratio =
- to_ratio(global_rt_period(), global_rt_runtime());
+ /*
+ * Cannot have more runtime than the period.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
- rcu_read_lock();
- list_for_each_entry_rcu(tgi, &task_groups, list) {
- if (tgi == tg)
- continue;
+ /*
+ * Ensure we don't starve existing RT tasks.
+ */
+ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ return -EBUSY;
+
+ total = to_ratio(period, runtime);
+
+ /*
+ * Nobody can have more than the global setting allows.
+ */
+ if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+ return -EINVAL;
- total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
- tgi->rt_bandwidth.rt_runtime);
+ /*
+ * The sum of our children's runtime should not exceed our own.
+ */
+ list_for_each_entry_rcu(child, &tg->children, siblings) {
+ period = ktime_to_ns(child->rt_bandwidth.rt_period);
+ runtime = child->rt_bandwidth.rt_runtime;
+
+ if (child == d->tg) {
+ period = d->rt_period;
+ runtime = d->rt_runtime;
+ }
+
+ sum += to_ratio(period, runtime);
}
- rcu_read_unlock();
- return total + to_ratio(period, runtime) < global_ratio;
+ if (sum > total)
+ return -EINVAL;
+
+ return 0;
}
-#endif
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
- struct task_struct *g, *p;
- do_each_thread(g, p) {
- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
- return 1;
- } while_each_thread(g, p);
- return 0;
+ struct rt_schedulable_data data = {
+ .tg = tg,
+ .rt_period = period,
+ .rt_runtime = runtime,
+ };
+
+ return walk_tg_tree(tg_schedulable, tg_nop, &data);
}
static int tg_set_bandwidth(struct task_group *tg,
@@ -8708,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg,
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
- if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
- err = -EBUSY;
- goto unlock;
- }
- if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
- err = -EINVAL;
+ err = __rt_schedulable(tg, rt_period, rt_runtime);
+ if (err)
goto unlock;
- }
spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8784,16 +9006,25 @@ long sched_group_rt_period(struct task_group *tg)
static int sched_rt_global_constraints(void)
{
- struct task_group *tg = &root_task_group;
- u64 rt_runtime, rt_period;
+ u64 runtime, period;
int ret = 0;
- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- rt_runtime = tg->rt_bandwidth.rt_runtime;
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
+ runtime = global_rt_runtime();
+ period = global_rt_period();
+
+ /*
+ * Sanity check on the sysctl variables.
+ */
+ if (runtime > period && runtime != RUNTIME_INF)
+ return -EINVAL;
mutex_lock(&rt_constraints_mutex);
- if (!__rt_schedulable(tg, rt_period, rt_runtime))
- ret = -EINVAL;
+ read_lock(&tasklist_lock);
+ ret = __rt_schedulable(NULL, 0, 0);
+ read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
return ret;
@@ -8804,6 +9035,9 @@ static int sched_rt_global_constraints(void)
unsigned long flags;
int i;
+ if (sysctl_sched_rt_period <= 0)
+ return -EINVAL;
+
spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8864,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
if (!cgrp->parent) {
/* This is early initialization for the top cgroup */
- init_task_group.css.cgroup = cgrp;
return &init_task_group.css;
}
@@ -8873,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- /* Bind the cgroup to task_group object we just created */
- tg->css.cgroup = cgrp;
-
return &tg->css;
}
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 22ed55d1167..e8ab096ddfe 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -12,19 +12,17 @@
*
* Create a semi stable clock from a mixture of other events, including:
* - gtod
- * - jiffies
* - sched_clock()
* - explicit idle events
*
* We use gtod as base and the unstable clock deltas. The deltas are filtered,
- * making it monotonic and keeping it within an expected window. This window
- * is set up using jiffies.
+ * making it monotonic and keeping it within an expected window.
*
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
* that is otherwise invisible (TSC gets stopped).
*
* The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
- * consistent between cpus (never more than 1 jiffies difference).
+ * consistent between cpus (never more than 2 jiffies difference).
*/
#include <linux/sched.h>
#include <linux/percpu.h>
@@ -32,13 +30,19 @@
#include <linux/ktime.h>
#include <linux/module.h>
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+ return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static __read_mostly int sched_clock_running;
-#define MULTI_SHIFT 15
-/* Max is double, Min is 1/2 */
-#define MAX_MULTI (2LL << MULTI_SHIFT)
-#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
struct sched_clock_data {
/*
@@ -48,15 +52,9 @@ struct sched_clock_data {
*/
raw_spinlock_t lock;
- unsigned long tick_jiffies;
- u64 prev_raw;
u64 tick_raw;
u64 tick_gtod;
u64 clock;
- s64 multi;
-#ifdef CONFIG_NO_HZ
- int check_max;
-#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -71,121 +69,69 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
return &per_cpu(sched_clock_data, cpu);
}
-static __read_mostly int sched_clock_running;
-
void sched_clock_init(void)
{
u64 ktime_now = ktime_to_ns(ktime_get());
- unsigned long now_jiffies = jiffies;
int cpu;
for_each_possible_cpu(cpu) {
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
- scd->tick_jiffies = now_jiffies;
- scd->prev_raw = 0;
scd->tick_raw = 0;
scd->tick_gtod = ktime_now;
scd->clock = ktime_now;
- scd->multi = 1 << MULTI_SHIFT;
-#ifdef CONFIG_NO_HZ
- scd->check_max = 1;
-#endif
}
sched_clock_running = 1;
}
-#ifdef CONFIG_NO_HZ
/*
- * The dynamic ticks makes the delta jiffies inaccurate. This
- * prevents us from checking the maximum time update.
- * Disable the maximum check during stopped ticks.
+ * min,max except they take wrapping into account
*/
-void sched_clock_tick_stop(int cpu)
-{
- struct sched_clock_data *scd = cpu_sdc(cpu);
-
- scd->check_max = 0;
-}
-void sched_clock_tick_start(int cpu)
+static inline u64 wrap_min(u64 x, u64 y)
{
- struct sched_clock_data *scd = cpu_sdc(cpu);
-
- scd->check_max = 1;
+ return (s64)(x - y) < 0 ? x : y;
}
-static int check_max(struct sched_clock_data *scd)
+static inline u64 wrap_max(u64 x, u64 y)
{
- return scd->check_max;
+ return (s64)(x - y) > 0 ? x : y;
}
-#else
-static int check_max(struct sched_clock_data *scd)
-{
- return 1;
-}
-#endif /* CONFIG_NO_HZ */
/*
* update the percpu scd from the raw @now value
*
* - filter out backward motion
- * - use jiffies to generate a min,max window to clip the raw values
+ * - use the GTOD tick value to create a window to filter crazy TSC values
*/
-static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
+static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
{
- unsigned long now_jiffies = jiffies;
- long delta_jiffies = now_jiffies - scd->tick_jiffies;
- u64 clock = scd->clock;
- u64 min_clock, max_clock;
- s64 delta = now - scd->prev_raw;
+ s64 delta = now - scd->tick_raw;
+ u64 clock, min_clock, max_clock;
WARN_ON_ONCE(!irqs_disabled());
- /*
- * At schedule tick the clock can be just under the gtod. We don't
- * want to push it too prematurely.
- */
- min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
- if (min_clock > TICK_NSEC)
- min_clock -= TICK_NSEC / 2;
-
- if (unlikely(delta < 0)) {
- clock++;
- goto out;
- }
+ if (unlikely(delta < 0))
+ delta = 0;
/*
- * The clock must stay within a jiffie of the gtod.
- * But since we may be at the start of a jiffy or the end of one
- * we add another jiffy buffer.
+ * scd->clock = clamp(scd->tick_gtod + delta,
+ * max(scd->tick_gtod, scd->clock),
+ * scd->tick_gtod + TICK_NSEC);
*/
- max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
- delta *= scd->multi;
- delta >>= MULTI_SHIFT;
+ clock = scd->tick_gtod + delta;
+ min_clock = wrap_max(scd->tick_gtod, scd->clock);
+ max_clock = scd->tick_gtod + TICK_NSEC;
- if (unlikely(clock + delta > max_clock) && check_max(scd)) {
- if (clock < max_clock)
- clock = max_clock;
- else
- clock++;
- } else {
- clock += delta;
- }
+ clock = wrap_max(clock, min_clock);
+ clock = wrap_min(clock, max_clock);
- out:
- if (unlikely(clock < min_clock))
- clock = min_clock;
+ scd->clock = clock;
- if (time)
- *time = clock;
- else {
- scd->prev_raw = now;
- scd->clock = clock;
- }
+ return scd->clock;
}
static void lock_double_clock(struct sched_clock_data *data1,
@@ -203,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1,
u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
- u64 now, clock;
+ u64 now, clock, this_clock, remote_clock;
if (unlikely(!sched_clock_running))
return 0ull;
@@ -212,43 +158,44 @@ u64 sched_clock_cpu(int cpu)
now = sched_clock();
if (cpu != raw_smp_processor_id()) {
- /*
- * in order to update a remote cpu's clock based on our
- * unstable raw time rebase it against:
- * tick_raw (offset between raw counters)
- * tick_gotd (tick offset between cpus)
- */
struct sched_clock_data *my_scd = this_scd();
lock_double_clock(scd, my_scd);
- now -= my_scd->tick_raw;
- now += scd->tick_raw;
+ this_clock = __update_sched_clock(my_scd, now);
+ remote_clock = scd->clock;
- now += my_scd->tick_gtod;
- now -= scd->tick_gtod;
+ /*
+ * Use the opportunity that we have both locks
+ * taken to couple the two clocks: we take the
+ * larger time as the latest time for both
+ * runqueues. (this creates monotonic movement)
+ */
+ if (likely((s64)(remote_clock - this_clock) < 0)) {
+ clock = this_clock;
+ scd->clock = clock;
+ } else {
+ /*
+ * Should be rare, but possible:
+ */
+ clock = remote_clock;
+ my_scd->clock = remote_clock;
+ }
__raw_spin_unlock(&my_scd->lock);
-
- __update_sched_clock(scd, now, &clock);
-
- __raw_spin_unlock(&scd->lock);
-
} else {
__raw_spin_lock(&scd->lock);
- __update_sched_clock(scd, now, NULL);
- clock = scd->clock;
- __raw_spin_unlock(&scd->lock);
+ clock = __update_sched_clock(scd, now);
}
+ __raw_spin_unlock(&scd->lock);
+
return clock;
}
void sched_clock_tick(void)
{
struct sched_clock_data *scd = this_scd();
- unsigned long now_jiffies = jiffies;
- s64 mult, delta_gtod, delta_raw;
u64 now, now_gtod;
if (unlikely(!sched_clock_running))
@@ -260,29 +207,9 @@ void sched_clock_tick(void)
now = sched_clock();
__raw_spin_lock(&scd->lock);
- __update_sched_clock(scd, now, NULL);
- /*
- * update tick_gtod after __update_sched_clock() because that will
- * already observe 1 new jiffy; adding a new tick_gtod to that would
- * increase the clock 2 jiffies.
- */
- delta_gtod = now_gtod - scd->tick_gtod;
- delta_raw = now - scd->tick_raw;
-
- if ((long)delta_raw > 0) {
- mult = delta_gtod << MULTI_SHIFT;
- do_div(mult, delta_raw);
- scd->multi = mult;
- if (scd->multi > MAX_MULTI)
- scd->multi = MAX_MULTI;
- else if (scd->multi < MIN_MULTI)
- scd->multi = MIN_MULTI;
- } else
- scd->multi = 1 << MULTI_SHIFT;
-
scd->tick_raw = now;
scd->tick_gtod = now_gtod;
- scd->tick_jiffies = now_jiffies;
+ __update_sched_clock(scd, now);
__raw_spin_unlock(&scd->lock);
}
@@ -300,37 +227,28 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
*/
void sched_clock_idle_wakeup_event(u64 delta_ns)
{
- struct sched_clock_data *scd = this_scd();
- u64 now = sched_clock();
-
- /*
- * Override the previous timestamp and ignore all
- * sched_clock() deltas that occured while we idled,
- * and use the PM-provided delta_ns to advance the
- * rq clock:
- */
- __raw_spin_lock(&scd->lock);
- scd->prev_raw = now;
- scd->clock += delta_ns;
- scd->multi = 1 << MULTI_SHIFT;
- __raw_spin_unlock(&scd->lock);
-
+ sched_clock_tick();
touch_softlockup_watchdog();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-#endif
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-/*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
+void sched_clock_init(void)
{
- return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+ sched_clock_running = 1;
}
+u64 sched_clock_cpu(int cpu)
+{
+ if (unlikely(!sched_clock_running))
+ return 0;
+
+ return sched_clock();
+}
+
+#endif
+
unsigned long long cpu_clock(int cpu)
{
unsigned long long clock;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index bbe6b31c3c5..ad958c1ec70 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
unsigned long flags;
int num_threads = 1;
- rcu_read_lock();
if (lock_task_sighand(p, &flags)) {
num_threads = atomic_read(&p->signal->count);
unlock_task_sighand(p, &flags);
}
- rcu_read_unlock();
SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
SEQ_printf(m,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cf2cd6ce4cb..18fd17172eb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- * -20 |
- * |
- * 0 --------+-------
- * .'
- * 19 .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
- struct load_weight lw = {
- .weight = NICE_0_LOAD,
- .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
- };
-
- for_each_sched_entity(se) {
- struct load_weight *se_lw = &se->load;
- unsigned long rw = cfs_rq_of(se)->load.weight;
-
-#ifdef CONFIG_FAIR_SCHED_GROUP
- struct cfs_rq *cfs_rq = se->my_q;
- struct task_group *tg = NULL
-
- if (cfs_rq)
- tg = cfs_rq->tg;
-
- if (tg && tg->shares < NICE_0_LOAD) {
- /*
- * scale shares to what it would have been had
- * tg->weight been NICE_0_LOAD:
- *
- * weight = 1024 * shares / tg->weight
- */
- lw.weight *= se->load.weight;
- lw.weight /= tg->shares;
-
- lw.inv_weight = 0;
-
- se_lw = &lw;
- rw += lw.weight - se->load.weight;
- } else
-#endif
-
- if (se->load.weight < NICE_0_LOAD) {
- se_lw = &lw;
- rw += NICE_0_LOAD - se->load.weight;
- }
-
- delta = calc_delta_mine(delta, rw, se_lw);
- }
-
- return delta;
-}
-
-/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
@@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
inc_cpu_load(rq_of(cfs_rq), se->load.weight);
- if (entity_is_task(se))
+ if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, se->load.weight);
+ list_add(&se->group_node, &cfs_rq->tasks);
+ }
cfs_rq->nr_running++;
se->on_rq = 1;
- list_add(&se->group_node, &cfs_rq->tasks);
}
static void
@@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
dec_cpu_load(rq_of(cfs_rq), se->load.weight);
- if (entity_is_task(se))
+ if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, -se->load.weight);
+ list_del_init(&se->group_node);
+ }
cfs_rq->nr_running--;
se->on_rq = 0;
- list_del_init(&se->group_node);
}
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -899,7 +843,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
* doesn't make sense. Rely on vruntime for fairness.
*/
if (rq->curr != p)
- delta = max(10000LL, delta);
+ delta = max_t(s64, 10000LL, delta);
hrtick_start(rq, delta);
}
@@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu,
long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
- long more_w;
if (!tg->parent)
return wl;
@@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu,
if (!wl && sched_feat(ASYM_EFF_LOAD))
return wl;
- /*
- * Instead of using this increment, also add the difference
- * between when the shares were last updated and now.
- */
- more_w = se->my_q->load.weight - se->my_q->rq_weight;
- wl += more_w;
- wg += more_w;
-
for_each_sched_entity(se) {
-#define D(n) (likely(n) ? (n) : 1)
-
long S, rw, s, a, b;
+ long more_w;
+
+ /*
+ * Instead of using this increment, also add the difference
+ * between when the shares were last updated and now.
+ */
+ more_w = se->my_q->load.weight - se->my_q->rq_weight;
+ wl += more_w;
+ wg += more_w;
S = se->my_q->tg->shares;
s = se->my_q->shares;
@@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu,
a = S*(rw + wl);
b = S*rw + s*wg;
- wl = s*(a-b)/D(b);
+ wl = s*(a-b);
+
+ if (likely(b))
+ wl /= b;
+
/*
* Assume the group is already running and will
* thus already be accounted for in the weight.
@@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu,
* alter the group weight.
*/
wg = 0;
-#undef D
}
return wl;
@@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
#endif
static int
-wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
struct task_struct *p, int prev_cpu, int this_cpu, int sync,
int idx, unsigned long load, unsigned long this_load,
unsigned int imbalance)
@@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
+ if (!sync && sched_feat(SYNC_WAKEUPS) &&
+ curr->se.avg_overlap < sysctl_sched_migration_cost &&
+ p->se.avg_overlap < sysctl_sched_migration_cost)
+ sync = 1;
+
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
@@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
* a reasonable amount of time then attract this newly
* woken task:
*/
- if (sync && balanced) {
- if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost)
- return 1;
- }
+ if (sync && balanced)
+ return 1;
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
- if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
- balanced) {
+ if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
+ tl_per_task)) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
@@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
struct sched_domain *sd, *this_sd = NULL;
int prev_cpu, this_cpu, new_cpu;
unsigned long load, this_load;
- struct rq *rq, *this_rq;
+ struct rq *this_rq;
unsigned int imbalance;
int idx;
prev_cpu = task_cpu(p);
- rq = task_rq(p);
this_cpu = smp_processor_id();
this_rq = cpu_rq(this_cpu);
new_cpu = prev_cpu;
+ if (prev_cpu == this_cpu)
+ goto out;
/*
* 'this_sd' is the first domain that both
* this_cpu and prev_cpu are present in:
@@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
- if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+ if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
load, this_load, imbalance))
return this_cpu;
- if (prev_cpu == this_cpu)
- goto out;
-
/*
* Start passive balancing when half the imbalance_pct
* limit is reached.
@@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
* + nice tasks.
*/
if (sched_feat(ASYM_GRAN))
- gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
- else
- gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
+ gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
return gran;
}
/*
- * Should 'se' preempt 'curr'.
- *
- * |s1
- * |s2
- * |s3
- * g
- * |<--->|c
- *
- * w(c, s1) = -1
- * w(c, s2) = 0
- * w(c, s3) = 1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
- s64 gran, vdiff = curr->vruntime - se->vruntime;
-
- if (vdiff < 0)
- return -1;
-
- gran = wakeup_gran(curr);
- if (vdiff > gran)
- return 1;
-
- return 0;
-}
-
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
- int depth = 0;
-
- for_each_sched_entity(se)
- depth++;
-
- return depth;
-}
-
-/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
{
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
- int se_depth, pse_depth;
+ s64 delta_exec;
if (unlikely(rt_prio(p->prio))) {
update_rq_clock(rq);
@@ -1351,6 +1254,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
cfs_rq_of(pse)->next = pse;
/*
+ * We can come here with TIF_NEED_RESCHED already set from new task
+ * wake up path.
+ */
+ if (test_tsk_need_resched(curr))
+ return;
+
+ /*
* Batch tasks do not preempt (their preemption is driven by
* the tick):
*/
@@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
if (!sched_feat(WAKEUP_PREEMPT))
return;
- /*
- * preemption test can be made between sibling entities who are in the
- * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
- * both tasks until we find their ancestors who are siblings of common
- * parent.
- */
-
- /* First walk up until both entities are at same depth */
- se_depth = depth_se(se);
- pse_depth = depth_se(pse);
-
- while (se_depth > pse_depth) {
- se_depth--;
- se = parent_entity(se);
- }
-
- while (pse_depth > se_depth) {
- pse_depth--;
- pse = parent_entity(pse);
- }
-
- while (!is_same_group(se, pse)) {
- se = parent_entity(se);
- pse = parent_entity(pse);
+ if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+ (se->avg_overlap < sysctl_sched_migration_cost &&
+ pse->avg_overlap < sysctl_sched_migration_cost))) {
+ resched_task(curr);
+ return;
}
- if (wakeup_preempt_entity(se, pse) == 1)
+ delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+ if (delta_exec > wakeup_gran(pse))
resched_task(curr);
}
@@ -1442,18 +1334,13 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
struct task_struct *p = NULL;
struct sched_entity *se;
- while (next != &cfs_rq->tasks) {
- se = list_entry(next, struct sched_entity, group_node);
- next = next->next;
+ if (next == &cfs_rq->tasks)
+ return NULL;
- /* Skip over entities that are not tasks */
- if (entity_is_task(se)) {
- p = task_of(se);
- break;
- }
- }
+ se = list_entry(next, struct sched_entity, group_node);
+ p = task_of(se);
+ cfs_rq->balance_iterator = next->next;
- cfs_rq->balance_iterator = next;
return p;
}
@@ -1502,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
rcu_read_lock();
update_h_load(busiest_cpu);
- list_for_each_entry(tg, &task_groups, list) {
+ list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
unsigned long busiest_h_load = busiest_cfs_rq->h_load;
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1615,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
+ resched_task(rq->curr);
}
enqueue_task_fair(rq, p, 0);
- resched_task(rq->curr);
}
/*
@@ -1637,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
if (p->prio > oldprio)
resched_task(rq->curr);
} else
- check_preempt_curr(rq, p);
+ check_preempt_curr(rq, p, 0);
}
/*
@@ -1654,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
if (running)
resched_task(rq->curr);
else
- check_preempt_curr(rq, p);
+ check_preempt_curr(rq, p, 0);
}
/* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 862b06bd560..7c9e8f4a049 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,6 +8,7 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
SCHED_FEAT(HRTICK, 1)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(ASYM_GRAN, 1)
-SCHED_FEAT(LB_BIAS, 0)
+SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
+SCHED_FEAT(WAKEUP_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe6..dec4ccabe2f 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
/*
* Idle tasks are unconditionally rescheduled:
*/
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
{
resched_task(rq->idle);
}
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
if (running)
resched_task(rq->curr);
else
- check_preempt_curr(rq, p);
+ check_preempt_curr(rq, p, 0);
}
static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
if (p->prio > oldprio)
resched_task(rq->curr);
} else
- check_preempt_curr(rq, p);
+ check_preempt_curr(rq, p, 0);
}
/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 908c04f9dad..cdf5740ab03 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
+ struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
struct sched_rt_entity *rt_se = rt_rq->rt_se;
- if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
- struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-
- enqueue_rt_entity(rt_se);
+ if (rt_rq->rt_nr_running) {
+ if (rt_se && !on_rt_rq(rt_se))
+ enqueue_rt_entity(rt_se);
if (rt_rq->highest_prio < curr->prio)
resched_task(curr);
}
@@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
+ if (rt_rq->rt_nr_running)
+ resched_task(rq_of_rt_rq(rt_rq)->curr);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -229,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SMP
+/*
+ * We ran out of runtime, see if we can borrow some from our neighbours.
+ */
static int do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -248,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
continue;
spin_lock(&iter->rt_runtime_lock);
+ /*
+ * Either all rqs have inf runtime and there's nothing to steal
+ * or __disable_runtime() below sets a specific rq to inf to
+ * indicate its been disabled and disalow stealing.
+ */
if (iter->rt_runtime == RUNTIME_INF)
goto next;
+ /*
+ * From runqueues with spare time, take 1/n part of their
+ * spare time, but no more than our period.
+ */
diff = iter->rt_runtime - iter->rt_time;
if (diff > 0) {
diff = div_u64((u64)diff, weight);
@@ -272,6 +286,9 @@ next:
return more;
}
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
static void __disable_runtime(struct rq *rq)
{
struct root_domain *rd = rq->rd;
@@ -287,18 +304,34 @@ static void __disable_runtime(struct rq *rq)
spin_lock(&rt_b->rt_runtime_lock);
spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * Either we're all inf and nobody needs to borrow, or we're
+ * already disabled and thus have nothing to do, or we have
+ * exactly the right amount of runtime to take out.
+ */
if (rt_rq->rt_runtime == RUNTIME_INF ||
rt_rq->rt_runtime == rt_b->rt_runtime)
goto balanced;
spin_unlock(&rt_rq->rt_runtime_lock);
+ /*
+ * Calculate the difference between what we started out with
+ * and what we current have, that's the amount of runtime
+ * we lend and now have to reclaim.
+ */
want = rt_b->rt_runtime - rt_rq->rt_runtime;
+ /*
+ * Greedy reclaim, take back as much as we can.
+ */
for_each_cpu_mask(i, rd->span) {
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
s64 diff;
- if (iter == rt_rq)
+ /*
+ * Can't reclaim from ourselves or disabled runqueues.
+ */
+ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
continue;
spin_lock(&iter->rt_runtime_lock);
@@ -317,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
}
spin_lock(&rt_rq->rt_runtime_lock);
+ /*
+ * We cannot be left wanting - that would mean some runtime
+ * leaked out of the system.
+ */
BUG_ON(want);
balanced:
+ /*
+ * Disable all the borrow logic by pretending we have inf
+ * runtime - in which case borrowing doesn't make sense.
+ */
rt_rq->rt_runtime = RUNTIME_INF;
spin_unlock(&rt_rq->rt_runtime_lock);
spin_unlock(&rt_b->rt_runtime_lock);
@@ -341,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
if (unlikely(!scheduler_running))
return;
+ /*
+ * Reset each runqueue's bandwidth settings
+ */
for_each_leaf_rt_rq(rt_rq, rq) {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -348,6 +392,7 @@ static void __enable_runtime(struct rq *rq)
spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_runtime = rt_b->rt_runtime;
rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
spin_unlock(&rt_rq->rt_runtime_lock);
spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -386,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int i, idle = 1;
cpumask_t span;
- if (rt_b->rt_runtime == RUNTIME_INF)
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return 1;
span = sched_rt_period_mask();
@@ -438,9 +483,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
- if (runtime == RUNTIME_INF)
- return 0;
-
if (rt_rq->rt_throttled)
return rt_rq_throttled(rt_rq);
@@ -487,13 +529,18 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
+ if (!rt_bandwidth_enabled())
+ return;
+
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
- resched_task(curr);
+ if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+ rt_rq->rt_time += delta_exec;
+ if (sched_rt_runtime_exceeded(rt_rq))
+ resched_task(curr);
+ }
spin_unlock(&rt_rq->rt_runtime_lock);
}
}
@@ -782,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
/*
* Preempt the current task with a newly woken task if needed:
*/
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
{
if (p->prio < rq->curr->prio) {
resched_task(rq->curr);
@@ -861,6 +908,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
#define RT_MAX_TRIES 3
static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
+
static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -1022,7 +1071,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
break;
/* try again */
- spin_unlock(&lowest_rq->lock);
+ double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
}
@@ -1091,7 +1140,7 @@ static int push_rt_task(struct rq *rq)
resched_task(lowest_rq->curr);
- spin_unlock(&lowest_rq->lock);
+ double_unlock_balance(rq, lowest_rq);
ret = 1;
out:
@@ -1197,7 +1246,7 @@ static int pull_rt_task(struct rq *this_rq)
}
skip:
- spin_unlock(&src_rq->lock);
+ double_unlock_balance(this_rq, src_rq);
}
return ret;
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index aaaeae8244e..94a62c0d4ad 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
waiter.up = 0;
for (;;) {
- if (state == TASK_INTERRUPTIBLE && signal_pending(task))
- goto interrupted;
- if (state == TASK_KILLABLE && fatal_signal_pending(task))
+ if (signal_pending_state(state, task))
goto interrupted;
if (timeout <= 0)
goto timed_out;
diff --git a/kernel/signal.c b/kernel/signal.c
index 954f77d7e3b..e661b01d340 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1304,6 +1304,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
q->info.si_overrun++;
goto out;
}
+ q->info.si_overrun = 0;
signalfd_notify(t, sig);
pending = group ? &t->signal->shared_pending : &t->pending;
@@ -1337,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
struct siginfo info;
unsigned long flags;
struct sighand_struct *psig;
+ int ret = sig;
BUG_ON(sig == -1);
@@ -1401,7 +1403,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
* is implementation-defined: we do (if you don't want
* it, just use SIG_IGN instead).
*/
- tsk->exit_signal = -1;
+ ret = tsk->exit_signal = -1;
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = -1;
}
@@ -1410,7 +1412,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
__wake_up_parent(tsk, tsk->parent);
spin_unlock_irqrestore(&psig->siglock, flags);
- return sig;
+ return ret;
}
static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
diff --git a/kernel/smp.c b/kernel/smp.c
index 96fc7c0edc5..f362a855377 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -135,7 +135,8 @@ void generic_smp_call_function_interrupt(void)
*/
smp_wmb();
data->csd.flags &= ~CSD_FLAG_WAIT;
- } else
+ }
+ if (data->csd.flags & CSD_FLAG_ALLOC)
call_rcu(&data->rcu_head, rcu_free_call_data);
}
rcu_read_unlock();
@@ -209,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
{
struct call_single_data d;
unsigned long flags;
- /* prevent preemption and reschedule on another processor */
+ /* prevent preemption and reschedule on another processor,
+ as well as CPU removal */
int me = get_cpu();
+ int err = 0;
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
@@ -219,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
local_irq_save(flags);
func(info);
local_irq_restore(flags);
- } else {
+ } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
struct call_single_data *data = NULL;
if (!wait) {
@@ -235,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
data->func = func;
data->info = info;
generic_exec_single(cpu, data);
+ } else {
+ err = -ENXIO; /* CPU not online */
}
put_cpu();
- return 0;
+ return err;
}
EXPORT_SYMBOL(smp_call_function_single);
@@ -260,6 +265,42 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
generic_exec_single(cpu, data);
}
+/* Dummy function */
+static void quiesce_dummy(void *unused)
+{
+}
+
+/*
+ * Ensure stack based data used in call function mask is safe to free.
+ *
+ * This is needed by smp_call_function_mask when using on-stack data, because
+ * a single call function queue is shared by all CPUs, and any CPU may pick up
+ * the data item on the queue at any time before it is deleted. So we need to
+ * ensure that all CPUs have transitioned through a quiescent state after
+ * this call.
+ *
+ * This is a very slow function, implemented by sending synchronous IPIs to
+ * all possible CPUs. For this reason, we have to alloc data rather than use
+ * stack based data even in the case of synchronous calls. The stack based
+ * data is then just used for deadlock/oom fallback which will be very rare.
+ *
+ * If a faster scheme can be made, we could go back to preferring stack based
+ * data -- the data allocation/free is non-zero cost.
+ */
+static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
+{
+ struct call_single_data data;
+ int cpu;
+
+ data.func = quiesce_dummy;
+ data.info = NULL;
+
+ for_each_cpu_mask(cpu, mask) {
+ data.flags = CSD_FLAG_WAIT;
+ generic_exec_single(cpu, &data);
+ }
+}
+
/**
* smp_call_function_mask(): Run a function on a set of other CPUs.
* @mask: The set of cpus to run on.
@@ -285,6 +326,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
cpumask_t allbutself;
unsigned long flags;
int cpu, num_cpus;
+ int slowpath = 0;
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
@@ -306,15 +348,16 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
return smp_call_function_single(cpu, func, info, wait);
}
- if (!wait) {
- data = kmalloc(sizeof(*data), GFP_ATOMIC);
- if (data)
- data->csd.flags = CSD_FLAG_ALLOC;
- }
- if (!data) {
+ data = kmalloc(sizeof(*data), GFP_ATOMIC);
+ if (data) {
+ data->csd.flags = CSD_FLAG_ALLOC;
+ if (wait)
+ data->csd.flags |= CSD_FLAG_WAIT;
+ } else {
data = &d;
data->csd.flags = CSD_FLAG_WAIT;
wait = 1;
+ slowpath = 1;
}
spin_lock_init(&data->lock);
@@ -331,8 +374,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
arch_send_call_function_ipi(mask);
/* optionally wait for the CPUs to complete */
- if (wait)
+ if (wait) {
csd_flag_wait(&data->csd);
+ if (unlikely(slowpath))
+ smp_call_function_mask_quiesce_stack(mask);
+ }
return 0;
}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b75b492fbfc..cb838ee93a8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -233,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
do_each_thread(g, t) {
if (!--max_count)
goto unlock;
- if (t->state & TASK_UNINTERRUPTIBLE)
+ /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+ if (t->state == TASK_UNINTERRUPTIBLE)
check_hung_task(t, now);
} while_each_thread(g, t);
unlock:
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index a1fb54c93cd..29ab20749dd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -290,8 +290,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
}
-
EXPORT_SYMBOL(_spin_lock_nested);
+
unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
{
unsigned long flags;
@@ -311,9 +311,17 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
#endif
return flags;
}
-
EXPORT_SYMBOL(_spin_lock_irqsave_nested);
+void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
+ struct lockdep_map *nest_lock)
+{
+ preempt_disable();
+ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
+ LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
+}
+EXPORT_SYMBOL(_spin_lock_nest_lock);
+
#endif
void __lockfunc _spin_unlock(spinlock_t *lock)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e446c7c7d6a..af3c7cea258 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -65,7 +65,6 @@ static void ack_state(void)
static int stop_cpu(struct stop_machine_data *smdata)
{
enum stopmachine_state curstate = STOPMACHINE_NONE;
- int uninitialized_var(ret);
/* Simple state machine */
do {
diff --git a/kernel/sys.c b/kernel/sys.c
index c01858090a9..234d9454294 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
pgrp = find_vpid(who);
else
pgrp = task_pgrp(current);
- do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
error = set_one_prio(p, niceval, error);
- } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case PRIO_USER:
user = current->user;
@@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who)
pgrp = find_vpid(who);
else
pgrp = task_pgrp(current);
- do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
niceval = 20 - task_nice(p);
if (niceval > retval)
retval = niceval;
- } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case PRIO_USER:
user = current->user;
@@ -274,7 +274,7 @@ void emergency_restart(void)
}
EXPORT_SYMBOL_GPL(emergency_restart);
-static void kernel_restart_prepare(char *cmd)
+void kernel_restart_prepare(char *cmd)
{
blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
system_state = SYSTEM_RESTART;
@@ -1060,9 +1060,7 @@ asmlinkage long sys_setsid(void)
group_leader->signal->leader = 1;
__set_special_pids(sid);
- spin_lock(&group_leader->sighand->siglock);
- group_leader->signal->tty = NULL;
- spin_unlock(&group_leader->sighand->siglock);
+ proc_clear_tty(group_leader);
err = session;
out:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index fe471334727..c468c3c6dfc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
-extern int maps_protect;
extern int latencytop_enabled;
extern int sysctl_nr_open_min, sysctl_nr_open_max;
#ifdef CONFIG_RCU_TORTURE_TEST
@@ -118,10 +117,8 @@ extern char modprobe_path[];
extern int sg_big_buff;
#endif
-#ifdef __sparc__
-extern char reboot_command [];
-extern int stop_a_enabled;
-extern int scons_pwroff;
+#ifdef CONFIG_SPARC
+#include <asm/system.h>
#endif
#ifdef __hppa__
@@ -159,6 +156,7 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
static struct ctl_table root_table[];
static struct ctl_table_root sysctl_table_root;
static struct ctl_table_header root_table_header = {
+ .count = 1,
.ctl_table = root_table,
.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
.root = &sysctl_table_root,
@@ -414,7 +412,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
-#ifdef __sparc__
+#ifdef CONFIG_SPARC
{
.ctl_name = KERN_SPARC_REBOOT,
.procname = "reboot-cmd",
@@ -809,16 +807,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
-#ifdef CONFIG_PROC_FS
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "maps_protect",
- .data = &maps_protect,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
-#endif
{
.ctl_name = CTL_UNNUMBERED,
.procname = "poweroff_cmd",
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a197..f8d968063ce 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
}
/**
+ * clockevents_shutdown - shutdown the device and clear next_event
+ * @dev: device to shutdown
+ */
+void clockevents_shutdown(struct clock_event_device *dev)
+{
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+ dev->next_event.tv64 = KTIME_MAX;
+}
+
+/**
* clockevents_program_event - Reprogram the clock event device.
* @expires: absolute expiry time (monotonic clock)
*
@@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev)
/*
* Noop handler when we shut down an event device
*/
-static void clockevents_handle_noop(struct clock_event_device *dev)
+void clockevents_handle_noop(struct clock_event_device *dev)
{
}
@@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
* released list and do a notify add later.
*/
if (old) {
- old->event_handler = clockevents_handle_noop;
clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
list_del(&old->list);
list_add(&old->list, &clockevents_released);
@@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
if (new) {
BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
- clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_shutdown(new);
}
local_irq_restore(flags);
}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196..1ad46f3df6e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy)
if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
fail = update_persistent_clock(now);
- next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
+ next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
if (next.tv_nsec <= 0)
next.tv_nsec += NSEC_PER_SEC;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 31463d370b9..cb01cd8f919 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
*/
static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
{
+ ktime_t next;
+
tick_do_periodic_broadcast();
/*
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
/*
* Setup the next period for devices, which do not have
- * periodic mode:
+ * periodic mode. We read dev->next_event first and add to it
+ * when the event alrady expired. clockevents_program_event()
+ * sets dev->next_event only when the event is really
+ * programmed to the device.
*/
- for (;;) {
- ktime_t next = ktime_add(dev->next_event, tick_period);
+ for (next = dev->next_event; ;) {
+ next = ktime_add(next, tick_period);
if (!clockevents_program_event(dev, next, ktime_get()))
return;
@@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
struct clock_event_device *bc, *dev;
struct tick_device *td;
unsigned long flags, *reason = why;
- int cpu;
+ int cpu, bc_stopped;
spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -223,14 +228,16 @@ static void tick_do_broadcast_on_off(void *why)
if (!tick_device_is_functional(dev))
goto out;
+ bc_stopped = cpus_empty(tick_broadcast_mask);
+
switch (*reason) {
case CLOCK_EVT_NOTIFY_BROADCAST_ON:
case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
if (!cpu_isset(cpu, tick_broadcast_mask)) {
cpu_set(cpu, tick_broadcast_mask);
- if (td->mode == TICKDEV_MODE_PERIODIC)
- clockevents_set_mode(dev,
- CLOCK_EVT_MODE_SHUTDOWN);
+ if (tick_broadcast_device.mode ==
+ TICKDEV_MODE_PERIODIC)
+ clockevents_shutdown(dev);
}
if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
tick_broadcast_force = 1;
@@ -239,15 +246,17 @@ static void tick_do_broadcast_on_off(void *why)
if (!tick_broadcast_force &&
cpu_isset(cpu, tick_broadcast_mask)) {
cpu_clear(cpu, tick_broadcast_mask);
- if (td->mode == TICKDEV_MODE_PERIODIC)
+ if (tick_broadcast_device.mode ==
+ TICKDEV_MODE_PERIODIC)
tick_setup_periodic(dev, 0);
}
break;
}
- if (cpus_empty(tick_broadcast_mask))
- clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
- else {
+ if (cpus_empty(tick_broadcast_mask)) {
+ if (!bc_stopped)
+ clockevents_shutdown(bc);
+ } else if (bc_stopped) {
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
tick_broadcast_start_periodic(bc);
else
@@ -298,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
if (bc && cpus_empty(tick_broadcast_mask))
- clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_shutdown(bc);
}
spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -313,7 +322,7 @@ void tick_suspend_broadcast(void)
bc = tick_broadcast_device.evtdev;
if (bc)
- clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_shutdown(bc);
spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -364,16 +373,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
static int tick_broadcast_set_event(ktime_t expires, int force)
{
struct clock_event_device *bc = tick_broadcast_device.evtdev;
- ktime_t now = ktime_get();
- int res;
-
- for(;;) {
- res = clockevents_program_event(bc, expires, now);
- if (!res || !force)
- return res;
- now = ktime_get();
- expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
- }
+
+ return tick_dev_program_event(bc, expires, force);
}
int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -491,14 +492,52 @@ static void tick_broadcast_clear_oneshot(int cpu)
cpu_clear(cpu, tick_broadcast_oneshot_mask);
}
+static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
+{
+ struct tick_device *td;
+ int cpu;
+
+ for_each_cpu_mask_nr(cpu, *mask) {
+ td = &per_cpu(tick_cpu_device, cpu);
+ if (td->evtdev)
+ td->evtdev->next_event = expires;
+ }
+}
+
/**
* tick_broadcast_setup_oneshot - setup the broadcast device
*/
void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
- bc->event_handler = tick_handle_oneshot_broadcast;
- clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
- bc->next_event.tv64 = KTIME_MAX;
+ /* Set it up only once ! */
+ if (bc->event_handler != tick_handle_oneshot_broadcast) {
+ int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
+ int cpu = smp_processor_id();
+ cpumask_t mask;
+
+ bc->event_handler = tick_handle_oneshot_broadcast;
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+
+ /* Take the do_timer update */
+ tick_do_timer_cpu = cpu;
+
+ /*
+ * We must be careful here. There might be other CPUs
+ * waiting for periodic broadcast. We need to set the
+ * oneshot_mask bits for those and program the
+ * broadcast device to fire.
+ */
+ mask = tick_broadcast_mask;
+ cpu_clear(cpu, mask);
+ cpus_or(tick_broadcast_oneshot_mask,
+ tick_broadcast_oneshot_mask, mask);
+
+ if (was_periodic && !cpus_empty(mask)) {
+ tick_broadcast_init_next_event(&mask, tick_next_period);
+ tick_broadcast_set_event(tick_next_period, 1);
+ } else
+ bc->next_event.tv64 = KTIME_MAX;
+ }
}
/*
@@ -538,4 +577,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
+/*
+ * Check, whether the broadcast device is in one shot mode
+ */
+int tick_broadcast_oneshot_active(void)
+{
+ return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
+}
+
#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 80c4336f418..df12434b43c 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
*/
ktime_t tick_next_period;
ktime_t tick_period;
-int tick_do_timer_cpu __read_mostly = -1;
+int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
DEFINE_SPINLOCK(tick_device_lock);
/*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
if (!tick_device_is_functional(dev))
return;
- if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
+ if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+ !tick_broadcast_oneshot_active()) {
clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
} else {
unsigned long seq;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
* If no cpu took the do_timer update, assign it to
* this cpu:
*/
- if (tick_do_timer_cpu == -1) {
+ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
tick_do_timer_cpu = cpu;
tick_next_period = ktime_get();
tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -161,6 +162,7 @@ static void tick_setup_device(struct tick_device *td,
} else {
handler = td->evtdev->event_handler;
next_event = td->evtdev->next_event;
+ td->evtdev->event_handler = clockevents_handle_noop;
}
td->evtdev = newdev;
@@ -248,7 +250,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
* not give it back to the clockevents layer !
*/
if (tick_is_broadcast_device(curdev)) {
- clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_shutdown(curdev);
curdev = NULL;
}
clockevents_exchange_device(curdev, newdev);
@@ -299,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
if (*cpup == tick_do_timer_cpu) {
int cpu = first_cpu(cpu_online_map);
- tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+ tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
+ TICK_DO_TIMER_NONE;
}
spin_unlock_irqrestore(&tick_device_lock, flags);
}
@@ -310,7 +313,7 @@ static void tick_suspend(void)
unsigned long flags;
spin_lock_irqsave(&tick_device_lock, flags);
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_shutdown(td->evtdev);
spin_unlock_irqrestore(&tick_device_lock, flags);
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd..469248782c2 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
/*
* tick internal variable and functions used by low/high res code
*/
+
+#define TICK_DO_TIMER_NONE -1
+#define TICK_DO_TIMER_BOOT -2
+
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern spinlock_t tick_device_lock;
extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
+extern void clockevents_shutdown(struct clock_event_device *dev);
+
/*
* NO_HZ / high resolution timer shared code
*/
@@ -17,6 +23,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *),
ktime_t nextevt);
+extern int tick_dev_program_event(struct clock_event_device *dev,
+ ktime_t expires, int force);
extern int tick_program_event(ktime_t expires, int force);
extern void tick_oneshot_notify(void);
extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
@@ -27,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
extern void tick_broadcast_switch_to_oneshot(void);
extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern int tick_broadcast_oneshot_active(void);
# else /* BROADCAST */
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
@@ -35,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
static inline void tick_broadcast_switch_to_oneshot(void) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
# endif /* !BROADCAST */
#else /* !ONESHOT */
@@ -64,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
{
return 0;
}
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
#endif /* !TICK_ONESHOT */
/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b6..2e8de678e76 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,24 +23,56 @@
#include "tick-internal.h"
/**
- * tick_program_event
+ * tick_program_event internal worker function
*/
-int tick_program_event(ktime_t expires, int force)
+int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
+ int force)
{
- struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
ktime_t now = ktime_get();
+ int i;
- while (1) {
+ for (i = 0;;) {
int ret = clockevents_program_event(dev, expires, now);
if (!ret || !force)
return ret;
+
+ /*
+ * We tried 2 times to program the device with the given
+ * min_delta_ns. If that's not working then we double it
+ * and emit a warning.
+ */
+ if (++i > 2) {
+ /* Increase the min. delta and try again */
+ if (!dev->min_delta_ns)
+ dev->min_delta_ns = 5000;
+ else
+ dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+ printk(KERN_WARNING
+ "CE: %s increasing min_delta_ns to %lu nsec\n",
+ dev->name ? dev->name : "?",
+ dev->min_delta_ns << 1);
+
+ i = 0;
+ }
+
now = ktime_get();
- expires = ktime_add(now, ktime_set(0, dev->min_delta_ns));
+ expires = ktime_add_ns(now, dev->min_delta_ns);
}
}
/**
+ * tick_program_event
+ */
+int tick_program_event(ktime_t expires, int force)
+{
+ struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
+
+ return tick_dev_program_event(dev, expires, force);
+}
+
+/**
* tick_resume_onshot - resume oneshot mode
*/
void tick_resume_oneshot(void)
@@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
{
newdev->event_handler = handler;
clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
- clockevents_program_event(newdev, next_event, ktime_get());
+ tick_dev_program_event(newdev, next_event, 1);
}
/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 825b4c00fe4..a4d21939816 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/tick.h>
+#include <linux/module.h>
#include <asm/irq_regs.h>
@@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now)
incr * ticks);
}
do_timer(++ticks);
+
+ /* Keep the tick_next_period variable up to date */
+ tick_next_period = ktime_add(last_jiffies_update, tick_period);
}
write_sequnlock(&xtime_lock);
}
@@ -162,6 +166,8 @@ void tick_nohz_stop_idle(int cpu)
ts->idle_lastupdate = now;
ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
ts->idle_active = 0;
+
+ sched_clock_idle_wakeup_event(0);
}
}
@@ -177,6 +183,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
}
ts->idle_entrytime = now;
ts->idle_active = 1;
+ sched_clock_idle_sleep_event();
return now;
}
@@ -184,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- *last_update_time = ktime_to_us(ts->idle_lastupdate);
+ if (!tick_nohz_enabled)
+ return -1;
+
+ if (ts->idle_active)
+ *last_update_time = ktime_to_us(ts->idle_lastupdate);
+ else
+ *last_update_time = ktime_to_us(ktime_get());
+
return ktime_to_us(ts->idle_sleeptime);
}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
/**
* tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@ -218,7 +233,7 @@ void tick_nohz_stop_sched_tick(int inidle)
*/
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
- tick_do_timer_cpu = -1;
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
}
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -289,7 +304,6 @@ void tick_nohz_stop_sched_tick(int inidle)
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
rcu_enter_nohz();
- sched_clock_tick_stop(cpu);
}
/*
@@ -301,7 +315,7 @@ void tick_nohz_stop_sched_tick(int inidle)
* invoked.
*/
if (cpu == tick_do_timer_cpu)
- tick_do_timer_cpu = -1;
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
ts->idle_sleeps++;
@@ -392,7 +406,6 @@ void tick_nohz_restart_sched_tick(void)
select_nohz_load_balancer(0);
now = ktime_get();
tick_do_update_jiffies64(now);
- sched_clock_tick_start(cpu);
cpu_clear(cpu, nohz_cpu_mask);
/*
@@ -467,7 +480,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
* this duty, then the jiffies update is still serialized by
* xtime_lock.
*/
- if (unlikely(tick_do_timer_cpu == -1))
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
tick_do_timer_cpu = cpu;
/* Check, if the jiffies need an update */
@@ -569,7 +582,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
* this duty, then the jiffies update is still serialized by
* xtime_lock.
*/
- if (unlikely(tick_do_timer_cpu == -1))
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
tick_do_timer_cpu = cpu;
#endif
@@ -621,7 +634,7 @@ void tick_setup_sched_timer(void)
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
ts->sched_timer.function = tick_sched_timer;
- ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
/* Get the next period (per cpu) */
ts->sched_timer.expires = tick_init_jiffy_update();
@@ -645,17 +658,21 @@ void tick_setup_sched_timer(void)
ts->nohz_mode = NOHZ_MODE_HIGHRES;
#endif
}
+#endif /* HIGH_RES_TIMERS */
+#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+# ifdef CONFIG_HIGH_RES_TIMERS
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
+# endif
ts->nohz_mode = NOHZ_MODE_INACTIVE;
}
-#endif /* HIGH_RES_TIMERS */
+#endif
/**
* Async notification about clocksource changes
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce2..db58fb66a13 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = stack_trace_timer_fn;
- hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+ hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
}
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a09..39d6159fae4 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
{
struct user_struct *up = container_of(kobj, struct user_struct, kobj);
- return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+ return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
}
static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
unsigned long rt_runtime;
int rc;
- sscanf(buf, "%lu", &rt_runtime);
+ sscanf(buf, "%ld", &rt_runtime);
rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a9ab0596de4..532858fa5b8 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -6,7 +6,6 @@
*/
#include <linux/module.h>
-#include <linux/version.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 64d398f1244..815237a55af 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -12,7 +12,6 @@
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
-#include <linux/version.h>
#include <linux/err.h>
#include <linux/slab.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index fe3a56c2256..4ab9659d269 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -12,7 +12,6 @@
#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
-#include <linux/version.h>
#include <linux/sysctl.h>
static void *get_uts(ctl_table *table, int write)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ec7e4f62aaf..4048e92aa04 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
BUG_ON(get_wq_data(work) != cwq);
work_clear_pending(work);
- lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
- lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_acquire(&lockdep_map);
f(work);
- lock_release(&lockdep_map, 1, _THIS_IP_);
- lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+ lock_map_release(&lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
@@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq)
int cpu;
might_sleep();
- lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
- lock_release(&wq->lockdep_map, 1, _THIS_IP_);
+ lock_map_acquire(&wq->lockdep_map);
+ lock_map_release(&wq->lockdep_map);
for_each_cpu_mask_nr(cpu, *cpu_map)
flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
}
@@ -441,8 +441,8 @@ int flush_work(struct work_struct *work)
if (!cwq)
return 0;
- lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
- lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
prev = NULL;
spin_lock_irq(&cwq->lock);
@@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work)
might_sleep();
- lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
- lock_release(&work->lockdep_map, 1, _THIS_IP_);
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
cwq = get_wq_data(work);
if (!cwq)
@@ -830,10 +830,21 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
start_workqueue_thread(cwq, -1);
} else {
cpu_maps_update_begin();
+ /*
+ * We must place this wq on list even if the code below fails.
+ * cpu_down(cpu) can remove cpu from cpu_populated_map before
+ * destroy_workqueue() takes the lock, in that case we leak
+ * cwq[cpu]->thread.
+ */
spin_lock(&workqueue_lock);
list_add(&wq->list, &workqueues);
spin_unlock(&workqueue_lock);
-
+ /*
+ * We must initialize cwqs for each possible cpu even if we
+ * are going to call destroy_workqueue() finally. Otherwise
+ * cpu_up() can hit the uninitialized cwq once we drop the
+ * lock.
+ */
for_each_possible_cpu(cpu) {
cwq = init_cpu_workqueue(wq, cpu);
if (err || !cpu_online(cpu))
@@ -861,8 +872,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
if (cwq->thread == NULL)
return;
- lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_);
- lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_);
+ lock_map_acquire(&cwq->wq->lockdep_map);
+ lock_map_release(&cwq->wq->lockdep_map);
flush_cpu_workqueue(cwq);
/*