aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/cpuset.c45
-rw-r--r--kernel/die_notifier.c38
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c86
-rw-r--r--kernel/futex.c106
-rw-r--r--kernel/hrtimer.c1
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/irq/manage.c10
-rw-r--r--kernel/irq/proc.c15
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/itimer.c60
-rw-r--r--kernel/kallsyms.c81
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kprobes.c293
-rw-r--r--kernel/lockdep.c51
-rw-r--r--kernel/module.c79
-rw-r--r--kernel/nsproxy.c139
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c11
-rw-r--r--kernel/posix-cpu-timers.c14
-rw-r--r--kernel/posix-timers.c1
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/process.c6
-rw-r--r--kernel/power/snapshot.c1
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/printk.c27
-rw-r--r--kernel/rcutorture.c45
-rw-r--r--kernel/rwsem.c2
-rw-r--r--kernel/sched.c365
-rw-r--r--kernel/signal.c1
-rw-r--r--kernel/softlockup.c48
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/time.c61
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/tick-common.c8
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c51
-rw-r--r--kernel/time/timekeeping.c476
-rw-r--r--kernel/time/timer_list.c15
-rw-r--r--kernel/time/timer_stats.c14
-rw-r--r--kernel/timer.c528
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/utsname.c41
48 files changed, 1672 insertions, 1126 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ac6b27abb1a..642d4277c2e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
diff --git a/kernel/audit.c b/kernel/audit.c
index 4e9d2082968..d13276d4141 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
err = -EPERM;
break;
case AUDIT_USER:
- case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
- case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
@@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
loginuid, sid);
break;
case AUDIT_USER:
- case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
- case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d240349cbf0..88b416dfbc7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -42,7 +42,6 @@
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
@@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf)
return -EACCES;
trialcs = *cs;
- retval = cpulist_parse(buf, trialcs.cpus_allowed);
- if (retval < 0)
- return retval;
+
+ /*
+ * We allow a cpuset's cpus_allowed to be empty; if it has attached
+ * tasks, we'll catch it later when we validate the change and return
+ * -ENOSPC.
+ */
+ if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+ cpus_clear(trialcs.cpus_allowed);
+ } else {
+ retval = cpulist_parse(buf, trialcs.cpus_allowed);
+ if (retval < 0)
+ return retval;
+ }
cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
- if (cpus_empty(trialcs.cpus_allowed))
+ /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
+ if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
return -ENOSPC;
retval = validate_change(cs, &trialcs);
if (retval < 0)
@@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf)
return -EACCES;
trialcs = *cs;
- retval = nodelist_parse(buf, trialcs.mems_allowed);
- if (retval < 0)
- goto done;
+
+ /*
+ * We allow a cpuset's mems_allowed to be empty; if it has attached
+ * tasks, we'll catch it later when we validate the change and return
+ * -ENOSPC.
+ */
+ if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+ nodes_clear(trialcs.mems_allowed);
+ } else {
+ retval = nodelist_parse(buf, trialcs.mems_allowed);
+ if (retval < 0)
+ goto done;
+ }
nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
oldmem = cs->mems_allowed;
if (nodes_equal(oldmem, trialcs.mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
- if (nodes_empty(trialcs.mems_allowed)) {
+ /* mems_allowed cannot be empty for a cpuset with attached tasks. */
+ if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
retval = -ENOSPC;
goto done;
}
@@ -2200,10 +2221,6 @@ void cpuset_fork(struct task_struct *child)
* it is holding that mutex while calling check_for_release(),
* which calls kmalloc(), so can't be called holding callback_mutex().
*
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
* the_top_cpuset_hack:
*
* Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2242,8 +2259,10 @@ void cpuset_exit(struct task_struct *tsk)
{
struct cpuset *cs;
+ task_lock(current);
cs = tsk->cpuset;
tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
+ task_unlock(current);
if (notify_on_release(cs)) {
char *pathbuf = NULL;
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c
new file mode 100644
index 00000000000..0d98827887a
--- /dev/null
+++ b/kernel/die_notifier.c
@@ -0,0 +1,38 @@
+
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/vmalloc.h>
+#include <linux/kdebug.h>
+
+
+static ATOMIC_NOTIFIER_HEAD(die_chain);
+
+int notify_die(enum die_val val, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig,
+
+ };
+
+ return atomic_notifier_call_chain(&die_chain, val, &args);
+}
+
+int register_die_notifier(struct notifier_block *nb)
+{
+ vmalloc_sync_all();
+ return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);
+
+
diff --git a/kernel/exit.c b/kernel/exit.c
index 92369240d91..f5a7abb621f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -7,7 +7,6 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
-#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index b7d169def94..a8dd75d4992 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -14,7 +14,6 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/unistd.h>
-#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
@@ -1516,26 +1515,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
}
/*
- * Unshare the mnt_namespace structure if it is being shared
- */
-static int unshare_mnt_namespace(unsigned long unshare_flags,
- struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
-{
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
-
- if ((unshare_flags & CLONE_NEWNS) && ns) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
- if (!*new_nsp)
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/*
* Unsharing of sighand is not supported yet
*/
static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
@@ -1593,16 +1572,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n
return 0;
}
-#ifndef CONFIG_IPC_NS
-static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns)
-{
- if (flags & CLONE_NEWIPC)
- return -EINVAL;
-
- return 0;
-}
-#endif
-
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
@@ -1615,14 +1584,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
{
int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct mnt_namespace *ns, *new_ns = NULL;
struct sighand_struct *new_sigh = NULL;
struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
- struct uts_namespace *uts, *new_uts = NULL;
- struct ipc_namespace *ipc, *new_ipc = NULL;
check_unshare_flags(&unshare_flags);
@@ -1637,36 +1603,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
goto bad_unshare_cleanup_thread;
- if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
- goto bad_unshare_cleanup_fs;
if ((err = unshare_sighand(unshare_flags, &new_sigh)))
- goto bad_unshare_cleanup_ns;
+ goto bad_unshare_cleanup_fs;
if ((err = unshare_vm(unshare_flags, &new_mm)))
goto bad_unshare_cleanup_sigh;
if ((err = unshare_fd(unshare_flags, &new_fd)))
goto bad_unshare_cleanup_vm;
if ((err = unshare_semundo(unshare_flags, &new_ulist)))
goto bad_unshare_cleanup_fd;
- if ((err = unshare_utsname(unshare_flags, &new_uts)))
+ if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_fs)))
goto bad_unshare_cleanup_semundo;
- if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
- goto bad_unshare_cleanup_uts;
-
- if (new_ns || new_uts || new_ipc) {
- old_nsproxy = current->nsproxy;
- new_nsproxy = dup_namespaces(old_nsproxy);
- if (!new_nsproxy) {
- err = -ENOMEM;
- goto bad_unshare_cleanup_ipc;
- }
- }
- if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
- new_uts || new_ipc) {
+ if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) {
task_lock(current);
if (new_nsproxy) {
+ old_nsproxy = current->nsproxy;
current->nsproxy = new_nsproxy;
new_nsproxy = old_nsproxy;
}
@@ -1677,12 +1631,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
new_fs = fs;
}
- if (new_ns) {
- ns = current->nsproxy->mnt_ns;
- current->nsproxy->mnt_ns = new_ns;
- new_ns = ns;
- }
-
if (new_mm) {
mm = current->mm;
active_mm = current->active_mm;
@@ -1698,32 +1646,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
new_fd = fd;
}
- if (new_uts) {
- uts = current->nsproxy->uts_ns;
- current->nsproxy->uts_ns = new_uts;
- new_uts = uts;
- }
-
- if (new_ipc) {
- ipc = current->nsproxy->ipc_ns;
- current->nsproxy->ipc_ns = new_ipc;
- new_ipc = ipc;
- }
-
task_unlock(current);
}
if (new_nsproxy)
put_nsproxy(new_nsproxy);
-bad_unshare_cleanup_ipc:
- if (new_ipc)
- put_ipc_ns(new_ipc);
-
-bad_unshare_cleanup_uts:
- if (new_uts)
- put_uts_ns(new_uts);
-
bad_unshare_cleanup_semundo:
bad_unshare_cleanup_fd:
if (new_fd)
@@ -1738,10 +1666,6 @@ bad_unshare_cleanup_sigh:
if (atomic_dec_and_test(&new_sigh->count))
kmem_cache_free(sighand_cachep, new_sigh);
-bad_unshare_cleanup_ns:
- if (new_ns)
- put_mnt_ns(new_ns);
-
bad_unshare_cleanup_fs:
if (new_fs)
put_fs_struct(new_fs);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5a270b5e3f9..600bc9d801f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -48,6 +48,7 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
+#include <linux/module.h>
#include <asm/futex.h>
#include "rtmutex_common.h"
@@ -55,32 +56,6 @@
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
- * Futexes are matched on equal values of this key.
- * The key type depends on whether it's a shared or private mapping.
- * Don't rearrange members without looking at hash_futex().
- *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
-union futex_key {
- struct {
- unsigned long pgoff;
- struct inode *inode;
- int offset;
- } shared;
- struct {
- unsigned long address;
- struct mm_struct *mm;
- int offset;
- } private;
- struct {
- unsigned long word;
- void *ptr;
- int offset;
- } both;
-};
-
-/*
* Priority Inheritance state:
*/
struct futex_pi_state {
@@ -175,7 +150,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
*
* Should be called with &current->mm->mmap_sem but NOT any spinlocks.
*/
-static int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
}
return err;
}
+EXPORT_SYMBOL_GPL(get_futex_key);
/*
* Take a reference to the resource addressed by a key.
@@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
* NOTE: mmap_sem MUST be held between get_futex_key() and calling this
* function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
-static inline void get_key_refs(union futex_key *key)
+inline void get_futex_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
@@ -263,12 +239,13 @@ static inline void get_key_refs(union futex_key *key)
atomic_inc(&key->private.mm->mm_count);
}
}
+EXPORT_SYMBOL_GPL(get_futex_key_refs);
/*
* Drop a reference to the resource addressed by a key.
* The hash bucket spinlock must not be held.
*/
-static void drop_key_refs(union futex_key *key)
+void drop_futex_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
@@ -277,6 +254,7 @@ static void drop_key_refs(union futex_key *key)
mmdrop(key->private.mm);
}
}
+EXPORT_SYMBOL_GPL(drop_futex_key_refs);
static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
{
@@ -873,7 +851,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
this->lock_ptr = &hb2->lock;
}
this->key = key2;
- get_key_refs(&key2);
+ get_futex_key_refs(&key2);
drop_count++;
if (ret - nr_wake >= nr_requeue)
@@ -886,9 +864,9 @@ out_unlock:
if (hb1 != hb2)
spin_unlock(&hb2->lock);
- /* drop_key_refs() must be called outside the spinlocks. */
+ /* drop_futex_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
- drop_key_refs(&key1);
+ drop_futex_key_refs(&key1);
out:
up_read(&current->mm->mmap_sem);
@@ -906,7 +884,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
init_waitqueue_head(&q->waiters);
- get_key_refs(&q->key);
+ get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;
@@ -925,7 +903,7 @@ static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
spin_unlock(&hb->lock);
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
}
/*
@@ -980,7 +958,7 @@ static int unqueue_me(struct futex_q *q)
ret = 1;
}
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
return ret;
}
@@ -999,15 +977,18 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
spin_unlock(&hb->lock);
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
}
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+static long futex_wait_restart(struct restart_block *restart);
+static int futex_wait_abstime(u32 __user *uaddr, u32 val,
+ int timed, unsigned long abs_time)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb;
struct futex_q q;
+ unsigned long time_left = 0;
u32 uval;
int ret;
@@ -1087,8 +1068,21 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
* !list_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (likely(!list_empty(&q.list)))
- time = schedule_timeout(time);
+ time_left = 0;
+ if (likely(!list_empty(&q.list))) {
+ unsigned long rel_time;
+
+ if (timed) {
+ unsigned long now = jiffies;
+ if (time_after(now, abs_time))
+ rel_time = 0;
+ else
+ rel_time = abs_time - now;
+ } else
+ rel_time = MAX_SCHEDULE_TIMEOUT;
+
+ time_left = schedule_timeout(rel_time);
+ }
__set_current_state(TASK_RUNNING);
/*
@@ -1099,13 +1093,25 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
- if (time == 0)
+ if (time_left == 0)
return -ETIMEDOUT;
+
/*
* We expect signal_pending(current), but another thread may
* have handled it for us already.
*/
- return -EINTR;
+ if (time_left == MAX_SCHEDULE_TIMEOUT)
+ return -ERESTARTSYS;
+ else {
+ struct restart_block *restart;
+ restart = &current_thread_info()->restart_block;
+ restart->fn = futex_wait_restart;
+ restart->arg0 = (unsigned long)uaddr;
+ restart->arg1 = (unsigned long)val;
+ restart->arg2 = (unsigned long)timed;
+ restart->arg3 = abs_time;
+ return -ERESTART_RESTARTBLOCK;
+ }
out_unlock_release_sem:
queue_unlock(&q, hb);
@@ -1115,6 +1121,24 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
return ret;
}
+static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
+{
+ int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
+ return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
+}
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+ u32 __user *uaddr = (u32 __user *)restart->arg0;
+ u32 val = (u32)restart->arg1;
+ int timed = (int)restart->arg2;
+ unsigned long abs_time = restart->arg3;
+
+ restart->fn = do_no_restart_syscall;
+ return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
+}
+
+
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 1b3033105b4..c9f4f044a8a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -669,6 +669,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
return orun;
}
+EXPORT_SYMBOL_GPL(hrtimer_forward);
/*
* enqueue_hrtimer - internal function to (re)start a timer
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index aff1f0fabb0..32e1ab1477d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -48,7 +48,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
*
* Controller mappings for all interrupt sources:
*/
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS-1] = {
.status = IRQ_DISABLED,
.chip = &no_irq_chip,
@@ -180,6 +180,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
if (desc->chip->ack)
desc->chip->ack(irq);
action_ret = handle_IRQ_event(irq, desc->action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
desc->chip->end(irq);
return 1;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5597c157442..203a518b6f1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
}
*p = new;
-#if defined(CONFIG_IRQ_PER_CPU)
- if (new->flags & IRQF_PERCPU)
- desc->status |= IRQ_PER_CPU;
-#endif
+
/* Exclude IRQ from balancing */
if (new->flags & IRQF_NOBALANCING)
desc->status |= IRQ_NO_BALANCING;
@@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new)
if (!shared) {
irq_chip_set_defaults(desc->chip);
+#if defined(CONFIG_IRQ_PER_CPU)
+ if (new->flags & IRQF_PERCPU)
+ desc->status |= IRQ_PER_CPU;
+#endif
+
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
if (desc->chip && desc->chip->set_type)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 2db91eb54ad..ddde0ef9ccd 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -66,12 +66,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_desc + irq;
struct irqaction *action;
+ unsigned long flags;
+ int ret = 1;
- for (action = desc->action ; action; action = action->next)
+ spin_lock_irqsave(&desc->lock, flags);
+ for (action = desc->action ; action; action = action->next) {
if ((action != new_action) && action->name &&
- !strcmp(new_action->name, action->name))
- return 0;
- return 1;
+ !strcmp(new_action->name, action->name)) {
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 9d8c79b4882..b0d81aae472 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
if (unlikely(irqfixup)) {
/* Don't punish working computers */
- if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
+ if ((irqfixup == 2 && ((irq == 0) ||
+ (desc->action->flags & IRQF_IRQPOLL))) ||
+ action_ret == IRQ_NONE) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
desc->irqs_unhandled -= ok;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 307c6a632ef..3205e8e114f 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -7,7 +7,6 @@
/* These are all the functions necessary to implement itimers */
#include <linux/mm.h>
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/syscalls.h>
#include <linux/time.h>
@@ -139,59 +138,11 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
}
/*
- * We do not care about correctness. We just sanitize the values so
- * the ktime_t operations which expect normalized values do not
- * break. This converts negative values to long timeouts similar to
- * the code in kernel versions < 2.6.16
- *
- * Print a limited number of warning messages when an invalid timeval
- * is detected.
- */
-static void fixup_timeval(struct timeval *tv, int interval)
-{
- static int warnlimit = 10;
- unsigned long tmp;
-
- if (warnlimit > 0) {
- warnlimit--;
- printk(KERN_WARNING
- "setitimer: %s (pid = %d) provided "
- "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
- current->comm, current->pid,
- interval ? "it_interval" : "it_value",
- tv->tv_sec, (long) tv->tv_usec);
- }
-
- tmp = tv->tv_usec;
- if (tmp >= USEC_PER_SEC) {
- tv->tv_usec = tmp % USEC_PER_SEC;
- tv->tv_sec += tmp / USEC_PER_SEC;
- }
-
- tmp = tv->tv_sec;
- if (tmp > LONG_MAX)
- tv->tv_sec = LONG_MAX;
-}
-
-/*
* Returns true if the timeval is in canonical form
*/
#define timeval_valid(t) \
(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
-/*
- * Check for invalid timevals, sanitize them and print a limited
- * number of warnings.
- */
-static void check_itimerval(struct itimerval *value) {
-
- if (unlikely(!timeval_valid(&value->it_value)))
- fixup_timeval(&value->it_value, 0);
-
- if (unlikely(!timeval_valid(&value->it_interval)))
- fixup_timeval(&value->it_interval, 1);
-}
-
int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
{
struct task_struct *tsk = current;
@@ -201,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
/*
* Validate the timevals in value.
- *
- * Note: Although the spec requires that invalid values shall
- * return -EINVAL, we just fixup the value and print a limited
- * number of warnings in order not to break users of this
- * historical misfeature.
- *
- * Scheduled for replacement in March 2007
*/
- check_itimerval(value);
+ if (!timeval_valid(&value->it_value) ||
+ !timeval_valid(&value->it_interval))
+ return -EINVAL;
switch (which) {
case ITIMER_REAL:
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5a0de840973..f1bda23140b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
symbol_end = (unsigned long)_etext;
}
- *symbolsize = symbol_end - symbol_start;
- *offset = addr - symbol_start;
+ if (symbolsize)
+ *symbolsize = symbol_end - symbol_start;
+ if (offset)
+ *offset = addr - symbol_start;
return low;
}
@@ -267,6 +269,42 @@ const char *kallsyms_lookup(unsigned long addr,
return NULL;
}
+int lookup_symbol_name(unsigned long addr, char *symname)
+{
+ symname[0] = '\0';
+ symname[KSYM_NAME_LEN] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, NULL, NULL);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+ return 0;
+ }
+ /* see if it's in a module */
+ return lookup_module_symbol_name(addr, symname);
+}
+
+int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ name[0] = '\0';
+ name[KSYM_NAME_LEN] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, size, offset);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos), name);
+ modname[0] = '\0';
+ return 0;
+ }
+ /* see if it's in a module */
+ return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+}
+
/* Look up a kernel symbol and return it in a text buffer. */
int sprint_symbol(char *buffer, unsigned long address)
{
@@ -301,25 +339,20 @@ void __print_symbol(const char *fmt, unsigned long address)
struct kallsym_iter
{
loff_t pos;
- struct module *owner;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols */
char type;
char name[KSYM_NAME_LEN+1];
+ char module_name[MODULE_NAME_LEN + 1];
+ int exported;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
- iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
- &iter->value, &iter->type,
- iter->name, sizeof(iter->name));
- if (iter->owner == NULL)
+ if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
+ &iter->type, iter->name, iter->module_name,
+ &iter->exported) < 0)
return 0;
-
- /* Label it "global" if it is exported, "local" if not exported. */
- iter->type = is_exported(iter->name, iter->owner)
- ? toupper(iter->type) : tolower(iter->type);
-
return 1;
}
@@ -328,7 +361,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
{
unsigned off = iter->nameoff;
- iter->owner = NULL;
+ iter->module_name[0] = '\0';
iter->value = kallsyms_addresses[iter->pos];
iter->type = kallsyms_get_symbol_type(off);
@@ -392,12 +425,17 @@ static int s_show(struct seq_file *m, void *p)
if (!iter->name[0])
return 0;
- if (iter->owner)
+ if (iter->module_name[0]) {
+ char type;
+
+ /* Label it "global" if it is exported,
+ * "local" if not exported. */
+ type = iter->exported ? toupper(iter->type) :
+ tolower(iter->type);
seq_printf(m, "%0*lx %c %s\t[%s]\n",
(int)(2*sizeof(void*)),
- iter->value, iter->type, iter->name,
- module_name(iter->owner));
- else
+ iter->value, type, iter->name, iter->module_name);
+ } else
seq_printf(m, "%0*lx %c %s\n",
(int)(2*sizeof(void*)),
iter->value, iter->type, iter->name);
@@ -432,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return ret;
}
-static int kallsyms_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = (struct seq_file *)file->private_data;
- kfree(m->private);
- return seq_release(inode, file);
-}
-
static const struct file_operations kallsyms_operations = {
.open = kallsyms_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = kallsyms_release,
+ .release = seq_release_private,
};
static int __init kallsyms_init(void)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2a59c8a01ae..25db14b89e8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
memset(&prstatus, 0, sizeof(prstatus));
prstatus.pr_pid = current->pid;
elf_core_copy_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
- sizeof(prstatus));
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
final_note(buf);
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 796276141e5..49cc4b9c1a8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -23,7 +23,6 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/mnt_namespace.h>
#include <linux/completion.h>
@@ -166,6 +165,12 @@ static int ____call_usermodehelper(void *data)
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed(current, CPU_MASK_ALL);
+ /*
+ * Our parent is keventd, which runs with elevated scheduling priority.
+ * Avoid propagating that into the userspace child.
+ */
+ set_user_nice(current, 0);
+
retval = -EPERM;
if (current->fs->root)
retval = kernel_execve(sub_info->path,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d25a9ada3f8..9e47d8c493f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,16 +35,19 @@
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/stddef.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/kallsyms.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/kdebug.h>
+
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
-#include <asm/kdebug.h>
+#include <asm/uaccess.h>
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
@@ -63,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
static atomic_t kprobe_count;
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobe_enabled;
+
DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -132,9 +138,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
struct kprobe_insn_page *kip;
struct hlist_node *pos;
- retry:
- hlist_for_each(pos, &kprobe_insn_pages) {
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+ retry:
+ hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
if (kip->nused < INSNS_PER_PAGE) {
int i;
for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -155,9 +160,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
}
/* All out of space. Need to allocate a new page. Use slot 0. */
kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
- if (!kip) {
+ if (!kip)
return NULL;
- }
/*
* Use module_alloc so this page is within +/- 2GB of where the
@@ -213,9 +217,8 @@ static int __kprobes collect_garbage_slots(void)
if (check_safety() != 0)
return -EAGAIN;
- hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+ hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
int i;
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
if (kip->ngarbage == 0)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
@@ -234,8 +237,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
struct kprobe_insn_page *kip;
struct hlist_node *pos;
- hlist_for_each(pos, &kprobe_insn_pages) {
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+ hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
if (kip->insns <= slot &&
slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
int i = (slot - kip->insns) / MAX_INSN_SIZE;
@@ -248,9 +250,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
break;
}
}
- if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+
+ if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
collect_garbage_slots();
- }
}
#endif
@@ -316,7 +318,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
reset_kprobe_instance();
}
}
- return;
}
static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
@@ -362,46 +363,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
}
/* Called with kretprobe_lock held */
-struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
-{
- struct hlist_node *node;
- struct kretprobe_instance *ri;
- hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
- return ri;
- return NULL;
-}
-
-/* Called with kretprobe_lock held */
-static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
- *rp)
-{
- struct hlist_node *node;
- struct kretprobe_instance *ri;
- hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
- return ri;
- return NULL;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes add_rp_inst(struct kretprobe_instance *ri)
-{
- /*
- * Remove rp inst off the free list -
- * Add it back when probed function returns
- */
- hlist_del(&ri->uflist);
-
- /* Add rp inst onto table */
- INIT_HLIST_NODE(&ri->hlist);
- hlist_add_head(&ri->hlist,
- &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
-
- /* Also add this rp inst to the used list. */
- INIT_HLIST_NODE(&ri->uflist);
- hlist_add_head(&ri->uflist, &ri->rp->used_instances);
-}
-
-/* Called with kretprobe_lock held */
void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
struct hlist_head *head)
{
@@ -454,7 +415,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
static inline void free_rp_inst(struct kretprobe *rp)
{
struct kretprobe_instance *ri;
- while ((ri = get_free_rp_inst(rp)) != NULL) {
+ struct hlist_node *pos, *next;
+
+ hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
hlist_del(&ri->uflist);
kfree(ri);
}
@@ -535,8 +498,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
static int __kprobes in_kprobes_functions(unsigned long addr)
{
- if (addr >= (unsigned long)__kprobes_text_start
- && addr < (unsigned long)__kprobes_text_end)
+ if (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end)
return -EINVAL;
return 0;
}
@@ -563,19 +526,24 @@ static int __kprobes __register_kprobe(struct kprobe *p,
return -EINVAL;
p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
- if ((!kernel_text_address((unsigned long) p->addr)) ||
- in_kprobes_functions((unsigned long) p->addr))
+ if (!kernel_text_address((unsigned long) p->addr) ||
+ in_kprobes_functions((unsigned long) p->addr))
return -EINVAL;
p->mod_refcounted = 0;
- /* Check are we probing a module */
- if ((probed_mod = module_text_address((unsigned long) p->addr))) {
+
+ /*
+ * Check if are we probing a module.
+ */
+ probed_mod = module_text_address((unsigned long) p->addr);
+ if (probed_mod) {
struct module *calling_mod = module_text_address(called_from);
- /* We must allow modules to probe themself and
- * in this case avoid incrementing the module refcount,
- * so as to allow unloading of self probing modules.
+ /*
+ * We must allow modules to probe themself and in this case
+ * avoid incrementing the module refcount, so as to allow
+ * unloading of self probing modules.
*/
- if (calling_mod && (calling_mod != probed_mod)) {
+ if (calling_mod && calling_mod != probed_mod) {
if (unlikely(!try_module_get(probed_mod)))
return -EINVAL;
p->mod_refcounted = 1;
@@ -593,19 +561,21 @@ static int __kprobes __register_kprobe(struct kprobe *p,
goto out;
}
- if ((ret = arch_prepare_kprobe(p)) != 0)
+ ret = arch_prepare_kprobe(p);
+ if (ret)
goto out;
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
- if (atomic_add_return(1, &kprobe_count) == \
+ if (kprobe_enabled) {
+ if (atomic_add_return(1, &kprobe_count) == \
(ARCH_INACTIVE_KPROBE_COUNT + 1))
- register_page_fault_notifier(&kprobe_page_fault_nb);
-
- arch_arm_kprobe(p);
+ register_page_fault_notifier(&kprobe_page_fault_nb);
+ arch_arm_kprobe(p);
+ }
out:
mutex_unlock(&kprobe_mutex);
@@ -616,8 +586,7 @@ out:
int __kprobes register_kprobe(struct kprobe *p)
{
- return __register_kprobe(p,
- (unsigned long)__builtin_return_address(0));
+ return __register_kprobe(p, (unsigned long)__builtin_return_address(0));
}
void __kprobes unregister_kprobe(struct kprobe *p)
@@ -641,11 +610,16 @@ void __kprobes unregister_kprobe(struct kprobe *p)
return;
}
valid_p:
- if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
- (p->list.next == &old_p->list) &&
- (p->list.prev == &old_p->list))) {
- /* Only probe on the hash list */
- arch_disarm_kprobe(p);
+ if (old_p == p ||
+ (old_p->pre_handler == aggr_pre_handler &&
+ p->list.next == &old_p->list && p->list.prev == &old_p->list)) {
+ /*
+ * Only probe on the hash list. Disarm only if kprobes are
+ * enabled - otherwise, the breakpoint would already have
+ * been removed. We save on flushing icache.
+ */
+ if (kprobe_enabled)
+ arch_disarm_kprobe(p);
hlist_del_rcu(&old_p->hlist);
cleanup_p = 1;
} else {
@@ -656,9 +630,11 @@ valid_p:
mutex_unlock(&kprobe_mutex);
synchronize_sched();
- if (p->mod_refcounted &&
- (mod = module_text_address((unsigned long)p->addr)))
- module_put(mod);
+ if (p->mod_refcounted) {
+ mod = module_text_address((unsigned long)p->addr);
+ if (mod)
+ module_put(mod);
+ }
if (cleanup_p) {
if (p != old_p) {
@@ -729,7 +705,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
/*TODO: consider to only swap the RA after the last pre_handler fired */
spin_lock_irqsave(&kretprobe_lock, flags);
- arch_prepare_kretprobe(rp, regs);
+ if (!hlist_empty(&rp->free_instances)) {
+ struct kretprobe_instance *ri;
+
+ ri = hlist_entry(rp->free_instances.first,
+ struct kretprobe_instance, uflist);
+ ri->rp = rp;
+ ri->task = current;
+ arch_prepare_kretprobe(ri, regs);
+
+ /* XXX(hch): why is there no hlist_move_head? */
+ hlist_del(&ri->uflist);
+ hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+ hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
+ } else
+ rp->nmissed++;
spin_unlock_irqrestore(&kretprobe_lock, flags);
return 0;
}
@@ -792,11 +782,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
unsigned long flags;
struct kretprobe_instance *ri;
+ struct hlist_node *pos, *next;
unregister_kprobe(&rp->kp);
+
/* No race here */
spin_lock_irqsave(&kretprobe_lock, flags);
- while ((ri = get_used_rp_inst(rp)) != NULL) {
+ hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
ri->rp = NULL;
hlist_del(&ri->uflist);
}
@@ -816,6 +808,9 @@ static int __init init_kprobes(void)
}
atomic_set(&kprobe_count, 0);
+ /* By default, kprobes are enabled */
+ kprobe_enabled = true;
+
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
@@ -825,7 +820,7 @@ static int __init init_kprobes(void)
#ifdef CONFIG_DEBUG_FS
static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
- const char *sym, int offset,char *modname)
+ const char *sym, int offset,char *modname)
{
char *kprobe_type;
@@ -867,13 +862,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
struct kprobe *p, *kp;
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
- unsigned long size, offset = 0;
+ unsigned long offset = 0;
char *modname, namebuf[128];
head = &kprobe_table[i];
preempt_disable();
hlist_for_each_entry_rcu(p, node, head, hlist) {
- sym = kallsyms_lookup((unsigned long)p->addr, &size,
+ sym = kallsyms_lookup((unsigned long)p->addr, NULL,
&offset, &modname, namebuf);
if (p->pre_handler == aggr_pre_handler) {
list_for_each_entry_rcu(kp, &p->list, list)
@@ -904,21 +899,149 @@ static struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
+static void __kprobes enable_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are already enabled, just return */
+ if (kprobe_enabled)
+ goto already_enabled;
+
+ /*
+ * Re-register the page fault notifier only if there are any
+ * active probes at the time of enabling kprobes globally
+ */
+ if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT)
+ register_page_fault_notifier(&kprobe_page_fault_nb);
+
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist)
+ arch_arm_kprobe(p);
+ }
+
+ kprobe_enabled = true;
+ printk(KERN_INFO "Kprobes globally enabled\n");
+
+already_enabled:
+ mutex_unlock(&kprobe_mutex);
+ return;
+}
+
+static void __kprobes disable_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are already disabled, just return */
+ if (!kprobe_enabled)
+ goto already_disabled;
+
+ kprobe_enabled = false;
+ printk(KERN_INFO "Kprobes globally disabled\n");
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist) {
+ if (!arch_trampoline_kprobe(p))
+ arch_disarm_kprobe(p);
+ }
+ }
+
+ mutex_unlock(&kprobe_mutex);
+ /* Allow all currently running kprobes to complete */
+ synchronize_sched();
+
+ mutex_lock(&kprobe_mutex);
+ /* Unconditionally unregister the page_fault notifier */
+ unregister_page_fault_notifier(&kprobe_page_fault_nb);
+
+already_disabled:
+ mutex_unlock(&kprobe_mutex);
+ return;
+}
+
+/*
+ * XXX: The debugfs bool file interface doesn't allow for callbacks
+ * when the bool state is switched. We can reuse that facility when
+ * available
+ */
+static ssize_t read_enabled_file_bool(struct file *file,
+ char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[3];
+
+ if (kprobe_enabled)
+ buf[0] = '1';
+ else
+ buf[0] = '0';
+ buf[1] = '\n';
+ buf[2] = 0x00;
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t write_enabled_file_bool(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ int buf_size;
+
+ buf_size = min(count, (sizeof(buf)-1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ switch (buf[0]) {
+ case 'y':
+ case 'Y':
+ case '1':
+ enable_all_kprobes();
+ break;
+ case 'n':
+ case 'N':
+ case '0':
+ disable_all_kprobes();
+ break;
+ }
+
+ return count;
+}
+
+static struct file_operations fops_kp = {
+ .read = read_enabled_file_bool,
+ .write = write_enabled_file_bool,
+};
+
static int __kprobes debugfs_kprobe_init(void)
{
struct dentry *dir, *file;
+ unsigned int value = 1;
dir = debugfs_create_dir("kprobes", NULL);
if (!dir)
return -ENOMEM;
- file = debugfs_create_file("list", 0444, dir , 0 ,
+ file = debugfs_create_file("list", 0444, dir, NULL,
&debugfs_kprobes_operations);
if (!file) {
debugfs_remove(dir);
return -ENOMEM;
}
+ file = debugfs_create_file("enabled", 0600, dir,
+ &value, &fops_kp);
+ if (!file) {
+ debugfs_remove(dir);
+ return -ENOMEM;
+ }
+
return 0;
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7065a687ac5..1a5ff2211d8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace)
trace->entries = stack_trace + nr_stack_trace_entries;
trace->skip = 3;
- trace->all_contexts = 0;
- save_stack_trace(trace, NULL);
+ save_stack_trace(trace);
trace->max_entries = trace->nr_entries;
@@ -341,10 +340,7 @@ static const char *usage_str[] =
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
{
- unsigned long offs, size;
- char *modname;
-
- return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
+ return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
void
@@ -1313,8 +1309,9 @@ out_unlock_set:
/*
* Look up a dependency chain. If the key is not present yet then
- * add it and return 0 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 1.
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
*/
static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
{
@@ -1577,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
* Mark a lock with a usage bit, and validate the state transition:
*/
static int mark_lock(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit, unsigned long ip)
+ enum lock_usage_bit new_bit)
{
unsigned int new_mask = 1 << new_bit, ret = 1;
@@ -1600,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
this->class->usage_mask |= new_mask;
-#ifdef CONFIG_TRACE_IRQFLAGS
- if (new_bit == LOCK_ENABLED_HARDIRQS ||
- new_bit == LOCK_ENABLED_HARDIRQS_READ)
- ip = curr->hardirq_enable_ip;
- else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
- new_bit == LOCK_ENABLED_SOFTIRQS_READ)
- ip = curr->softirq_enable_ip;
-#endif
if (!save_trace(this->class->usage_traces + new_bit))
return 0;
@@ -1806,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
* Mark all held locks with a usage bit:
*/
static int
-mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
+mark_held_locks(struct task_struct *curr, int hardirq)
{
enum lock_usage_bit usage_bit;
struct held_lock *hlock;
@@ -1826,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
else
usage_bit = LOCK_ENABLED_SOFTIRQS;
}
- if (!mark_lock(curr, hlock, usage_bit, ip))
+ if (!mark_lock(curr, hlock, usage_bit))
return 0;
}
@@ -1879,7 +1868,7 @@ void trace_hardirqs_on(void)
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
- if (!mark_held_locks(curr, 1, ip))
+ if (!mark_held_locks(curr, 1))
return;
/*
* If we have softirqs enabled, then set the usage
@@ -1887,7 +1876,7 @@ void trace_hardirqs_on(void)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, 0, ip))
+ if (!mark_held_locks(curr, 0))
return;
curr->hardirq_enable_ip = ip;
@@ -1955,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip)
* enabled too:
*/
if (curr->hardirqs_enabled)
- mark_held_locks(curr, 0, ip);
+ mark_held_locks(curr, 0);
}
/*
@@ -2093,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (read) {
if (curr->hardirq_context)
if (!mark_lock(curr, hlock,
- LOCK_USED_IN_HARDIRQ_READ, ip))
+ LOCK_USED_IN_HARDIRQ_READ))
return 0;
if (curr->softirq_context)
if (!mark_lock(curr, hlock,
- LOCK_USED_IN_SOFTIRQ_READ, ip))
+ LOCK_USED_IN_SOFTIRQ_READ))
return 0;
} else {
if (curr->hardirq_context)
- if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
return 0;
if (curr->softirq_context)
- if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
return 0;
}
}
if (!hardirqs_off) {
if (read) {
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_HARDIRQS_READ, ip))
+ LOCK_ENABLED_HARDIRQS_READ))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_SOFTIRQS_READ, ip))
+ LOCK_ENABLED_SOFTIRQS_READ))
return 0;
} else {
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_HARDIRQS, ip))
+ LOCK_ENABLED_HARDIRQS))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_SOFTIRQS, ip))
+ LOCK_ENABLED_SOFTIRQS))
return 0;
}
}
#endif
/* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED, ip))
+ if (!mark_lock(curr, hlock, LOCK_USED))
return 0;
out_calc_hash:
/*
diff --git a/kernel/module.c b/kernel/module.c
index 1eb8ca565ba..d36e45477fa 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/init.h>
+#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -310,14 +311,14 @@ static int split_block(unsigned int i, unsigned short size)
{
/* Reallocation required? */
if (pcpu_num_used + 1 > pcpu_num_allocated) {
- int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
- GFP_KERNEL);
+ int *new;
+
+ new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
+ GFP_KERNEL);
if (!new)
return 0;
- memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
pcpu_num_allocated *= 2;
- kfree(pcpu_size);
pcpu_size = new;
}
@@ -1471,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
}
#ifdef CONFIG_KALLSYMS
-int is_exported(const char *name, const struct module *mod)
+static int is_exported(const char *name, const struct module *mod)
{
if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
return 1;
@@ -2097,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod,
if (!best)
return NULL;
- *size = nextval - mod->symtab[best].st_value;
- *offset = addr - mod->symtab[best].st_value;
+ if (size)
+ *size = nextval - mod->symtab[best].st_value;
+ if (offset)
+ *offset = addr - mod->symtab[best].st_value;
return mod->strtab + mod->symtab[best].st_name;
}
@@ -2123,8 +2126,58 @@ const char *module_address_lookup(unsigned long addr,
return NULL;
}
-struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
- char *type, char *name, size_t namelen)
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size) ||
+ within(addr, mod->module_core, mod->core_size)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+ strlcpy(symname, sym, KSYM_NAME_LEN + 1);
+ mutex_unlock(&module_mutex);
+ return 0;
+ }
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return -ERANGE;
+}
+
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size) ||
+ within(addr, mod->module_core, mod->core_size)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, size, offset);
+ if (!sym)
+ goto out;
+ if (modname)
+ strlcpy(modname, mod->name, MODULE_NAME_LEN + 1);
+ if (name)
+ strlcpy(name, sym, KSYM_NAME_LEN + 1);
+ mutex_unlock(&module_mutex);
+ return 0;
+ }
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
{
struct module *mod;
@@ -2134,14 +2187,16 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
- namelen);
+ KSYM_NAME_LEN + 1);
+ strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1);
+ *exported = is_exported(name, mod);
mutex_unlock(&module_mutex);
- return mod;
+ return 0;
}
symnum -= mod->num_symtab;
}
mutex_unlock(&module_mutex);
- return NULL;
+ return -ERANGE;
}
static unsigned long mod_find_symname(struct module *mod, const char *name)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5b9ee6f6bb..1bc4b55241a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk)
/*
* creates a copy of "orig" with refcount 1.
- * This does not grab references to the contained namespaces,
- * so that needs to be done by dup_namespaces.
*/
-static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
+static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
{
struct nsproxy *ns;
@@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
}
/*
- * copies the nsproxy, setting refcount to 1, and grabbing a
- * reference to all contained namespaces. Called from
- * sys_unshare()
+ * Create new nsproxy and all of its the associated namespaces.
+ * Return the newly created nsproxy. Do not attach this to the task,
+ * leave it to the caller to do proper locking and attach it to task.
*/
-struct nsproxy *dup_namespaces(struct nsproxy *orig)
+static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk,
+ struct fs_struct *new_fs)
{
- struct nsproxy *ns = clone_namespaces(orig);
+ struct nsproxy *new_nsp;
- if (ns) {
- if (ns->mnt_ns)
- get_mnt_ns(ns->mnt_ns);
- if (ns->uts_ns)
- get_uts_ns(ns->uts_ns);
- if (ns->ipc_ns)
- get_ipc_ns(ns->ipc_ns);
- if (ns->pid_ns)
- get_pid_ns(ns->pid_ns);
- }
+ new_nsp = clone_nsproxy(tsk->nsproxy);
+ if (!new_nsp)
+ return ERR_PTR(-ENOMEM);
- return ns;
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+ if (IS_ERR(new_nsp->mnt_ns))
+ goto out_ns;
+
+ new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+ if (IS_ERR(new_nsp->uts_ns))
+ goto out_uts;
+
+ new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+ if (IS_ERR(new_nsp->ipc_ns))
+ goto out_ipc;
+
+ new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
+ if (IS_ERR(new_nsp->pid_ns))
+ goto out_pid;
+
+ return new_nsp;
+
+out_pid:
+ if (new_nsp->ipc_ns)
+ put_ipc_ns(new_nsp->ipc_ns);
+out_ipc:
+ if (new_nsp->uts_ns)
+ put_uts_ns(new_nsp->uts_ns);
+out_uts:
+ if (new_nsp->mnt_ns)
+ put_mnt_ns(new_nsp->mnt_ns);
+out_ns:
+ kfree(new_nsp);
+ return ERR_PTR(-ENOMEM);
}
/*
@@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk)
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
return 0;
- new_ns = clone_namespaces(old_ns);
- if (!new_ns) {
- err = -ENOMEM;
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
goto out;
}
- tsk->nsproxy = new_ns;
-
- err = copy_mnt_ns(flags, tsk);
- if (err)
- goto out_ns;
-
- err = copy_utsname(flags, tsk);
- if (err)
- goto out_uts;
-
- err = copy_ipcs(flags, tsk);
- if (err)
- goto out_ipc;
-
- err = copy_pid_ns(flags, tsk);
- if (err)
- goto out_pid;
+ new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+ if (IS_ERR(new_ns)) {
+ err = PTR_ERR(new_ns);
+ goto out;
+ }
+ tsk->nsproxy = new_ns;
out:
put_nsproxy(old_ns);
return err;
-
-out_pid:
- if (new_ns->ipc_ns)
- put_ipc_ns(new_ns->ipc_ns);
-out_ipc:
- if (new_ns->uts_ns)
- put_uts_ns(new_ns->uts_ns);
-out_uts:
- if (new_ns->mnt_ns)
- put_mnt_ns(new_ns->mnt_ns);
-out_ns:
- tsk->nsproxy = old_ns;
- kfree(new_ns);
- goto out;
}
void free_nsproxy(struct nsproxy *ns)
@@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns)
put_pid_ns(ns->pid_ns);
kfree(ns);
}
+
+/*
+ * Called from unshare. Unshare all the namespaces part of nsproxy.
+ * On sucess, returns the new nsproxy and a reference to old nsproxy
+ * to make sure it stays around.
+ */
+int unshare_nsproxy_namespaces(unsigned long unshare_flags,
+ struct nsproxy **new_nsp, struct fs_struct *new_fs)
+{
+ struct nsproxy *old_ns = current->nsproxy;
+ int err = 0;
+
+ if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+ return 0;
+
+#ifndef CONFIG_IPC_NS
+ if (unshare_flags & CLONE_NEWIPC)
+ return -EINVAL;
+#endif
+
+#ifndef CONFIG_UTS_NS
+ if (unshare_flags & CLONE_NEWUTS)
+ return -EINVAL;
+#endif
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ get_nsproxy(old_ns);
+
+ *new_nsp = create_new_namespaces(unshare_flags, current,
+ new_fs ? new_fs : current->fs);
+ if (IS_ERR(*new_nsp)) {
+ err = PTR_ERR(*new_nsp);
+ put_nsproxy(old_ns);
+ }
+ return err;
+}
diff --git a/kernel/params.c b/kernel/params.c
index 312172320b4..e61c46c97ce 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
return param_get_bool(buffer, &dummy);
}
-/* We cheat here and temporarily mangle the string. */
+/* We break the rule and mangle the string. */
static int param_array(const char *name,
const char *val,
unsigned int min, unsigned int max,
diff --git a/kernel/pid.c b/kernel/pid.c
index 9c80bc23d6b..d3ad724afa8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -360,16 +360,11 @@ struct pid *find_ge_pid(int nr)
}
EXPORT_SYMBOL_GPL(find_get_pid);
-int copy_pid_ns(int flags, struct task_struct *tsk)
+struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns)
{
- struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
- int err = 0;
-
- if (!old_ns)
- return 0;
-
+ BUG_ON(!old_ns);
get_pid_ns(old_ns);
- return err;
+ return old_ns;
}
void free_pid_ns(struct kref *kref)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 657f7769741..1de710e1837 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk,
maxfire = 20;
tsk->it_prof_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
@@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk,
maxfire = 20;
tsk->it_virt_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
@@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk,
maxfire = 20;
tsk->it_sched_expires = 0;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || tsk->sched_time < t->expires.sched) {
@@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk,
maxfire = 20;
prof_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
@@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk,
maxfire = 20;
virt_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
@@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk,
maxfire = 20;
sched_expires = 0;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || sched_time < t->expires.sched) {
@@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
*/
head = &tsk->signal->cpu_timers[clock_idx];
if (list_empty(head) ||
- cputime_ge(list_entry(head->next,
+ cputime_ge(list_first_entry(head,
struct cpu_timer_list, entry)->expires.cpu,
*newval)) {
/*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 44318ca7197..588c99da030 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -31,7 +31,6 @@
* POSIX clocks & timers
*/
#include <linux/mm.h>
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/time.h>
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 877721708fa..495b7d4dd33 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -79,7 +79,7 @@ config PM_SYSFS_DEPRECATED
config SOFTWARE_SUSPEND
bool "Software Suspend (Hibernation)"
- depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
+ depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
@@ -139,7 +139,7 @@ config PM_STD_PARTITION
config SUSPEND_SMP
bool
- depends on HOTPLUG_CPU && X86 && PM
+ depends on HOTPLUG_CPU && (X86 || PPC64) && PM
default y
config APM_EMULATION
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0eb5c420e8e..08841938738 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -8,7 +8,6 @@
#undef DEBUG
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/module.h>
@@ -25,10 +24,9 @@
static inline int freezeable(struct task_struct * p)
{
- if ((p == current) ||
+ if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
- (p->exit_state == EXIT_ZOMBIE) ||
- (p->exit_state == EXIT_DEAD))
+ (p->exit_state != 0))
return 0;
return 1;
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 128da11f01c..b7039772b05 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -14,7 +14,6 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/suspend.h>
-#include <linux/smp_lock.h>
#include <linux/delay.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index e83ed9945a8..b8b235cc19d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,7 +12,6 @@
*/
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/version.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 4b47e59248d..0bbdeac2810 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -20,7 +20,6 @@
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
-#include <linux/smp_lock.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -931,8 +930,16 @@ void register_console(struct console *console)
{
int i;
unsigned long flags;
+ struct console *bootconsole = NULL;
- if (preferred_console < 0)
+ if (console_drivers) {
+ if (console->flags & CON_BOOT)
+ return;
+ if (console_drivers->flags & CON_BOOT)
+ bootconsole = console_drivers;
+ }
+
+ if (preferred_console < 0 || bootconsole || !console_drivers)
preferred_console = selected_console;
/*
@@ -978,8 +985,11 @@ void register_console(struct console *console)
if (!(console->flags & CON_ENABLED))
return;
- if (console_drivers && (console_drivers->flags & CON_BOOT)) {
- unregister_console(console_drivers);
+ if (bootconsole) {
+ printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
+ bootconsole->name, bootconsole->index,
+ console->name, console->index);
+ unregister_console(bootconsole);
console->flags &= ~CON_PRINTBUFFER;
}
@@ -1030,16 +1040,11 @@ int unregister_console(struct console *console)
}
}
- /* If last console is removed, we re-enable picking the first
- * one that gets registered. Without that, pmac early boot console
- * would prevent fbcon from taking over.
- *
+ /*
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
*/
- if (console_drivers == NULL)
- preferred_console = selected_console;
- else if (console->flags & CON_CONSDEV)
+ if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
release_console_sem();
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index bcd14e83ef3..55ba82a85a6 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
-static struct rcu_torture_ops *torture_ops[] =
- { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops,
- &sched_ops, NULL };
-
/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
@@ -534,7 +530,7 @@ rcu_torture_writer(void *arg)
rp->rtort_mbtest = 1;
rcu_assign_pointer(rcu_torture_current, rp);
smp_wmb();
- if (old_rp != NULL) {
+ if (old_rp) {
i = old_rp->rtort_pipe_count;
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
@@ -685,7 +681,7 @@ rcu_torture_printk(char *page)
atomic_read(&rcu_torture_wcount[i]));
}
cnt += sprintf(&page[cnt], "\n");
- if (cur_ops->stats != NULL)
+ if (cur_ops->stats)
cnt += cur_ops->stats(&page[cnt]);
return cnt;
}
@@ -749,13 +745,13 @@ static void rcu_torture_shuffle_tasks(void)
set_cpus_allowed(current, tmp_mask);
- if (reader_tasks != NULL) {
+ if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
if (reader_tasks[i])
set_cpus_allowed(reader_tasks[i], tmp_mask);
}
- if (fakewriter_tasks != NULL) {
+ if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
if (fakewriter_tasks[i])
set_cpus_allowed(fakewriter_tasks[i], tmp_mask);
@@ -808,21 +804,21 @@ rcu_torture_cleanup(void)
int i;
fullstop = 1;
- if (shuffler_task != NULL) {
+ if (shuffler_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
kthread_stop(shuffler_task);
}
shuffler_task = NULL;
- if (writer_task != NULL) {
+ if (writer_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
kthread_stop(writer_task);
}
writer_task = NULL;
- if (reader_tasks != NULL) {
+ if (reader_tasks) {
for (i = 0; i < nrealreaders; i++) {
- if (reader_tasks[i] != NULL) {
+ if (reader_tasks[i]) {
VERBOSE_PRINTK_STRING(
"Stopping rcu_torture_reader task");
kthread_stop(reader_tasks[i]);
@@ -834,9 +830,9 @@ rcu_torture_cleanup(void)
}
rcu_torture_current = NULL;
- if (fakewriter_tasks != NULL) {
+ if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++) {
- if (fakewriter_tasks[i] != NULL) {
+ if (fakewriter_tasks[i]) {
VERBOSE_PRINTK_STRING(
"Stopping rcu_torture_fakewriter task");
kthread_stop(fakewriter_tasks[i]);
@@ -847,7 +843,7 @@ rcu_torture_cleanup(void)
fakewriter_tasks = NULL;
}
- if (stats_task != NULL) {
+ if (stats_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
kthread_stop(stats_task);
}
@@ -858,7 +854,7 @@ rcu_torture_cleanup(void)
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
- if (cur_ops->cleanup != NULL)
+ if (cur_ops->cleanup)
cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error))
rcu_torture_print_module_parms("End of test: FAILURE");
@@ -866,27 +862,28 @@ rcu_torture_cleanup(void)
rcu_torture_print_module_parms("End of test: SUCCESS");
}
-static int
+static int __init
rcu_torture_init(void)
{
int i;
int cpu;
int firsterr = 0;
+ static struct rcu_torture_ops *torture_ops[] =
+ { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+ &srcu_ops, &sched_ops, };
/* Process args and tell the world that the torturer is on the job. */
-
- for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
cur_ops = torture_ops[i];
- if (strcmp(torture_type, cur_ops->name) == 0) {
+ if (strcmp(torture_type, cur_ops->name) == 0)
break;
- }
}
- if (cur_ops == NULL) {
+ if (i == ARRAY_SIZE(torture_ops)) {
printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
torture_type);
return (-EINVAL);
}
- if (cur_ops->init != NULL)
+ if (cur_ops->init)
cur_ops->init(); /* no "goto unwind" prior to this point!!! */
if (nreaders >= 0)
@@ -899,7 +896,7 @@ rcu_torture_init(void)
/* Set up the freelist. */
INIT_LIST_HEAD(&rcu_torture_freelist);
- for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+ for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
rcu_tortures[i].rtort_mbtest = 0;
list_add_tail(&rcu_tortures[i].rtort_free,
&rcu_torture_freelist);
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 291ded556aa..9a87886b022 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem)
int ret = __down_write_trylock(sem);
if (ret == 1)
- rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
return ret;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 0227f1625a7..a3a04085e79 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,8 +52,9 @@
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <linux/reciprocal_div.h>
+#include <asm/tlb.h>
#include <asm/unistd.h>
/*
@@ -168,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
#define TASK_PREEMPTS_CURR(p, rq) \
- ((p)->prio < (rq)->curr->prio)
+ (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active))
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+ return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+ sg->__cpu_power += val;
+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
+
/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
@@ -223,6 +245,10 @@ struct rq {
unsigned long raw_weighted_load;
#ifdef CONFIG_SMP
unsigned long cpu_load[3];
+ unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+ unsigned char in_nohz_recently;
+#endif
#endif
unsigned long long nr_switches;
@@ -278,7 +304,7 @@ struct rq {
struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct rq, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
static inline int cpu_of(struct rq *rq)
{
@@ -1049,6 +1075,17 @@ static void resched_task(struct task_struct *p)
if (!tsk_is_polling(p))
smp_send_reschedule(cpu);
}
+
+static void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&rq->lock, flags))
+ return;
+ resched_task(cpu_curr(cpu));
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
#else
static inline void resched_task(struct task_struct *p)
{
@@ -1241,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
if (local_group) {
this_load = avg_load;
@@ -1368,7 +1406,16 @@ static int wake_idle(int cpu, struct task_struct *p)
struct sched_domain *sd;
int i;
- if (idle_cpu(cpu))
+ /*
+ * If it is idle, then it is the best cpu to run this task.
+ *
+ * This cpu is also the best, if it has more than one task already.
+ * Siblings must be also busy(in most cases) as they didn't already
+ * pickup the extra load from this cpu and hence we need not check
+ * sibling runqueue info. This will avoid the checks and cache miss
+ * penalities associated with that.
+ */
+ if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
@@ -2352,12 +2399,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
}
total_load += avg_load;
- total_pwr += group->cpu_power;
+ total_pwr += group->__cpu_power;
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
- group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+ group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) {
this_load = avg_load;
@@ -2468,8 +2516,8 @@ group_next:
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->cpu_power,
- (avg_load - this_load) * this->cpu_power)
+ *imbalance = min(max_pull * busiest->__cpu_power,
+ (avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
@@ -2503,28 +2551,29 @@ small_imbalance:
* moving them.
*/
- pwr_now += busiest->cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->cpu_power *
- min(this_load_per_task, this_load);
+ pwr_now += busiest->__cpu_power *
+ min(busiest_load_per_task, max_load);
+ pwr_now += this->__cpu_power *
+ min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- busiest->cpu_power;
+ tmp = sg_div_cpu_power(busiest,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
- pwr_move += busiest->cpu_power *
+ pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
- if (max_load * busiest->cpu_power <
+ if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = max_load * busiest->cpu_power / this->cpu_power;
+ tmp = sg_div_cpu_power(this,
+ max_load * busiest->__cpu_power);
else
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- this->cpu_power;
- pwr_move += this->cpu_power *
- min(this_load_per_task, this_load + tmp);
+ tmp = sg_div_cpu_power(this,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += this->__cpu_power *
+ min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
@@ -2657,6 +2706,12 @@ redo:
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
+ /*
+ * some other cpu did the load balance for us.
+ */
+ if (nr_moved && this_cpu != smp_processor_id())
+ resched_cpu(this_cpu);
+
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
@@ -2927,32 +2982,98 @@ static void update_load(struct rq *this_rq)
}
}
+#ifdef CONFIG_NO_HZ
+static struct {
+ atomic_t load_balancer;
+ cpumask_t cpu_mask;
+} nohz ____cacheline_aligned = {
+ .load_balancer = ATOMIC_INIT(-1),
+ .cpu_mask = CPU_MASK_NONE,
+};
+
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
*
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+ int cpu = smp_processor_id();
+
+ if (stop_tick) {
+ cpu_set(cpu, nohz.cpu_mask);
+ cpu_rq(cpu)->in_nohz_recently = 1;
+
+ /*
+ * If we are going offline and still the leader, give up!
+ */
+ if (cpu_is_offline(cpu) &&
+ atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ return 0;
+ }
+
+ /* time for ilb owner also to sleep */
+ if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ atomic_set(&nohz.load_balancer, -1);
+ return 0;
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+ return 1;
+ } else if (atomic_read(&nohz.load_balancer) == cpu)
+ return 1;
+ } else {
+ if (!cpu_isset(cpu, nohz.cpu_mask))
+ return 0;
+
+ cpu_clear(cpu, nohz.cpu_mask);
+
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ }
+ return 0;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in arch_init_sched_domains.
*/
-static DEFINE_SPINLOCK(balancing);
-
-static void run_rebalance_domains(struct softirq_action *h)
+static inline void rebalance_domains(int cpu, enum idle_type idle)
{
- int this_cpu = smp_processor_id(), balance = 1;
- struct rq *this_rq = cpu_rq(this_cpu);
+ int balance = 1;
+ struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
- /*
- * We are idle if there are no processes running. This
- * is valid even if we are the idle process (SMT).
- */
- enum idle_type idle = !this_rq->nr_running ?
- SCHED_IDLE : NOT_IDLE;
- /* Earliest time when we have to call run_rebalance_domains again */
+ /* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
- for_each_domain(this_cpu, sd) {
+ for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -2971,7 +3092,7 @@ static void run_rebalance_domains(struct softirq_action *h)
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
+ if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -2995,7 +3116,114 @@ out:
if (!balance)
break;
}
- this_rq->next_balance = next_balance;
+ rq->next_balance = next_balance;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+ int local_cpu = smp_processor_id();
+ struct rq *local_rq = cpu_rq(local_cpu);
+ enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+
+ rebalance_domains(local_cpu, idle);
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * If this cpu is the owner for idle load balancing, then do the
+ * balancing on behalf of the other idle cpus whose ticks are
+ * stopped.
+ */
+ if (local_rq->idle_at_tick &&
+ atomic_read(&nohz.load_balancer) == local_cpu) {
+ cpumask_t cpus = nohz.cpu_mask;
+ struct rq *rq;
+ int balance_cpu;
+
+ cpu_clear(local_cpu, cpus);
+ for_each_cpu_mask(balance_cpu, cpus) {
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched())
+ break;
+
+ rebalance_domains(balance_cpu, SCHED_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(local_rq->next_balance, rq->next_balance))
+ local_rq->next_balance = rq->next_balance;
+ }
+ }
+#endif
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ
+ /*
+ * If we were in the nohz mode recently and busy at the current
+ * scheduler tick, then check if we need to nominate new idle
+ * load balancer.
+ */
+ if (rq->in_nohz_recently && !rq->idle_at_tick) {
+ rq->in_nohz_recently = 0;
+
+ if (atomic_read(&nohz.load_balancer) == cpu) {
+ cpu_clear(cpu, nohz.cpu_mask);
+ atomic_set(&nohz.load_balancer, -1);
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /*
+ * simple selection for now: Nominate the
+ * first cpu in the nohz list to be the next
+ * ilb owner.
+ *
+ * TBD: Traverse the sched domains and nominate
+ * the nearest cpu in the nohz.cpu_mask.
+ */
+ int ilb = first_cpu(nohz.cpu_mask);
+
+ if (ilb != NR_CPUS)
+ resched_cpu(ilb);
+ }
+ }
+
+ /*
+ * If this cpu is idle and doing idle load balancing for all the
+ * cpus with ticks stopped, is it time for that to stop?
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+ cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ resched_cpu(cpu);
+ return;
+ }
+
+ /*
+ * If this cpu is idle and the idle load balancing is done by
+ * someone else, then no need raise the SCHED_SOFTIRQ
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+ cpu_isset(cpu, nohz.cpu_mask))
+ return;
+#endif
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
}
#else
/*
@@ -3218,16 +3446,17 @@ void scheduler_tick(void)
unsigned long long now = sched_clock();
struct task_struct *p = current;
int cpu = smp_processor_id();
+ int idle_at_tick = idle_cpu(cpu);
struct rq *rq = cpu_rq(cpu);
update_cpu_clock(p, rq, now);
- if (p != rq->idle)
+ if (!idle_at_tick)
task_running_tick(rq, p);
#ifdef CONFIG_SMP
update_load(rq);
- if (time_after_eq(jiffies, rq->next_balance))
- raise_softirq(SCHED_SOFTIRQ);
+ rq->idle_at_tick = idle_at_tick;
+ trigger_load_balance(cpu);
#endif
}
@@ -3847,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
struct prio_array *array;
unsigned long flags;
struct rq *rq;
- int oldprio;
+ int delta;
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
- oldprio = p->prio;
+ delta = prio - p->prio;
array = p->array;
if (array)
dequeue_task(p, array);
@@ -3869,13 +4098,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
enqueue_task(p, array);
/*
* Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
+ * our priority decreased, or if our priority became higher
+ * than the current's.
*/
- if (task_running(rq, p)) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
+ if (TASK_PREEMPTS_CURR(p, rq) ||
+ (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
task_rq_unlock(rq, &flags);
@@ -3923,10 +4150,12 @@ void set_user_nice(struct task_struct *p, long nice)
enqueue_task(p, array);
inc_raw_weighted_load(rq, p);
/*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if our priority became higher
+ * than the current's.
*/
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
+ if (TASK_PREEMPTS_CURR(p, rq) ||
+ (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
out_unlock:
@@ -4153,13 +4382,11 @@ recheck:
__activate_task(p, rq);
/*
* Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
+ * our priority decreased, or our priority became higher
+ * than the current's.
*/
- if (task_running(rq, p)) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
+ if (TASK_PREEMPTS_CURR(p, rq) ||
+ (task_running(rq, p) && p->prio > oldprio))
resched_task(rq->curr);
}
__task_rq_unlock(rq);
@@ -4750,6 +4977,8 @@ void show_state_filter(unsigned long state_filter)
show_task(p);
} while_each_thread(g, p);
+ touch_all_softlockup_watchdogs();
+
read_unlock(&tasklist_lock);
/*
* Only show locks if all tasks are dumped:
@@ -5304,7 +5533,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
break;
}
- if (!group->cpu_power) {
+ if (!group->__cpu_power) {
printk("\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -5481,7 +5710,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
continue;
sg->cpumask = CPU_MASK_NONE;
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
for_each_cpu_mask(j, span) {
if (group_fn(j, cpu_map, NULL) != group)
@@ -6170,7 +6399,7 @@ next_sg:
continue;
}
- sg->cpu_power += sd->groups->cpu_power;
+ sg_inc_cpu_power(sg, sd->groups->__cpu_power);
}
sg = sg->next;
if (sg != group_head)
@@ -6245,6 +6474,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
child = sd->child;
+ sd->groups->__cpu_power = 0;
+
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
@@ -6255,18 +6486,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
(child->flags &
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
return;
}
- sd->groups->cpu_power = 0;
-
/*
* add cpu_power of each child group to this groups cpu_power
*/
group = child->groups;
do {
- sd->groups->cpu_power += group->cpu_power;
+ sg_inc_cpu_power(sd->groups, group->__cpu_power);
group = group->next;
} while (group != child->groups);
}
@@ -6426,7 +6655,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = nodemask;
sg->next = sg;
cpus_or(covered, covered, nodemask);
@@ -6454,7 +6683,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
"Can not alloc domain group for node %d\n", j);
goto error;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = tmp;
sg->next = prev->next;
cpus_or(covered, covered, tmp);
diff --git a/kernel/signal.c b/kernel/signal.c
index 2b4087d545a..1368e67c848 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -12,7 +12,6 @@
#include <linux/slab.h>
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 50afeb81330..8fa7040247a 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -34,12 +34,32 @@ static struct notifier_block panic_block = {
.notifier_call = softlock_panic,
};
+/*
+ * Returns seconds, approximately. We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+ return sched_clock() >> 30; /* 2^30 ~= 10^9 */
+}
+
void touch_softlockup_watchdog(void)
{
- __raw_get_cpu_var(touch_timestamp) = jiffies;
+ __raw_get_cpu_var(touch_timestamp) = get_timestamp();
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_all_softlockup_watchdogs(void)
+{
+ int cpu;
+
+ /* Cause each CPU to re-update its timestamp rather than complain */
+ for_each_online_cpu(cpu)
+ per_cpu(touch_timestamp, cpu) = 0;
+}
+EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+
/*
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
@@ -48,9 +68,18 @@ void softlockup_tick(void)
{
int this_cpu = smp_processor_id();
unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+ unsigned long print_timestamp;
+ unsigned long now;
+
+ if (touch_timestamp == 0) {
+ touch_softlockup_watchdog();
+ return;
+ }
+
+ print_timestamp = per_cpu(print_timestamp, this_cpu);
- /* prevent double reports: */
- if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+ /* report at most once a second */
+ if (print_timestamp < (touch_timestamp + 1) ||
did_panic ||
!per_cpu(watchdog_task, this_cpu))
return;
@@ -61,12 +90,14 @@ void softlockup_tick(void)
return;
}
+ now = get_timestamp();
+
/* Wake up the high-prio watchdog task every second: */
- if (time_after(jiffies, touch_timestamp + HZ))
+ if (now > (touch_timestamp + 1))
wake_up_process(per_cpu(watchdog_task, this_cpu));
/* Warn about unreasonable 10+ seconds delays: */
- if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+ if (now > (touch_timestamp + 10)) {
per_cpu(print_timestamp, this_cpu) = touch_timestamp;
spin_lock(&print_lock);
@@ -82,11 +113,14 @@ void softlockup_tick(void)
*/
static int watchdog(void * __bind_cpu)
{
- struct sched_param param = { .sched_priority = 99 };
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
sched_setscheduler(current, SCHED_FIFO, &param);
current->flags |= PF_NOFREEZE;
+ /* initialize timestamp */
+ touch_softlockup_watchdog();
+
/*
* Run briefly once per second to reset the softlockup timestamp.
* If this gets delayed for more than 10 seconds then the
@@ -118,7 +152,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
printk("watchdog for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
- per_cpu(touch_timestamp, hotcpu) = jiffies;
+ per_cpu(touch_timestamp, hotcpu) = 0;
per_cpu(watchdog_task, hotcpu) = p;
kthread_bind(p, hotcpu);
break;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 12458040e66..daabb74ee0b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,11 +1,12 @@
/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
* GPL v2 and any later version.
*/
-#include <linux/stop_machine.h>
-#include <linux/kthread.h>
-#include <linux/sched.h>
#include <linux/cpu.h>
#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/stop_machine.h>
#include <linux/syscalls.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
@@ -208,3 +209,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
return ret;
}
+EXPORT_SYMBOL_GPL(stop_machine_run);
diff --git a/kernel/sys.c b/kernel/sys.c
index fe1f3ab2047..926bf9d7ac4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1923,6 +1923,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
if (retval)
return retval;
+ if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+ /*
+ * The caller is asking for an immediate RLIMIT_CPU
+ * expiry. But we use the zero value to mean "it was
+ * never set". So let's cheat and make it one second
+ * instead
+ */
+ new_rlim.rlim_cur = 1;
+ }
+
task_lock(current->group_leader);
*old_rlim = new_rlim;
task_unlock(current->group_leader);
@@ -1944,15 +1954,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
unsigned long rlim_cur = new_rlim.rlim_cur;
cputime_t cputime;
- if (rlim_cur == 0) {
- /*
- * The caller is asking for an immediate RLIMIT_CPU
- * expiry. But we use the zero value to mean "it was
- * never set". So let's cheat and make it one second
- * instead
- */
- rlim_cur = 1;
- }
cputime = secs_to_cputime(rlim_cur);
read_lock(&tasklist_lock);
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c904748f229..f0664bd5011 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
+extern int maps_protect;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -603,6 +604,16 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#ifdef CONFIG_PROC_FS
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "maps_protect",
+ .data = &maps_protect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{ .ctl_name = 0 }
};
diff --git a/kernel/time.c b/kernel/time.c
index ba18ec4899b..f04791f6940 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -31,7 +31,6 @@
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/errno.h>
-#include <linux/smp_lock.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
@@ -247,6 +246,36 @@ struct timespec current_fs_time(struct super_block *sb)
}
EXPORT_SYMBOL(current_fs_time);
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int inline jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+ return (j * MSEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_msecs);
+
+unsigned int inline jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+ return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+ return (j * USEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs);
+
/**
* timespec_trunc - Truncate timespec to a granularity
* @t: Timespec
@@ -473,36 +502,6 @@ struct timeval ns_to_timeval(const s64 nsec)
EXPORT_SYMBOL(ns_to_timeval);
/*
- * Convert jiffies to milliseconds and back.
- *
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
- */
-unsigned int jiffies_to_msecs(const unsigned long j)
-{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
- return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
-#else
- return (j * MSEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_msecs);
-
-unsigned int jiffies_to_usecs(const unsigned long j)
-{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
- return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
-#else
- return (j * USEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_usecs);
-
-/*
* When we convert to jiffies then we interpret incoming values
* the following way:
*
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 93bccba1f26..99b6034fc86 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bfda3f7f071..a96ec9ab345 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
*/
ktime_t tick_next_period;
ktime_t tick_period;
-static int tick_do_timer_cpu = -1;
+int tick_do_timer_cpu __read_mostly = -1;
DEFINE_SPINLOCK(tick_device_lock);
/*
@@ -295,6 +295,12 @@ static void tick_shutdown(unsigned int *cpup)
clockevents_exchange_device(dev, NULL);
td->evtdev = NULL;
}
+ /* Transfer the do_timer job away from this cpu */
+ if (*cpup == tick_do_timer_cpu) {
+ int cpu = first_cpu(cpu_online_map);
+
+ tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+ }
spin_unlock_irqrestore(&tick_device_lock, flags);
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c9d203bde51..bb13f272490 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern spinlock_t tick_device_lock;
extern ktime_t tick_next_period;
extern ktime_t tick_period;
+extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 51556b95f60..3483e6cb954 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -217,10 +217,30 @@ void tick_nohz_stop_sched_tick(void)
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
+ if (select_nohz_load_balancer(1)) {
+ /*
+ * sched tick not stopped!
+ */
+ cpu_clear(cpu, nohz_cpu_mask);
+ goto out;
+ }
+
ts->idle_tick = ts->sched_timer.expires;
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
}
+
+ /*
+ * If this cpu is the one which updates jiffies, then
+ * give up the assignment and let it be taken by the
+ * cpu which runs the tick timer next, which might be
+ * this cpu as well. If we don't drop this here the
+ * jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = -1;
+
/*
* calculate the expiry time for the next timer wheel
* timer
@@ -273,6 +293,7 @@ void tick_nohz_restart_sched_tick(void)
now = ktime_get();
local_irq_disable();
+ select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
cpu_clear(cpu, nohz_cpu_mask);
@@ -338,12 +359,24 @@ static void tick_nohz_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
+ int cpu = smp_processor_id();
ktime_t now = ktime_get();
dev->next_event.tv64 = KTIME_MAX;
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == -1))
+ tick_do_timer_cpu = cpu;
+
/* Check, if the jiffies need an update */
- tick_do_update_jiffies64(now);
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
/*
* When we are idle and the tick is stopped, we have to touch
@@ -431,9 +464,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
struct hrtimer_cpu_base *base = timer->base->cpu_base;
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
+ int cpu = smp_processor_id();
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == -1))
+ tick_do_timer_cpu = cpu;
+#endif
/* Check, if the jiffies need an update */
- tick_do_update_jiffies64(now);
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
/*
* Do not call, when we are not in irq context and have
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
new file mode 100644
index 00000000000..f9217bf644f
--- /dev/null
+++ b/kernel/time/timekeeping.c
@@ -0,0 +1,476 @@
+/*
+ * linux/kernel/time/timekeeping.c
+ *
+ * Kernel timekeeping code and accessor functions
+ *
+ * This code was moved from linux/kernel/timer.c.
+ * Please see that file for copyright and history logs.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sysdev.h>
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/tick.h>
+
+
+/*
+ * This read-write spinlock protects us from races in SMP while
+ * playing with xtime and avenrun.
+ */
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+
+EXPORT_SYMBOL(xtime_lock);
+
+
+/*
+ * The current time
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected
+ * for sub jiffie times) to get to monotonic time. Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
+ */
+struct timespec xtime __attribute__ ((aligned (16)));
+struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+
+EXPORT_SYMBOL(xtime);
+
+
+static struct clocksource *clock; /* pointer to current clocksource */
+
+
+#ifdef CONFIG_GENERIC_TIME
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold xtime_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to update_wall_time() (adjusted by NTP scaling)
+ */
+static inline s64 __get_nsec_offset(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ s64 ns_offset;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyc2ns(clock, cycle_delta);
+
+ return ns_offset;
+}
+
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+ unsigned long seq;
+ s64 nsecs;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ *ts = xtime;
+ nsecs = __get_nsec_offset();
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ timespec_add_ns(ts, nsecs);
+}
+
+/**
+ * getnstimeofday - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ __get_realtime_clock_ts(ts);
+}
+
+EXPORT_SYMBOL(getnstimeofday);
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv: pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+ struct timespec now;
+
+ __get_realtime_clock_ts(&now);
+ tv->tv_sec = now.tv_sec;
+ tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv: pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+ unsigned long flags;
+ time_t wtm_sec, sec = tv->tv_sec;
+ long wtm_nsec, nsec = tv->tv_nsec;
+
+ if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ nsec -= __get_nsec_offset();
+
+ wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+ wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+ set_normalized_timespec(&xtime, sec, nsec);
+ set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+ clock->error = 0;
+ ntp_clear();
+
+ update_vsyscall(&xtime, clock);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static void change_clocksource(void)
+{
+ struct clocksource *new;
+ cycle_t now;
+ u64 nsec;
+
+ new = clocksource_get_next();
+
+ if (clock == new)
+ return;
+
+ now = clocksource_read(new);
+ nsec = __get_nsec_offset();
+ timespec_add_ns(&xtime, nsec);
+
+ clock = new;
+ clock->cycle_last = now;
+
+ clock->error = 0;
+ clock->xtime_nsec = 0;
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+
+ tick_clock_notify();
+
+ printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+ clock->name);
+}
+#else
+static inline void change_clocksource(void) { }
+#endif
+
+/**
+ * timekeeping_is_continuous - check to see if timekeeping is free running
+ */
+int timekeeping_is_continuous(void)
+{
+ unsigned long seq;
+ int ret;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ return ret;
+}
+
+/**
+ * read_persistent_clock - Return time in seconds from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Returns seconds from epoch using the battery backed persistent clock.
+ * Returns zero if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+ return 0;
+}
+
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+ unsigned long flags;
+ unsigned long sec = read_persistent_clock();
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ ntp_clear();
+
+ clock = clocksource_get_next();
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+ clock->cycle_last = clocksource_read(clock);
+
+ xtime.tv_sec = sec;
+ xtime.tv_nsec = 0;
+ set_normalized_timespec(&wall_to_monotonic,
+ -xtime.tv_sec, -xtime.tv_nsec);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+/* flag for if timekeeping is suspended */
+static int timekeeping_suspended;
+/* time in seconds when suspend began */
+static unsigned long timekeeping_suspend_time;
+
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ * @dev: unused
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static int timekeeping_resume(struct sys_device *dev)
+{
+ unsigned long flags;
+ unsigned long now = read_persistent_clock();
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ if (now && (now > timekeeping_suspend_time)) {
+ unsigned long sleep_length = now - timekeeping_suspend_time;
+
+ xtime.tv_sec += sleep_length;
+ wall_to_monotonic.tv_sec -= sleep_length;
+ }
+ /* re-base the last cycle value */
+ clock->cycle_last = clocksource_read(clock);
+ clock->error = 0;
+ timekeeping_suspended = 0;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ touch_softlockup_watchdog();
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+
+ /* Resume hrtimers */
+ hres_timers_resume();
+
+ return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+ timekeeping_suspended = 1;
+ timekeeping_suspend_time = read_persistent_clock();
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+
+ return 0;
+}
+
+/* sysfs resume/suspend bits for timekeeping */
+static struct sysdev_class timekeeping_sysclass = {
+ .resume = timekeeping_resume,
+ .suspend = timekeeping_suspend,
+ set_kset_name("timekeeping"),
+};
+
+static struct sys_device device_timer = {
+ .id = 0,
+ .cls = &timekeeping_sysclass,
+};
+
+static int __init timekeeping_init_device(void)
+{
+ int error = sysdev_class_register(&timekeeping_sysclass);
+ if (!error)
+ error = sysdev_register(&device_timer);
+ return error;
+}
+
+device_initcall(timekeeping_init_device);
+
+/*
+ * If the error is already larger, we look ahead even further
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+ s64 *offset)
+{
+ s64 tick_error, i;
+ u32 look_ahead, adj;
+ s32 error2, mult;
+
+ /*
+ * Use the current error value to determine how much to look ahead.
+ * The larger the error the slower we adjust for it to avoid problems
+ * with losing too many ticks, otherwise we would overadjust and
+ * produce an even larger error. The smaller the adjustment the
+ * faster we try to adjust for it, as lost ticks can do less harm
+ * here. This is tuned so that an error of about 1 msec is adusted
+ * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
+ */
+ error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = abs(error2);
+ for (look_ahead = 0; error2 > 0; look_ahead++)
+ error2 >>= 2;
+
+ /*
+ * Now calculate the error in (1 << look_ahead) ticks, but first
+ * remove the single look ahead already included in the error.
+ */
+ tick_error = current_tick_length() >>
+ (TICK_LENGTH_SHIFT - clock->shift + 1);
+ tick_error -= clock->xtime_interval >> 1;
+ error = ((error - tick_error) >> look_ahead) + tick_error;
+
+ /* Finally calculate the adjustment shift value. */
+ i = *interval;
+ mult = 1;
+ if (error < 0) {
+ error = -error;
+ *interval = -*interval;
+ *offset = -*offset;
+ mult = -1;
+ }
+ for (adj = 0; error > i; adj++)
+ error >>= 1;
+
+ *interval <<= adj;
+ *offset <<= adj;
+ return mult << adj;
+}
+
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void clocksource_adjust(struct clocksource *clock, s64 offset)
+{
+ s64 error, interval = clock->cycle_interval;
+ int adj;
+
+ error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+ if (error > interval) {
+ error >>= 2;
+ if (likely(error <= interval))
+ adj = 1;
+ else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else if (error < -interval) {
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else
+ return;
+
+ clock->mult += adj;
+ clock->xtime_interval += interval;
+ clock->xtime_nsec -= offset;
+ clock->error -= (interval - offset) <<
+ (TICK_LENGTH_SHIFT - clock->shift);
+}
+
+/**
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
+ */
+void update_wall_time(void)
+{
+ cycle_t offset;
+
+ /* Make sure we're fully resumed: */
+ if (unlikely(timekeeping_suspended))
+ return;
+
+#ifdef CONFIG_GENERIC_TIME
+ offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+#else
+ offset = clock->cycle_interval;
+#endif
+ clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+
+ /* normally this loop will run just once, however in the
+ * case of lost or late ticks, it will accumulate correctly.
+ */
+ while (offset >= clock->cycle_interval) {
+ /* accumulate one interval */
+ clock->xtime_nsec += clock->xtime_interval;
+ clock->cycle_last += clock->cycle_interval;
+ offset -= clock->cycle_interval;
+
+ if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+ clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+
+ /* interpolator bits */
+ time_interpolator_update(clock->xtime_interval
+ >> clock->shift);
+
+ /* accumulate error between NTP and clock interval */
+ clock->error += current_tick_length();
+ clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+ }
+
+ /* correct the clock when NTP error is too big */
+ clocksource_adjust(clock, offset);
+
+ /* store full nanoseconds into xtime */
+ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+ clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+
+ /* check to see if there is a new clocksource to use */
+ change_clocksource();
+ update_vsyscall(&xtime, clock);
+}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 59df5e8555a..b734ca4bc75 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,17 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
static void print_name_offset(struct seq_file *m, void *sym)
{
- unsigned long addr = (unsigned long)sym;
- char namebuf[KSYM_NAME_LEN+1];
- unsigned long size, offset;
- const char *sym_name;
- char *modname;
-
- sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
- if (sym_name)
- SEQ_printf(m, "%s", sym_name);
- else
+ char symname[KSYM_NAME_LEN+1];
+
+ if (lookup_symbol_name((unsigned long)sym, symname) < 0)
SEQ_printf(m, "<%p>", sym);
+ else
+ SEQ_printf(m, "%s", symname);
}
static void
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1bc4882e28e..868f1bceb07 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -257,16 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
static void print_name_offset(struct seq_file *m, unsigned long addr)
{
- char namebuf[KSYM_NAME_LEN+1];
- unsigned long size, offset;
- const char *sym_name;
- char *modname;
-
- sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
- if (sym_name)
- seq_printf(m, "%s", sym_name);
- else
+ char symname[KSYM_NAME_LEN+1];
+
+ if (lookup_symbol_name(addr, symname) < 0)
seq_printf(m, "<%p>", (void *)addr);
+ else
+ seq_printf(m, "%s", symname);
}
static int tstats_show(struct seq_file *m, void *v)
diff --git a/kernel/timer.c b/kernel/timer.c
index b22bd39740d..7a6448340f9 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
/*
* linux/kernel/timer.c
*
- * Kernel internal timers, kernel timekeeping, basic process system calls
+ * Kernel internal timers, basic process system calls
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
@@ -74,7 +74,7 @@ struct tvec_t_base_s {
tvec_t tv3;
tvec_t tv4;
tvec_t tv5;
-} ____cacheline_aligned_in_smp;
+} ____cacheline_aligned;
typedef struct tvec_t_base_s tvec_base_t;
@@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+/*
+ * Note that all tvec_bases is 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB for
+ * the new flag to indicate whether the timer is deferrable
+ */
+#define TBASE_DEFERRABLE_FLAG (0x1)
+
+/* Functions below help us manage 'deferrable' flag */
+static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
+{
+ return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+}
+
+static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
+{
+ return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+}
+
+static inline void timer_set_deferrable(struct timer_list *timer)
+{
+ timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
+ TBASE_DEFERRABLE_FLAG));
+}
+
+static inline void
+timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
+{
+ timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+ tbase_get_deferrable(timer->base));
+}
+
/**
* __round_jiffies - function to round jiffies to a full second
* @j: the time in (absolute) jiffies that should be rounded
@@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer)
}
EXPORT_SYMBOL(init_timer);
+void fastcall init_timer_deferrable(struct timer_list *timer)
+{
+ init_timer(timer);
+ timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL(init_timer_deferrable);
+
static inline void detach_timer(struct timer_list *timer,
int clear_pending)
{
@@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
tvec_base_t *base;
for (;;) {
- base = timer->base;
+ tvec_base_t *prelock_base = timer->base;
+ base = tbase_get_base(prelock_base);
if (likely(base != NULL)) {
spin_lock_irqsave(&base->lock, *flags);
- if (likely(base == timer->base))
+ if (likely(prelock_base == timer->base))
return base;
/* The timer has migrated to another CPU */
spin_unlock_irqrestore(&base->lock, *flags);
@@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
*/
if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
- timer->base = NULL;
+ timer_set_base(timer, NULL);
spin_unlock(&base->lock);
base = new_base;
spin_lock(&base->lock);
- timer->base = base;
+ timer_set_base(timer, base);
}
}
@@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
spin_lock_irqsave(&base->lock, flags);
- timer->base = base;
+ timer_set_base(timer, base);
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
}
@@ -550,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
* don't have to detach them individually.
*/
list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
- BUG_ON(timer->base != base);
+ BUG_ON(tbase_get_base(timer->base) != base);
internal_add_timer(base, timer);
}
@@ -590,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base)
void (*fn)(unsigned long);
unsigned long data;
- timer = list_entry(head->next,struct timer_list,entry);
+ timer = list_first_entry(head, struct timer_list,entry);
fn = timer->function;
data = timer->data;
@@ -636,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
index = slot = timer_jiffies & TVR_MASK;
do {
list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ if (tbase_get_deferrable(nte->base))
+ continue;
+
found = 1;
expires = nte->expires;
/* Look at the cascade bucket(s)? */
@@ -752,455 +794,6 @@ unsigned long next_timer_interrupt(void)
#endif
-/******************************************************************/
-
-/*
- * The current time
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected
- * for sub jiffie times) to get to monotonic time. Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- */
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-
-EXPORT_SYMBOL(xtime);
-
-
-/* XXX - all of this timekeeping code should be later moved to time.c */
-#include <linux/clocksource.h>
-static struct clocksource *clock; /* pointer to current clocksource */
-
-#ifdef CONFIG_GENERIC_TIME
-/**
- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
- *
- * private function, must hold xtime_lock lock when being
- * called. Returns the number of nanoseconds since the
- * last call to update_wall_time() (adjusted by NTP scaling)
- */
-static inline s64 __get_nsec_offset(void)
-{
- cycle_t cycle_now, cycle_delta;
- s64 ns_offset;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
- ns_offset = cyc2ns(clock, cycle_delta);
-
- return ns_offset;
-}
-
-/**
- * __get_realtime_clock_ts - Returns the time of day in a timespec
- * @ts: pointer to the timespec to be set
- *
- * Returns the time of day in a timespec. Used by
- * do_gettimeofday() and get_realtime_clock_ts().
- */
-static inline void __get_realtime_clock_ts(struct timespec *ts)
-{
- unsigned long seq;
- s64 nsecs;
-
- do {
- seq = read_seqbegin(&xtime_lock);
-
- *ts = xtime;
- nsecs = __get_nsec_offset();
-
- } while (read_seqretry(&xtime_lock, seq));
-
- timespec_add_ns(ts, nsecs);
-}
-
-/**
- * getnstimeofday - Returns the time of day in a timespec
- * @ts: pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- */
-void getnstimeofday(struct timespec *ts)
-{
- __get_realtime_clock_ts(ts);
-}
-
-EXPORT_SYMBOL(getnstimeofday);
-
-/**
- * do_gettimeofday - Returns the time of day in a timeval
- * @tv: pointer to the timeval to be set
- *
- * NOTE: Users should be converted to using get_realtime_clock_ts()
- */
-void do_gettimeofday(struct timeval *tv)
-{
- struct timespec now;
-
- __get_realtime_clock_ts(&now);
- tv->tv_sec = now.tv_sec;
- tv->tv_usec = now.tv_nsec/1000;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-/**
- * do_settimeofday - Sets the time of day
- * @tv: pointer to the timespec variable containing the new time
- *
- * Sets the time of day to the new time and update NTP and notify hrtimers
- */
-int do_settimeofday(struct timespec *tv)
-{
- unsigned long flags;
- time_t wtm_sec, sec = tv->tv_sec;
- long wtm_nsec, nsec = tv->tv_nsec;
-
- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
- return -EINVAL;
-
- write_seqlock_irqsave(&xtime_lock, flags);
-
- nsec -= __get_nsec_offset();
-
- wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
- wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
- set_normalized_timespec(&xtime, sec, nsec);
- set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
- clock->error = 0;
- ntp_clear();
-
- update_vsyscall(&xtime, clock);
-
- write_sequnlock_irqrestore(&xtime_lock, flags);
-
- /* signal hrtimers about time change */
- clock_was_set();
-
- return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
-/**
- * change_clocksource - Swaps clocksources if a new one is available
- *
- * Accumulates current time interval and initializes new clocksource
- */
-static void change_clocksource(void)
-{
- struct clocksource *new;
- cycle_t now;
- u64 nsec;
-
- new = clocksource_get_next();
-
- if (clock == new)
- return;
-
- now = clocksource_read(new);
- nsec = __get_nsec_offset();
- timespec_add_ns(&xtime, nsec);
-
- clock = new;
- clock->cycle_last = now;
-
- clock->error = 0;
- clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-
- tick_clock_notify();
-
- printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
-}
-#else
-static inline void change_clocksource(void) { }
-#endif
-
-/**
- * timekeeping_is_continuous - check to see if timekeeping is free running
- */
-int timekeeping_is_continuous(void)
-{
- unsigned long seq;
- int ret;
-
- do {
- seq = read_seqbegin(&xtime_lock);
-
- ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-
- } while (read_seqretry(&xtime_lock, seq));
-
- return ret;
-}
-
-/**
- * read_persistent_clock - Return time in seconds from the persistent clock.
- *
- * Weak dummy function for arches that do not yet support it.
- * Returns seconds from epoch using the battery backed persistent clock.
- * Returns zero if unsupported.
- *
- * XXX - Do be sure to remove it once all arches implement it.
- */
-unsigned long __attribute__((weak)) read_persistent_clock(void)
-{
- return 0;
-}
-
-/*
- * timekeeping_init - Initializes the clocksource and common timekeeping values
- */
-void __init timekeeping_init(void)
-{
- unsigned long flags;
- unsigned long sec = read_persistent_clock();
-
- write_seqlock_irqsave(&xtime_lock, flags);
-
- ntp_clear();
-
- clock = clocksource_get_next();
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
- clock->cycle_last = clocksource_read(clock);
-
- xtime.tv_sec = sec;
- xtime.tv_nsec = 0;
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
-
- write_sequnlock_irqrestore(&xtime_lock, flags);
-}
-
-/* flag for if timekeeping is suspended */
-static int timekeeping_suspended;
-/* time in seconds when suspend began */
-static unsigned long timekeeping_suspend_time;
-
-/**
- * timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev: unused
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
- */
-static int timekeeping_resume(struct sys_device *dev)
-{
- unsigned long flags;
- unsigned long now = read_persistent_clock();
-
- write_seqlock_irqsave(&xtime_lock, flags);
-
- if (now && (now > timekeeping_suspend_time)) {
- unsigned long sleep_length = now - timekeeping_suspend_time;
-
- xtime.tv_sec += sleep_length;
- wall_to_monotonic.tv_sec -= sleep_length;
- }
- /* re-base the last cycle value */
- clock->cycle_last = clocksource_read(clock);
- clock->error = 0;
- timekeeping_suspended = 0;
- write_sequnlock_irqrestore(&xtime_lock, flags);
-
- touch_softlockup_watchdog();
-
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
- /* Resume hrtimers */
- hres_timers_resume();
-
- return 0;
-}
-
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&xtime_lock, flags);
- timekeeping_suspended = 1;
- timekeeping_suspend_time = read_persistent_clock();
- write_sequnlock_irqrestore(&xtime_lock, flags);
-
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-
- return 0;
-}
-
-/* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
- .resume = timekeeping_resume,
- .suspend = timekeeping_suspend,
- set_kset_name("timekeeping"),
-};
-
-static struct sys_device device_timer = {
- .id = 0,
- .cls = &timekeeping_sysclass,
-};
-
-static int __init timekeeping_init_device(void)
-{
- int error = sysdev_class_register(&timekeeping_sysclass);
- if (!error)
- error = sysdev_register(&device_timer);
- return error;
-}
-
-device_initcall(timekeeping_init_device);
-
-/*
- * If the error is already larger, we look ahead even further
- * to compensate for late or lost adjustments.
- */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
- s64 *offset)
-{
- s64 tick_error, i;
- u32 look_ahead, adj;
- s32 error2, mult;
-
- /*
- * Use the current error value to determine how much to look ahead.
- * The larger the error the slower we adjust for it to avoid problems
- * with losing too many ticks, otherwise we would overadjust and
- * produce an even larger error. The smaller the adjustment the
- * faster we try to adjust for it, as lost ticks can do less harm
- * here. This is tuned so that an error of about 1 msec is adusted
- * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
- */
- error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
- error2 = abs(error2);
- for (look_ahead = 0; error2 > 0; look_ahead++)
- error2 >>= 2;
-
- /*
- * Now calculate the error in (1 << look_ahead) ticks, but first
- * remove the single look ahead already included in the error.
- */
- tick_error = current_tick_length() >>
- (TICK_LENGTH_SHIFT - clock->shift + 1);
- tick_error -= clock->xtime_interval >> 1;
- error = ((error - tick_error) >> look_ahead) + tick_error;
-
- /* Finally calculate the adjustment shift value. */
- i = *interval;
- mult = 1;
- if (error < 0) {
- error = -error;
- *interval = -*interval;
- *offset = -*offset;
- mult = -1;
- }
- for (adj = 0; error > i; adj++)
- error >>= 1;
-
- *interval <<= adj;
- *offset <<= adj;
- return mult << adj;
-}
-
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
- */
-static void clocksource_adjust(struct clocksource *clock, s64 offset)
-{
- s64 error, interval = clock->cycle_interval;
- int adj;
-
- error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
- if (error > interval) {
- error >>= 2;
- if (likely(error <= interval))
- adj = 1;
- else
- adj = clocksource_bigadjust(error, &interval, &offset);
- } else if (error < -interval) {
- error >>= 2;
- if (likely(error >= -interval)) {
- adj = -1;
- interval = -interval;
- offset = -offset;
- } else
- adj = clocksource_bigadjust(error, &interval, &offset);
- } else
- return;
-
- clock->mult += adj;
- clock->xtime_interval += interval;
- clock->xtime_nsec -= offset;
- clock->error -= (interval - offset) <<
- (TICK_LENGTH_SHIFT - clock->shift);
-}
-
-/**
- * update_wall_time - Uses the current clocksource to increment the wall time
- *
- * Called from the timer interrupt, must hold a write on xtime_lock.
- */
-static void update_wall_time(void)
-{
- cycle_t offset;
-
- /* Make sure we're fully resumed: */
- if (unlikely(timekeeping_suspended))
- return;
-
-#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
-#else
- offset = clock->cycle_interval;
-#endif
- clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
-
- /* normally this loop will run just once, however in the
- * case of lost or late ticks, it will accumulate correctly.
- */
- while (offset >= clock->cycle_interval) {
- /* accumulate one interval */
- clock->xtime_nsec += clock->xtime_interval;
- clock->cycle_last += clock->cycle_interval;
- offset -= clock->cycle_interval;
-
- if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
- clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
- xtime.tv_sec++;
- second_overflow();
- }
-
- /* interpolator bits */
- time_interpolator_update(clock->xtime_interval
- >> clock->shift);
-
- /* accumulate error between NTP and clock interval */
- clock->error += current_tick_length();
- clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
- }
-
- /* correct the clock when NTP error is too big */
- clocksource_adjust(clock, offset);
-
- /* store full nanoseconds into xtime */
- xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
- clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
-
- /* check to see if there is a new clocksource to use */
- change_clocksource();
- update_vsyscall(&xtime, clock);
-}
-
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
@@ -1264,14 +857,6 @@ static inline void calc_load(unsigned long ticks)
}
/*
- * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
- */
-__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
-
-EXPORT_SYMBOL(xtime_lock);
-
-/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h)
@@ -1617,6 +1202,13 @@ static int __devinit init_timers_cpu(int cpu)
cpu_to_node(cpu));
if (!base)
return -ENOMEM;
+
+ /* Make sure that tvec_base is 2 byte aligned */
+ if (tbase_get_deferrable(base)) {
+ WARN_ON(1);
+ kfree(base);
+ return -ENOMEM;
+ }
memset(base, 0, sizeof(*base));
per_cpu(tvec_bases, cpu) = base;
} else {
@@ -1656,9 +1248,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
struct timer_list *timer;
while (!list_empty(head)) {
- timer = list_entry(head->next, struct timer_list, entry);
+ timer = list_first_entry(head, struct timer_list, entry);
detach_timer(timer, 0);
- timer->base = new_base;
+ timer_set_base(timer, new_base);
internal_add_timer(new_base, timer);
}
}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 187e2a42387..dd308ba4e03 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -6,7 +6,6 @@
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
-#include <linux/smp_lock.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index c859164a699..160c8c5136b 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,58 +32,25 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
}
/*
- * unshare the current process' utsname namespace.
- * called only in sys_unshare()
- */
-int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
-{
- if (unshare_flags & CLONE_NEWUTS) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- *new_uts = clone_uts_ns(current->nsproxy->uts_ns);
- if (!*new_uts)
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/*
* Copy task tsk's utsname namespace, or clone it if flags
* specifies CLONE_NEWUTS. In latter case, changes to the
* utsname of this process won't be seen by parent, and vice
* versa.
*/
-int copy_utsname(int flags, struct task_struct *tsk)
+struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns)
{
- struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
struct uts_namespace *new_ns;
- int err = 0;
-
- if (!old_ns)
- return 0;
+ BUG_ON(!old_ns);
get_uts_ns(old_ns);
if (!(flags & CLONE_NEWUTS))
- return 0;
-
- if (!capable(CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto out;
- }
+ return old_ns;
new_ns = clone_uts_ns(old_ns);
- if (!new_ns) {
- err = -ENOMEM;
- goto out;
- }
- tsk->nsproxy->uts_ns = new_ns;
-out:
put_uts_ns(old_ns);
- return err;
+ return new_ns;
}
void free_uts_ns(struct kref *kref)