diff options
Diffstat (limited to 'kernel')
37 files changed, 476 insertions, 321 deletions
diff --git a/kernel/async.c b/kernel/async.c index 968ef9457d4..27235f5de19 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -92,19 +92,18 @@ extern int initcall_debug; static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; + if (!list_empty(running)) { entry = list_first_entry(running, struct async_entry, list); return entry->cookie; - } else if (!list_empty(&async_pending)) { - entry = list_first_entry(&async_pending, - struct async_entry, list); - return entry->cookie; - } else { - /* nothing in progress... next_cookie is "infinity" */ - return next_cookie; } + list_for_each_entry(entry, &async_pending, list) + if (entry->running == running) + return entry->cookie; + + return next_cookie; /* "infinity" value */ } static async_cookie_t lowest_in_progress(struct list_head *running) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a6fe71fd5d1..713098ee5a0 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1028,7 +1028,7 @@ static void audit_update_watch(struct audit_parent *parent, if (audit_enabled) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u", audit_get_loginuid(current), @@ -1067,7 +1067,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) e = container_of(r, struct audit_entry, rule); if (audit_enabled) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u", audit_get_loginuid(current), diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 382109b5bae..a7267bfd376 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1133,8 +1133,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, free_cg_links: free_cg_links(&tmp_cg_links); drop_new_super: - up_write(&sb->s_umount); - deactivate_super(sb); + deactivate_locked_super(sb); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..875ffbdd96d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1409,7 +1409,7 @@ long do_fork(unsigned long clone_flags, } audit_finish_fork(p); - tracehook_report_clone(trace, regs, clone_flags, nr, p); + tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to diff --git a/kernel/futex.c b/kernel/futex.c index eef8cd26b5e..d546b2d53a6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -193,6 +193,7 @@ static void drop_futex_key_refs(union futex_key *key) * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -203,7 +204,8 @@ static void drop_futex_key_refs(union futex_key *key) * * lock_page() might sleep, the caller should not hold a spinlock. */ -static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) +static int +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -226,7 +228,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; @@ -235,7 +237,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) } again: - err = get_user_pages_fast(address, 1, 0, &page); + err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); if (err < 0) return err; @@ -677,7 +679,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -723,10 +725,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) goto out_put_key1; @@ -814,10 +816,10 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, drop_count = 0; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ); if (unlikely(ret != 0)) goto out_put_key1; @@ -1140,7 +1142,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.bitset = bitset; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -1330,7 +1332,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, q.pi_state = NULL; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); + ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; @@ -1594,7 +1596,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); if (unlikely(ret != 0)) goto out; diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 3394f8f5296..7d047808419 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o +obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c687ba4363f..13c68e71b72 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); spin_unlock(&desc->lock); } @@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); @@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) /* Start handling the irq */ if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) { + if (desc->chip->eoi) desc->chip->eoi(irq); - desc = irq_remap_to_desc(irq, desc); - } } void @@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) { + if (desc->chip != &no_irq_chip) mask_ack_irq(desc, irq); - desc = irq_remap_to_desc(irq, desc); - } desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142be8dd..18041a254d3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include <linux/irq.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> @@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = { .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; -void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) { - int node; void *ptr; - node = cpu_to_node(cpu); - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); + if (slab_is_available()) + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), + GFP_ATOMIC, node); + else + ptr = alloc_bootmem_node(NODE_DATA(node), + nr * sizeof(*desc->kstat_irqs)); /* * don't overwite if can not get new one * init_copy_kstat_irqs() could still use old one */ if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", - cpu, node); + printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); desc->kstat_irqs = ptr; } } -static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP - desc->cpu = cpu; + desc->node = node; #endif lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, cpu, nr_cpu_ids); + init_kstat_irqs(desc, node, nr_cpu_ids); if (!desc->kstat_irqs) { printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } - arch_init_chip_data(desc, cpu); + init_desc_masks(desc); + arch_init_chip_data(desc, node); } /* @@ -169,7 +173,8 @@ int __init early_irq_init(void) desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); irq_desc_ptrs[i] = desc + i; } @@ -187,11 +192,10 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; unsigned long flags; - int node; if (irq >= nr_irqs) { WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", @@ -210,15 +214,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) if (desc) goto out_unlock; - node = cpu_to_node(cpu); - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", - irq, cpu, node); + if (slab_is_available()) + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + else + desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); + + printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } - init_one_irq_desc(irq, desc, cpu); + init_one_irq_desc(irq, desc, node); irq_desc_ptrs[irq] = desc; @@ -256,7 +262,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq = i; - init_alloc_desc_masks(&desc[i], 0, true); + alloc_desc_masks(&desc[i], 0, true); + init_desc_masks(&desc[i]); desc[i].kstat_irqs = kstat_irqs_all[i]; } return arch_early_irq_init(); @@ -267,7 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { return irq_to_desc(irq); } @@ -363,8 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!"); - if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); @@ -455,11 +460,8 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - /* get new one */ - desc = irq_remap_to_desc(irq, desc); - } if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -470,10 +472,8 @@ unsigned int __do_IRQ(unsigned int irq) } spin_lock(&desc->lock); - if (desc->chip->ack) { + if (desc->chip->ack) desc->chip->ack(irq); - desc = irq_remap_to_desc(irq, desc); - } /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 01ce20eab38..73468253143 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; -extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); extern spinlock_t sparse_irq_lock; @@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); +extern void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2734eca5924..aaf5c9d0577 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -static void +void irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) { struct irqaction *action = desc->action; @@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) - desc->chip->set_affinity(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } + } else { desc->status |= IRQ_MOVE_PENDING; cpumask_copy(desc->pending_mask, cpumask); } #else - cpumask_copy(desc->affinity, cpumask); - desc->chip->set_affinity(irq, cpumask); + if (!desc->chip->set_affinity(irq, cpumask)) { + cpumask_copy(desc->affinity, cpumask); + irq_set_thread_affinity(desc, cpumask); + } #endif - irq_set_thread_affinity(desc, cpumask); desc->status |= IRQ_AFFINITY_SET; spin_unlock_irqrestore(&desc->lock, flags); return 0; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e05ad9be43b..cfe767ca154 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,5 +1,8 @@ #include <linux/irq.h> +#include <linux/interrupt.h> + +#include "internals.h" void move_masked_irq(int irq) { @@ -39,11 +42,12 @@ void move_masked_irq(int irq) * masking the irqs. */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - cpumask_and(desc->affinity, - desc->pending_mask, cpu_online_mask); - desc->chip->set_affinity(irq, desc->affinity); - } + < nr_cpu_ids)) + if (!desc->chip->set_affinity(irq, desc->pending_mask)) { + cpumask_copy(desc->affinity, desc->pending_mask); + irq_set_thread_affinity(desc, desc->pending_mask); + } + cpumask_clear(desc->pending_mask); } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 44bbdcbaf8d..2f69bee57bf 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -15,9 +15,9 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc, - int cpu, int nr) + int node, int nr) { - init_kstat_irqs(desc, cpu, nr); + init_kstat_irqs(desc, node, nr); if (desc->kstat_irqs != old_desc->kstat_irqs) memcpy(desc->kstat_irqs, old_desc->kstat_irqs, @@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) } static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int cpu) + struct irq_desc *desc, int node) { memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!init_alloc_desc_masks(desc, cpu, false)) { + if (!alloc_desc_masks(desc, node, false)) { printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " "for migration.\n", irq); return false; } spin_lock_init(&desc->lock); - desc->cpu = cpu; + desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, cpu); + arch_init_copy_chip_data(old_desc, desc, node); return true; } @@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) } static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int cpu) + int node) { struct irq_desc *desc; unsigned int irq; unsigned long flags; - int node; irq = old_desc->irq; @@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, if (desc && old_desc != desc) goto out_unlock; - node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); if (!desc) { printk(KERN_ERR "irq %d: can not get new irq_desc " @@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = old_desc; goto out_unlock; } - if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { + if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { /* still use old one */ kfree(desc); desc = old_desc; @@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, /* free the old one */ free_one_irq_desc(old_desc, desc); - spin_unlock(&old_desc->lock); kfree(old_desc); - spin_lock(&desc->lock); return desc; @@ -109,24 +105,14 @@ out_unlock: return desc; } -struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) +struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) { - int old_cpu; - int node, old_node; - /* those all static, do move them */ if (desc->irq < NR_IRQS_LEGACY) return desc; - old_cpu = desc->cpu; - if (old_cpu != cpu) { - node = cpu_to_node(cpu); - old_node = cpu_to_node(old_cpu); - if (old_node != node) - desc = __real_move_irq_desc(desc, cpu); - else - desc->cpu = cpu; - } + if (desc->node != node) + desc = __real_move_irq_desc(desc, node); return desc; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 5a758c6e495..e4983770913 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1451,7 +1451,6 @@ int kernel_kexec(void) error = device_suspend(PMSG_FREEZE); if (error) goto Resume_console; - device_pm_lock(); /* At this point, device_suspend() has been called, * but *not* device_power_down(). We *must* * device_power_down() now. Otherwise, drivers for @@ -1489,7 +1488,6 @@ int kernel_kexec(void) enable_nonboot_cpus(); device_power_up(PMSG_RESTORE); Resume_devices: - device_pm_unlock(); device_resume(PMSG_RESTORE); Resume_console: resume_console(); diff --git a/kernel/kgdb.c b/kernel/kgdb.c index e4dcfb2272a..9147a3190c9 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1583,8 +1583,8 @@ static void sysrq_handle_gdb(int key, struct tty_struct *tty) static struct sysrq_key_op sysrq_gdb_op = { .handler = sysrq_handle_gdb, - .help_msg = "Gdb", - .action_msg = "GDB", + .help_msg = "debug(G)", + .action_msg = "DEBUG", }; #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index b750675251e..7e95bedb2bf 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -370,8 +370,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->argv = argv; sub_info->envp = envp; sub_info->cred = prepare_usermodehelper_creds(); - if (!sub_info->cred) + if (!sub_info->cred) { + kfree(sub_info); return NULL; + } out: return sub_info; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a5e74ddee0e..c0fa54b276d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -319,6 +319,22 @@ struct kprobe __kprobes *get_kprobe(void *addr) return NULL; } +/* Arm a kprobe with text_mutex */ +static void __kprobes arm_kprobe(struct kprobe *kp) +{ + mutex_lock(&text_mutex); + arch_arm_kprobe(kp); + mutex_unlock(&text_mutex); +} + +/* Disarm a kprobe with text_mutex */ +static void __kprobes disarm_kprobe(struct kprobe *kp) +{ + mutex_lock(&text_mutex); + arch_disarm_kprobe(kp); + mutex_unlock(&text_mutex); +} + /* * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list @@ -538,7 +554,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) ap->flags &= ~KPROBE_FLAG_DISABLED; if (!kprobes_all_disarmed) /* Arm the breakpoint again. */ - arch_arm_kprobe(ap); + arm_kprobe(ap); } return 0; } @@ -789,11 +805,8 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) * enabled and not gone - otherwise, the breakpoint would * already have been removed. We save on flushing icache. */ - if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) { - mutex_lock(&text_mutex); - arch_disarm_kprobe(p); - mutex_unlock(&text_mutex); - } + if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) + disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); } else { if (p->break_handler && !kprobe_gone(p)) @@ -810,7 +823,7 @@ noclean: if (!kprobe_disabled(old_p)) { try_to_disable_aggr_kprobe(old_p); if (!kprobes_all_disarmed && kprobe_disabled(old_p)) - arch_disarm_kprobe(old_p); + disarm_kprobe(old_p); } } return 0; @@ -1364,7 +1377,7 @@ int __kprobes disable_kprobe(struct kprobe *kp) try_to_disable_aggr_kprobe(p); if (!kprobes_all_disarmed && kprobe_disabled(p)) - arch_disarm_kprobe(p); + disarm_kprobe(p); out: mutex_unlock(&kprobe_mutex); return ret; @@ -1393,7 +1406,7 @@ int __kprobes enable_kprobe(struct kprobe *kp) } if (!kprobes_all_disarmed && kprobe_disabled(p)) - arch_arm_kprobe(p); + arm_kprobe(p); p->flags &= ~KPROBE_FLAG_DISABLED; if (p != kp) diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index a2cc7e9a6e8..699a2ac3a0d 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 8192UL +#define MAX_LOCKDEP_ENTRIES 16384UL -#define MAX_LOCKDEP_CHAINS_BITS 14 +#define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b5e9f..6ca5fe96e39 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - __schedule(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/panic.c b/kernel/panic.c index 3dcaa166135..984b3ecbd72 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -340,34 +340,46 @@ void oops_exit(void) } #ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath(const char *file, int line, const char *fmt, ...) -{ +struct slowpath_args { + const char *fmt; va_list args; - char function[KSYM_SYMBOL_LEN]; - unsigned long caller = (unsigned long)__builtin_return_address(0); - const char *board; +}; - sprint_symbol(function, caller); +static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) +{ + const char *board; printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, - line, function); + printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); board = dmi_get_system_info(DMI_PRODUCT_NAME); if (board) printk(KERN_WARNING "Hardware name: %s\n", board); - if (fmt) { - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - } + if (args) + vprintk(args->fmt, args->args); print_modules(); dump_stack(); print_oops_end_marker(); add_taint(TAINT_WARN); } -EXPORT_SYMBOL(warn_slowpath); + +void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) +{ + struct slowpath_args args; + + args.fmt = fmt; + va_start(args.args, fmt); + warn_slowpath_common(file, line, __builtin_return_address(0), &args); + va_end(args.args); +} +EXPORT_SYMBOL(warn_slowpath_fmt); + +void warn_slowpath_null(const char *file, int line) +{ + warn_slowpath_common(file, line, __builtin_return_address(0), NULL); +} +EXPORT_SYMBOL(warn_slowpath_null); #endif #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c9dcf98b446..bece7c0b67b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1420,19 +1420,19 @@ void run_posix_cpu_timers(struct task_struct *tsk) * timer call will interfere. */ list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { - int firing; + int cpu_firing; + spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.entry); - firing = timer->it.cpu.firing; + cpu_firing = timer->it.cpu.firing; timer->it.cpu.firing = 0; /* * The firing flag is -1 if we collided with a reset * of the timer, which already reported this * almost-firing as an overrun. So don't generate an event. */ - if (likely(firing >= 0)) { + if (likely(cpu_firing >= 0)) cpu_timer_fire(timer); - } spin_unlock(&timer->it_lock); } } diff --git a/kernel/power/disk.c b/kernel/power/disk.c index e71ca9cd81b..5cb080e7eeb 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -215,8 +215,6 @@ static int create_image(int platform_mode) if (error) return error; - device_pm_lock(); - /* At this point, device_suspend() has been called, but *not* * device_power_down(). We *must* call device_power_down() now. * Otherwise, drivers for some devices (e.g. interrupt controllers) @@ -227,7 +225,7 @@ static int create_image(int platform_mode) if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); - goto Unlock; + return error; } error = platform_pre_snapshot(platform_mode); @@ -241,9 +239,9 @@ static int create_image(int platform_mode) local_irq_disable(); - sysdev_suspend(PMSG_FREEZE); + error = sysdev_suspend(PMSG_FREEZE); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " + printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); goto Enable_irqs; } @@ -280,9 +278,6 @@ static int create_image(int platform_mode) device_power_up(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - Unlock: - device_pm_unlock(); - return error; } @@ -344,13 +339,11 @@ static int resume_target_kernel(bool platform_mode) { int error; - device_pm_lock(); - error = device_power_down(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); - goto Unlock; + return error; } error = platform_pre_restore(platform_mode); @@ -403,9 +396,6 @@ static int resume_target_kernel(bool platform_mode) device_power_up(PMSG_RECOVER); - Unlock: - device_pm_unlock(); - return error; } @@ -464,11 +454,9 @@ int hibernation_platform_enter(void) goto Resume_devices; } - device_pm_lock(); - error = device_power_down(PMSG_HIBERNATE); if (error) - goto Unlock; + goto Resume_devices; error = hibernation_ops->prepare(); if (error) @@ -493,9 +481,6 @@ int hibernation_platform_enter(void) device_power_up(PMSG_RESTORE); - Unlock: - device_pm_unlock(); - Resume_devices: entering_platform_hibernation = false; device_resume(PMSG_RESTORE); diff --git a/kernel/power/main.c b/kernel/power/main.c index f99ed6a75ea..868028280d1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -289,12 +289,10 @@ static int suspend_enter(suspend_state_t state) { int error; - device_pm_lock(); - if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - goto Done; + return error; } error = device_power_down(PMSG_SUSPEND); @@ -343,9 +341,6 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); - Done: - device_pm_unlock(); - return error; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0692ab5a0d6..42c317874cf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -304,6 +304,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) if (child->ptrace) { child->exit_code = data; dead = __ptrace_detach(current, child); + if (!child->exit_state) + wake_up_process(child); } write_unlock_irq(&tasklist_lock); diff --git a/kernel/sched.c b/kernel/sched.c index b902e587a3a..c3c04e25656 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -630,6 +630,10 @@ struct rq { struct list_head migration_queue; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -2458,6 +2464,17 @@ out: return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -2856,19 +2873,72 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) { - unsigned long i, running = 0, uninterruptible = 0; + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; - } +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; + + if (time_before(jiffies, upd)) + return; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - return running + uninterruptible; + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); + } } /* @@ -2899,6 +2969,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -4240,10 +4315,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return cpumask_first(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4298,8 +4489,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; @@ -4468,15 +4675,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -4732,7 +4931,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, one_jiffy, one_jiffy_scaled); - else if (p != rq->idle) + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, one_jiffy, one_jiffy_scaled); else @@ -5007,13 +5206,15 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; +need_resched: + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -5070,15 +5271,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); @@ -5221,7 +5416,7 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; @@ -5241,6 +5436,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5279,6 +5477,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5315,6 +5516,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5332,6 +5536,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { @@ -6490,8 +6697,9 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } @@ -6970,6 +7178,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7204,6 +7420,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); + rq->calc_load_update = calc_load_update; + rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7243,7 +7461,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -7753,8 +7971,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; @@ -7875,7 +8094,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -7953,7 +8172,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; @@ -8938,6 +9157,8 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9045,6 +9266,9 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ @@ -9055,6 +9279,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); + alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ @@ -9800,6 +10025,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 819f17ac796..e1d16c9a768 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -38,7 +38,8 @@ */ unsigned long long __attribute__((weak)) sched_clock(void) { - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); + return (unsigned long long)(jiffies - INITIAL_JIFFIES) + * (NSEC_PER_SEC / HZ); } static __read_mostly int sched_clock_running; diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index cdd3c89574c..344712a5e3e 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -165,7 +165,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) vec->count = 0; if (bootmem) alloc_bootmem_cpumask_var(&vec->mask); - else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL)) + else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f217f11..5f9650e8fe7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) find_matching_se(&se, &pse); - while (se) { - BUG_ON(!pse); + BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - break; - } - - se = parent_entity(se); - pse = parent_entity(pse); - } + if (wakeup_preempt_entity(se, pse) == 1) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c1..499672c10cb 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - + /* adjust the active tasks as we might go into a long sleep */ + calc_load_account_active(rq); return rq->idle; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f2c66f8f971..9bf0d2a7304 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void) unsigned int i; for_each_possible_cpu(i) - alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); } #endif /* CONFIG_SMP */ diff --git a/kernel/smp.c b/kernel/smp.c index 858baac568e..ad63d850120 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return NOTIFY_BAD; break; diff --git a/kernel/softirq.c b/kernel/softirq.c index b525dd34851..f674f332a02 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void) return 0; } -int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) +int __weak arch_init_chip_data(struct irq_desc *desc, int node) { return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7dd59b..6a463716ecb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -101,7 +101,9 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; -static int one_thousand = 1000; + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -729,6 +731,14 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", .data = &kstack_depth_to_print, .maxlen = sizeof(int), @@ -1006,7 +1016,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &dirty_bytes_handler, .strategy = &sysctl_intvec, - .extra1 = &one_ul, + .extra1 = &dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", @@ -1031,28 +1041,6 @@ static struct ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_pdflush_threads_min", - .data = &nr_pdflush_threads_min, - .maxlen = sizeof nr_pdflush_threads_min, - .mode = 0644 /* read-write */, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &one, - .extra2 = &nr_pdflush_threads_max, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_pdflush_threads_max", - .data = &nr_pdflush_threads_max, - .maxlen = sizeof nr_pdflush_threads_max, - .mode = 0644 /* read-write */, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &nr_pdflush_threads_min, - .extra2 = &one_thousand, - }, - { .ctl_name = VM_SWAPPINESS, .procname = "swappiness", .data = &vm_swappiness, diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 21a5ca84951..83c4417b6a3 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -93,7 +93,17 @@ void tick_handle_periodic(struct clock_event_device *dev) for (;;) { if (!clockevents_program_event(dev, next, ktime_get())) return; - tick_periodic(cpu); + /* + * Have to be careful here. If we're in oneshot mode, + * before we call tick_periodic() in a loop, we need + * to be sure we're using a real hardware clocksource. + * Otherwise we could get trapped in an infinite + * loop, as the tick_periodic() increments jiffies, + * when then will increment time, posibly causing + * the loop to trigger again and again. + */ + if (timekeeping_valid_for_hres()) + tick_periodic(cpu); next = ktime_add(next, tick_period); } } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e..52a8bf8931f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,7 +22,7 @@ /* * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. + * playing with xtime. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c3..a26ed294f93 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1123,47 +1123,6 @@ void update_process_times(int user_tick) } /* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (unlikely(count < 0)) { - active_tasks = count_active_tasks(); - do { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - count += LOAD_FREQ; - } while (count < 0); - } -} - -/* * This function runs timers and the timer-tq in bottom half context. */ static void run_timer_softirq(struct softirq_action *h) @@ -1187,16 +1146,6 @@ void run_local_timers(void) } /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ - update_wall_time(); - calc_load(ticks); -} - -/* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. * jiffies is defined in the linker script... @@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); + update_wall_time(); + calc_global_load(); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; - unsigned long seq; + struct timespec tp; memset(info, 0, sizeof(struct sysinfo)); - do { - struct timespec tp; - seq = read_seqbegin(&xtime_lock); - - /* - * This is annoying. The below is the same thing - * posix_get_clock_monotonic() does, but it wants to - * take the lock which we want to cover the loads stuff - * too. - */ - - getnstimeofday(&tp); - tp.tv_sec += wall_to_monotonic.tv_sec; - tp.tv_nsec += wall_to_monotonic.tv_nsec; - monotonic_to_bootbased(&tp); - if (tp.tv_nsec - NSEC_PER_SEC >= 0) { - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1ce5dc6372b..cda81ec58d9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2380,7 +2380,7 @@ static const char readme_msg[] = "# echo print-parent > /debug/tracing/trace_options\n" "# echo 1 > /debug/tracing/tracing_enabled\n" "# cat /debug/tracing/trace > /tmp/trace.txt\n" - "echo 0 > /debug/tracing/tracing_enabled\n" + "# echo 0 > /debug/tracing/tracing_enabled\n" ; static ssize_t @@ -3448,6 +3448,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!ref) break; + ref->ref = 1; ref->buffer = info->tr->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer); if (!ref->page) { diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc181c..ea7c3b4275c 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_common(q, mode, 1, 0, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); |