diff options
Diffstat (limited to 'kernel')
125 files changed, 11224 insertions, 5601 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks new file mode 100644 index 00000000000..88c92fb4461 --- /dev/null +++ b/kernel/Kconfig.locks @@ -0,0 +1,202 @@ +# +# The ARCH_INLINE foo is necessary because select ignores "depends on" +# +config ARCH_INLINE_SPIN_TRYLOCK + bool + +config ARCH_INLINE_SPIN_TRYLOCK_BH + bool + +config ARCH_INLINE_SPIN_LOCK + bool + +config ARCH_INLINE_SPIN_LOCK_BH + bool + +config ARCH_INLINE_SPIN_LOCK_IRQ + bool + +config ARCH_INLINE_SPIN_LOCK_IRQSAVE + bool + +config ARCH_INLINE_SPIN_UNLOCK + bool + +config ARCH_INLINE_SPIN_UNLOCK_BH + bool + +config ARCH_INLINE_SPIN_UNLOCK_IRQ + bool + +config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + bool + + +config ARCH_INLINE_READ_TRYLOCK + bool + +config ARCH_INLINE_READ_LOCK + bool + +config ARCH_INLINE_READ_LOCK_BH + bool + +config ARCH_INLINE_READ_LOCK_IRQ + bool + +config ARCH_INLINE_READ_LOCK_IRQSAVE + bool + +config ARCH_INLINE_READ_UNLOCK + bool + +config ARCH_INLINE_READ_UNLOCK_BH + bool + +config ARCH_INLINE_READ_UNLOCK_IRQ + bool + +config ARCH_INLINE_READ_UNLOCK_IRQRESTORE + bool + + +config ARCH_INLINE_WRITE_TRYLOCK + bool + +config ARCH_INLINE_WRITE_LOCK + bool + +config ARCH_INLINE_WRITE_LOCK_BH + bool + +config ARCH_INLINE_WRITE_LOCK_IRQ + bool + +config ARCH_INLINE_WRITE_LOCK_IRQSAVE + bool + +config ARCH_INLINE_WRITE_UNLOCK + bool + +config ARCH_INLINE_WRITE_UNLOCK_BH + bool + +config ARCH_INLINE_WRITE_UNLOCK_IRQ + bool + +config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + bool + +# +# lock_* functions are inlined when: +# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y +# +# trylock_* functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# +# unlock and unlock_irq functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# or +# - DEBUG_SPINLOCK=n and PREEMPT=n +# +# unlock_bh and unlock_irqrestore functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# + +config INLINE_SPIN_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK + +config INLINE_SPIN_TRYLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH + +config INLINE_SPIN_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK + +config INLINE_SPIN_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_BH + +config INLINE_SPIN_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_IRQ + +config INLINE_SPIN_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_IRQSAVE + +config INLINE_SPIN_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) + +config INLINE_SPIN_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH + +config INLINE_SPIN_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) + +config INLINE_SPIN_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + + +config INLINE_READ_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK + +config INLINE_READ_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK + +config INLINE_READ_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_BH + +config INLINE_READ_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_IRQ + +config INLINE_READ_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_IRQSAVE + +config INLINE_READ_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) + +config INLINE_READ_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH + +config INLINE_READ_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) + +config INLINE_READ_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE + + +config INLINE_WRITE_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK + +config INLINE_WRITE_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK + +config INLINE_WRITE_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_BH + +config INLINE_WRITE_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_IRQ + +config INLINE_WRITE_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_IRQSAVE + +config INLINE_WRITE_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) + +config INLINE_WRITE_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH + +config INLINE_WRITE_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) + +config INLINE_WRITE_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + +config MUTEX_SPIN_ON_OWNER + def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile index b8d4cd8ac0b..864ff75d65f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -4,7 +4,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ - sysctl.o capability.o ptrace.o timer.o user.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ @@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -82,6 +83,7 @@ obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o +obj-$(CONFIG_TINY_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o @@ -94,7 +96,10 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index 9a4715a2f6b..a6605ca921b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -536,7 +536,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, do_div(elapsed, AHZ); ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ - current_uid_gid(&ac.ac_uid, &ac.ac_gid); + ac.ac_uid = orig_cred->uid; + ac.ac_gid = orig_cred->gid; #if ACCT_VERSION==2 ac.ac_ahz = AHZ; #endif diff --git a/kernel/bounds.c b/kernel/bounds.c index 3c530138183..98a51f26c13 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -12,7 +12,7 @@ void foo(void) { - /* The enum constants to put into include/linux/bounds.h */ + /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); /* End of constants */ diff --git a/kernel/capability.c b/kernel/capability.c index 4e17041963f..7f876e60521 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -29,7 +29,6 @@ EXPORT_SYMBOL(__cap_empty_set); EXPORT_SYMBOL(__cap_full_set); EXPORT_SYMBOL(__cap_init_eff_set); -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES int file_caps_enabled = 1; static int __init file_caps_disable(char *str) @@ -38,7 +37,6 @@ static int __init file_caps_disable(char *str) return 1; } __setup("no_file_caps", file_caps_disable); -#endif /* * More recent versions of libcap are available from: @@ -169,8 +167,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) kernel_cap_t pE, pI, pP; ret = cap_validate_magic(header, &tocopy); - if (ret != 0) - return ret; + if ((dataptr == NULL) || (ret != 0)) + return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -238,7 +236,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; - unsigned i, tocopy; + unsigned i, tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; @@ -255,8 +253,11 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; - if (copy_from_user(&kdata, data, - tocopy * sizeof(struct __user_cap_data_struct))) + copybytes = tocopy * sizeof(struct __user_cap_data_struct); + if (copybytes > sizeof(kdata)) + return -EFAULT; + + if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; for (i = 0; i < tocopy; i++) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ca83b73fba1..0249f4be9b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); if (cft->write_u64) { - u64 val = simple_strtoull(buffer, &end, 0); + u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_u64(cgrp, cft, val); } else { - s64 val = simple_strtoll(buffer, &end, 0); + s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_s64(cgrp, cft, val); @@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); - retval = cft->write_string(cgrp, cft, buffer); + retval = cft->write_string(cgrp, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ba0f1ecb21..291ac586f37 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { + set_cpu_active(cpu, true); + nr_calls--; __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); @@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) /* Ensure that we are not runnable on dying cpu */ cpumask_copy(old_allowed, ¤t->cpus_allowed); - set_cpus_allowed_ptr(current, - cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); + set_cpus_allowed_ptr(current, cpu_active_mask); err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { + set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) @@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu) err = _cpu_down(cpu, 0); - if (cpu_online(cpu)) - set_cpu_active(cpu, true); - out: cpu_maps_update_done(); stop_machine_destroy(); @@ -387,15 +386,23 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); + + for_each_online_cpu(cpu) { + if (cpu == first_cpu) + continue; + set_cpu_active(cpu, false); + } + + synchronize_sched(); + printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; error = _cpu_down(cpu, 1); - if (!error) { + if (!error) cpumask_set_cpu(cpu, frozen_cpus); - printk("CPU%d is down\n", cpu); - } else { + else { printk(KERN_ERR "Error taking CPU%d down: %d\n", cpu, error); break; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b5cb469d254..ba401fab459 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -537,8 +537,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ -/* FIXME: see the FIXME in partition_sched_domains() */ -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { LIST_HEAD(q); /* queue of cpusets to be scanned */ @@ -546,7 +545,7 @@ static int generate_sched_domains(struct cpumask **domains, struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ - struct cpumask *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -557,7 +556,8 @@ static int generate_sched_domains(struct cpumask **domains, /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - doms = kmalloc(cpumask_size(), GFP_KERNEL); + ndoms = 1; + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -566,9 +566,8 @@ static int generate_sched_domains(struct cpumask **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms, top_cpuset.cpus_allowed); + cpumask_copy(doms[0], top_cpuset.cpus_allowed); - ndoms = 1; goto done; } @@ -636,7 +635,7 @@ restart: * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ - doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL); + doms = alloc_sched_domains(ndoms); if (!doms) goto done; @@ -656,7 +655,7 @@ restart: continue; } - dp = doms + nslot; + dp = doms[nslot]; if (nslot == ndoms) { static int warnings = 10; @@ -718,7 +717,7 @@ done: static void do_rebuild_sched_domains(struct work_struct *unused) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; get_online_cpus(); @@ -738,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) { } -static int generate_sched_domains(struct cpumask **domains, +static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { *domains = NULL; @@ -873,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) + if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) return -EINVAL; } retval = validate_change(cs, trialcs); @@ -2011,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) } /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; @@ -2020,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_online_mask); + cpu_active_mask); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); @@ -2052,14 +2051,16 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { struct sched_domain_attr *attr; - struct cpumask *doms; + cpumask_var_t *doms; int ndoms; switch (phase) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: break; default: @@ -2068,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, cgroup_lock(); mutex_lock(&callback_mutex); - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); @@ -2115,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, void __init cpuset_init_smp(void) { - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); @@ -2537,15 +2538,9 @@ const struct file_operations proc_cpuset_operations = { }; #endif /* CONFIG_PROC_PID_CPUSET */ -/* Display task cpus_allowed, mems_allowed in /proc/<pid>/status file. */ +/* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { - seq_printf(m, "Cpus_allowed:\t"); - seq_cpumask(m, &task->cpus_allowed); - seq_printf(m, "\n"); - seq_printf(m, "Cpus_allowed_list:\t"); - seq_cpumask_list(m, &task->cpus_allowed); - seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); seq_nodemask(m, &task->mems_allowed); seq_printf(m, "\n"); diff --git a/kernel/exit.c b/kernel/exit.c index e61891f8012..546774a31a6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -49,6 +49,7 @@ #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> +#include <linux/hw_breakpoint.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -67,10 +68,10 @@ static void __unhash_process(struct task_struct *p) detach_pid(p, PIDTYPE_SID); list_del_rcu(&p->tasks); + list_del_init(&p->sibling); __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); - list_del_init(&p->sibling); } /* @@ -110,9 +111,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); - sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, tsk->gtime); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -359,10 +360,8 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) { + if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); - proc_sid_connector(curr); - } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); @@ -737,12 +736,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father) /* * Any that need to be release_task'd are put on the @dead list. */ -static void reparent_thread(struct task_struct *father, struct task_struct *p, +static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - if (p->pdeath_signal) - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); - list_move_tail(&p->sibling, &p->real_parent->children); if (task_detached(p)) @@ -781,12 +777,18 @@ static void forget_original_parent(struct task_struct *father) reaper = find_new_reaper(father); list_for_each_entry_safe(p, n, &father->children, sibling) { - p->real_parent = reaper; - if (p->parent == father) { - BUG_ON(task_ptrace(p)); - p->parent = p->real_parent; - } - reparent_thread(father, p, &dead_children); + struct task_struct *t = p; + do { + t->real_parent = reaper; + if (t->parent == father) { + BUG_ON(task_ptrace(t)); + t->parent = t->real_parent; + } + if (t->pdeath_signal) + group_send_sig_info(t->pdeath_signal, + SEND_SIG_NOINFO, t); + } while_each_thread(p, t); + reparent_leader(father, p, &dead_children); } write_unlock_irq(&tasklist_lock); @@ -934,7 +936,7 @@ NORET_TYPE void do_exit(long code) * an exiting task cleaning up the robust pi futexes. */ smp_mb(); - spin_unlock_wait(&tsk->pi_lock); + raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", @@ -972,7 +974,7 @@ NORET_TYPE void do_exit(long code) exit_thread(); cgroup_exit(tsk, 1); - if (group_dead && tsk->signal->leader) + if (group_dead) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); @@ -980,6 +982,10 @@ NORET_TYPE void do_exit(long code) proc_exit_connector(tsk); /* + * FIXME: do that only when needed, using sched_exit tracepoint + */ + flush_ptrace_hw_breakpoint(tsk); + /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ @@ -1006,7 +1012,7 @@ NORET_TYPE void do_exit(long code) tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) - exit_io_context(); + exit_io_context(tsk); if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); @@ -1207,6 +1213,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) struct signal_struct *psig; struct signal_struct *sig; unsigned long maxrss; + cputime_t tgutime, tgstime; /* * The resource counters for the group leader are in its @@ -1222,20 +1229,23 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * need to protect the access to parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_times() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ + thread_group_times(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(tgutime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(tgstime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, @@ -1544,14 +1554,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { - /* - * Do not consider detached threads. - */ - if (!task_detached(p)) { - int ret = wait_consider_task(wo, 0, p); - if (ret) - return ret; - } + int ret = wait_consider_task(wo, 0, p); + if (ret) + return ret; } return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 4c20fff8c13..5b2959b3ffc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -64,6 +64,7 @@ #include <linux/magic.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> +#include <linux/user-return-notifier.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -91,7 +92,7 @@ int nr_processes(void) int cpu; int total = 0; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; @@ -249,6 +250,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto out; setup_thread_stack(tsk, orig); + clear_user_return_notifier(tsk); stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */ @@ -884,6 +886,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + sig->prev_utime = sig->prev_stime = cputime_zero; +#endif sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; @@ -934,9 +939,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) static void rt_mutex_init_task(struct task_struct *p) { - spin_lock_init(&p->pi_lock); + raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters, &p->pi_lock); + plist_head_init_raw(&p->pi_waiters, &p->pi_lock); p->pi_blocked_on = NULL; #endif } @@ -1066,8 +1071,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->gtime = cputime_zero; p->utimescaled = cputime_zero; p->stimescaled = cputime_zero; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING p->prev_utime = cputime_zero; p->prev_stime = cputime_zero; +#endif p->default_timer_slack_ns = current->timer_slack_ns; @@ -1120,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + p->memcg_batch.do_batch = 0; + p->memcg_batch.memcg = NULL; +#endif p->bts = NULL; @@ -1199,9 +1210,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->sas_ss_sp = p->sas_ss_size = 0; /* - * Syscall tracing should be turned off in the child regardless - * of CLONE_PTRACE. + * Syscall tracing and stepping should be turned off in the + * child regardless of CLONE_PTRACE. */ + user_disable_single_step(p); clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); @@ -1279,7 +1291,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - list_add_tail(&p->sibling, &p->real_parent->children); tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { @@ -1291,6 +1302,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); + list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } @@ -1310,7 +1322,8 @@ bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: - put_io_context(p->io_context); + if (p->io_context) + exit_io_context(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: diff --git a/kernel/futex.c b/kernel/futex.c index 4949d336d88..8e3c3ffe1b9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) */ static inline int match_futex(union futex_key *key1, union futex_key *key2) { - return (key1->both.word == key2->both.word + return (key1 && key2 + && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } @@ -303,8 +304,14 @@ void put_futex_key(int fshared, union futex_key *key) */ static int fault_in_user_writeable(u32 __user *uaddr) { - int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, (unsigned long)uaddr, + 1, 1, 0, NULL, NULL); + up_read(&mm->mmap_sem); + return ret < 0 ? ret : 0; } @@ -396,9 +403,9 @@ static void free_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); } @@ -463,18 +470,18 @@ void exit_pi_state_list(struct task_struct *curr) * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: @@ -488,15 +495,15 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); rt_mutex_unlock(&pi_state->pi_mutex); spin_unlock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); } - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); } static int @@ -551,7 +558,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * change of the task flags, we do this protected by * p->pi_lock: */ - spin_lock_irq(&p->pi_lock); + raw_spin_lock_irq(&p->pi_lock); if (unlikely(p->flags & PF_EXITING)) { /* * The task is on the way out. When PF_EXITPIDONE is @@ -560,7 +567,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, */ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; - spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); return ret; } @@ -579,7 +586,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; - spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); @@ -753,7 +760,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; - spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -782,23 +789,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) else if (curval != uval) ret = -EINVAL; if (ret) { - spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); return ret; } } - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); - spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock_irq(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock_irq(&new_owner->pi_lock); - spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -1003,7 +1010,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb2->lock; + q->list.plist.spinlock = &hb2->lock; #endif } get_futex_key_refs(key2); @@ -1028,7 +1035,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; @@ -1040,7 +1046,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, q->lock_ptr = &hb->lock; #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; + q->list.plist.spinlock = &hb->lock; #endif wake_up_state(q->task, TASK_NORMAL); @@ -1226,6 +1232,7 @@ retry_private: */ if (ret == 1) { WARN_ON(pi_state); + drop_count++; task_count++; ret = get_futex_value_locked(&curval2, uaddr2); if (!ret) @@ -1304,6 +1311,7 @@ retry_private: if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; continue; } else if (ret) { /* -EDEADLK */ @@ -1386,7 +1394,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; + q->list.plist.spinlock = &hb->lock; #endif plist_add(&q->list, &hb->chain); q->task = current; @@ -1521,18 +1529,18 @@ retry: * itself. */ if (pi_state->owner != NULL) { - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - spin_lock_irq(&newowner->pi_lock); + raw_spin_lock_irq(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - spin_unlock_irq(&newowner->pi_lock); + raw_spin_unlock_irq(&newowner->pi_lock); return 0; /* @@ -1791,6 +1799,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } +retry: /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -1808,9 +1817,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out_put_key; /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) { + put_futex_key(fshared, &q.key); + goto retry; + } + ret = -ERESTARTSYS; if (!abs_time) goto out_put_key; @@ -2118,9 +2132,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, */ plist_del(&q->list, &q->list.plist); + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; if (timeout && !timeout->task) ret = -ETIMEDOUT; - else + else if (signal_pending(current)) ret = -ERESTARTNOINTR; } return ret; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 3e1c36e7998..0086628b6e9 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -127,11 +127,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, for (;;) { base = timer->base; if (likely(base != NULL)) { - spin_lock_irqsave(&base->cpu_base->lock, *flags); + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ - spin_unlock_irqrestore(&base->cpu_base->lock, *flags); + raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } @@ -208,13 +208,13 @@ again: /* See the comment in lock_timer_base() */ timer->base = NULL; - spin_unlock(&base->cpu_base->lock); - spin_lock(&new_base->cpu_base->lock); + raw_spin_unlock(&base->cpu_base->lock); + raw_spin_lock(&new_base->cpu_base->lock); if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { cpu = this_cpu; - spin_unlock(&new_base->cpu_base->lock); - spin_lock(&base->cpu_base->lock); + raw_spin_unlock(&new_base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); timer->base = base; goto again; } @@ -230,7 +230,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base = timer->base; - spin_lock_irqsave(&base->cpu_base->lock, *flags); + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } @@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { - ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; @@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (expires.tv64 < 0) return -ETIME; - if (expires.tv64 >= expires_next->tv64) + if (expires.tv64 >= cpu_base->expires_next.tv64) + return 0; + + /* + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (cpu_base->hang_detected) return 0; /* @@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, */ res = tick_program_event(expires, 0); if (!IS_ERR_VALUE(res)) - *expires_next = expires; + cpu_base->expires_next = expires; return res; } @@ -619,12 +628,12 @@ static void retrigger_next_event(void *arg) base = &__get_cpu_var(hrtimer_bases); /* Adjust CLOCK_REALTIME offset */ - spin_lock(&base->lock); + raw_spin_lock(&base->lock); base->clock_base[CLOCK_REALTIME].offset = timespec_to_ktime(realtime_offset); hrtimer_force_reprogram(base, 0); - spin_unlock(&base->lock); + raw_spin_unlock(&base->lock); } /* @@ -685,9 +694,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { if (wakeup) { - spin_unlock(&base->cpu_base->lock); + raw_spin_unlock(&base->cpu_base->lock); raise_softirq_irqoff(HRTIMER_SOFTIRQ); - spin_lock(&base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); } else __raise_softirq_irqoff(HRTIMER_SOFTIRQ); @@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ -#ifdef CONFIG_TIMER_STATS -void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { +#ifdef CONFIG_TIMER_STATS if (timer->start_site) return; - - timer->start_site = addr; + timer->start_site = __builtin_return_address(0); memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); timer->start_pid = current->pid; +#endif } + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; +#endif +} + +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (likely(!timer_stats_active)) + return; + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, 0); #endif +} /* * Counterpart to lock_hrtimer_base above: @@ -765,7 +790,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { - spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); + raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** @@ -1098,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void) unsigned long flags; int i; - spin_lock_irqsave(&cpu_base->lock, flags); + raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!hrtimer_hres_active()) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { @@ -1115,7 +1140,7 @@ ktime_t hrtimer_get_next_event(void) } } - spin_unlock_irqrestore(&cpu_base->lock, flags); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); if (mindelta.tv64 < 0) mindelta.tv64 = 0; @@ -1197,11 +1222,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) * they get migrated to another cpu, therefore its safe to unlock * the timer base. */ - spin_unlock(&cpu_base->lock); + raw_spin_unlock(&cpu_base->lock); trace_hrtimer_expire_entry(timer, now); restart = fn(timer); trace_hrtimer_expire_exit(timer); - spin_lock(&cpu_base->lock); + raw_spin_lock(&cpu_base->lock); /* * Note: We clear the CALLBACK bit after enqueue_hrtimer and @@ -1217,29 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) #ifdef CONFIG_HIGH_RES_TIMERS -static int force_clock_reprogram; - -/* - * After 5 iteration's attempts, we consider that hrtimer_interrupt() - * is hanging, which could happen with something that slows the interrupt - * such as the tracing. Then we force the clock reprogramming for each future - * hrtimer interrupts to avoid infinite loops and use the min_delta_ns - * threshold that we will overwrite. - * The next tick event will be scheduled to 3 times we currently spend on - * hrtimer_interrupt(). This gives a good compromise, the cpus will spend - * 1/4 of their time to process the hrtimer interrupts. This is enough to - * let it running without serious starvation. - */ - -static inline void -hrtimer_interrupt_hanging(struct clock_event_device *dev, - ktime_t try_time) -{ - force_clock_reprogram = 1; - dev->min_delta_ns = (unsigned long)try_time.tv64 * 3; - printk(KERN_WARNING "hrtimer: interrupt too slow, " - "forcing clock min delta to %lu ns\n", dev->min_delta_ns); -} /* * High resolution timer interrupt * Called with interrupts disabled @@ -1248,24 +1250,18 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; - ktime_t expires_next, now; - int nr_retries = 0; - int i; + ktime_t expires_next, now, entry_time, delta; + int i, retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; - retry: - /* 5 retries is enough to notice a hang */ - if (!(++nr_retries % 5)) - hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now)); - - now = ktime_get(); - + entry_time = now = ktime_get(); +retry: expires_next.tv64 = KTIME_MAX; - spin_lock(&cpu_base->lock); + raw_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1321,13 +1317,51 @@ void hrtimer_interrupt(struct clock_event_device *dev) * against it. */ cpu_base->expires_next = expires_next; - spin_unlock(&cpu_base->lock); + raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 != KTIME_MAX) { - if (tick_program_event(expires_next, force_clock_reprogram)) - goto retry; + if (expires_next.tv64 == KTIME_MAX || + !tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + */ + now = ktime_get(); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + delta = ktime_sub(now, entry_time); + if (delta.tv64 > cpu_base->max_hang_time.tv64) + cpu_base->max_hang_time = delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta.tv64 > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", + ktime_to_ns(delta)); } /* @@ -1423,7 +1457,7 @@ void hrtimer_run_queues(void) gettime = 0; } - spin_lock(&cpu_base->lock); + raw_spin_lock(&cpu_base->lock); while ((node = base->first)) { struct hrtimer *timer; @@ -1435,7 +1469,7 @@ void hrtimer_run_queues(void) __run_hrtimer(timer, &base->softirq_time); } - spin_unlock(&cpu_base->lock); + raw_spin_unlock(&cpu_base->lock); } } @@ -1591,7 +1625,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - spin_lock_init(&cpu_base->lock); + raw_spin_lock_init(&cpu_base->lock); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) cpu_base->clock_base[i].cpu_base = cpu_base; @@ -1649,16 +1683,16 @@ static void migrate_hrtimers(int scpu) * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ - spin_lock(&new_base->lock); - spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } - spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); + raw_spin_unlock(&old_base->lock); + raw_spin_unlock(&new_base->lock); /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d4e84174740..0c642d51aac 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -144,7 +144,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_lock(); do_each_thread(g, t) { - if (!--max_count) + if (!max_count--) goto unlock; if (!--batch_count) { batch_count = HUNG_TASK_BATCHING; diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c new file mode 100644 index 00000000000..dbcbf6a33a0 --- /dev/null +++ b/kernel/hw_breakpoint.c @@ -0,0 +1,453 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> + * + * Thanks to Ingo Molnar for his many suggestions. + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + * This file contains the arch-independent routines. + */ + +#include <linux/irqflags.h> +#include <linux/kallsyms.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/smp.h> + +#include <linux/hw_breakpoint.h> + +/* + * Constraints data + */ + +/* Number of pinned cpu breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); + +/* Number of pinned task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); + +/* Number of non-pinned cpu/task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); + +/* Gather the number of total pinned and un-pinned bp in a cpuset */ +struct bp_busy_slots { + unsigned int pinned; + unsigned int flexible; +}; + +/* Serialize accesses to the above constraints */ +static DEFINE_MUTEX(nr_bp_mutex); + +/* + * Report the maximum number of pinned breakpoints a task + * have in this cpu + */ +static unsigned int max_task_bp_pinned(int cpu) +{ + int i; + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + + for (i = HBP_NUM -1; i >= 0; i--) { + if (tsk_pinned[i] > 0) + return i + 1; + } + + return 0; +} + +static int task_bp_pinned(struct task_struct *tsk) +{ + struct perf_event_context *ctx = tsk->perf_event_ctxp; + struct list_head *list; + struct perf_event *bp; + unsigned long flags; + int count = 0; + + if (WARN_ONCE(!ctx, "No perf context for this task")) + return 0; + + list = &ctx->event_list; + + raw_spin_lock_irqsave(&ctx->lock, flags); + + /* + * The current breakpoint counter is not included in the list + * at the open() callback time + */ + list_for_each_entry(bp, list, event_entry) { + if (bp->attr.type == PERF_TYPE_BREAKPOINT) + count++; + } + + raw_spin_unlock_irqrestore(&ctx->lock, flags); + + return count; +} + +/* + * Report the number of pinned/un-pinned breakpoints we have in + * a given cpu (cpu > -1) or in all of them (cpu = -1). + */ +static void +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + + if (cpu >= 0) { + slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + if (!tsk) + slots->pinned += max_task_bp_pinned(cpu); + else + slots->pinned += task_bp_pinned(tsk); + slots->flexible = per_cpu(nr_bp_flexible, cpu); + + return; + } + + for_each_online_cpu(cpu) { + unsigned int nr; + + nr = per_cpu(nr_cpu_bp_pinned, cpu); + if (!tsk) + nr += max_task_bp_pinned(cpu); + else + nr += task_bp_pinned(tsk); + + if (nr > slots->pinned) + slots->pinned = nr; + + nr = per_cpu(nr_bp_flexible, cpu); + + if (nr > slots->flexible) + slots->flexible = nr; + } +} + +/* + * Add a pinned breakpoint for the given task in our constraint table + */ +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +{ + unsigned int *tsk_pinned; + int count = 0; + + count = task_bp_pinned(tsk); + + tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); + if (enable) { + tsk_pinned[count]++; + if (count > 0) + tsk_pinned[count-1]--; + } else { + tsk_pinned[count]--; + if (count > 0) + tsk_pinned[count-1]++; + } +} + +/* + * Add/remove the given breakpoint in our constraint table + */ +static void toggle_bp_slot(struct perf_event *bp, bool enable) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + + /* Pinned counter task profiling */ + if (tsk) { + if (cpu >= 0) { + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + for_each_online_cpu(cpu) + toggle_bp_task_slot(tsk, cpu, enable); + return; + } + + /* Pinned counter cpu profiling */ + if (enable) + per_cpu(nr_cpu_bp_pinned, bp->cpu)++; + else + per_cpu(nr_cpu_bp_pinned, bp->cpu)--; +} + +/* + * Contraints to check before allowing this new breakpoint counter: + * + * == Non-pinned counter == (Considered as pinned for now) + * + * - If attached to a single cpu, check: + * + * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM + * + * -> If there are already non-pinned counters in this cpu, it means + * there is already a free slot for them. + * Otherwise, we check that the maximum number of per task + * breakpoints (for this cpu) plus the number of per cpu breakpoint + * (for this cpu) doesn't cover every registers. + * + * - If attached to every cpus, check: + * + * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM + * + * -> This is roughly the same, except we check the number of per cpu + * bp for every cpu and we keep the max one. Same for the per tasks + * breakpoints. + * + * + * == Pinned counter == + * + * - If attached to a single cpu, check: + * + * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM + * + * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * one register at least (or they will never be fed). + * + * - If attached to every cpus, check: + * + * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM + */ +int reserve_bp_slot(struct perf_event *bp) +{ + struct bp_busy_slots slots = {0}; + int ret = 0; + + mutex_lock(&nr_bp_mutex); + + fetch_bp_busy_slots(&slots, bp); + + /* Flexible counters need to keep at least one slot */ + if (slots.pinned + (!!slots.flexible) == HBP_NUM) { + ret = -ENOSPC; + goto end; + } + + toggle_bp_slot(bp, true); + +end: + mutex_unlock(&nr_bp_mutex); + + return ret; +} + +void release_bp_slot(struct perf_event *bp) +{ + mutex_lock(&nr_bp_mutex); + + toggle_bp_slot(bp, false); + + mutex_unlock(&nr_bp_mutex); +} + + +int register_perf_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = reserve_bp_slot(bp); + if (ret) + return ret; + + /* + * Ptrace breakpoints can be temporary perf events only + * meant to reserve a slot. In this case, it is created disabled and + * we don't want to check the params right now (as we put a null addr) + * But perf tools create events as disabled and we want to check + * the params for them. + * This is a quick hack that will be removed soon, once we remove + * the tmp breakpoints from ptrace + */ + if (!bp->attr.disabled || !bp->overflow_handler) + ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + + return ret; +} + +/** + * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +struct perf_event * +register_user_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered, + struct task_struct *tsk) +{ + return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); +} +EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); + +/** + * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @bp: the breakpoint structure to modify + * @attr: new breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +{ + u64 old_addr = bp->attr.bp_addr; + int old_type = bp->attr.bp_type; + int old_len = bp->attr.bp_len; + int err = 0; + + perf_event_disable(bp); + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (attr->disabled) + goto end; + + err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + if (!err) + perf_event_enable(bp); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + if (!bp->attr.disabled) + perf_event_enable(bp); + + return err; + } + +end: + bp->attr.disabled = attr->disabled; + + return 0; +} +EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); + +/** + * unregister_hw_breakpoint - unregister a user-space hardware breakpoint + * @bp: the breakpoint structure to unregister + */ +void unregister_hw_breakpoint(struct perf_event *bp) +{ + if (!bp) + return; + perf_event_release_kernel(bp); +} +EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); + +/** + * register_wide_hw_breakpoint - register a wide breakpoint in the kernel + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * + * @return a set of per_cpu pointers to perf events + */ +struct perf_event ** +register_wide_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered) +{ + struct perf_event **cpu_events, **pevent, *bp; + long err; + int cpu; + + cpu_events = alloc_percpu(typeof(*cpu_events)); + if (!cpu_events) + return ERR_PTR(-ENOMEM); + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); + + *pevent = bp; + + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto fail; + } + } + + return cpu_events; + +fail: + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + if (IS_ERR(*pevent)) + break; + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); + /* return the error if any */ + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); + +/** + * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel + * @cpu_events: the per cpu set of events to unregister + */ +void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +{ + int cpu; + struct perf_event **pevent; + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); +} +EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); + +static struct notifier_block hw_breakpoint_exceptions_nb = { + .notifier_call = hw_breakpoint_exceptions_notify, + /* we need to be notified first */ + .priority = 0x7fffffff +}; + +static int __init init_hw_breakpoint(void) +{ + return register_die_notifier(&hw_breakpoint_exceptions_nb); +} +core_initcall(init_hw_breakpoint); + + +struct pmu perf_ops_bp = { + .enable = arch_install_hw_breakpoint, + .disable = arch_uninstall_hw_breakpoint, + .read = hw_breakpoint_pmu_read, + .unthrottle = hw_breakpoint_pmu_unthrottle +}; diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 1de9700f416..2295a31ef11 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -45,7 +45,7 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* * An old-style architecture might still have @@ -61,7 +61,7 @@ unsigned long probe_irq_on(void) desc->chip->set_type(i, IRQ_TYPE_PROBE); desc->chip->startup(i); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } /* Wait for longstanding interrupts to trigger. */ @@ -73,13 +73,13 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; if (desc->chip->startup(i)) desc->status |= IRQ_PENDING; } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } /* @@ -91,7 +91,7 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -103,7 +103,7 @@ unsigned long probe_irq_on(void) if (i < 32) mask |= 1 << i; } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } return mask; @@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val) desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); @@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val) unsigned int status; for_each_irq_desc(i, desc) { - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { @@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val) desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } mutex_unlock(&probing_active); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c1660194d11..ecc3fa28f66 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq) } /* Ensure we don't have left over values from a previous use of this irq */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; desc->handle_irq = handle_bad_irq; @@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq) cpumask_clear(desc->pending_mask); #endif #endif - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } /** @@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); if (desc->action) { - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", irq); return; @@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq) desc->chip = &no_irq_chip; desc->name = NULL; clear_kstat_irqs(desc); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } @@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type) if (type == IRQ_TYPE_NONE) return 0; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); ret = __irq_set_trigger(desc, irq, type); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_type); @@ -158,19 +158,19 @@ int set_irq_data(unsigned int irq, void *data) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(set_irq_data); /** - * set_irq_data - set irq type data for an irq + * set_irq_msi - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * - * Set the hardware irq controller data for an irq + * Set the MSI descriptor entry for an irq */ int set_irq_msi(unsigned int irq, struct msi_desc *entry) { @@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) entry->irq = irq; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -214,9 +214,9 @@ int set_irq_chip_data(unsigned int irq, void *data) return -EINVAL; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->chip_data = data; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -241,12 +241,12 @@ void set_irq_nested_thread(unsigned int irq, int nest) if (!desc) return; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); if (nest) desc->status |= IRQ_NESTED_THREAD; else desc->status &= ~IRQ_NESTED_THREAD; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL_GPL(set_irq_nested_thread); @@ -343,7 +343,7 @@ void handle_nested_irq(unsigned int irq) might_sleep(); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); kstat_incr_irqs_this_cpu(irq, desc); @@ -352,17 +352,17 @@ void handle_nested_irq(unsigned int irq) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } EXPORT_SYMBOL_GPL(handle_nested_irq); @@ -384,7 +384,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -396,16 +396,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } /** @@ -424,7 +424,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); mask_ack_irq(desc, irq); if (unlikely(desc->status & IRQ_INPROGRESS)) @@ -441,13 +441,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; if (unlikely(desc->status & IRQ_ONESHOT)) @@ -455,7 +455,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) desc->chip->unmask(irq); out_unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); @@ -475,7 +475,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) struct irqaction *action; irqreturn_t action_ret; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out; @@ -497,18 +497,18 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status |= IRQ_INPROGRESS; desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } /** @@ -530,7 +530,7 @@ out: void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -576,21 +576,21 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) } desc->status &= ~IRQ_PENDING; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); desc->status &= ~IRQ_INPROGRESS; out_unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } /** - * handle_percpu_IRQ - Per CPU local irq handler + * handle_percpu_irq - Per CPU local irq handler * @irq: the interrupt number * @desc: the interrupt description structure for this irq * @@ -643,7 +643,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, } chip_bus_lock(irq, desc); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ if (handle == handle_bad_irq) { @@ -661,7 +661,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->depth = 0; desc->chip->startup(irq); } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); @@ -692,9 +692,9 @@ void __init set_irq_noprobe(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } void __init set_irq_probe(unsigned int irq) @@ -707,7 +707,7 @@ void __init set_irq_probe(unsigned int irq) return; } - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 17c71bb565c..814940e7f48 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -80,7 +80,7 @@ static struct irq_desc irq_desc_init = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), }; void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) @@ -108,7 +108,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) { memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - spin_lock_init(&desc->lock); + raw_spin_lock_init(&desc->lock); desc->irq = irq; #ifdef CONFIG_SMP desc->node = node; @@ -130,7 +130,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) /* * Protect the sparse_irqs: */ -DEFINE_SPINLOCK(sparse_irq_lock); +DEFINE_RAW_SPINLOCK(sparse_irq_lock); struct irq_desc **irq_desc_ptrs __read_mostly; @@ -141,7 +141,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), } }; @@ -212,7 +212,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) if (desc) return desc; - spin_lock_irqsave(&sparse_irq_lock, flags); + raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ desc = irq_desc_ptrs[irq]; @@ -234,7 +234,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) irq_desc_ptrs[irq] = desc; out_unlock: - spin_unlock_irqrestore(&sparse_irq_lock, flags); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); return desc; } @@ -247,7 +247,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; @@ -473,7 +473,7 @@ unsigned int __do_IRQ(unsigned int irq) return 1; } - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (desc->chip->ack) desc->chip->ack(irq); /* @@ -517,13 +517,13 @@ unsigned int __do_IRQ(unsigned int irq) for (;;) { irqreturn_t action_ret; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (likely(!(desc->status & IRQ_PENDING))) break; desc->status &= ~IRQ_PENDING; @@ -536,7 +536,7 @@ out: * disabled while the handler was running. */ desc->chip->end(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); return 1; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 1b5d742c6a7..b2821f070a3 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); extern void clear_kstat_irqs(struct irq_desc *desc); -extern spinlock_t sparse_irq_lock; +extern raw_spinlock_t sparse_irq_lock; #ifdef CONFIG_SPARSE_IRQ /* irq_desc_ptrs allocated at boot time */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index bde4c667d24..eb6078ca60c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); status = desc->status; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ } while (status & IRQ_INPROGRESS); @@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) if (!desc->chip->set_affinity) return -EINVAL; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { @@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) } #endif desc->status |= IRQ_AFFINITY_SET; - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq) unsigned long flags; int ret; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); if (!ret) irq_set_thread_affinity(desc); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq) return; chip_bus_lock(irq, desc); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -308,9 +308,9 @@ void enable_irq(unsigned int irq) return; chip_bus_lock(irq, desc); - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(irq, desc); } EXPORT_SYMBOL(enable_irq); @@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); @@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_wake); @@ -484,12 +484,12 @@ static int irq_wait_for_interrupt(struct irqaction *action) static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { chip_bus_lock(irq, desc); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->chip->unmask(irq); } - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(irq, desc); } @@ -514,9 +514,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) return; } - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); cpumask_copy(mask, desc->affinity); - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); @@ -545,7 +545,7 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); - spin_lock_irq(&desc->lock); + raw_spin_lock_irq(&desc->lock); if (unlikely(desc->status & IRQ_DISABLED)) { /* * CHECKME: We might need a dedicated @@ -555,9 +555,9 @@ static int irq_thread(void *data) * retriggers the interrupt itself --- tglx */ desc->status |= IRQ_PENDING; - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); } else { - spin_unlock_irq(&desc->lock); + raw_spin_unlock_irq(&desc->lock); action->thread_fn(action->irq, action->dev_id); @@ -679,7 +679,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* * The following block of code has to be executed atomically */ - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); old_ptr = &desc->action; old = *old_ptr; if (old) { @@ -775,7 +775,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) __enable_irq(desc, irq, false); } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Strictly no need to wake it up, but hung_task complains @@ -802,7 +802,7 @@ mismatch: ret = -EBUSY; out_thread: - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); if (new->thread) { struct task_struct *t = new->thread; @@ -844,7 +844,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); /* * There can be multiple actions per IRQ descriptor, find the right @@ -856,7 +856,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return NULL; } @@ -884,7 +884,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, kfree(action); #ifdef CONFIG_DEBUG_SHIRQ - if (irqflags & IRQF_SHARED) { + if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index fcb6c96f262..24196228083 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -27,7 +27,7 @@ void move_masked_irq(int irq) if (!desc->chip->set_affinity) return; - assert_spin_locked(&desc->lock); + assert_raw_spin_locked(&desc->lock); /* * If there was a valid mask to work with, please diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 3fd30197da2..26bac9d8f86 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, "for migration.\n", irq); return false; } - spin_lock_init(&desc->lock); + raw_spin_lock_init(&desc->lock); desc->node = node; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); @@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, irq = old_desc->irq; - spin_lock_irqsave(&sparse_irq_lock, flags); + raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ desc = irq_desc_ptrs[irq]; @@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, } irq_desc_ptrs[irq] = desc; - spin_unlock_irqrestore(&sparse_irq_lock, flags); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); /* free the old one */ free_one_irq_desc(old_desc, desc); @@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, return desc; out_unlock: - spin_unlock_irqrestore(&sparse_irq_lock, flags); + raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); return desc; } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index a0bb09e7986..0d4005d85b0 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -28,9 +28,9 @@ void suspend_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, true); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } for_each_irq_desc(irq, desc) @@ -56,9 +56,9 @@ void resume_device_irqs(void) if (!(desc->status & IRQ_SUSPENDED)) continue; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); } } EXPORT_SYMBOL_GPL(resume_device_irqs); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 692363dd591..6f50eccc79c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -136,7 +136,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, NULL); + return single_open(file, default_affinity_show, PDE(inode)->data); } static const struct file_operations default_affinity_proc_fops = { @@ -148,18 +148,28 @@ static const struct file_operations default_affinity_proc_fops = { }; #endif -static int irq_spurious_read(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int irq_spurious_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_to_desc((long) data); - return sprintf(page, "count %u\n" - "unhandled %u\n" - "last_unhandled %u ms\n", - desc->irq_count, - desc->irqs_unhandled, - jiffies_to_msecs(desc->last_unhandled)); + struct irq_desc *desc = irq_to_desc((long) m->private); + + seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", + desc->irq_count, desc->irqs_unhandled, + jiffies_to_msecs(desc->last_unhandled)); + return 0; +} + +static int irq_spurious_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_spurious_proc_show, NULL); } +static const struct file_operations irq_spurious_proc_fops = { + .open = irq_spurious_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) @@ -169,7 +179,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) unsigned long flags; int ret = 1; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); for (action = desc->action ; action; action = action->next) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { @@ -177,7 +187,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) break; } } - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -204,7 +214,6 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; - struct proc_dir_entry *entry; if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; @@ -214,6 +223,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); + if (!desc->dir) + return; #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ @@ -221,11 +232,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_affinity_proc_fops, (void *)(long)irq); #endif - entry = create_proc_entry("spurious", 0444, desc->dir); - if (entry) { - entry->data = (void *)(long)irq; - entry->read_proc = irq_spurious_read; - } + proc_create_data("spurious", 0444, desc->dir, + &irq_spurious_proc_fops, (void *)(long)irq); } #undef MAX_NAMELEN diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 114e704760f..89fb90ae534 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) struct irqaction *action; int ok = 0, work = 0; - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); /* Already running on another processor */ if (desc->status & IRQ_INPROGRESS) { /* @@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc) */ if (desc->action && (desc->action->flags & IRQF_SHARED)) desc->status |= IRQ_PENDING; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); return ok; } /* Honour the normal IRQ locking */ desc->status |= IRQ_INPROGRESS; action = desc->action; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); while (action) { /* Only shared IRQ handlers are safe to call */ @@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) } local_irq_disable(); /* Now clean up the flags */ - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); action = desc->action; /* @@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc) * Perform real IRQ processing for the IRQ we deferred */ work = 1; - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); handle_IRQ_event(irq, action); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); desc->status &= ~IRQ_PENDING; } desc->status &= ~IRQ_INPROGRESS; @@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc) */ if (work && desc->chip && desc->chip->end) desc->chip->end(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); return ok; } @@ -104,7 +104,7 @@ static int misrouted_irq(int irq) return ok; } -static void poll_all_shared_irqs(void) +static void poll_spurious_irqs(unsigned long dummy) { struct irq_desc *desc; int i; @@ -121,25 +121,15 @@ static void poll_all_shared_irqs(void) if (!(status & IRQ_SPURIOUS_DISABLED)) continue; + local_irq_disable(); try_one_irq(i, desc); + local_irq_enable(); } -} - -static void poll_spurious_irqs(unsigned long dummy) -{ - poll_all_shared_irqs(); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } -#ifdef CONFIG_DEBUG_SHIRQ -void debug_poll_all_shared_irqs(void) -{ - poll_all_shared_irqs(); -} -#endif - /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic @@ -230,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, /* * If we are seeing only the odd spurious IRQ caused by * bus asynchronicity then don't eventually trigger an error, - * otherwise the couter becomes a doomsday timer for otherwise + * otherwise the counter becomes a doomsday timer for otherwise * working systems */ if (time_after(jiffies, desc->last_unhandled + HZ/10)) diff --git a/kernel/itimer.c b/kernel/itimer.c index b03451ede52..d802883153d 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, { cputime_t cval, nval, cinterval, ninterval; s64 ns_ninterval, ns_nval; + u32 error, incr_error; struct cpu_itimer *it = &tsk->signal->it[clock_id]; nval = timeval_to_cputime(&value->it_value); @@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, ninterval = timeval_to_cputime(&value->it_interval); ns_ninterval = timeval_to_ns(&value->it_interval); - it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); - it->error = cputime_sub_ns(nval, ns_nval); + error = cputime_sub_ns(nval, ns_nval); + incr_error = cputime_sub_ns(ninterval, ns_ninterval); spin_lock_irq(&tsk->sighand->siglock); @@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, } it->expires = nval; it->incr = ninterval; + it->error = error; + it->incr_error = incr_error; trace_itimer_state(clock_id == CPUCLOCK_VIRT ? ITIMER_VIRTUAL : ITIMER_PROF, value, nval); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 8b6b8b697c6..8e5288a8a35 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -181,6 +181,7 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } +EXPORT_SYMBOL_GPL(kallsyms_lookup_name); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), diff --git a/kernel/kexec.c b/kernel/kexec.c index f336e2107f9..a9a93d9ee7a 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,7 @@ #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <linux/utsrelease.h> +#include <generated/utsrelease.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> @@ -31,6 +31,7 @@ #include <linux/cpu.h> #include <linux/console.h> #include <linux/vmalloc.h> +#include <linux/swap.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -1082,6 +1083,64 @@ void crash_kexec(struct pt_regs *regs) } } +size_t crash_get_memory_size(void) +{ + size_t size; + mutex_lock(&kexec_mutex); + size = crashk_res.end - crashk_res.start + 1; + mutex_unlock(&kexec_mutex); + return size; +} + +static void free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long start, end; + + mutex_lock(&kexec_mutex); + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + start = crashk_res.start; + end = crashk_res.end; + + if (new_size >= end - start + 1) { + ret = -EINVAL; + if (new_size == end - start + 1) + ret = 0; + goto unlock; + } + + start = roundup(start, PAGE_SIZE); + end = roundup(start + new_size, PAGE_SIZE); + + free_reserved_phys_range(end, crashk_res.end); + + if (start == end) { + crashk_res.end = end; + release_resource(&crashk_res); + } else + crashk_res.end = end - 1; + +unlock: + mutex_unlock(&kexec_mutex); + return ret; +} + static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, size_t data_len) { diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 9147a3190c9..2eb517e2351 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread; struct task_struct *kgdb_contthread; int kgdb_single_step; +pid_t kgdb_sstep_pid; /* Our I/O buffers. */ static char remcom_in_buffer[BUFMAX]; @@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid) */ if (tid == 0 || tid == -1) tid = -atomic_read(&kgdb_active) - 2; - if (tid < 0) { + if (tid < -1 && tid > -NR_CPUS - 2) { if (kgdb_info[-tid - 2].task) return kgdb_info[-tid - 2].task; else return idle_task(-tid - 2); } + if (tid <= 0) { + printk(KERN_ERR "KGDB: Internal thread select error\n"); + dump_stack(); + return NULL; + } /* * find_task_by_pid_ns() does not take the tasklist lock anymore @@ -619,7 +625,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) static int kgdb_activate_sw_breakpoints(void) { unsigned long addr; - int error = 0; + int error; + int ret = 0; int i; for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { @@ -629,13 +636,16 @@ static int kgdb_activate_sw_breakpoints(void) addr = kgdb_break[i].bpt_addr; error = kgdb_arch_set_breakpoint(addr, kgdb_break[i].saved_instr); - if (error) - return error; + if (error) { + ret = error; + printk(KERN_INFO "KGDB: BP install failed: %lx", addr); + continue; + } kgdb_flush_swbreak_addr(addr); kgdb_break[i].state = BP_ACTIVE; } - return 0; + return ret; } static int kgdb_set_sw_break(unsigned long addr) @@ -682,7 +692,8 @@ static int kgdb_set_sw_break(unsigned long addr) static int kgdb_deactivate_sw_breakpoints(void) { unsigned long addr; - int error = 0; + int error; + int ret = 0; int i; for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { @@ -691,13 +702,15 @@ static int kgdb_deactivate_sw_breakpoints(void) addr = kgdb_break[i].bpt_addr; error = kgdb_arch_remove_breakpoint(addr, kgdb_break[i].saved_instr); - if (error) - return error; + if (error) { + printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr); + ret = error; + } kgdb_flush_swbreak_addr(addr); kgdb_break[i].state = BP_SET; } - return 0; + return ret; } static int kgdb_remove_sw_break(unsigned long addr) @@ -870,7 +883,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) /* * All threads that don't have debuggerinfo should be - * in __schedule() sleeping, since all other CPUs + * in schedule() sleeping, since all other CPUs * are in kgdb_wait, and thus have debuggerinfo. */ if (local_debuggerinfo) { @@ -1204,8 +1217,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks) return 1; } else { - error_packet(remcom_out_buffer, -EINVAL); - return 0; + kgdb_msg_write("KGDB only knows signal 9 (pass)" + " and 15 (pass and disconnect)\n" + "Executing a continue without signal passing\n", 0); + remcom_in_buffer[0] = 'c'; } /* Indicate fall through */ @@ -1395,6 +1410,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; unsigned long flags; + int sstep_tries = 100; int error = 0; int i, cpu; @@ -1425,13 +1441,14 @@ acquirelock: cpu_relax(); /* - * Do not start the debugger connection on this CPU if the last - * instance of the exception handler wanted to come into the - * debugger on a different CPU via a single step + * For single stepping, try to only enter on the processor + * that was single stepping. To gaurd against a deadlock, the + * kernel will only try for the value of sstep_tries before + * giving up and continuing on. */ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && - atomic_read(&kgdb_cpu_doing_single_step) != cpu) { - + (kgdb_info[cpu].task && + kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); touch_softlockup_watchdog(); clocksource_touch_watchdog(); @@ -1524,6 +1541,13 @@ acquirelock: } kgdb_restore: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step); + if (kgdb_info[sstep_cpu].task) + kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid; + else + kgdb_sstep_pid = 0; + } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); touch_softlockup_watchdog(); diff --git a/kernel/kmod.c b/kernel/kmod.c index 9fcb53a11f8..25b10319036 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -80,16 +80,16 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; - ret = security_kernel_module_request(); - if (ret) - return ret; - va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); if (ret >= MODULE_NAME_LEN) return -ENAMETOOLONG; + ret = security_kernel_module_request(module_name); + if (ret) + return ret; + /* If modprobe needs a service that is in a module, we get a recursive * loop. Limit the number of running kmod threads to max_threads/2 or * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5240d75f4c6..e5342a344c4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) */ static struct kprobe_blackpoint kprobe_blacklist[] = { {"preempt_schedule",}, + {"native_get_debugreg",}, + {"irq_entries_start",}, + {"common_interrupt",}, {NULL} /* Terminator */ }; @@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) return (kprobe_opcode_t *)(((char *)addr) + p->offset); } +/* Check passed kprobe is valid and return kprobe in kprobe_table. */ +static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) +{ + struct kprobe *old_p, *list_p; + + old_p = get_kprobe(p->addr); + if (unlikely(!old_p)) + return NULL; + + if (p != old_p) { + list_for_each_entry_rcu(list_p, &old_p->list, list) + if (list_p == p) + /* kprobe p is a valid probe */ + goto valid; + return NULL; + } +valid: + return old_p; +} + +/* Return error if the kprobe is being re-registered */ +static inline int check_kprobe_rereg(struct kprobe *p) +{ + int ret = 0; + struct kprobe *old_p; + + mutex_lock(&kprobe_mutex); + old_p = __get_valid_kprobe(p); + if (old_p) + ret = -EINVAL; + mutex_unlock(&kprobe_mutex); + return ret; +} + int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; @@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p) return -EINVAL; p->addr = addr; + ret = check_kprobe_rereg(p); + if (ret) + return ret; + preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) { @@ -754,26 +795,6 @@ out: } EXPORT_SYMBOL_GPL(register_kprobe); -/* Check passed kprobe is valid and return kprobe in kprobe_table. */ -static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) -{ - struct kprobe *old_p, *list_p; - - old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) - return NULL; - - if (p != old_p) { - list_for_each_entry_rcu(list_p, &old_p->list, list) - if (list_p == p) - /* kprobe p is a valid probe */ - goto valid; - return NULL; - } -valid: - return old_p; -} - /* * Unregister a kprobe without a scheduler synchronization. */ @@ -1014,9 +1035,9 @@ int __kprobes register_kretprobe(struct kretprobe *rp) /* Pre-allocate memory for max kretprobe instances */ if (rp->maxactive <= 0) { #ifdef CONFIG_PREEMPT - rp->maxactive = max(10, 2 * NR_CPUS); + rp->maxactive = max(10, 2 * num_possible_cpus()); #else - rp->maxactive = NR_CPUS; + rp->maxactive = num_possible_cpus(); #endif } spin_lock_init(&rp->lock); @@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) arch_remove_kprobe(p); } +void __kprobes dump_kprobe(struct kprobe *kp) +{ + printk(KERN_WARNING "Dumping kprobe:\n"); + printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + kp->symbol_name, kp->addr, kp->offset); +} + /* Module notifier call back, checking kprobes on the module */ static int __kprobes kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 528dd78e7e7..3feaf5a7451 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj, } KERNEL_ATTR_RO(kexec_crash_loaded); +static ssize_t kexec_crash_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%zu\n", crash_get_memory_size()); +} +static ssize_t kexec_crash_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long cnt; + int ret; + + if (strict_strtoul(buf, 0, &cnt)) + return -EINVAL; + + ret = crash_shrink_memory(cnt); + return ret < 0 ? ret : count; +} +KERNEL_ATTR_RW(kexec_crash_size); + static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = { #ifdef CONFIG_KEXEC &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, + &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif NULL diff --git a/kernel/kthread.c b/kernel/kthread.c index 5fe709982ca..ab7ae57773e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), EXPORT_SYMBOL(kthread_create); /** - * kthread_bind - bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - */ -void kthread_bind(struct task_struct *k, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - set_task_cpu(k, cpu); - k->cpus_allowed = cpumask_of_cpu(cpu); - k->rt.nr_cpus_allowed = 1; - k->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - -/** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 9af56723c09..5feaddcdbe4 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -49,7 +49,7 @@ #include "lockdep_internals.h" #define CREATE_TRACE_POINTS -#include <trace/events/lockdep.h> +#include <trace/events/lock.h> #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; @@ -73,11 +73,11 @@ module_param(lock_stat, int, 0644); * to use a raw spinlock - we really dont want the spinlock * code to recurse back into the lockdep code... */ -static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static int graph_lock(void) { - __raw_spin_lock(&lockdep_lock); + arch_spin_lock(&lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -85,7 +85,7 @@ static int graph_lock(void) * dropped already) */ if (!debug_locks) { - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); return 0; } /* prevent any recursions within lockdep from causing deadlocks */ @@ -95,11 +95,11 @@ static int graph_lock(void) static inline int graph_unlock(void) { - if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) + if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); current->lockdep_recursion--; - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); return 0; } @@ -111,7 +111,7 @@ static inline int debug_locks_off_graph_unlock(void) { int ret = debug_locks_off(); - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); return ret; } @@ -140,7 +140,8 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) } #ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], + cpu_lock_stats); static inline u64 lockstat_clock(void) { @@ -168,7 +169,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time) if (time > lt->max) lt->max = time; - if (time < lt->min || !lt->min) + if (time < lt->min || !lt->nr) lt->min = time; lt->total += time; @@ -177,8 +178,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time) static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) { - dst->min += src->min; - dst->max += src->max; + if (!src->nr) + return; + + if (src->max > dst->max) + dst->max = src->max; + + if (src->min < dst->min || !dst->nr) + dst->min = src->min; + dst->total += src->total; dst->nr += src->nr; } @@ -191,7 +199,7 @@ struct lock_class_stats lock_stats(struct lock_class *class) memset(&stats, 0, sizeof(struct lock_class_stats)); for_each_possible_cpu(cpu) { struct lock_class_stats *pcs = - &per_cpu(lock_stats, cpu)[class - lock_classes]; + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) stats.contention_point[i] += pcs->contention_point[i]; @@ -218,7 +226,7 @@ void clear_lock_stats(struct lock_class *class) for_each_possible_cpu(cpu) { struct lock_class_stats *cpu_stats = - &per_cpu(lock_stats, cpu)[class - lock_classes]; + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; memset(cpu_stats, 0, sizeof(struct lock_class_stats)); } @@ -228,12 +236,12 @@ void clear_lock_stats(struct lock_class *class) static struct lock_class_stats *get_lock_stats(struct lock_class *class) { - return &get_cpu_var(lock_stats)[class - lock_classes]; + return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; } static void put_lock_stats(struct lock_class_stats *stats) { - put_cpu_var(lock_stats); + put_cpu_var(cpu_lock_stats); } static void lock_release_holdtime(struct held_lock *hlock) @@ -379,7 +387,8 @@ static int save_trace(struct stack_trace *trace) * complete trace that maxes out the entries provided will be reported * as incomplete, friggin useless </rant> */ - if (trace->entries[trace->nr_entries-1] == ULONG_MAX) + if (trace->nr_entries != 0 && + trace->entries[trace->nr_entries-1] == ULONG_MAX) trace->nr_entries--; trace->max_entries = trace->nr_entries; @@ -1161,9 +1170,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.class = class; local_irq_save(flags); - __raw_spin_lock(&lockdep_lock); + arch_spin_lock(&lockdep_lock); ret = __lockdep_count_forward_deps(&this); - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; @@ -1188,9 +1197,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.class = class; local_irq_save(flags); - __raw_spin_lock(&lockdep_lock); + arch_spin_lock(&lockdep_lock); ret = __lockdep_count_backward_deps(&this); - __raw_spin_unlock(&lockdep_lock); + arch_spin_unlock(&lockdep_lock); local_irq_restore(flags); return ret; diff --git a/kernel/module.c b/kernel/module.c index dd29ba43c34..e96b8ed1cb6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA - static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) { @@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ - -/* Number of blocks used and allocated. */ -static unsigned int pcpu_num_used, pcpu_num_allocated; -/* Size of each block. -ve means used. */ -static int *pcpu_size; - -static int split_block(unsigned int i, unsigned short size) -{ - /* Reallocation required? */ - if (pcpu_num_used + 1 > pcpu_num_allocated) { - int *new; - - new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2, - GFP_KERNEL); - if (!new) - return 0; - - pcpu_num_allocated *= 2; - pcpu_size = new; - } - - /* Insert a new subblock */ - memmove(&pcpu_size[i+1], &pcpu_size[i], - sizeof(pcpu_size[0]) * (pcpu_num_used - i)); - pcpu_num_used++; - - pcpu_size[i+1] -= size; - pcpu_size[i] = size; - return 1; -} - -static inline unsigned int block_size(int val) -{ - if (val < 0) - return -val; - return val; -} - -static void *percpu_modalloc(unsigned long size, unsigned long align, - const char *name) -{ - unsigned long extra; - unsigned int i; - void *ptr; - int cpu; - - if (align > PAGE_SIZE) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - name, align, PAGE_SIZE); - align = PAGE_SIZE; - } - - ptr = __per_cpu_start; - for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { - /* Extra for alignment requirement. */ - extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr; - BUG_ON(i == 0 && extra != 0); - - if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size) - continue; - - /* Transfer extra to previous block. */ - if (pcpu_size[i-1] < 0) - pcpu_size[i-1] -= extra; - else - pcpu_size[i-1] += extra; - pcpu_size[i] -= extra; - ptr += extra; - - /* Split block if warranted */ - if (pcpu_size[i] - size > sizeof(unsigned long)) - if (!split_block(i, size)) - return NULL; - - /* add the per-cpu scanning areas */ - for_each_possible_cpu(cpu) - kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0, - GFP_KERNEL); - - /* Mark allocated */ - pcpu_size[i] = -pcpu_size[i]; - return ptr; - } - - printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", - size); - return NULL; -} - -static void percpu_modfree(void *freeme) -{ - unsigned int i; - void *ptr = __per_cpu_start + block_size(pcpu_size[0]); - int cpu; - - /* First entry is core kernel percpu data. */ - for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { - if (ptr == freeme) { - pcpu_size[i] = -pcpu_size[i]; - goto free; - } - } - BUG(); - - free: - /* remove the per-cpu scanning areas */ - for_each_possible_cpu(cpu) - kmemleak_free(freeme + per_cpu_offset(cpu)); - - /* Merge with previous? */ - if (pcpu_size[i-1] >= 0) { - pcpu_size[i-1] += pcpu_size[i]; - pcpu_num_used--; - memmove(&pcpu_size[i], &pcpu_size[i+1], - (pcpu_num_used - i) * sizeof(pcpu_size[0])); - i--; - } - /* Merge with next? */ - if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) { - pcpu_size[i] += pcpu_size[i+1]; - pcpu_num_used--; - memmove(&pcpu_size[i+1], &pcpu_size[i+2], - (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0])); - } -} - -static int percpu_modinit(void) -{ - pcpu_num_used = 2; - pcpu_num_allocated = 2; - pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, - GFP_KERNEL); - /* Static in-kernel percpu data (used). */ - pcpu_size[0] = -(__per_cpu_end-__per_cpu_start); - /* Free room. */ - pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; - if (pcpu_size[1] < 0) { - printk(KERN_ERR "No per-cpu room for modules.\n"); - pcpu_num_used = 1; - } - - return 0; -} -__initcall(percpu_modinit); - -#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ - static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) @@ -1030,11 +880,23 @@ static int try_to_force_load(struct module *mod, const char *reason) } #ifdef CONFIG_MODVERSIONS +/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */ +static unsigned long maybe_relocated(unsigned long crc, + const struct module *crc_owner) +{ +#ifdef ARCH_RELOCATES_KCRCTAB + if (crc_owner == NULL) + return crc - (unsigned long)reloc_start; +#endif + return crc; +} + static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, struct module *mod, - const unsigned long *crc) + const unsigned long *crc, + const struct module *crc_owner) { unsigned int i, num_versions; struct modversion_info *versions; @@ -1055,10 +917,10 @@ static int check_version(Elf_Shdr *sechdrs, if (strcmp(versions[i].name, symname) != 0) continue; - if (versions[i].crc == *crc) + if (versions[i].crc == maybe_relocated(*crc, crc_owner)) return 1; DEBUGP("Found checksum %lX vs module %lX\n", - *crc, versions[i].crc); + maybe_relocated(*crc, crc_owner), versions[i].crc); goto bad_version; } @@ -1081,7 +943,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, &crc, true, false)) BUG(); - return check_version(sechdrs, versindex, "module_layout", mod, crc); + return check_version(sechdrs, versindex, "module_layout", mod, crc, + NULL); } /* First part is kernel version, which we ignore if module has crcs. */ @@ -1099,7 +962,8 @@ static inline int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, struct module *mod, - const unsigned long *crc) + const unsigned long *crc, + const struct module *crc_owner) { return 1; } @@ -1134,8 +998,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, /* use_module can fail due to OOM, or module initialization or unloading */ if (sym) { - if (!check_version(sechdrs, versindex, name, mod, crc) || - !use_module(mod, owner)) + if (!check_version(sechdrs, versindex, name, mod, crc, owner) + || !use_module(mod, owner)) sym = NULL; } return sym; @@ -1187,7 +1051,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, /* Count loaded sections and allocate structures */ for (i = 0; i < nsect; i++) - if (sechdrs[i].sh_flags & SHF_ALLOC) + if (sechdrs[i].sh_flags & SHF_ALLOC + && sechdrs[i].sh_size) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1207,6 +1072,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, for (i = 0; i < nsect; i++) { if (! (sechdrs[i].sh_flags & SHF_ALLOC)) continue; + if (!sechdrs[i].sh_size) + continue; sattr->address = sechdrs[i].sh_addr; sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, GFP_KERNEL); diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 6b2d735846a..57d527a16f9 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock) \ DEBUG_LOCKS_WARN_ON(in_interrupt()); \ local_irq_save(flags); \ - __raw_spin_lock(&(lock)->raw_lock); \ + arch_spin_lock(&(lock)->rlock.raw_lock);\ DEBUG_LOCKS_WARN_ON(l->magic != l); \ } while (0) -#define spin_unlock_mutex(lock, flags) \ - do { \ - __raw_spin_unlock(&(lock)->raw_lock); \ - local_irq_restore(flags); \ - preempt_check_resched(); \ +#define spin_unlock_mutex(lock, flags) \ + do { \ + arch_spin_unlock(&(lock)->rlock.raw_lock); \ + local_irq_restore(flags); \ + preempt_check_resched(); \ } while (0) diff --git a/kernel/mutex.c b/kernel/mutex.c index 947b3ad551f..632f04c57d8 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -148,8 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire(&lock->dep_map, subclass, 0, ip); -#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ - !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Optimistic spinning. * diff --git a/kernel/notifier.c b/kernel/notifier.c index 61d5aa5eced..acd24e7643e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notrace notify_die(enum die_val val, const char *str, +int notrace __kprobes notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { diff --git a/kernel/panic.c b/kernel/panic.c index 96b45d0b4ba..5827f7b9725 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -10,6 +10,7 @@ */ #include <linux/debug_locks.h> #include <linux/interrupt.h> +#include <linux/kmsg_dump.h> #include <linux/kallsyms.h> #include <linux/notifier.h> #include <linux/module.h> @@ -74,6 +75,7 @@ NORET_TYPE void panic(const char * fmt, ...) dump_stack(); #endif + kmsg_dump(KMSG_DUMP_PANIC); /* * If we have crashed and we have a crash kernel loaded let it handle * everything else. @@ -339,6 +341,7 @@ void oops_exit(void) { do_oops_enter_exit(); print_oops_end_marker(); + kmsg_dump(KMSG_DUMP_OOPS); } #ifdef WANT_WARN_ON_SLOWPATH diff --git a/kernel/params.c b/kernel/params.c index 9da58eabdcb..cf1b6918312 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -24,6 +24,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/ctype.h> +#include <linux/string.h> #if 0 #define DEBUGP printk @@ -122,9 +123,7 @@ static char *next_arg(char *args, char **param, char **val) next = args + i; /* Chew up trailing spaces. */ - while (isspace(*next)) - next++; - return next; + return skip_spaces(next); } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ @@ -139,8 +138,7 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); /* Chew leading spaces */ - while (isspace(*args)) - args++; + args = skip_spaces(args); while (*args) { int ret; @@ -218,15 +216,11 @@ int param_set_charp(const char *val, struct kernel_param *kp) return -ENOSPC; } - if (kp->flags & KPARAM_KMALLOCED) - kfree(*(char **)kp->arg); - /* This is a hack. We can't need to strdup in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - kp->flags |= KPARAM_KMALLOCED; *(char **)kp->arg = kstrdup(val, GFP_KERNEL); - if (!kp->arg) + if (!*(char **)kp->arg) return -ENOMEM; } else *(const char **)kp->arg = val; @@ -304,6 +298,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, struct kernel_param *kp), + u16 flags, unsigned int *num) { int ret; @@ -313,6 +308,7 @@ static int param_array(const char *name, /* Get the name right for errors. */ kp.name = name; kp.arg = elem; + kp.flags = flags; /* No equals sign? */ if (!val) { @@ -358,7 +354,8 @@ int param_array_set(const char *val, struct kernel_param *kp) unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->set, arr->num ?: &temp_num); + arr->elemsize, arr->set, kp->flags, + arr->num ?: &temp_num); } int param_array_get(char *buffer, struct kernel_param *kp) @@ -605,11 +602,7 @@ void module_param_sysfs_remove(struct module *mod) void destroy_params(const struct kernel_param *params, unsigned num) { - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].flags & KPARAM_KMALLOCED) - kfree(*(char **)params[i].arg); + /* FIXME: This should free kmalloced charp parameters. It doesn't. */ } static void __init kernel_add_sysfs_param(const char *name, diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9d0b5c66588..8ab86988bd2 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -28,13 +28,15 @@ #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/perf_event.h> +#include <linux/ftrace_event.h> +#include <linux/hw_breakpoint.h> #include <asm/irq_regs.h> /* * Each CPU has a list of per CPU events: */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); int perf_max_events __read_mostly = 1; static int perf_reserved_percpu __read_mostly; @@ -201,14 +203,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * if so. If we locked the right context, then it * can't get swapped on us any more. */ - spin_lock_irqsave(&ctx->lock, *flags); + raw_spin_lock_irqsave(&ctx->lock, *flags); if (ctx != rcu_dereference(task->perf_event_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock_irqrestore(&ctx->lock, *flags); ctx = NULL; } } @@ -229,7 +231,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } @@ -238,12 +240,55 @@ static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; - spin_lock_irqsave(&ctx->lock, flags); + raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); put_ctx(ctx); } +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + if (ctx->is_active) + run_end = ctx->time; + else + run_end = event->tstamp_stopped; + + event->total_time_enabled = run_end - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -292,6 +337,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader != event) event->group_leader->nr_siblings--; + update_event_times(event); + + /* + * If event was in error state, then keep it + * that way, otherwise bogus counts will be + * returned on read(). The only way to get out + * of error state is by explicit re-enabling + * of the event + */ + if (event->state > PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_OFF; + /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them @@ -370,7 +427,7 @@ static void __perf_event_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the * events on a global level. @@ -392,7 +449,7 @@ static void __perf_event_remove_from_context(void *info) } perf_enable(); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } @@ -419,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event) if (!task) { /* * Per cpu events are removed via an smp call and - * the removal is always sucessful. + * the removal is always successful. */ smp_call_function_single(event->cpu, __perf_event_remove_from_context, @@ -431,12 +488,12 @@ retry: task_oncpu_function_call(task, __perf_event_remove_from_context, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * If the context is active we need to retry the smp call. */ if (ctx->nr_active && !list_empty(&event->group_entry)) { - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); goto retry; } @@ -445,48 +502,9 @@ retry: * can remove the event safely, if the call above did not * succeed. */ - if (!list_empty(&event->group_entry)) { + if (!list_empty(&event->group_entry)) list_del_event(event, ctx); - } - spin_unlock_irq(&ctx->lock); -} - -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_event_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a event. - */ -static void update_event_times(struct perf_event *event) -{ - struct perf_event_context *ctx = event->ctx; - u64 run_end; - - if (event->state < PERF_EVENT_STATE_INACTIVE || - event->group_leader->state < PERF_EVENT_STATE_INACTIVE) - return; - - event->total_time_enabled = ctx->time - event->tstamp_enabled; - - if (event->state == PERF_EVENT_STATE_INACTIVE) - run_end = event->tstamp_stopped; - else - run_end = ctx->time; - - event->total_time_running = run_end - event->tstamp_running; + raw_spin_unlock_irq(&ctx->lock); } /* @@ -517,7 +535,7 @@ static void __perf_event_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); /* * If the event is on, turn it off. @@ -533,7 +551,7 @@ static void __perf_event_disable(void *info) event->state = PERF_EVENT_STATE_OFF; } - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -549,7 +567,7 @@ static void __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -static void perf_event_disable(struct perf_event *event) +void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -566,12 +584,12 @@ static void perf_event_disable(struct perf_event *event) retry: task_oncpu_function_call(task, __perf_event_disable, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * If the event is still active, we need to retry the cross-call. */ if (event->state == PERF_EVENT_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); goto retry; } @@ -584,7 +602,7 @@ static void perf_event_disable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; } - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } static int @@ -752,7 +770,7 @@ static void __perf_install_in_context(void *info) cpuctx->task_ctx = ctx; } - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -764,6 +782,9 @@ static void __perf_install_in_context(void *info) add_event_to_ctx(event, ctx); + if (event->cpu != -1 && event->cpu != smp_processor_id()) + goto unlock; + /* * Don't put the event on if it is disabled or if * it is in a group and the group isn't on. @@ -802,7 +823,7 @@ static void __perf_install_in_context(void *info) unlock: perf_enable(); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -827,7 +848,7 @@ perf_install_in_context(struct perf_event_context *ctx, if (!task) { /* * Per cpu events are installed via an smp call and - * the install is always sucessful. + * the install is always successful. */ smp_call_function_single(cpu, __perf_install_in_context, event, 1); @@ -838,12 +859,12 @@ retry: task_oncpu_function_call(task, __perf_install_in_context, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * we need to retry the smp call. */ if (ctx->is_active && list_empty(&event->group_entry)) { - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); goto retry; } @@ -854,7 +875,7 @@ retry: */ if (list_empty(&event->group_entry)) add_event_to_ctx(event, ctx); - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } /* @@ -899,7 +920,7 @@ static void __perf_event_enable(void *info) cpuctx->task_ctx = ctx; } - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); @@ -907,6 +928,9 @@ static void __perf_event_enable(void *info) goto unlock; __perf_event_mark_enabled(event, ctx); + if (event->cpu != -1 && event->cpu != smp_processor_id()) + goto unlock; + /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. @@ -941,7 +965,7 @@ static void __perf_event_enable(void *info) } unlock: - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -953,7 +977,7 @@ static void __perf_event_enable(void *info) * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -static void perf_event_enable(struct perf_event *event) +void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -967,7 +991,7 @@ static void perf_event_enable(struct perf_event *event) return; } - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE) goto out; @@ -982,10 +1006,10 @@ static void perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; retry: - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); task_oncpu_function_call(task, __perf_event_enable, event); - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); /* * If the context is active and the event is still off, @@ -1002,7 +1026,7 @@ static void perf_event_enable(struct perf_event *event) __perf_event_mark_enabled(event, ctx); out: - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); } static int perf_event_refresh(struct perf_event *event, int refresh) @@ -1024,20 +1048,20 @@ void __perf_event_sched_out(struct perf_event_context *ctx, { struct perf_event *event; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 0; if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); perf_disable(); - if (ctx->nr_active) + if (ctx->nr_active) { list_for_each_entry(event, &ctx->group_list, group_entry) group_sched_out(event, cpuctx, ctx); - + } perf_enable(); out: - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -1059,8 +1083,6 @@ static int context_equiv(struct perf_event_context *ctx1, && !ctx1->pin_count && !ctx2->pin_count; } -static void __perf_event_read(void *event); - static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { @@ -1078,8 +1100,8 @@ static void __perf_event_sync_stat(struct perf_event *event, */ switch (event->state) { case PERF_EVENT_STATE_ACTIVE: - __perf_event_read(event); - break; + event->pmu->read(event); + /* fall-through */ case PERF_EVENT_STATE_INACTIVE: update_event_times(event); @@ -1118,6 +1140,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, if (!ctx->nr_stat) return; + update_context_time(ctx); + event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); @@ -1161,8 +1185,6 @@ void perf_event_task_sched_out(struct task_struct *task, if (likely(!ctx || !cpuctx->task_ctx)) return; - update_context_time(ctx); - rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp; @@ -1177,8 +1199,8 @@ void perf_event_task_sched_out(struct task_struct *task, * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&ctx->lock); + raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { /* * XXX do we need a memory barrier of sorts @@ -1192,8 +1214,8 @@ void perf_event_task_sched_out(struct task_struct *task, perf_event_sync_stat(ctx, next_ctx); } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); + raw_spin_unlock(&next_ctx->lock); + raw_spin_unlock(&ctx->lock); } rcu_read_unlock(); @@ -1235,7 +1257,7 @@ __perf_event_sched_in(struct perf_event_context *ctx, struct perf_event *event; int can_add_hw = 1; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) goto out; @@ -1290,7 +1312,7 @@ __perf_event_sched_in(struct perf_event_context *ctx, } perf_enable(); out: - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -1354,8 +1376,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) struct hw_perf_event *hwc; u64 interrupts, freq; - spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->group_list, group_entry) { + raw_spin_lock(&ctx->lock); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -1409,7 +1431,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) perf_enable(); } } - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } /* @@ -1422,7 +1444,7 @@ static void rotate_ctx(struct perf_event_context *ctx) if (!ctx->nr_events) return; - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); /* * Rotate the first entry last (works just fine for group events too): */ @@ -1433,7 +1455,7 @@ static void rotate_ctx(struct perf_event_context *ctx) } perf_enable(); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); } void perf_event_task_tick(struct task_struct *curr, int cpu) @@ -1482,7 +1504,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) __perf_event_task_sched_out(ctx); - spin_lock(&ctx->lock); + raw_spin_lock(&ctx->lock); list_for_each_entry(event, &ctx->group_list, group_entry) { if (!event->attr.enable_on_exec) @@ -1500,7 +1522,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) if (enabled) unclone_ctx(ctx); - spin_unlock(&ctx->lock); + raw_spin_unlock(&ctx->lock); perf_event_task_sched_in(task, smp_processor_id()); out: @@ -1515,7 +1537,6 @@ static void __perf_event_read(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - unsigned long flags; /* * If this is a task context, we need to check whether it is @@ -1527,12 +1548,12 @@ static void __perf_event_read(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - event->pmu->read(event); + raw_spin_lock(&ctx->lock); + update_context_time(ctx); update_event_times(event); - local_irq_restore(flags); + raw_spin_unlock(&ctx->lock); + + event->pmu->read(event); } static u64 perf_event_read(struct perf_event *event) @@ -1545,7 +1566,13 @@ static u64 perf_event_read(struct perf_event *event) smp_call_function_single(event->oncpu, __perf_event_read, event, 1); } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + raw_spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); update_event_times(event); + raw_spin_unlock_irqrestore(&ctx->lock, flags); } return atomic64_read(&event->count); @@ -1558,8 +1585,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx, struct task_struct *task) { - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); + raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->group_list); INIT_LIST_HEAD(&ctx->event_list); @@ -1575,15 +1601,12 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) unsigned long flags; int err; - /* - * If cpu is not a wildcard then this is a percpu event: - */ - if (cpu != -1) { + if (pid == -1 && cpu != -1) { /* Must be root to operate on a CPU event: */ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - if (cpu < 0 || cpu > num_possible_cpus()) + if (cpu < 0 || cpu >= nr_cpumask_bits) return ERR_PTR(-EINVAL); /* @@ -1629,11 +1652,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) ctx = perf_lock_task_context(task, &flags); if (ctx) { unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); + raw_spin_unlock_irqrestore(&ctx->lock, flags); } if (!ctx) { - ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); err = -ENOMEM; if (!ctx) goto errout; @@ -1658,6 +1681,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(err); } +static void perf_event_free_filter(struct perf_event *event); + static void free_event_rcu(struct rcu_head *head) { struct perf_event *event; @@ -1665,6 +1690,7 @@ static void free_event_rcu(struct rcu_head *head) event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); + perf_event_free_filter(event); kfree(event); } @@ -1696,16 +1722,10 @@ static void free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) +int perf_event_release_kernel(struct perf_event *event) { - struct perf_event *event = file->private_data; struct perf_event_context *ctx = event->ctx; - file->private_data = NULL; - WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_event_remove_from_context(event); @@ -1720,6 +1740,19 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + + file->private_data = NULL; + + return perf_event_release_kernel(event); +} static int perf_event_read_size(struct perf_event *event) { @@ -1746,91 +1779,94 @@ static int perf_event_read_size(struct perf_event *event) return size; } -static u64 perf_event_read_value(struct perf_event *event) +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; u64 total = 0; + *enabled = 0; + *running = 0; + + mutex_lock(&event->child_mutex); total += perf_event_read(event); - list_for_each_entry(child, &event->child_list, child_list) + *enabled += event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + *running += event->total_time_running + + atomic64_read(&event->child_total_time_running); + + list_for_each_entry(child, &event->child_list, child_list) { total += perf_event_read(child); + *enabled += child->total_time_enabled; + *running += child->total_time_running; + } + mutex_unlock(&event->child_mutex); return total; } - -static int perf_event_read_entry(struct perf_event *event, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_event_id(event); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} +EXPORT_SYMBOL_GPL(perf_event_read_value); static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { struct perf_event *leader = event->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; + int n = 0, size = 0, ret = -EFAULT; + struct perf_event_context *ctx = leader->ctx; + u64 values[5]; + u64 count, enabled, running; + + mutex_lock(&ctx->mutex); + count = perf_event_read_value(leader, &enabled, &running); values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + values[n++] = count; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); size = n * sizeof(u64); if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_event_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; + goto unlock; - size += err; + ret = size; list_for_each_entry(sub, &leader->sibling_list, group_entry) { - err = perf_event_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; + n = 0; + + values[n++] = perf_event_read_value(sub, &enabled, &running); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + size = n * sizeof(u64); - size += err; + if (copy_to_user(buf + ret, values, size)) { + ret = -EFAULT; + goto unlock; + } + + ret += size; } +unlock: + mutex_unlock(&ctx->mutex); - return size; + return ret; } static int perf_event_read_one(struct perf_event *event, u64 read_format, char __user *buf) { + u64 enabled, running; u64 values[4]; int n = 0; - values[n++] = perf_event_read_value(event); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + - atomic64_read(&event->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + - atomic64_read(&event->child_total_time_running); - } + values[n++] = perf_event_read_value(event, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); @@ -1861,12 +1897,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); - mutex_lock(&event->child_mutex); if (read_format & PERF_FORMAT_GROUP) ret = perf_event_read_group(event, read_format, buf); else ret = perf_event_read_one(event, read_format, buf); - mutex_unlock(&event->child_mutex); return ret; } @@ -1956,7 +1990,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (!value) return -EINVAL; - spin_lock_irq(&ctx->lock); + raw_spin_lock_irq(&ctx->lock); if (event->attr.freq) { if (value > sysctl_perf_event_sample_rate) { ret = -EINVAL; @@ -1969,12 +2003,13 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->hw.sample_period = value; } unlock: - spin_unlock_irq(&ctx->lock); + raw_spin_unlock_irq(&ctx->lock); return ret; } -int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_output(struct perf_event *event, int output_fd); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2002,6 +2037,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: return perf_event_set_output(event, arg); + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + default: return -ENOTTY; } @@ -2174,6 +2212,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) perf_mmap_free_page((unsigned long)data->data_pages[i]); + kfree(data); } #else @@ -2214,6 +2253,7 @@ static void perf_mmap_data_free_work(struct work_struct *work) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); + kfree(data); } static void perf_mmap_data_free(struct perf_mmap_data *data) @@ -2307,7 +2347,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) } if (!data->watermark) - data->watermark = max_t(long, PAGE_SIZE, max_size / 2); + data->watermark = max_size / 2; rcu_assign_pointer(event->data, data); @@ -2319,7 +2359,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) data = container_of(rcu_head, struct perf_mmap_data, rcu_head); perf_mmap_data_free(data); - kfree(data); } static void perf_mmap_data_release(struct perf_event *event) @@ -2666,20 +2705,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle) static void perf_output_lock(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int cpu; + int cur, cpu = get_cpu(); handle->locked = 0; - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; + for (;;) { + cur = atomic_cmpxchg(&data->lock, -1, cpu); + if (cur == -1) { + handle->locked = 1; + break; + } + if (cur == cpu) + break; - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) cpu_relax(); - - handle->locked = 1; + } } static void perf_output_unlock(struct perf_output_handle *handle) @@ -2725,7 +2765,7 @@ again: if (atomic_xchg(&data->wakeup, 0)) perf_output_wakeup(handle); out: - local_irq_restore(handle->flags); + put_cpu(); } void perf_output_copy(struct perf_output_handle *handle, @@ -3236,15 +3276,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_task_match(event)) perf_event_task_output(event, task_event); } - rcu_read_unlock(); } static void perf_event_task_event(struct perf_task_event *task_event) @@ -3252,11 +3287,11 @@ static void perf_event_task_event(struct perf_task_event *task_event) struct perf_cpu_context *cpuctx; struct perf_event_context *ctx = task_event->task_ctx; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); if (!ctx) ctx = rcu_dereference(task_event->task->perf_event_ctxp); if (ctx) @@ -3348,15 +3383,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_comm_match(event)) perf_event_comm_output(event, comm_event); } - rcu_read_unlock(); } static void perf_event_comm_event(struct perf_comm_event *comm_event) @@ -3367,7 +3397,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) char comm[TASK_COMM_LEN]; memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); + strlcpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -3375,11 +3405,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3472,15 +3502,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (perf_event_mmap_match(event, mmap_event)) perf_event_mmap_output(event, mmap_event); } - rcu_read_unlock(); } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -3536,11 +3561,11 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3679,7 +3704,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, perf_event_disable(event); } - perf_event_output(event, nmi, data, regs); + if (event->overflow_handler) + event->overflow_handler(event, nmi, data, regs); + else + perf_event_output(event, nmi, data, regs); + return ret; } @@ -3724,16 +3753,16 @@ again: return nr; } -static void perf_swevent_overflow(struct perf_event *event, +static void perf_swevent_overflow(struct perf_event *event, u64 overflow, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; int throttle = 0; - u64 overflow; data->period = event->hw.last_period; - overflow = perf_swevent_set_period(event); + if (!overflow) + overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; @@ -3766,14 +3795,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, atomic64_add(nr, &event->count); + if (!regs) + return; + if (!hwc->sample_period) return; - if (!regs) + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) + return perf_swevent_overflow(event, 1, nmi, data, regs); + + if (atomic64_add_negative(nr, &hwc->period_left)) return; - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swevent_overflow(event, nmi, data, regs); + perf_swevent_overflow(event, 0, nmi, data, regs); } static int perf_swevent_is_counting(struct perf_event *event) @@ -3806,25 +3840,44 @@ static int perf_swevent_is_counting(struct perf_event *event) return 1; } +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data); + +static int perf_exclude_event(struct perf_event *event, + struct pt_regs *regs) +{ + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 1; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + } + + return 0; +} + static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, - u32 event_id, struct pt_regs *regs) + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) { if (!perf_swevent_is_counting(event)) return 0; if (event->attr.type != type) return 0; + if (event->attr.config != event_id) return 0; - if (regs) { - if (event->attr.exclude_user && user_mode(regs)) - return 0; + if (perf_exclude_event(event, regs)) + return 0; - if (event->attr.exclude_kernel && !user_mode(regs)) - return 0; - } + if (event->attr.type == PERF_TYPE_TRACEPOINT && + !perf_tp_event_match(event, data)) + return 0; return 1; } @@ -3837,49 +3890,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx, { struct perf_event *event; - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_swevent_match(event, type, event_id, regs)) + if (perf_swevent_match(event, type, event_id, data, regs)) perf_swevent_add(event, nr, nmi, data, regs); } - rcu_read_unlock(); } -static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +int perf_swevent_get_recursion_context(void) { + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int rctx; + if (in_nmi()) - return &cpuctx->recursion[3]; + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; - if (in_irq()) - return &cpuctx->recursion[2]; + if (cpuctx->recursion[rctx]) { + put_cpu_var(perf_cpu_context); + return -1; + } - if (in_softirq()) - return &cpuctx->recursion[1]; + cpuctx->recursion[rctx]++; + barrier(); + + return rctx; +} +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); - return &cpuctx->recursion[0]; +void perf_swevent_put_recursion_context(int rctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + barrier(); + cpuctx->recursion[rctx]--; + put_cpu_var(perf_cpu_context); } +EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - + cpuctx = &__get_cpu_var(perf_cpu_context); + rcu_read_lock(); perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, nr, nmi, data, regs); - rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. @@ -3888,23 +3951,24 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, if (ctx) perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - struct perf_sample_data data = { - .addr = addr, - }; + struct perf_sample_data data; + int rctx; - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, - &data, regs); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + return; + + data.addr = addr; + data.raw = NULL; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + + perf_swevent_put_recursion_context(rctx); } static void perf_swevent_read(struct perf_event *event) @@ -3949,6 +4013,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); data.addr = 0; + data.raw = NULL; + data.period = event->hw.last_period; regs = get_irq_regs(); /* * In case we exclude kernel IPs or are somehow not in interrupt @@ -3959,8 +4025,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) { - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); @@ -3969,6 +4036,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) return ret; } +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period; + + if (hwc->remaining) { + if (hwc->remaining < 0) + period = 10000; + else + period = hwc->remaining; + hwc->remaining = 0; + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + hwc->remaining = ktime_to_ns(remaining); + + hrtimer_cancel(&hwc->hrtimer); + } +} + /* * Software event: cpu wall time clock */ @@ -3980,8 +4083,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_read(&event->hw.prev_count); - atomic64_set(&event->hw.prev_count, now); + prev = atomic64_xchg(&event->hw.prev_count, now); atomic64_add(now - prev, &event->count); } @@ -3991,22 +4093,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) int cpu = raw_smp_processor_id(); atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + perf_swevent_start_hrtimer(event); return 0; } static void cpu_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); cpu_clock_perf_event_update(event); } @@ -4043,22 +4137,15 @@ static int task_clock_perf_event_enable(struct perf_event *event) now = event->ctx->time; atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + + perf_swevent_start_hrtimer(event); return 0; } static void task_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); task_clock_perf_event_update(event, event->ctx->time); } @@ -4086,6 +4173,7 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE + void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { @@ -4104,13 +4192,21 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, if (!regs) regs = task_pt_regs(current); + /* Trace events already protected against recursion */ do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data, regs); } EXPORT_SYMBOL_GPL(perf_tp_event); -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} static void tp_perf_event_destroy(struct perf_event *event) { @@ -4135,11 +4231,93 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) return &perf_ops_generic; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + #else + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data) +{ + return 1; +} + static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; } + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_PROFILE */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + int err; + + err = register_perf_hw_breakpoint(bp); + if (err) + return ERR_PTR(err); + + bp->destroy = bp_perf_event_destroy; + + return &perf_ops_bp; +} + +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + sample.raw = NULL; + sample.addr = bp->attr.bp_addr; + + if (!perf_exclude_event(bp, regs)) + perf_swevent_add(bp, 1, 1, &sample, regs); +} +#else +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + return NULL; +} + +void perf_bp_event(struct perf_event *bp, void *regs) +{ +} #endif atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; @@ -4186,6 +4364,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: + case PERF_COUNT_SW_ALIGNMENT_FAULTS: + case PERF_COUNT_SW_EMULATION_FAULTS: if (!event->parent) { atomic_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; @@ -4206,6 +4386,7 @@ perf_event_alloc(struct perf_event_attr *attr, struct perf_event_context *ctx, struct perf_event *group_leader, struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler, gfp_t gfpflags) { const struct pmu *pmu; @@ -4248,6 +4429,11 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; + if (!overflow_handler && parent_event) + overflow_handler = parent_event->overflow_handler; + + event->overflow_handler = overflow_handler; + if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -4282,6 +4468,11 @@ perf_event_alloc(struct perf_event_attr *attr, pmu = tp_perf_event_init(event); break; + case PERF_TYPE_BREAKPOINT: + pmu = bp_perf_event_init(event); + break; + + default: break; } @@ -4376,7 +4567,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + if (attr->__reserved_1 || attr->__reserved_2) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -4394,7 +4585,7 @@ err_size: goto out; } -int perf_event_set_output(struct perf_event *event, int output_fd) +static int perf_event_set_output(struct perf_event *event, int output_fd) { struct perf_event *output_event = NULL; struct file *output_file = NULL; @@ -4524,7 +4715,7 @@ SYSCALL_DEFINE5(perf_event_open, } event = perf_event_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); + NULL, NULL, GFP_KERNEL); err = PTR_ERR(event); if (IS_ERR(event)) goto err_put_context; @@ -4572,6 +4763,61 @@ err_put_context: return err; } +/** + * perf_event_create_kernel_counter + * + * @attr: attributes of the counter to create + * @cpu: cpu in which the counter is bound + * @pid: task to profile + */ +struct perf_event * +perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, + pid_t pid, + perf_overflow_handler_t overflow_handler) +{ + struct perf_event *event; + struct perf_event_context *ctx; + int err; + + /* + * Get the target context (task or percpu): + */ + + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_exit; + } + + event = perf_event_alloc(attr, cpu, ctx, NULL, + NULL, overflow_handler, GFP_KERNEL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_put_context; + } + + event->filp = NULL; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + + return event; + + err_put_context: + put_ctx(ctx); + err_exit: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); + /* * inherit a event from parent task to child task: */ @@ -4597,7 +4843,7 @@ inherit_event(struct perf_event *parent_event, child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, child_ctx, group_leader, parent_event, - GFP_KERNEL); + NULL, GFP_KERNEL); if (IS_ERR(child_event)) return child_event; get_ctx(child_ctx); @@ -4615,6 +4861,8 @@ inherit_event(struct perf_event *parent_event, if (parent_event->attr.freq) child_event->hw.sample_period = parent_event->hw.sample_period; + child_event->overflow_handler = parent_event->overflow_handler; + /* * Link it up in the child's context: */ @@ -4704,7 +4952,6 @@ __perf_event_exit_task(struct perf_event *child_event, { struct perf_event *parent_event; - update_event_times(child_event); perf_event_remove_from_context(child_event); parent_event = child_event->parent; @@ -4748,7 +4995,7 @@ void perf_event_exit_task(struct task_struct *child) * reading child->perf_event_ctxp, we wait until it has * incremented the context's refcount before we do put_ctx below. */ - spin_lock(&child_ctx->lock); + raw_spin_lock(&child_ctx->lock); child->perf_event_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get @@ -4756,7 +5003,8 @@ void perf_event_exit_task(struct task_struct *child) * the events from it. */ unclone_ctx(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); + update_context_time(child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Report the task dead after unscheduling the events so that we @@ -4839,7 +5087,7 @@ again: */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *child_ctx = NULL, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -4855,20 +5103,6 @@ int perf_event_init_task(struct task_struct *child) return 0; /* - * This is executed from the parent task context, so inherit - * events that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - - /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ @@ -4898,6 +5132,26 @@ int perf_event_init_task(struct task_struct *child) continue; } + if (!child->perf_event_ctxp) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ + + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) { + ret = -ENOMEM; + goto exit; + } + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) { @@ -4926,6 +5180,7 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } +exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); @@ -5040,11 +5295,11 @@ perf_set_reserve_percpu(struct sysdev_class *class, perf_reserved_percpu = val; for_each_online_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); + raw_spin_lock_irq(&cpuctx->ctx.lock); mpt = min(perf_max_events - cpuctx->ctx.nr_events, perf_max_events - perf_reserved_percpu); cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); + raw_spin_unlock_irq(&cpuctx->ctx.lock); } spin_unlock(&perf_resource_lock); diff --git a/kernel/pid.c b/kernel/pid.c index d3f722d20f9..2e17c9c92cb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) * installing it: */ spin_lock_irq(&pidmap_lock); - if (map->page) - kfree(page); - else + if (!map->page) { map->page = page; + page = NULL; + } spin_unlock_irq(&pidmap_lock); + kfree(page); if (unlikely(!map->page)) break; } @@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns) for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for (i = ns->level; i >= 0; i--) { - upid = &pid->numbers[i]; + for ( ; upid >= pid->numbers; --upid) hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); - } spin_unlock_irq(&pidmap_lock); out: diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index dfdec524d1b..3db49b9ca37 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -29,7 +29,6 @@ #include <linux/pm_qos_params.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/time.h> @@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) } EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); -#define PID_NAME_LEN sizeof("process_1234567890") -static char name[PID_NAME_LEN]; +#define PID_NAME_LEN 32 static int pm_qos_power_open(struct inode *inode, struct file *filp) { int ret; long pm_qos_class; + char name[PID_NAME_LEN]; - lock_kernel(); pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { filp->private_data = (void *)pm_qos_class; - sprintf(name, "process_%d", current->pid); + snprintf(name, PID_NAME_LEN, "process_%d", current->pid); ret = pm_qos_add_requirement(pm_qos_class, name, PM_QOS_DEFAULT_VALUE); - if (ret >= 0) { - unlock_kernel(); + if (ret >= 0) return 0; - } } - unlock_kernel(); - return -EPERM; } static int pm_qos_power_release(struct inode *inode, struct file *filp) { int pm_qos_class; + char name[PID_NAME_LEN]; pm_qos_class = (long)filp->private_data; - sprintf(name, "process_%d", current->pid); + snprintf(name, PID_NAME_LEN, "process_%d", current->pid); pm_qos_remove_requirement(pm_qos_class, name); return 0; @@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, { s32 value; int pm_qos_class; + char name[PID_NAME_LEN]; pm_qos_class = (long)filp->private_data; if (count != sizeof(s32)) return -EINVAL; if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - sprintf(name, "process_%d", current->pid); + snprintf(name, PID_NAME_LEN, "process_%d", current->pid); pm_qos_update_requirement(pm_qos_class, name, value); return sizeof(s32); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 5c9dc228747..438ff452351 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) /* * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. - * This is called from sys_timer_create with the new timer already locked. + * This is called from sys_timer_create() and do_cpu_nanosleep() with the + * new timer already all-zeros initialized. */ int posix_cpu_timer_create(struct k_itimer *new_timer) { @@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; INIT_LIST_HEAD(&new_timer->it.cpu.entry); - new_timer->it.cpu.incr.sched = 0; - new_timer->it.cpu.expires.sched = 0; read_lock(&tasklist_lock); if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c3b81c30e5d..43191815f87 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/console.c b/kernel/power/console.c index 5187136fe1d..218e5af9015 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -6,7 +6,7 @@ #include <linux/vt_kern.h> #include <linux/kbd_kern.h> -#include <linux/console.h> +#include <linux/vt.h> #include <linux/module.h> #include "power.h" @@ -21,8 +21,7 @@ int pm_prepare_console(void) if (orig_fgconsole < 0) return 1; - orig_kmsg = kmsg_redirect; - kmsg_redirect = SUSPEND_CONSOLE; + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return 0; } @@ -30,7 +29,7 @@ void pm_restore_console(void) { if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); - kmsg_redirect = orig_kmsg; + vt_kmsg_redirect(orig_kmsg); } } #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 04b3a83d686..bbfe472d752 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -32,6 +32,7 @@ static int noresume = 0; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; +int in_suspend __nosavedata = 0; enum { HIBERNATION_INVALID, @@ -202,6 +203,35 @@ static void platform_recover(int platform_mode) } /** + * swsusp_show_speed - print the time elapsed between two events. + * @start: Starting event. + * @stop: Final event. + * @nr_pages - number of pages processed between @start and @stop + * @msg - introductory message to print + */ + +void swsusp_show_speed(struct timeval *start, struct timeval *stop, + unsigned nr_pages, char *msg) +{ + s64 elapsed_centisecs64; + int centisecs; + int k; + int kps; + + elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", + msg, k, + centisecs / 100, centisecs % 100, + kps / 1000, (kps % 1000) / 10); +} + +/** * create_image - freeze devices that need to be frozen with interrupts * off, create the hibernation image and thaw those devices. Control * reappears in this routine after a restore. @@ -693,21 +723,22 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; + swsusp_close(FMODE_READ); goto Unlock; } pm_prepare_console(); error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - goto Finish; + goto close_finish; error = usermodehelper_disable(); if (error) - goto Finish; + goto close_finish; error = create_basic_memory_bitmaps(); if (error) - goto Finish; + goto close_finish; pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); @@ -719,6 +750,7 @@ static int software_resume(void) pr_debug("PM: Reading hibernation image.\n"); error = swsusp_read(&flags); + swsusp_close(FMODE_READ); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); @@ -737,6 +769,9 @@ static int software_resume(void) mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return error; +close_finish: + swsusp_close(FMODE_READ); + goto Finish; } late_initcall(software_resume); diff --git a/kernel/power/main.c b/kernel/power/main.c index 347d2cc88cd..0998c713905 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -220,6 +220,7 @@ static struct attribute_group attr_group = { #ifdef CONFIG_PM_RUNTIME struct workqueue_struct *pm_wq; +EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { diff --git a/kernel/power/process.c b/kernel/power/process.c index cc2e55373b6..5ade1bdcf36 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/syscalls.h> #include <linux/freezer.h> +#include <linux/delay.h> /* * Timeout for stopping processes @@ -41,7 +42,7 @@ static int try_to_freeze_tasks(bool sig_only) do_gettimeofday(&start); end_time = jiffies + TIMEOUT; - do { + while (true) { todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -62,10 +63,15 @@ static int try_to_freeze_tasks(bool sig_only) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); - yield(); /* Yield is okay here */ - if (time_after(jiffies, end_time)) + if (!todo || time_after(jiffies, end_time)) break; - } while (todo); + + /* + * We need to retry, but first give the freezing tasks some + * time to enter the regrigerator. + */ + msleep(10); + } do_gettimeofday(&end); elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 17d8bb1acf9..25596e450ac 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -19,7 +19,7 @@ * The time it takes is system-specific though, so when we test this * during system bootup we allow a LOT of time. */ -#define TEST_SUSPEND_SECONDS 5 +#define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; @@ -49,7 +49,8 @@ void suspend_test_finish(const char *label) * has some performance issues. The stack dump of a WARN_ON * is more likely to get the right attention than a printk... */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); } /* diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b101cdc4df3..09b2b0ae9e9 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -38,6 +38,107 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; +/** + * The following functions are used for tracing the allocated + * swap pages, so that they can be freed in case of an error. + */ + +struct swsusp_extent { + struct rb_node node; + unsigned long start; + unsigned long end; +}; + +static struct rb_root swsusp_extents = RB_ROOT; + +static int swsusp_extents_insert(unsigned long swap_offset) +{ + struct rb_node **new = &(swsusp_extents.rb_node); + struct rb_node *parent = NULL; + struct swsusp_extent *ext; + + /* Figure out where to put the new node */ + while (*new) { + ext = container_of(*new, struct swsusp_extent, node); + parent = *new; + if (swap_offset < ext->start) { + /* Try to merge */ + if (swap_offset == ext->start - 1) { + ext->start--; + return 0; + } + new = &((*new)->rb_left); + } else if (swap_offset > ext->end) { + /* Try to merge */ + if (swap_offset == ext->end + 1) { + ext->end++; + return 0; + } + new = &((*new)->rb_right); + } else { + /* It already is in the tree */ + return -EINVAL; + } + } + /* Add the new node and rebalance the tree. */ + ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); + if (!ext) + return -ENOMEM; + + ext->start = swap_offset; + ext->end = swap_offset; + rb_link_node(&ext->node, parent, new); + rb_insert_color(&ext->node, &swsusp_extents); + return 0; +} + +/** + * alloc_swapdev_block - allocate a swap page and register that it has + * been allocated, so that it can be freed in case of an error. + */ + +sector_t alloc_swapdev_block(int swap) +{ + unsigned long offset; + + offset = swp_offset(get_swap_page_of_type(swap)); + if (offset) { + if (swsusp_extents_insert(offset)) + swap_free(swp_entry(swap, offset)); + else + return swapdev_block(swap, offset); + } + return 0; +} + +/** + * free_all_swap_pages - free swap pages allocated for saving image data. + * It also frees the extents used to register which swap entres had been + * allocated. + */ + +void free_all_swap_pages(int swap) +{ + struct rb_node *node; + + while ((node = swsusp_extents.rb_node)) { + struct swsusp_extent *ext; + unsigned long offset; + + ext = container_of(node, struct swsusp_extent, node); + rb_erase(node, &swsusp_extents); + for (offset = ext->start; offset <= ext->end; offset++) + swap_free(swp_entry(swap, offset)); + + kfree(ext); + } +} + +int swsusp_swap_in_use(void) +{ + return (swsusp_extents.rb_node != NULL); +} + /* * General things */ @@ -314,7 +415,6 @@ static int save_image(struct swap_map_handle *handle, { unsigned int m; int ret; - int error = 0; int nr_pages; int err2; struct bio *bio; @@ -329,26 +429,27 @@ static int save_image(struct swap_map_handle *handle, nr_pages = 0; bio = NULL; do_gettimeofday(&start); - do { + while (1) { ret = snapshot_read_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot), - &bio); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - } while (ret > 0); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &bio); + if (ret) + break; + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } err2 = wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) - printk("\b\b\b\bdone\n"); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_CONT "\b\b\b\bdone\n"); + else + printk(KERN_CONT "\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - return error; + return ret; } /** @@ -536,7 +637,8 @@ static int load_image(struct swap_map_handle *handle, snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; - } + } else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return error; } @@ -572,8 +674,6 @@ int swsusp_read(unsigned int *flags_p) error = load_image(&handle, &snapshot, header->pages - 1); release_swap_reader(&handle); - blkdev_put(resume_bdev, FMODE_READ); - if (!error) pr_debug("PM: Image successfully loaded\n"); else @@ -596,7 +696,7 @@ int swsusp_check(void) error = bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) - return error; + goto put; if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); @@ -604,8 +704,10 @@ int swsusp_check(void) error = bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { - return -EINVAL; + error = -EINVAL; } + +put: if (error) blkdev_put(resume_bdev, FMODE_READ); else diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 6a07f4dbf2f..5b3601bd189 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -56,133 +56,3 @@ #include "power.h" int in_suspend __nosavedata = 0; - -/** - * The following functions are used for tracing the allocated - * swap pages, so that they can be freed in case of an error. - */ - -struct swsusp_extent { - struct rb_node node; - unsigned long start; - unsigned long end; -}; - -static struct rb_root swsusp_extents = RB_ROOT; - -static int swsusp_extents_insert(unsigned long swap_offset) -{ - struct rb_node **new = &(swsusp_extents.rb_node); - struct rb_node *parent = NULL; - struct swsusp_extent *ext; - - /* Figure out where to put the new node */ - while (*new) { - ext = container_of(*new, struct swsusp_extent, node); - parent = *new; - if (swap_offset < ext->start) { - /* Try to merge */ - if (swap_offset == ext->start - 1) { - ext->start--; - return 0; - } - new = &((*new)->rb_left); - } else if (swap_offset > ext->end) { - /* Try to merge */ - if (swap_offset == ext->end + 1) { - ext->end++; - return 0; - } - new = &((*new)->rb_right); - } else { - /* It already is in the tree */ - return -EINVAL; - } - } - /* Add the new node and rebalance the tree. */ - ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); - if (!ext) - return -ENOMEM; - - ext->start = swap_offset; - ext->end = swap_offset; - rb_link_node(&ext->node, parent, new); - rb_insert_color(&ext->node, &swsusp_extents); - return 0; -} - -/** - * alloc_swapdev_block - allocate a swap page and register that it has - * been allocated, so that it can be freed in case of an error. - */ - -sector_t alloc_swapdev_block(int swap) -{ - unsigned long offset; - - offset = swp_offset(get_swap_page_of_type(swap)); - if (offset) { - if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); - else - return swapdev_block(swap, offset); - } - return 0; -} - -/** - * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been - * allocated. - */ - -void free_all_swap_pages(int swap) -{ - struct rb_node *node; - - while ((node = swsusp_extents.rb_node)) { - struct swsusp_extent *ext; - unsigned long offset; - - ext = container_of(node, struct swsusp_extent, node); - rb_erase(node, &swsusp_extents); - for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); - - kfree(ext); - } -} - -int swsusp_swap_in_use(void) -{ - return (swsusp_extents.rb_node != NULL); -} - -/** - * swsusp_show_speed - print the time elapsed between two events represented by - * @start and @stop - * - * @nr_pages - number of pages processed between @start and @stop - * @msg - introductory message to print - */ - -void swsusp_show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) -{ - s64 elapsed_centisecs64; - int centisecs; - int k; - int kps; - - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); - centisecs = elapsed_centisecs64; - if (centisecs == 0) - centisecs = 1; /* avoid div-by-zero */ - k = nr_pages * (PAGE_SIZE / 1024); - kps = (k * 100) / centisecs; - printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", - msg, k, - centisecs / 100, centisecs % 100, - kps / 1000, (kps % 1000) / 10); -} diff --git a/kernel/printk.c b/kernel/printk.c index f38b07f78a4..17463ca2e22 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -33,6 +33,8 @@ #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/kexec.h> +#include <linux/ratelimit.h> +#include <linux/kmsg_dump.h> #include <asm/uaccess.h> @@ -1376,11 +1378,11 @@ late_initcall(disable_boot_consoles); */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); -int printk_ratelimit(void) +int __printk_ratelimit(const char *func) { - return __ratelimit(&printk_ratelimit_state); + return ___ratelimit(&printk_ratelimit_state, func); } -EXPORT_SYMBOL(printk_ratelimit); +EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting @@ -1404,4 +1406,122 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies, return false; } EXPORT_SYMBOL(printk_timed_ratelimit); + +static DEFINE_SPINLOCK(dump_list_lock); +static LIST_HEAD(dump_list); + +/** + * kmsg_dump_register - register a kernel log dumper. + * @dumper: pointer to the kmsg_dumper structure + * + * Adds a kernel log dumper to the system. The dump callback in the + * structure will be called when the kernel oopses or panics and must be + * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. + */ +int kmsg_dump_register(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EBUSY; + + /* The dump callback needs to be set */ + if (!dumper->dump) + return -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + /* Don't allow registering multiple times */ + if (!dumper->registered) { + dumper->registered = 1; + list_add_tail(&dumper->list, &dump_list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_register); + +/** + * kmsg_dump_unregister - unregister a kmsg dumper. + * @dumper: pointer to the kmsg_dumper structure + * + * Removes a dump device from the system. Returns zero on success and + * %-EINVAL otherwise. + */ +int kmsg_dump_unregister(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + if (dumper->registered) { + dumper->registered = 0; + list_del(&dumper->list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_unregister); + +static const char const *kmsg_reasons[] = { + [KMSG_DUMP_OOPS] = "oops", + [KMSG_DUMP_PANIC] = "panic", +}; + +static const char *kmsg_to_str(enum kmsg_dump_reason reason) +{ + if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) + return "unknown"; + + return kmsg_reasons[reason]; +} + +/** + * kmsg_dump - dump kernel log to kernel message dumpers. + * @reason: the reason (oops, panic etc) for dumping + * + * Iterate through each of the dump devices and call the oops/panic + * callbacks with the log buffer. + */ +void kmsg_dump(enum kmsg_dump_reason reason) +{ + unsigned long end; + unsigned chars; + struct kmsg_dumper *dumper; + const char *s1, *s2; + unsigned long l1, l2; + unsigned long flags; + + /* Theoretically, the log could move on after we do this, but + there's not a lot we can do about that. The new messages + will overwrite the start of what we dump. */ + spin_lock_irqsave(&logbuf_lock, flags); + end = log_end & LOG_BUF_MASK; + chars = logged_chars; + spin_unlock_irqrestore(&logbuf_lock, flags); + + if (logged_chars > end) { + s1 = log_buf + log_buf_len - logged_chars + end; + l1 = logged_chars - end; + + s2 = log_buf; + l2 = end; + } else { + s1 = ""; + l1 = 0; + + s2 = log_buf + end - logged_chars; + l2 = logged_chars; + } + + if (!spin_trylock_irqsave(&dump_list_lock, flags)) { + printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", + kmsg_to_str(reason)); + return; + } + list_for_each_entry(dumper, &dump_list, list) + dumper->dump(dumper, reason, s1, l1, s2, l2); + spin_unlock_irqrestore(&dump_list_lock, flags); +} #endif diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 400183346ad..9b7fd472387 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,7 +44,6 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/module.h> -#include <linux/kernel_stat.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -53,8 +52,6 @@ struct lockdep_map rcu_lock_map = EXPORT_SYMBOL_GPL(rcu_lock_map); #endif -int rcu_scheduler_active __read_mostly; - /* * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. @@ -66,122 +63,3 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } - -#ifdef CONFIG_TREE_PREEMPT_RCU - -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. - */ -void synchronize_rcu(void) -{ - struct rcu_synchronize rcu; - - if (!rcu_scheduler_active) - return; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -/** - * synchronize_sched - wait until an rcu-sched grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-sched - * grace period has elapsed, in other words after all currently executing - * rcu-sched read-side critical sections have completed. These read-side - * critical sections are delimited by rcu_read_lock_sched() and - * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), - * local_irq_disable(), and so on may be used in place of - * rcu_read_lock_sched(). - * - * This means that all preempt_disable code sequences, including NMI and - * hardware-interrupt handlers, in progress on entry will have completed - * before this primitive returns. However, this does not guarantee that - * softirq handlers will have completed, since in some kernels, these - * handlers can run in process context, and can block. - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. - */ -void synchronize_sched(void) -{ - struct rcu_synchronize rcu; - - if (rcu_blocking_is_gp()) - return; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(synchronize_sched); - -/** - * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. - * - * Control will return to the caller some time after a full rcu_bh grace - * period has elapsed, in other words after all currently executing rcu_bh - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), - * and may be nested. - */ -void synchronize_rcu_bh(void) -{ - struct rcu_synchronize rcu; - - if (rcu_blocking_is_gp()) - return; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_bh(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_bh); - -static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - return rcu_cpu_notify(self, action, hcpu); -} - -void __init rcu_init(void) -{ - int i; - - __rcu_init(); - cpu_notifier(rcu_barrier_cpu_hotplug, 0); - - /* - * We don't need protection against CPU-hotplug here because - * this is called early in boot, before either interrupts - * or the scheduler are operational. - */ - for_each_online_cpu(i) - rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i); -} - -void rcu_scheduler_starting(void) -{ - WARN_ON(num_online_cpus() != 1); - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; -} diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c new file mode 100644 index 00000000000..9f6d9ff2572 --- /dev/null +++ b/kernel/rcutiny.c @@ -0,0 +1,282 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ +#include <linux/moduleparam.h> +#include <linux/completion.h> +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/rcupdate.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/time.h> +#include <linux/cpu.h> + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_ctrlblk = { + .donetail = &rcu_ctrlblk.rcucblist, + .curtail = &rcu_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; + +#ifdef CONFIG_NO_HZ + +static long rcu_dynticks_nesting = 1; + +/* + * Enter dynticks-idle mode, which is an extended quiescent state + * if we have fully entered that mode (i.e., if the new value of + * dynticks_nesting is zero). + */ +void rcu_enter_nohz(void) +{ + if (--rcu_dynticks_nesting == 0) + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ +} + +/* + * Exit dynticks-idle mode, so that we are no longer in an extended + * quiescent state. + */ +void rcu_exit_nohz(void) +{ + rcu_dynticks_nesting++; +} + +#endif /* #ifdef CONFIG_NO_HZ */ + +/* + * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). + * Also disable irqs to avoid confusion due to interrupt handlers + * invoking call_rcu(). + */ +static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + local_irq_save(flags); + if (rcp->rcucblist != NULL && + rcp->donetail != rcp->curtail) { + rcp->donetail = rcp->curtail; + local_irq_restore(flags); + return 1; + } + local_irq_restore(flags); + + return 0; +} + +/* + * Record an rcu quiescent state. And an rcu_bh quiescent state while we + * are at it, given that any rcu quiescent state is also an rcu_bh + * quiescent state. Use "+" instead of "||" to defeat short circuiting. + */ +void rcu_sched_qs(int cpu) +{ + if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Record an rcu_bh quiescent state. + */ +void rcu_bh_qs(int cpu) +{ + if (rcu_qsctr_help(&rcu_bh_ctrlblk)) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Check to see if the scheduling-clock interrupt came from an extended + * quiescent state, and, if so, tell RCU about it. + */ +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && + !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) + rcu_sched_qs(cpu); + else if (!in_softirq()) + rcu_bh_qs(cpu); +} + +/* + * Helper function for rcu_process_callbacks() that operates on the + * specified rcu_ctrlkblk structure. + */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) +{ + struct rcu_head *next, *list; + unsigned long flags; + + /* If no RCU callbacks ready to invoke, just return. */ + if (&rcp->rcucblist == rcp->donetail) + return; + + /* Move the ready-to-invoke callbacks to a local list. */ + local_irq_save(flags); + list = rcp->rcucblist; + rcp->rcucblist = *rcp->donetail; + *rcp->donetail = NULL; + if (rcp->curtail == rcp->donetail) + rcp->curtail = &rcp->rcucblist; + rcp->donetail = &rcp->rcucblist; + local_irq_restore(flags); + + /* Invoke the callbacks on the local list. */ + while (list) { + next = list->next; + prefetch(next); + list->func(list); + list = next; + } +} + +/* + * Invoke any callbacks whose grace period has completed. + */ +static void rcu_process_callbacks(struct softirq_action *unused) +{ + __rcu_process_callbacks(&rcu_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); +} + +/* + * Wait for a grace period to elapse. But it is illegal to invoke + * synchronize_sched() from within an RCU read-side critical section. + * Therefore, any legal call to synchronize_sched() is a quiescent + * state, and so on a UP system, synchronize_sched() need do nothing. + * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the + * benefits of doing might_sleep() to reduce latency.) + * + * Cool, huh? (Due to Josh Triplett.) + * + * But we want to make this a static inline later. + */ +void synchronize_sched(void) +{ + cond_resched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +void synchronize_rcu_bh(void) +{ + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + +/* + * Helper function for call_rcu() and call_rcu_bh(). + */ +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcp->curtail = head; + rcp->curtail = &head->next; + local_irq_restore(flags); +} + +/* + * Post an RCU callback to be invoked after the end of an RCU grace + * period. But since we have but one CPU, that would be after any + * quiescent state. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/* + * Post an RCU bottom-half callback to be invoked after any subsequent + * quiescent state. + */ +void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_bh_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +void rcu_barrier_bh(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +void rcu_barrier_sched(void) +{ + struct rcu_synchronize rcu; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +void __init rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 697c0a0229d..9bb52177af0 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -327,6 +327,11 @@ rcu_torture_cb(struct rcu_head *p) cur_ops->deferred_free(rp); } +static int rcu_no_completed(void) +{ + return 0; +} + static void rcu_torture_deferred_free(struct rcu_torture *p) { call_rcu(&p->rtort_rcu, rcu_torture_cb); @@ -388,6 +393,21 @@ static struct rcu_torture_ops rcu_sync_ops = { .name = "rcu_sync" }; +static struct rcu_torture_ops rcu_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu_expedited, + .cb_barrier = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_expedited" +}; + /* * Definitions for rcu_bh torture testing. */ @@ -547,6 +567,25 @@ static struct rcu_torture_ops srcu_ops = { .name = "srcu" }; +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + +static struct rcu_torture_ops srcu_expedited_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize_expedited, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_expedited" +}; + /* * Definitions for sched torture testing. */ @@ -562,11 +601,6 @@ static void sched_torture_read_unlock(int idx) preempt_enable(); } -static int sched_torture_completed(void) -{ - return 0; -} - static void rcu_sched_torture_deferred_free(struct rcu_torture *p) { call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); @@ -583,7 +617,7 @@ static struct rcu_torture_ops sched_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, + .completed = rcu_no_completed, .deferred_free = rcu_sched_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = rcu_barrier_sched, @@ -592,13 +626,13 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -static struct rcu_torture_ops sched_ops_sync = { +static struct rcu_torture_ops sched_sync_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, + .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = NULL, @@ -612,7 +646,7 @@ static struct rcu_torture_ops sched_expedited_ops = { .readlock = sched_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, - .completed = sched_torture_completed, + .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_sched_expedited, .cb_barrier = NULL, @@ -729,13 +763,13 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_count)[pipe_count]; + __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_batch)[completed]; + __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); preempt_enable(); cur_ops->readunlock(idx); } @@ -784,13 +818,13 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_count)[pipe_count]; + __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - ++__get_cpu_var(rcu_torture_batch)[completed]; + __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); preempt_enable(); cur_ops->readunlock(idx); schedule(); @@ -1097,9 +1131,10 @@ rcu_torture_init(void) int cpu; int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, - &sched_expedited_ops, - &srcu_ops, &sched_ops, &sched_ops_sync, }; + { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, + &rcu_bh_ops, &rcu_bh_sync_ops, + &srcu_ops, &srcu_expedited_ops, + &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; mutex_lock(&fullstop_mutex); @@ -1110,8 +1145,12 @@ rcu_torture_init(void) break; } if (i == ARRAY_SIZE(torture_ops)) { - printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", + printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", torture_type); + printk(KERN_ALERT "rcu-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + printk(KERN_ALERT " %s", torture_ops[i]->name); + printk(KERN_ALERT "\n"); mutex_unlock(&fullstop_mutex); return -EINVAL; } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 705f02ac743..53ae9598f79 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -46,20 +46,24 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/time.h> +#include <linux/kernel_stat.h> #include "rcutree.h" /* Data structures. */ +static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; + #define RCU_STATE_INITIALIZER(name) { \ .level = { &name.node[0] }, \ .levelcnt = { \ NUM_RCU_LVL_0, /* root of hierarchy. */ \ NUM_RCU_LVL_1, \ NUM_RCU_LVL_2, \ - NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ + NUM_RCU_LVL_3, \ + NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ }, \ - .signaled = RCU_SIGNAL_INIT, \ + .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ @@ -77,6 +81,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +static int rcu_scheduler_active __read_mostly; + /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s @@ -98,7 +104,7 @@ void rcu_sched_qs(int cpu) struct rcu_data *rdp; rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesc_completed = rdp->completed; + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; rcu_preempt_note_context_switch(cpu); @@ -109,7 +115,7 @@ void rcu_bh_qs(int cpu) struct rcu_data *rdp; rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc_completed = rdp->completed; + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; } @@ -335,28 +341,9 @@ void rcu_irq_exit(void) set_need_resched(); } -/* - * Record the specified "completed" value, which is later used to validate - * dynticks counter manipulations. Specify "rsp->completed - 1" to - * unconditionally invalidate any future dynticks manipulations (which is - * useful at the beginning of a grace period). - */ -static void dyntick_record_completed(struct rcu_state *rsp, long comp) -{ - rsp->dynticks_completed = comp; -} - #ifdef CONFIG_SMP /* - * Recall the previously recorded value of the completion for dynticks. - */ -static long dyntick_recall_completed(struct rcu_state *rsp) -{ - return rsp->dynticks_completed; -} - -/* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. @@ -419,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #else /* #ifdef CONFIG_NO_HZ */ -static void dyntick_record_completed(struct rcu_state *rsp, long comp) -{ -} - #ifdef CONFIG_SMP -/* - * If there are no dynticks, then the only way that a CPU can passively - * be in a quiescent state is to be offline. Unlike dynticks idle, which - * is a point in time during the prior (already finished) grace period, - * an offline CPU is always in a quiescent state, and thus can be - * unconditionally applied. So just return the current value of completed. - */ -static long dyntick_recall_completed(struct rcu_state *rsp) -{ - return rsp->completed; -} - static int dyntick_save_progress_counter(struct rcu_data *rdp) { return 0; @@ -553,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) /* * Update CPU-local rcu_data state to record the newly noticed grace period. * This is used both when we started the grace period and when we notice - * that someone else started the grace period. + * that someone else started the grace period. The caller must hold the + * ->lock of the leaf rcu_node structure corresponding to the current CPU, + * and must have irqs disabled. */ +static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + if (rdp->gpnum != rnp->gpnum) { + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + rdp->gpnum = rnp->gpnum; + } +} + static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) { - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; - rdp->gpnum = rsp->gpnum; + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ + !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + local_irq_restore(flags); + return; + } + __note_new_gpnum(rsp, rnp, rdp); + spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -583,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) } /* + * Advance this CPU's callbacks, but only if the current grace period + * has ended. This may be called only from the CPU to whom the rdp + * belongs. In addition, the corresponding leaf rcu_node structure's + * ->lock must be held by the caller, with irqs disabled. + */ +static void +__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* Did another grace period end? */ + if (rdp->completed != rnp->completed) { + + /* Advance callbacks. No harm if list empty. */ + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Remember that we saw this grace-period completion. */ + rdp->completed = rnp->completed; + } +} + +/* + * Advance this CPU's callbacks, but only if the current grace period + * has ended. This may be called only from the CPU to whom the rdp + * belongs. + */ +static void +rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ + !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + local_irq_restore(flags); + return; + } + __rcu_process_gp_end(rsp, rnp, rdp); + spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Do per-CPU grace-period initialization for running CPU. The caller + * must hold the lock of the leaf rcu_node structure corresponding to + * this CPU. + */ +static void +rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* Prior grace period ended, so advance callbacks for current CPU. */ + __rcu_process_gp_end(rsp, rnp, rdp); + + /* + * Because this CPU just now started the new grace period, we know + * that all of its callbacks will be covered by this upcoming grace + * period, even the ones that were registered arbitrarily recently. + * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. + * + * Other CPUs cannot be sure exactly when the grace period started. + * Therefore, their recently registered callbacks must pass through + * an additional RCU_NEXT_READY stage, so that they will be handled + * by the next RCU grace period. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Set state so that this CPU will detect the next quiescent state. */ + __note_new_gpnum(rsp, rnp, rdp); +} + +/* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold * the root node's ->lock, which is released before return. Hard irqs must @@ -596,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_node *rnp = rcu_get_root(rsp); if (!cpu_needs_another_gp(rsp, rdp)) { - spin_unlock_irqrestore(&rnp->lock, flags); + if (rnp->completed == rsp->completed) { + spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * Propagate new ->completed value to rcu_node structures + * so that other CPUs don't have to wait until the start + * of the next grace period to process their callbacks. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->completed; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + local_irq_restore(flags); return; } @@ -606,29 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - dyntick_record_completed(rsp, rsp->completed - 1); - note_new_gpnum(rsp, rdp); - - /* - * Because this CPU just now started the new grace period, we know - * that all of its callbacks will be covered by this upcoming grace - * period, even the ones that were registered arbitrarily recently. - * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. - * - * Other CPUs cannot be sure exactly when the grace period started. - * Therefore, their recently registered callbacks must pass through - * an additional RCU_NEXT_READY stage, so that they will be handled - * by the next RCU grace period. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; /* Special-case the common single-level case. */ if (NUM_RCU_NODES == 1) { rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ + rcu_start_gp_per_cpu(rsp, rnp, rdp); spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -657,69 +723,50 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * irqs disabled. */ rcu_for_each_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; - spin_unlock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->completed; + if (rnp == rdp->mynode) + rcu_start_gp_per_cpu(rsp, rnp, rdp); + spin_unlock(&rnp->lock); /* irqs remain disabled. */ } + rnp = rcu_get_root(rsp); + spin_lock(&rnp->lock); /* irqs already disabled. */ rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + spin_unlock(&rnp->lock); /* irqs remain disabled. */ spin_unlock_irqrestore(&rsp->onofflock, flags); } /* - * Advance this CPU's callbacks, but only if the current grace period - * has ended. This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) -{ - long completed_snap; - unsigned long flags; - - local_irq_save(flags); - completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */ - - /* Did another grace period end? */ - if (rdp->completed != completed_snap) { - - /* Advance callbacks. No harm if list empty. */ - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - - /* Remember that we saw this grace-period completion. */ - rdp->completed = completed_snap; - } - local_irq_restore(flags); -} - -/* - * Clean up after the prior grace period and let rcu_start_gp() start up - * the next grace period if one is needed. Note that the caller must - * hold rnp->lock, as required by rcu_start_gp(), which will release it. + * Report a full set of quiescent states to the specified rcu_state + * data structure. This involves cleaning up after the prior grace + * period and letting rcu_start_gp() start up the next grace period + * if one is needed. Note that the caller must hold rnp->lock, as + * required by rcu_start_gp(), which will release it. */ -static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) +static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); rsp->completed = rsp->gpnum; - rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); + rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } /* - * Similar to cpu_quiet(), for which it is a helper function. Allows - * a group of CPUs to be quieted at one go, though all the CPUs in the - * group must be represented by the same leaf rcu_node structure. - * That structure's lock must be held upon entry, and it is released - * before return. + * Similar to rcu_report_qs_rdp(), for which it is a helper function. + * Allows quiescent states for a group of CPUs to be reported at one go + * to the specified rcu_node structure, though all the CPUs in the group + * must be represented by the same rcu_node structure (which need not be + * a leaf rcu_node structure, though it often will be). That structure's + * lock must be held upon entry, and it is released before return. */ static void -cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long flags) +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { struct rcu_node *rnp_c; @@ -755,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, /* * Get here if we are the last CPU to pass through a quiescent - * state for this grace period. Invoke cpu_quiet_msk_finish() + * state for this grace period. Invoke rcu_report_qs_rsp() * to clean up and start the next grace period if one is needed. */ - cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ + rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ } /* - * Record a quiescent state for the specified CPU, which must either be - * the current CPU. The lastcomp argument is used to make sure we are - * still in the grace period of interest. We don't want to end the current - * grace period based on quiescent states detected in an earlier grace - * period! + * Record a quiescent state for the specified CPU to that CPU's rcu_data + * structure. This must be either called from the specified CPU, or + * called when the specified CPU is known to be offline (and when it is + * also known that no other CPU is concurrently trying to help the offline + * CPU). The lastcomp argument is used to make sure we are still in the + * grace period of interest. We don't want to end the current grace period + * based on quiescent states detected in an earlier grace period! */ static void -cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) { unsigned long flags; unsigned long mask; @@ -777,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) rnp = rdp->mynode; spin_lock_irqsave(&rnp->lock, flags); - if (lastcomp != ACCESS_ONCE(rsp->completed)) { + if (lastcomp != rnp->completed) { /* * Someone beat us to it for this grace period, so leave. * The race with GP start is resolved by the fact that we * hold the leaf rcu_node lock, so that the per-CPU bits * cannot yet be initialized -- so we would simply find our - * CPU's bit already cleared in cpu_quiet_msk() if this race - * occurred. + * CPU's bit already cleared in rcu_report_qs_rnp() if this + * race occurred. */ rdp->passed_quiesc = 0; /* try again later! */ spin_unlock_irqrestore(&rnp->lock, flags); @@ -803,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) */ rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ + rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ } } @@ -834,8 +883,11 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) if (!rdp->passed_quiesc) return; - /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ - cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); + /* + * Tell RCU we are done (but rcu_report_qs_rdp() will be the + * judge of that). + */ + rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); } #ifdef CONFIG_HOTPLUG_CPU @@ -895,8 +947,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - long lastcomp; unsigned long mask; + int need_report = 0; struct rcu_data *rdp = rsp->rda[cpu]; struct rcu_node *rnp; @@ -910,17 +962,32 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + if (rnp != rdp->mynode) + spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp, rdp); + if (rnp == rdp->mynode) + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + else + spin_unlock(&rnp->lock); /* irqs remain disabled. */ mask = rnp->grpmask; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; } while (rnp != NULL); - lastcomp = rsp->completed; - spin_unlock_irqrestore(&rsp->onofflock, flags); + /* + * We still hold the leaf rcu_node structure lock here, and + * irqs are still disabled. The reason for this subterfuge is + * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * held leads to deadlock. + */ + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + rnp = rdp->mynode; + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp); rcu_adopt_orphan_cbs(rsp); } @@ -958,7 +1025,7 @@ static void rcu_offline_cpu(int cpu) * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; @@ -1011,6 +1078,13 @@ static void rcu_do_batch(struct rcu_data *rdp) if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + local_irq_restore(flags); /* Re-raise the RCU softirq if there are callbacks remaining. */ @@ -1085,7 +1159,7 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, rcu_for_each_leaf_node(rsp, rnp) { mask = 0; spin_lock_irqsave(&rnp->lock, flags); - if (rsp->completed != lastcomp) { + if (rnp->completed != lastcomp) { spin_unlock_irqrestore(&rnp->lock, flags); return 1; } @@ -1099,10 +1173,10 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) mask |= bit; } - if (mask != 0 && rsp->completed == lastcomp) { + if (mask != 0 && rnp->completed == lastcomp) { - /* cpu_quiet_msk() releases rnp->lock. */ - cpu_quiet_msk(mask, rsp, rnp, flags); + /* rcu_report_qs_rnp() releases rnp->lock. */ + rcu_report_qs_rnp(mask, rsp, rnp, flags); continue; } spin_unlock_irqrestore(&rnp->lock, flags); @@ -1120,6 +1194,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) long lastcomp; struct rcu_node *rnp = rcu_get_root(rsp); u8 signaled; + u8 forcenow; if (!rcu_gp_in_progress(rsp)) return; /* No grace period in progress, nothing to force. */ @@ -1132,19 +1207,20 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) goto unlock_ret; /* no emergency and done recently. */ rsp->n_force_qs++; spin_lock(&rnp->lock); - lastcomp = rsp->completed; + lastcomp = rsp->gpnum - 1; signaled = rsp->signaled; rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - if (lastcomp == rsp->gpnum) { + if(!rcu_gp_in_progress(rsp)) { rsp->n_force_qs_ngp++; spin_unlock(&rnp->lock); goto unlock_ret; /* no GP in progress, time updated. */ } spin_unlock(&rnp->lock); switch (signaled) { + case RCU_GP_IDLE: case RCU_GP_INIT: - break; /* grace period still initializing, ignore. */ + break; /* grace period idle or initializing, ignore. */ case RCU_SAVE_DYNTICK: @@ -1155,20 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) if (rcu_process_dyntick(rsp, lastcomp, dyntick_save_progress_counter)) goto unlock_ret; + /* fall into next case. */ + + case RCU_SAVE_COMPLETED: /* Update state, record completion counter. */ + forcenow = 0; spin_lock(&rnp->lock); - if (lastcomp == rsp->completed) { + if (lastcomp + 1 == rsp->gpnum && + lastcomp == rsp->completed && + rsp->signaled == signaled) { rsp->signaled = RCU_FORCE_QS; - dyntick_record_completed(rsp, lastcomp); + rsp->completed_fqs = lastcomp; + forcenow = signaled == RCU_SAVE_COMPLETED; } spin_unlock(&rnp->lock); - break; + if (!forcenow) + break; + /* fall into next case. */ case RCU_FORCE_QS: /* Check dyntick-idle state, send IPI to laggarts. */ - if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), + if (rcu_process_dyntick(rsp, rsp->completed_fqs, rcu_implicit_dynticks_qs)) goto unlock_ret; @@ -1224,7 +1309,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rdp); + rcu_do_batch(rsp, rdp); } /* @@ -1288,10 +1373,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } - /* Force the grace period if too many callbacks or too long waiting. */ - if (unlikely(++rdp->qlen > qhimark)) { + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { rdp->blimit = LONG_MAX; - force_quiescent_state(rsp, 0); + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1315,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed. These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * hardware-interrupt handlers, in progress on entry will have completed + * before this primitive returns. However, this does not guarantee that + * softirq handlers will have completed, since in some kernels, these + * handlers can run in process context, and can block. + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API. In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + */ +void synchronize_rcu_bh(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -1324,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); */ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) { + struct rcu_node *rnp = rdp->mynode; + rdp->n_rcu_pending++; /* Check for CPU stalls, if enabled. */ @@ -1348,13 +1507,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ + if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ rdp->n_rp_gp_completed++; return 1; } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ rdp->n_rp_gp_started++; return 1; } @@ -1397,6 +1556,21 @@ int rcu_needs_cpu(int cpu) rcu_preempt_needs_cpu(cpu); } +/* + * This function is invoked towards the end of the scheduler's initialization + * process. Before this is called, the idle task might contain + * RCU read-side critical sections (during which time, this idle + * task is booting the system). After this function is called, the + * idle tasks are prohibited from containing RCU read-side critical + * sections. + */ +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -1508,21 +1682,18 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; - long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ spin_lock_irqsave(&rnp->lock, flags); - lastcomp = rsp->completed; - rdp->completed = lastcomp; - rdp->gpnum = lastcomp; rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ rdp->preemptable = preemptable; - rdp->passed_quiesc_completed = lastcomp - 1; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -1542,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit |= mask; mask = rnp->grpmask; + if (rnp == rdp->mynode) { + rdp->gpnum = rnp->completed; /* if GP in progress... */ + rdp->completed = rnp->completed; + rdp->passed_quiesc_completed = rnp->completed - 1; + } spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); @@ -1559,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -int __cpuinit rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1647,8 +1823,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { - if (rnp != rcu_get_root(rsp)) - spin_lock_init(&rnp->lock); + spin_lock_init(&rnp->lock); + lockdep_set_class(&rnp->lock, &rcu_node_class[i]); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; @@ -1669,9 +1845,10 @@ static void __init rcu_init_one(struct rcu_state *rsp) rnp->level = i; INIT_LIST_HEAD(&rnp->blocked_tasks[0]); INIT_LIST_HEAD(&rnp->blocked_tasks[1]); + INIT_LIST_HEAD(&rnp->blocked_tasks[2]); + INIT_LIST_HEAD(&rnp->blocked_tasks[3]); } } - spin_lock_init(&rcu_get_root(rsp)->lock); } /* @@ -1697,16 +1874,30 @@ do { \ } \ } while (0) -void __init __rcu_init(void) +void __init rcu_init(void) { + int i; + rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#if NUM_RCU_LVL_4 != 0 + printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n"); +#endif /* #if NUM_RCU_LVL_4 != 0 */ RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + + /* + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. + */ + cpu_notifier(rcu_cpu_notify, 0); + for_each_online_cpu(i) + rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); } #include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b40ac570604..d2a0046f63b 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -34,10 +34,11 @@ * In practice, this has not been tested, so there is probably some * bug somewhere. */ -#define MAX_RCU_LVLS 3 +#define MAX_RCU_LVLS 4 #define RCU_FANOUT (CONFIG_RCU_FANOUT) #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) +#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) #if NR_CPUS <= RCU_FANOUT # define NUM_RCU_LVLS 1 @@ -45,23 +46,33 @@ # define NUM_RCU_LVL_1 (NR_CPUS) # define NUM_RCU_LVL_2 0 # define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_SQ # define NUM_RCU_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_2 (NR_CPUS) # define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 #elif NR_CPUS <= RCU_FANOUT_CUBE # define NUM_RCU_LVLS 3 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) # define NUM_RCU_LVL_3 NR_CPUS +# define NUM_RCU_LVL_4 0 +#elif NR_CPUS <= RCU_FANOUT_FOURTH +# define NUM_RCU_LVLS 4 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) +# define NUM_RCU_LVL_4 NR_CPUS #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT */ -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) /* @@ -84,14 +95,21 @@ struct rcu_node { long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ + long completed; /* Last grace period completed for this node. */ + /* This will either be equal to or one */ + /* behind the root rcu_node's gpnum. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ /* order for current grace period to proceed.*/ /* In leaf rcu_node, each bit corresponds to */ /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ + unsigned long expmask; /* Groups that have ->blocked_tasks[] */ + /* elements that need to drain to allow the */ + /* current expedited grace period to */ + /* complete (only for TREE_PREEMPT_RCU). */ unsigned long qsmaskinit; - /* Per-GP initialization for qsmask. */ + /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -99,7 +117,7 @@ struct rcu_node { u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ struct rcu_node *parent; - struct list_head blocked_tasks[2]; + struct list_head blocked_tasks[4]; /* Tasks blocked in RCU read-side critsect. */ /* Grace period number (->gpnum) x blocked */ /* by tasks on the (x & 0x1) element of the */ @@ -114,6 +132,21 @@ struct rcu_node { for ((rnp) = &(rsp)->node[0]; \ (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) +/* + * Do a breadth-first scan of the non-leaf rcu_node structures for the + * specified rcu_state structure. Note that if there is a singleton + * rcu_node tree with but one rcu_node structure, this loop is a no-op. + */ +#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + +/* + * Scan the leaves of the rcu_node hierarchy for the specified rcu_state + * structure. Note that if there is a singleton rcu_node tree with but + * one rcu_node structure, this loop -will- visit the rcu_node structure. + * It is still a leaf node, even if it is also the root node. + */ #define rcu_for_each_leaf_node(rsp, rnp) \ for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) @@ -167,6 +200,10 @@ struct rcu_data { struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; long qlen; /* # of queued callbacks */ + long qlen_last_fqs_check; + /* qlen at last check for QS forcing */ + unsigned long n_force_qs_snap; + /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ @@ -197,13 +234,15 @@ struct rcu_data { }; /* Values for signaled field in struct rcu_state. */ -#define RCU_GP_INIT 0 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#define RCU_GP_IDLE 0 /* No grace period in progress. */ +#define RCU_GP_INIT 1 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ +#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ +#define RCU_FORCE_QS 4 /* Need to force quiescent state. */ #ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK #else /* #ifdef CONFIG_NO_HZ */ -#define RCU_SIGNAL_INIT RCU_FORCE_QS +#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED #endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ @@ -241,7 +280,7 @@ struct rcu_state { long gpnum; /* Current gp number. */ long completed; /* # of last completed gp. */ - /* End of fields guarded by root rcu_node's lock. */ + /* End of fields guarded by root rcu_node's lock. */ spinlock_t onofflock; /* exclude on/offline and */ /* starting new GP. Also */ @@ -255,6 +294,8 @@ struct rcu_state { long orphan_qlen; /* Number of orphaned cbs. */ spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ + long completed_fqs; /* Value of completed @ snap. */ + /* Protected by fqslock. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ @@ -269,11 +310,15 @@ struct rcu_state { unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -#ifdef CONFIG_NO_HZ - long dynticks_completed; /* Value of completed @ snap. */ -#endif /* #ifdef CONFIG_NO_HZ */ }; +/* Return values for rcu_preempt_offline_tasks(). */ + +#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ + /* GP were moved to root. */ +#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ + /* GP were moved to root. */ + #ifdef RCU_TREE_NONCORE /* @@ -293,23 +338,30 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #else /* #ifdef RCU_TREE_NONCORE */ /* Forward declarations for rcutree_plugin.h */ -static inline void rcu_bootup_announce(void); +static void rcu_bootup_announce(void); long rcu_batches_completed(void); static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempted_readers(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, + unsigned long flags); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_task_stall(struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); +#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c0cb783aa16..37fbccdf41d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -24,16 +24,19 @@ * Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/delay.h> #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +static int rcu_preempted_readers_exp(struct rcu_node *rnp); + /* * Tell them what RCU they are running. */ -static inline void rcu_bootup_announce(void) +static void __init rcu_bootup_announce(void) { printk(KERN_INFO "Experimental preemptable hierarchical RCU implementation.\n"); @@ -67,7 +70,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesc_completed = rdp->completed; + rdp->passed_quiesc_completed = rdp->gpnum - 1; barrier(); rdp->passed_quiesc = 1; } @@ -157,14 +160,58 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); */ static int rcu_preempted_readers(struct rcu_node *rnp) { - return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); + int phase = rnp->gpnum & 0x1; + + return !list_empty(&rnp->blocked_tasks[phase]) || + !list_empty(&rnp->blocked_tasks[phase + 2]); +} + +/* + * Record a quiescent state for all tasks that were previously queued + * on the specified rcu_node structure and that were blocking the current + * RCU grace period. The caller must hold the specified rnp->lock with + * irqs disabled, and this lock is released upon return, but irqs remain + * disabled. + */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + struct rcu_node *rnp_p; + + if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { + spin_unlock_irqrestore(&rnp->lock, flags); + return; /* Still need more quiescent states! */ + } + + rnp_p = rnp->parent; + if (rnp_p == NULL) { + /* + * Either there is only one rcu_node in the tree, + * or tasks were kicked up to root rcu_node due to + * CPUs going offline. + */ + rcu_report_qs_rsp(&rcu_preempt_state, flags); + return; + } + + /* Report up the rest of the hierarchy. */ + mask = rnp->grpmask; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + spin_lock(&rnp_p->lock); /* irqs already disabled. */ + rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); } +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ static void rcu_read_unlock_special(struct task_struct *t) { int empty; + int empty_exp; unsigned long flags; - unsigned long mask; struct rcu_node *rnp; int special; @@ -207,36 +254,30 @@ static void rcu_read_unlock_special(struct task_struct *t) spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty = !rcu_preempted_readers(rnp); + empty_exp = !rcu_preempted_readers_exp(rnp); + smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. - * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk() - * drop rnp->lock and restore irq. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. */ - if (!empty && rnp->qsmask == 0 && - !rcu_preempted_readers(rnp)) { - struct rcu_node *rnp_p; - - if (rnp->parent == NULL) { - /* Only one rcu_node in the tree. */ - cpu_quiet_msk_finish(&rcu_preempt_state, flags); - return; - } - /* Report up the rest of the hierarchy. */ - mask = rnp->grpmask; + if (empty) spin_unlock_irqrestore(&rnp->lock, flags); - rnp_p = rnp->parent; - spin_lock_irqsave(&rnp_p->lock, flags); - WARN_ON_ONCE(rnp->qsmask); - cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags); - return; - } - spin_unlock(&rnp->lock); + else + rcu_report_unblock_qs_rnp(rnp, flags); + + /* + * If this was the last task on the expedited lists, + * then we need to report up the rcu_node hierarchy. + */ + if (!empty_exp && !rcu_preempted_readers_exp(rnp)) + rcu_report_exp_rnp(&rcu_preempt_state, rnp); + } else { + local_irq_restore(flags); } - local_irq_restore(flags); } /* @@ -303,26 +344,34 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) * rcu_node. The reason for not just moving them to the immediate * parent is to remove the need for rcu_read_unlock_special() to * make more than two attempts to acquire the target rcu_node's lock. + * Returns true if there were tasks blocking the current RCU grace + * period. + * + * Returns 1 if there was previously a task blocking the current grace + * period on the specified rcu_node structure. * * The caller must hold rnp->lock with irqs disabled. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; struct list_head *lp_root; + int retval = 0; struct rcu_node *rnp_root = rcu_get_root(rsp); struct task_struct *tp; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); - return; /* Shouldn't happen: at least one CPU online. */ + return 0; /* Shouldn't happen: at least one CPU online. */ } WARN_ON_ONCE(rnp != rdp->mynode && (!list_empty(&rnp->blocked_tasks[0]) || - !list_empty(&rnp->blocked_tasks[1]))); + !list_empty(&rnp->blocked_tasks[1]) || + !list_empty(&rnp->blocked_tasks[2]) || + !list_empty(&rnp->blocked_tasks[3]))); /* * Move tasks up to root rcu_node. Rely on the fact that the @@ -330,7 +379,11 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, * rcu_nodes in terms of gp_num value. This fact allows us to * move the blocked_tasks[] array directly, element by element. */ - for (i = 0; i < 2; i++) { + if (rcu_preempted_readers(rnp)) + retval |= RCU_OFL_TASKS_NORM_GP; + if (rcu_preempted_readers_exp(rnp)) + retval |= RCU_OFL_TASKS_EXP_GP; + for (i = 0; i < 4; i++) { lp = &rnp->blocked_tasks[i]; lp_root = &rnp_root->blocked_tasks[i]; while (!list_empty(lp)) { @@ -342,6 +395,7 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, spin_unlock(&rnp_root->lock); /* irqs remain disabled */ } } + return retval; } /* @@ -392,6 +446,186 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ + struct rcu_synchronize rcu; + + if (!rcu_scheduler_active) + return; + + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(struct rcu_node *rnp) +{ + return !list_empty(&rnp->blocked_tasks[2]) || + !list_empty(&rnp->blocked_tasks[3]); +} + +/* + * return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return !rcu_preempted_readers_exp(rnp) && + ACCESS_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +{ + unsigned long flags; + unsigned long mask; + + spin_lock_irqsave(&rnp->lock, flags); + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) + break; + if (rnp->parent == NULL) { + wake_up(&sync_rcu_preempt_exp_wq); + break; + } + mask = rnp->grpmask; + spin_unlock(&rnp->lock); /* irqs remain disabled */ + rnp = rnp->parent; + spin_lock(&rnp->lock); /* irqs already disabled */ + rnp->expmask &= ~mask; + } + spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Snapshot the tasks blocking the newly started preemptible-RCU expedited + * grace period for the specified rcu_node structure. If there are no such + * tasks, report it up the rcu_node hierarchy. + * + * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. + */ +static void +sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int must_wait; + + spin_lock(&rnp->lock); /* irqs already disabled */ + list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); + list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); + must_wait = rcu_preempted_readers_exp(rnp); + spin_unlock(&rnp->lock); /* irqs remain disabled */ + if (!must_wait) + rcu_report_exp_rnp(rsp, rnp); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to invoke synchronize_sched_expedited() to push all the tasks to + * the ->blocked_tasks[] lists, move all entries from the first set of + * ->blocked_tasks[] lists to the second set, and finally wait for this + * second set to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_preempt_state; + long snap; + int trycount = 0; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; + smp_mb(); /* Above access cannot bleed into critical section. */ + + /* + * Acquire lock, falling back to synchronize_rcu() if too many + * lock-acquisition failures. Of course, if someone does the + * expedited grace period for us, just leave. + */ + while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_rcu(); + return; + } + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + goto mb_ret; /* Others did our work for us. */ + } + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + goto unlock_mb_ret; /* Others did our work for us. */ + + /* force all RCU readers onto blocked_tasks[]. */ + synchronize_sched_expedited(); + + spin_lock_irqsave(&rsp->onofflock, flags); + + /* Initialize ->expmask for all non-leaf rcu_node structures. */ + rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { + spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->expmask = rnp->qsmaskinit; + spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + + /* Snapshot current state of ->blocked_tasks[] lists. */ + rcu_for_each_leaf_node(rsp, rnp) + sync_rcu_preempt_exp_init(rsp, rnp); + if (NUM_RCU_NODES > 1) + sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); + + spin_unlock_irqrestore(&rsp->onofflock, flags); + + /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ + rnp = rcu_get_root(rsp); + wait_event(sync_rcu_preempt_exp_wq, + sync_rcu_preempt_exp_done(rnp)); + + /* Clean up and exit. */ + smp_mb(); /* ensure expedited GP seen before counter increment. */ + ACCESS_ONCE(sync_rcu_preempt_exp_count)++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); +mb_ret: + smp_mb(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + /* * Check to see if there is any immediate preemptable-RCU-related work * to be done. @@ -464,7 +698,7 @@ void exit_rcu(void) /* * Tell them what RCU they are running. */ -static inline void rcu_bootup_announce(void) +static void __init rcu_bootup_announce(void) { printk(KERN_INFO "Hierarchical RCU implementation.\n"); } @@ -495,6 +729,16 @@ static int rcu_preempted_readers(struct rcu_node *rnp) return 0; } +#ifdef CONFIG_HOTPLUG_CPU + +/* Because preemptible RCU does not exist, no quieting of tasks. */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) +{ + spin_unlock_irqrestore(&rnp->lock, flags); +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + #ifdef CONFIG_RCU_CPU_STALL_DETECTOR /* @@ -521,12 +765,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) /* * Because preemptable RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections. + * tasks that were blocked within RCU read-side critical sections, and + * such non-existent tasks cannot possibly have been blocking the current + * grace period. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { + return 0; } /* @@ -565,6 +812,30 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu); /* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptable RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Because preemptable RCU does not exist, there is never any need to + * report on tasks preempted in RCU read-side critical sections during + * expedited RCU grace periods. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +{ + return; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* * Because preemptable RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4b31c779e62..9d2c88423b3 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -155,12 +155,15 @@ static const struct file_operations rcudata_csv_fops = { static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { + long gpnum; int level = 0; + int phase; struct rcu_node *rnp; + gpnum = rsp->gpnum; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", - rsp->completed, rsp->gpnum, rsp->signaled, + rsp->completed, gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff), rsp->n_force_qs, rsp->n_force_qs_ngp, @@ -171,8 +174,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); level = rnp->level; } - seq_printf(m, "%lx/%lx %d:%d ^%d ", + phase = gpnum & 0x1; + seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", rnp->qsmask, rnp->qsmaskinit, + "T."[list_empty(&rnp->blocked_tasks[phase])], + "E."[list_empty(&rnp->blocked_tasks[phase + 2])], + "T."[list_empty(&rnp->blocked_tasks[!phase])], + "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], rnp->grplo, rnp->grphi, rnp->grpnum); } seq_puts(m, "\n"); diff --git a/kernel/relay.c b/kernel/relay.c index 760c26209a3..c705a41b4ba 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, relay_consume_bytes(rbuf, buf->private); } -static struct pipe_buf_operations relay_pipe_buf_ops = { +static const struct pipe_buf_operations relay_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, diff --git a/kernel/resource.c b/kernel/resource.c index fb11a58b959..dc15686b7a7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; + resource_size_t start, end; - new->start = root->start; + start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment * of this->start - 1 to new->end below would cause an underflow. */ if (this && this->start == 0) { - new->start = this->end + 1; + start = this->end + 1; this = this->sibling; } for(;;) { if (this) - new->end = this->start - 1; + end = this->start - 1; else - new->end = root->end; - if (new->start < min) - new->start = min; - if (new->end > max) - new->end = max; - new->start = ALIGN(new->start, align); + end = root->end; + if (start < min) + start = min; + if (end > max) + end = max; + start = ALIGN(start, align); if (alignf) alignf(alignf_data, new, size, align); - if (new->start < new->end && new->end - new->start >= size - 1) { - new->end = new->start + size - 1; + if (start < end && end - start >= size - 1) { + new->start = start; + new->end = start + size - 1; return 0; } if (!this) break; - new->start = this->end + 1; + start = this->end + 1; this = this->sibling; } return -EBUSY; diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 5fcb4fe645e..ddabb54bb5c 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -37,8 +37,8 @@ do { \ if (rt_trace_on) { \ rt_trace_on = 0; \ console_verbose(); \ - if (spin_is_locked(¤t->pi_lock)) \ - spin_unlock(¤t->pi_lock); \ + if (raw_spin_is_locked(¤t->pi_lock)) \ + raw_spin_unlock(¤t->pi_lock); \ } \ } while (0) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 29bd4baf9e7..a9604815786 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task) { unsigned long flags; - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); } /* @@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Task can not go away as we did a get_task() before ! */ - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; /* @@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto out_unlock_pi; lock = waiter->lock; - if (!spin_trylock(&lock->wait_lock)) { - spin_unlock_irqrestore(&task->pi_lock, flags); + if (!raw_spin_trylock(&lock->wait_lock)) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); cpu_relax(); goto retry; } @@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Deadlock detection */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); ret = deadlock_detect ? -EDEADLK : 0; goto out_unlock_pi; } @@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, plist_add(&waiter->list_entry, &lock->wait_list); /* Release the task */ - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); put_task_struct(task); /* Grab the next task */ task = rt_mutex_owner(lock); get_task_struct(task); - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ @@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, __rt_mutex_adjust_prio(task); } - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); top_waiter = rt_mutex_top_waiter(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); if (!detect_deadlock && waiter != top_waiter) goto out_put_task; @@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); out_put_task: put_task_struct(task); @@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, if (pendowner == task) return 1; - spin_lock_irqsave(&pendowner->pi_lock, flags); + raw_spin_lock_irqsave(&pendowner->pi_lock, flags); if (task->prio >= pendowner->prio) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 0; } @@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * priority. */ if (likely(!rt_mutex_has_waiters(lock))) { - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 1; } @@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, next = rt_mutex_top_waiter(lock); plist_del(&next->pi_list_entry, &pendowner->pi_waiters); __rt_mutex_adjust_prio(pendowner); - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); /* * We are going to steal the lock and a waiter was @@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock, * might be task: */ if (likely(next->task != task)) { - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); plist_add(&next->pi_list_entry, &task->pi_waiters); __rt_mutex_adjust_prio(task); - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); } return 1; } @@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { - spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); plist_add(&waiter->pi_list_entry, &owner->pi_waiters); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) chain_walk = 1; @@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, task); - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); return res; } @@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) struct task_struct *pendowner; unsigned long flags; - spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock_irqsave(¤t->pi_lock, flags); waiter = rt_mutex_top_waiter(lock); plist_del(&waiter->list_entry, &lock->wait_list); @@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); - spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); /* * Clear the pi_blocked_on variable and enqueue a possible @@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * waiter with higher priority than pending-owner->normal_prio * is blocked on the unboosted (pending) owner. */ - spin_lock_irqsave(&pendowner->pi_lock, flags); + raw_spin_lock_irqsave(&pendowner->pi_lock, flags); WARN_ON(!pendowner->pi_blocked_on); WARN_ON(pendowner->pi_blocked_on != waiter); @@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) next = rt_mutex_top_waiter(lock); plist_add(&next->pi_list_entry, &pendowner->pi_waiters); } - spin_unlock_irqrestore(&pendowner->pi_lock, flags); + raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); wake_up_process(pendowner); } @@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0; - spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); waiter->task = NULL; current->pi_blocked_on = NULL; - spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); if (first && owner != current) { - spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&waiter->pi_list_entry, &owner->pi_waiters); @@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock, if (owner->pi_blocked_on) chain_walk = 1; - spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); @@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); } /* @@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task) struct rt_mutex_waiter *waiter; unsigned long flags; - spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; if (!waiter || waiter->list_entry.prio == task->prio) { - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } - spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); @@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); if (waiter->task) schedule_rt_mutex(lock); - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); set_current_state(state); } @@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, debug_rt_mutex_init_waiter(&waiter); waiter.task = NULL; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock)) { - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); return 0; } @@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); /* Remove pending timer: */ if (unlikely(timeout)) @@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) { int ret = 0; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); if (likely(rt_mutex_owner(lock) != current)) { @@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) fixup_rt_mutex_waiters(lock); } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); return ret; } @@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) static void __sched rt_mutex_slowunlock(struct rt_mutex *lock) { - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); debug_rt_mutex_unlock(lock); @@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock) if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); return; } wakeup_next_waiter(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); /* Undo pi boosting if necessary: */ rt_mutex_adjust_prio(current); @@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; - spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list, &lock->wait_lock); + raw_spin_lock_init(&lock->wait_lock); + plist_head_init_raw(&lock->wait_list, &lock->wait_lock); debug_rt_mutex_init(lock, name); } @@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); mark_rt_mutex_waiters(lock); @@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, /* We got the lock for task. */ debug_rt_mutex_lock(lock); rt_mutex_set_owner(lock, task, 0); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); rt_mutex_deadlock_account_lock(lock, task); return 1; } @@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - spin_lock(&lock->wait_lock); + raw_spin_lock(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - spin_unlock(&lock->wait_lock); + raw_spin_unlock(&lock->wait_lock); /* * Readjust priority, when we did not get the lock. We might have been diff --git a/kernel/sched.c b/kernel/sched.c index e88689522e6..18cceeecce3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -141,7 +141,7 @@ struct rt_prio_array { struct rt_bandwidth { /* nests inside the rq lock: */ - spinlock_t rt_runtime_lock; + raw_spinlock_t rt_runtime_lock; ktime_t rt_period; u64 rt_runtime; struct hrtimer rt_period_timer; @@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period = ns_to_ktime(period); rt_b->rt_runtime = runtime; - spin_lock_init(&rt_b->rt_runtime_lock); + raw_spin_lock_init(&rt_b->rt_runtime_lock); hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) if (hrtimer_active(&rt_b->rt_period_timer)) return; - spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); for (;;) { unsigned long delta; ktime_t soft, hard; @@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, HRTIMER_MODE_ABS_PINNED, 0); } - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); } #ifdef CONFIG_RT_GROUP_SCHED @@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group @@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); */ static DEFINE_SPINLOCK(task_group_lock); +#ifdef CONFIG_FAIR_GROUP_SCHED + #ifdef CONFIG_SMP static int root_task_group_empty(void) { @@ -316,7 +318,6 @@ static int root_task_group_empty(void) } #endif -#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else /* !CONFIG_USER_SCHED */ @@ -469,7 +470,7 @@ struct rt_rq { u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ - spinlock_t rt_runtime_lock; + raw_spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -524,7 +525,7 @@ static struct root_domain def_root_domain; */ struct rq { /* runqueue lock: */ - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -534,14 +535,12 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ - unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; - u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -590,6 +589,8 @@ struct rq { u64 rt_avg; u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; #endif /* calc_load related fields */ @@ -684,7 +685,7 @@ inline void update_rq_clock(struct rq *rq) */ int runqueue_is_locked(int cpu) { - return spin_is_locked(&cpu_rq(cpu)->lock); + return raw_spin_is_locked(&cpu_rq(cpu)->lock); } /* @@ -771,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, if (!sched_feat_names[i]) return -EINVAL; - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -813,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; * default: 0.25ms */ unsigned int sysctl_sched_shares_ratelimit = 250000; +unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; /* * Inject some fuzzyness into changing the per-cpu group shares @@ -891,7 +893,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ @@ -915,9 +917,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) next->oncpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); #else - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); #endif } @@ -947,10 +949,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) { for (;;) { struct rq *rq = task_rq(p); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } } @@ -967,10 +969,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) for (;;) { local_irq_save(*flags); rq = task_rq(p); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock_irqrestore(&rq->lock, *flags); } } @@ -979,19 +981,19 @@ void task_rq_unlock_wait(struct task_struct *p) struct rq *rq = task_rq(p); smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - spin_unlock_wait(&rq->lock); + raw_spin_unlock_wait(&rq->lock); } static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) __releases(rq->lock) { - spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock_irqrestore(&rq->lock, *flags); } /* @@ -1004,7 +1006,7 @@ static struct rq *this_rq_lock(void) local_irq_disable(); rq = this_rq(); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); return rq; } @@ -1051,10 +1053,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); return HRTIMER_NORESTART; } @@ -1067,10 +1069,10 @@ static void __hrtick_start(void *arg) { struct rq *rq = arg; - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); hrtimer_restart(&rq->hrtick_timer); rq->hrtick_csd_pending = 0; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } /* @@ -1177,7 +1179,7 @@ static void resched_task(struct task_struct *p) { int cpu; - assert_spin_locked(&task_rq(p)->lock); + assert_raw_spin_locked(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -1199,10 +1201,10 @@ static void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!spin_trylock_irqsave(&rq->lock, flags)) + if (!raw_spin_trylock_irqsave(&rq->lock, flags)) return; resched_task(cpu_curr(cpu)); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } #ifdef CONFIG_NO_HZ @@ -1271,7 +1273,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) #else /* !CONFIG_SMP */ static void resched_task(struct task_struct *p) { - assert_spin_locked(&task_rq(p)->lock); + assert_raw_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } @@ -1564,11 +1566,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -struct update_shares_data { - unsigned long rq_weight[NR_CPUS]; -}; - -static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); +static __read_mostly unsigned long *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1578,12 +1576,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight, - struct update_shares_data *usd) + unsigned long *usd_rq_weight) { unsigned long shares, rq_weight; int boost = 0; - rq_weight = usd->rq_weight[cpu]; + rq_weight = usd_rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; @@ -1602,11 +1600,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, struct rq *rq = cpu_rq(cpu); unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } } @@ -1617,8 +1615,8 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, shares = 0; - struct update_shares_data *usd; + unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; + unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; int i; @@ -1627,12 +1625,13 @@ static int tg_shares_up(struct task_group *tg, void *data) return 0; local_irq_save(flags); - usd = &__get_cpu_var(update_shares_data); + usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); for_each_cpu(i, sched_domain_span(sd)) { weight = tg->cfs_rq[i]->load.weight; - usd->rq_weight[i] = weight; + usd_rq_weight[i] = weight; + rq_weight += weight; /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1641,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!weight) weight = NICE_0_LOAD; - rq_weight += weight; + sum_weight += weight; shares += tg->cfs_rq[i]->shares; } + if (!rq_weight) + rq_weight = sum_weight; + if ((!shares && rq_weight) || shares > tg->shares) shares = tg->shares; @@ -1652,7 +1654,7 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd); + update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); local_irq_restore(flags); @@ -1704,9 +1706,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) if (root_task_group_empty()) return; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); update_shares(sd); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); } static void update_h_load(long cpu) @@ -1746,7 +1748,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - spin_unlock(&this_rq->lock); + raw_spin_unlock(&this_rq->lock); double_rq_lock(this_rq, busiest); return 1; @@ -1767,14 +1769,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) { int ret = 0; - if (unlikely(!spin_trylock(&busiest->lock))) { + if (unlikely(!raw_spin_trylock(&busiest->lock))) { if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); ret = 1; } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); } return ret; } @@ -1788,7 +1792,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) { if (unlikely(!irqs_disabled())) { /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); + raw_spin_unlock(&this_rq->lock); BUG_ON(1); } @@ -1798,7 +1802,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - spin_unlock(&busiest->lock); + raw_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } #endif @@ -1813,6 +1817,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif static void calc_load_account_active(struct rq *this_rq); +static void update_sysctl(void); +static int get_update_sysctl_factor(void); + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} #include "sched_stats.h" #include "sched_idletask.c" @@ -1970,20 +1990,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio, int running) @@ -1996,6 +2002,39 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + * + * Function lives here instead of kthread.c because it messes with + * scheduler internals which require locking. + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + set_task_cpu(p, cpu); + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} +EXPORT_SYMBOL(kthread_bind); + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: @@ -2008,7 +2047,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (sched_feat(CACHE_HOT_BUDDY) && + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; @@ -2030,30 +2069,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - - clock_offset = old_rq->clock - new_rq->clock; trace_sched_migrate_task(p, new_cpu); -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; -#endif if (old_cpu != new_cpu) { p->se.nr_migrations++; - new_rq->nr_migrations_in++; -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); -#endif perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } @@ -2086,6 +2108,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * it is sufficient to simply update the task's cpu field. */ if (!p->se.on_rq && !task_running(rq, p)) { + update_rq_clock(rq); set_task_cpu(p, dest_cpu); return 0; } @@ -2293,6 +2316,14 @@ void task_oncpu_function_call(struct task_struct *p, preempt_enable(); } +#ifdef CONFIG_SMP +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ + return p->sched_class->select_task_rq(p, sd_flags, wake_flags); +} +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2344,16 +2375,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); - rq = task_rq_lock(p, &flags); - - if (rq != orig_rq) - update_rq_clock(rq); + rq = __task_rq_lock(p); + update_rq_clock(rq); WARN_ON(p->state != TASK_WAKING); cpu = task_cpu(p); @@ -2411,6 +2440,17 @@ out_running: #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } #endif out: task_rq_unlock(rq, &flags); @@ -2457,7 +2497,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; - p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -2479,7 +2518,6 @@ static void __sched_fork(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; @@ -2549,8 +2587,11 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + #ifdef CONFIG_SMP - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); #endif set_task_cpu(p, cpu); @@ -2585,17 +2626,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } + activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -2752,10 +2783,10 @@ static inline void post_schedule(struct rq *rq) if (rq->post_schedule) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->curr->sched_class->post_schedule) rq->curr->sched_class->post_schedule(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->post_schedule = 0; } @@ -2819,14 +2850,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (unlikely(!mm)) { + if (likely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (unlikely(!prev->mm)) { + if (likely(!prev->mm)) { prev->active_mm = NULL; rq->prev_mm = oldmm; } @@ -2989,15 +3020,6 @@ static void calc_load_account_active(struct rq *this_rq) } /* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ - return cpu_rq(cpu)->nr_migrations_in; -} - -/* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). */ @@ -3046,15 +3068,15 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) { BUG_ON(!irqs_disabled()); if (rq1 == rq2) { - spin_lock(&rq1->lock); + raw_spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { if (rq1 < rq2) { - spin_lock(&rq1->lock); - spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); } else { - spin_lock(&rq2->lock); - spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } update_rq_clock(rq1); @@ -3071,9 +3093,9 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - spin_unlock(&rq1->lock); + raw_spin_unlock(&rq1->lock); if (rq1 != rq2) - spin_unlock(&rq2->lock); + raw_spin_unlock(&rq2->lock); else __release(rq2->lock); } @@ -3119,7 +3141,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); + new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); @@ -3135,10 +3157,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ check_preempt_curr(this_rq, p, 0); } @@ -4097,7 +4115,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4170,14 +4188,15 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock_irqsave(&busiest->lock, flags); + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { - spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_unlock_irqrestore(&busiest->lock, + flags); all_pinned = 1; goto out_one_pinned; } @@ -4187,7 +4206,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock_irqrestore(&busiest->lock, flags); + raw_spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); @@ -4260,7 +4279,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4369,10 +4388,10 @@ redo: /* * Should not call ttwu while holding a rq->lock */ - spin_unlock(&this_rq->lock); + raw_spin_unlock(&this_rq->lock); if (active_balance) wake_up_process(busiest->migration_thread); - spin_lock(&this_rq->lock); + raw_spin_lock(&this_rq->lock); } else sd->nr_balance_failed = 0; @@ -4400,6 +4419,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + this_rq->idle_stamp = this_rq->clock; + + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4414,8 +4438,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -4650,7 +4676,7 @@ int select_nohz_load_balancer(int stop_tick) cpumask_set_cpu(cpu, nohz.cpu_mask); /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -5017,8 +5043,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime = cputime_add(p->gtime, cputime); /* Add guest time to cpustat. */ - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + } else { + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); + } } /* @@ -5133,60 +5164,86 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -cputime_t task_utime(struct task_struct *p) +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - return p->utime; + *ut = p->utime; + *st = p->stime; } -cputime_t task_stime(struct task_struct *p) +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - return p->stime; + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; } #else -cputime_t task_utime(struct task_struct *p) + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; + cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); /* * Use CFS's precise accounting: */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - temp *= utime; + u64 temp; + + temp = (u64)(rtime * utime); do_div(temp, total); - } - utime = (clock_t)temp; + utime = (cputime_t)temp; + } else + utime = rtime; + + /* + * Compare with previous values, to keep monotonicity: + */ + p->prev_utime = max(p->prev_utime, utime); + p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); - return p->prev_utime; + *ut = p->prev_utime; + *st = p->prev_stime; } -cputime_t task_stime(struct task_struct *p) +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - clock_t stime; + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); + thread_group_cputime(p, &cputime); - if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - return p->prev_stime; -} -#endif + if (total) { + u64 temp; -inline cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; + temp = (u64)(rtime * cputime.utime); + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); + + *ut = sig->prev_utime; + *st = sig->prev_stime; } +#endif /* * This function gets called by the timer code, with HZ frequency. @@ -5203,11 +5260,11 @@ void scheduler_tick(void) sched_clock_tick(); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); perf_event_task_tick(curr, cpu); @@ -5321,13 +5378,14 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *p) +static void put_prev_task(struct rq *rq, struct task_struct *prev) { - u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; + if (prev->state == TASK_RUNNING) { + u64 runtime = prev->se.sum_exec_runtime; - update_avg(&p->se.avg_running, runtime); + runtime -= prev->se.prev_sum_exec_runtime; + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5337,12 +5395,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p) * correlates to the amount of cache footprint a task can * build up. */ - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); - update_avg(&p->se.avg_overlap, runtime); - } else { - update_avg(&p->se.avg_running, 0); + update_avg(&prev->se.avg_overlap, runtime); } - p->sched_class->put_prev_task(rq, p); + prev->sched_class->put_prev_task(rq, prev); } /* @@ -5403,7 +5458,7 @@ need_resched_nonpreemptible: if (sched_feat(HRTICK)) hrtick_clear(rq); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); clear_tsk_need_resched(prev); @@ -5439,7 +5494,7 @@ need_resched_nonpreemptible: cpu = smp_processor_id(); rq = cpu_rq(cpu); } else - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); post_schedule(rq); @@ -5452,7 +5507,7 @@ need_resched_nonpreemptible: } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_SMP +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. @@ -6146,22 +6201,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) BUG_ON(p->se.on_rq); p->policy = policy; - switch (p->policy) { - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: - p->sched_class = &fair_sched_class; - break; - case SCHED_FIFO: - case SCHED_RR: - p->sched_class = &rt_sched_class; - break; - } - p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; set_load_weight(p); } @@ -6276,7 +6323,7 @@ recheck: * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: */ - spin_lock_irqsave(&p->pi_lock, flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. @@ -6286,7 +6333,7 @@ recheck: if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } update_rq_clock(rq); @@ -6310,7 +6357,7 @@ recheck: check_class_changed(rq, p, prev_class, oldprio, running); } __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); rt_mutex_adjust_pi(p); @@ -6564,6 +6611,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; + unsigned long flags; + struct rq *rq; int retval; get_online_cpus(); @@ -6578,7 +6627,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; + rq = task_rq_lock(p, &flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + task_rq_unlock(rq, &flags); out_unlock: read_unlock(&tasklist_lock); @@ -6636,7 +6687,7 @@ SYSCALL_DEFINE0(sched_yield) */ __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - _raw_spin_unlock(&rq->lock); + do_raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); schedule(); @@ -6816,6 +6867,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; + unsigned long flags; + struct rq *rq; int retval; struct timespec t; @@ -6832,7 +6885,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - time_slice = p->sched_class->get_rr_interval(p); + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); jiffies_to_timespec(time_slice, &t); @@ -6906,7 +6961,7 @@ void show_state_filter(unsigned long state_filter) /* * Only show locks if all tasks are dumped: */ - if (state_filter == -1) + if (!state_filter) debug_show_all_locks(); } @@ -6928,12 +6983,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); idle->se.exec_start = sched_clock(); - idle->prio = idle->normal_prio = MAX_PRIO; cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); @@ -6941,7 +6995,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; #endif - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ #if defined(CONFIG_PREEMPT) @@ -6974,22 +7028,43 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static inline void sched_init_granularity(void) +static int get_update_sysctl_factor(void) { - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 200000000; + unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; + return factor; +} - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); - sysctl_sched_wakeup_granularity *= factor; +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_shares_ratelimit); +#undef SET_SYSCTL +} - sysctl_sched_shares_ratelimit *= factor; +static inline void sched_init_granularity(void) +{ + update_sysctl(); } #ifdef CONFIG_SMP @@ -7026,7 +7101,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpumask_intersects(new_mask, cpu_online_mask)) { + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } @@ -7048,7 +7123,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ struct task_struct *mt = rq->migration_thread; @@ -7137,10 +7212,10 @@ static int migration_thread(void *data) struct migration_req *req; struct list_head *head; - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); break; } @@ -7152,7 +7227,7 @@ static int migration_thread(void *data) head = &rq->migration_queue; if (list_empty(head)) { - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); schedule(); set_current_state(TASK_INTERRUPTIBLE); continue; @@ -7161,14 +7236,14 @@ static int migration_thread(void *data) list_del_init(head->next); if (req->task != NULL) { - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); __migrate_task(req->task, cpu, req->dest_cpu); } else if (likely(cpu == (badcpu = smp_processor_id()))) { req->dest_cpu = RCU_MIGRATION_GOT_QS; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } else { req->dest_cpu = RCU_MIGRATION_MUST_SYNC; - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); } local_irq_enable(); @@ -7202,19 +7277,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) again: /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto move; /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); if (dest_cpu < nr_cpu_ids) goto move; /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); /* * Don't tell them about moving exiting tasks or @@ -7243,7 +7318,7 @@ move: */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); unsigned long flags; local_irq_save(flags); @@ -7291,14 +7366,14 @@ void sched_idle_next(void) * Strictly not necessary since rest of the CPUs are stopped by now * and interrupts disabled on the current cpu. */ - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); update_rq_clock(rq); activate_task(rq, p, 0); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -7334,9 +7409,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) * that's OK. No task can be added to this CPU, so iteration is * fine. */ - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); move_task_off_dead_cpu(dead_cpu, p); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); put_task_struct(p); } @@ -7377,17 +7452,16 @@ static struct ctl_table sd_ctl_dir[] = { .procname = "sched_domain", .mode = 0555, }, - {0, }, + {} }; static struct ctl_table sd_ctl_root[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = sd_ctl_dir, }, - {0, }, + {} }; static struct ctl_table *sd_alloc_ctl_entry(int n) @@ -7497,7 +7571,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) static struct ctl_table_header *sd_sysctl_header; static void register_sched_domain_sysctl(void) { - int i, cpu_num = num_online_cpus(); + int i, cpu_num = num_possible_cpus(); struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; @@ -7507,7 +7581,7 @@ static void register_sched_domain_sysctl(void) if (entry == NULL) return; - for_each_online_cpu(i) { + for_each_possible_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; @@ -7603,13 +7677,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); break; #ifdef CONFIG_HOTPLUG_CPU @@ -7634,14 +7708,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) put_task_struct(rq->migration_thread); rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); - rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); @@ -7651,30 +7724,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) * they didn't take sched_hotcpu_mutex. Just wake up * the requestors. */ - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { struct migration_req *req; req = list_entry(rq->migration_queue.next, struct migration_req, list); list_del_init(&req->list); - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); complete(&req->done); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); } - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); break; case CPU_DYING: case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); break; #endif } @@ -7711,6 +7784,16 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ + sched_domain_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { @@ -7797,6 +7880,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) cpumask_var_t groupmask; int level = 0; + if (!sched_domain_debug_enabled) + return; + if (!sd) { printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); return; @@ -7876,6 +7962,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + synchronize_sched(); + cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); @@ -7889,7 +7977,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) struct root_domain *old_rd = NULL; unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { old_rd = rq->rd; @@ -7915,7 +8003,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) free_rootdomain(old_rd); @@ -8016,6 +8104,7 @@ static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + alloc_bootmem_cpumask_var(&cpu_isolated_map); cpulist_parse(str, cpu_isolated_map); return 1; } @@ -8200,14 +8289,14 @@ enum s_alloc { */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); +static DEFINE_PER_CPU(struct static_sched_group, sched_groups); static int cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *unused) { if (sg) - *sg = &per_cpu(sched_group_cpus, cpu).sg; + *sg = &per_cpu(sched_groups, cpu).sg; return cpu; } #endif /* CONFIG_SCHED_SMT */ @@ -8852,7 +8941,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) return __build_sched_domains(cpu_map, NULL); } -static struct cpumask *doms_cur; /* current sched domains */ +static cpumask_var_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static struct sched_domain_attr *dattr_cur; /* attribues of custom domains in 'doms_cur' */ @@ -8874,6 +8963,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) return 0; } +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -8885,12 +8999,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) arch_update_cpu_topology(); ndoms_cur = 1; - doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); + doms_cur = alloc_sched_domains(ndoms_cur); if (!doms_cur) - doms_cur = fallback_doms; - cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); dattr_cur = NULL; - err = build_sched_domains(doms_cur); + err = build_sched_domains(doms_cur[0]); register_sched_domain_sysctl(); return err; @@ -8940,19 +9054,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * - * 'doms_new' is an array of cpumask's of length 'ndoms_new'. + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. * The masks don't intersect (don't overlap.) We should setup one * sched domain for each mask. CPUs not in any of the cpumasks will * not be load balanced. If the same cpumask appears both in the * current 'doms_cur' domains and in the new 'doms_new', we can leave * it as it is. * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL && - * ndoms_new == 1, and partition_sched_domains() will fallback to - * the single partition 'fallback_doms', it also forces the domains - * to be rebuilt. + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. * * If doms_new == NULL it will be replaced with cpu_online_mask. * ndoms_new == 0 is a special case for destroying existing domains, @@ -8960,8 +9074,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock held */ -/* FIXME: Change to struct cpumask *doms_new[] */ -void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { int i, j, n; @@ -8980,40 +9093,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(&doms_cur[i], &doms_new[j]) + if (cpumask_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ - detach_destroy_domains(doms_cur + i); + detach_destroy_domains(doms_cur[i]); match1: ; } if (doms_new == NULL) { ndoms_cur = 0; - doms_new = fallback_doms; - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur && !new_topology; j++) { - if (cpumask_equal(&doms_new[i], &doms_cur[j]) + if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - __build_sched_domains(doms_new + i, + __build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); match2: ; } /* Remember the new sched domains */ - if (doms_cur != fallback_doms) - kfree(doms_cur); + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; dattr_cur = dattr_new; @@ -9124,8 +9237,10 @@ static int update_sched_domains(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; @@ -9172,7 +9287,7 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_online_mask); + arch_init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -9245,13 +9360,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks, &rq->lock); + plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; - spin_lock_init(&rt_rq->rt_runtime_lock); + raw_spin_lock_init(&rt_rq->rt_runtime_lock); #ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; @@ -9335,10 +9450,6 @@ void __init sched_init(void) #ifdef CONFIG_CPUMASK_OFFSTACK alloc_size += num_possible_cpus() * cpumask_size(); #endif - /* - * As sched_init() is called before page_alloc is setup, - * we use alloc_bootmem(). - */ if (alloc_size) { ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); @@ -9407,11 +9518,15 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_GROUP_SCHED */ +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP + update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), + __alignof__(unsigned long)); +#endif for_each_possible_cpu(i) { struct rq *rq; rq = cpu_rq(i); - spin_lock_init(&rq->lock); + raw_spin_lock_init(&rq->lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -9471,7 +9586,7 @@ void __init sched_init(void) #elif defined CONFIG_USER_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); init_tg_rt_entry(&init_task_group, - &per_cpu(init_rt_rq, i), + &per_cpu(init_rt_rq_var, i), &per_cpu(init_sched_rt_entity, i), i, 1, root_task_group.rt_se[i]); #endif @@ -9489,6 +9604,8 @@ void __init sched_init(void) rq->cpu = i; rq->online = 0; rq->migration_thread = NULL; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif @@ -9507,7 +9624,7 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); + plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); #endif /* @@ -9532,13 +9649,15 @@ void __init sched_init(void) current->sched_class = &fair_sched_class; /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); @@ -9630,13 +9749,13 @@ void normalize_rt_tasks(void) continue; } - spin_lock(&p->pi_lock); + raw_spin_lock(&p->pi_lock); rq = __task_rq_lock(p); normalize_task(rq, p); __task_rq_unlock(rq); - spin_unlock(&p->pi_lock); + raw_spin_unlock(&p->pi_lock); } while_each_thread(g, p); read_unlock_irqrestore(&tasklist_lock, flags); @@ -9732,13 +9851,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) se = kzalloc_node(sizeof(struct sched_entity), GFP_KERNEL, cpu_to_node(i)); if (!se) - goto err; + goto err_free_rq; init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; + err_free_rq: + kfree(cfs_rq); err: return 0; } @@ -9820,13 +9941,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) rt_se = kzalloc_node(sizeof(struct sched_rt_entity), GFP_KERNEL, cpu_to_node(i)); if (!rt_se) - goto err; + goto err_free_rq; init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; + err_free_rq: + kfree(rt_rq); err: return 0; } @@ -9995,9 +10118,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) struct rq *rq = cfs_rq->rq; unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __set_se_shares(se, shares); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -10182,18 +10305,18 @@ static int tg_set_bandwidth(struct task_group *tg, if (err) goto unlock; - spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); tg->rt_bandwidth.rt_runtime = rt_runtime; for_each_possible_cpu(i) { struct rt_rq *rt_rq = tg->rt_rq[i]; - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_runtime; - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); + raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -10298,15 +10421,15 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_runtime == 0) return -EBUSY; - spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = global_rt_runtime(); - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); return 0; } @@ -10597,9 +10720,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) /* * Take rq->lock to make 64-bit read safe on 32-bit platforms. */ - spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_lock_irq(&cpu_rq(cpu)->lock); data = *cpuusage; - spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); #else data = *cpuusage; #endif @@ -10615,9 +10738,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* * Take rq->lock to make 64-bit write safe on 32-bit platforms. */ - spin_lock_irq(&cpu_rq(cpu)->lock); + raw_spin_lock_irq(&cpu_rq(cpu)->lock); *cpuusage = val; - spin_unlock_irq(&cpu_rq(cpu)->lock); + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); #else *cpuusage = val; #endif @@ -10851,9 +10974,9 @@ void synchronize_sched_expedited(void) init_completion(&req->done); req->task = NULL; req->dest_cpu = RCU_MIGRATION_NEED_QS; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); list_add(&req->list, &rq->migration_queue); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); wake_up_process(rq->migration_thread); } for_each_online_cpu(cpu) { @@ -10861,13 +10984,14 @@ void synchronize_sched_expedited(void) req = &per_cpu(rcu_migration_req, cpu); rq = cpu_rq(cpu); wait_for_completion(&req->done); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) need_full_sync = 1; req->dest_cpu = RCU_MIGRATION_IDLE; - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; + synchronize_sched_expedited_count++; mutex_unlock(&rcu_sched_expedited_mutex); put_online_cpus(); if (need_full_sync) diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 0f052fc674d..597b33099df 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -135,26 +135,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; - spin_lock_irqsave(&vec->lock, flags); + raw_spin_lock_irqsave(&vec->lock, flags); cpumask_set_cpu(cpu, vec->mask); vec->count++; if (vec->count == 1) set_bit(newpri, cp->pri_active); - spin_unlock_irqrestore(&vec->lock, flags); + raw_spin_unlock_irqrestore(&vec->lock, flags); } if (likely(oldpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - spin_lock_irqsave(&vec->lock, flags); + raw_spin_lock_irqsave(&vec->lock, flags); vec->count--; if (!vec->count) clear_bit(oldpri, cp->pri_active); cpumask_clear_cpu(cpu, vec->mask); - spin_unlock_irqrestore(&vec->lock, flags); + raw_spin_unlock_irqrestore(&vec->lock, flags); } *currpri = newpri; @@ -180,7 +180,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; - spin_lock_init(&vec->lock); + raw_spin_lock_init(&vec->lock); vec->count = 0; if (!zalloc_cpumask_var(&vec->mask, gfp)) goto cleanup; diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 9a7e859b8fb..7cb5bb6b95b 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -12,7 +12,7 @@ /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - spinlock_t lock; + raw_spinlock_t lock; int count; cpumask_var_t mask; }; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index efb84409bc4..67f95aada4b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); @@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) max_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", @@ -285,12 +285,16 @@ static void print_cpu(struct seq_file *m, int cpu) #ifdef CONFIG_SCHEDSTATS #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P(yld_count); P(sched_switch); P(sched_count); P(sched_goidle); +#ifdef CONFIG_SMP + P64(avg_idle); +#endif P(ttwu_count); P(ttwu_local); @@ -305,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu) print_rq(m, rq, cpu); } +static const char *sched_tunable_scaling_names[] = { + "none", + "logaritmic", + "linear" +}; + static int sched_debug_show(struct seq_file *m, void *v) { u64 now = ktime_to_ns(ktime_get()); @@ -330,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P + SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + sysctl_sched_tunable_scaling, + sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + for_each_online_cpu(cpu) print_cpu(m, cpu); @@ -395,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); - PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; @@ -419,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.nr_failed_migrations_running); P(se.nr_failed_migrations_hot); P(se.nr_forced_migrations); - P(se.nr_forced2_migrations); P(se.nr_wakeups); P(se.nr_wakeups_sync); P(se.nr_wakeups_migrate); @@ -495,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; p->se.nr_wakeups_migrate = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b47eed..5bedf6e3ebf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -21,6 +21,7 @@ */ #include <linux/latencytop.h> +#include <linux/sched.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -35,12 +36,26 @@ * run vmstat and monitor the context-switches (cs) field) */ unsigned int sysctl_sched_latency = 5000000ULL; +unsigned int normalized_sysctl_sched_latency = 5000000ULL; + +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 1000000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield; * have immediate wakeup/sleep latencies. */ unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ #ifdef CONFIG_SCHED_DEBUG -int sched_nr_latency_handler(struct ctl_table *table, int write, +int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_shares_ratelimit); +#undef WRT_SYSCTL + return 0; } #endif @@ -822,6 +847,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that missed wakeup preemption by a + * narrow margin doesn't have to wait for a full slice. + * This also mitigates buddy induced latencies under load. + */ + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + if (delta_exec < sysctl_sched_min_granularity) + return; + + if (cfs_rq->nr_running > 1) { + struct sched_entity *se = __pick_next_entity(cfs_rq); + s64 delta = curr->vruntime - se->vruntime; + + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); } } @@ -861,12 +906,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) - return cfs_rq->next; + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) - return cfs_rq->last; + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; + + clear_buddies(cfs_rq, se); return se; } @@ -1319,6 +1370,37 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } /* + * Try and locate an idle CPU in the sched_domain. + */ +static int +select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) +{ + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int i; + + /* + * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE + * test in select_task_rq_fair) and the prev_cpu is idle then that's + * always a better target than the current cpu. + */ + if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) + return prev_cpu; + + /* + * Otherwise, iterate the domain and find an elegible idle cpu. + */ + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (!cpu_rq(i)->cfs.nr_running) { + target = i; + break; + } + } + + return target; +} + +/* * sched_balance_self: balance the current task (running on cpu) in domains * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and * SD_BALANCE_EXEC. @@ -1346,7 +1428,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag new_cpu = prev_cpu; } - rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If power savings logic is enabled for a domain, see if we @@ -1372,11 +1453,35 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag want_sd = 0; } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && - cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + /* + * While iterating the domains looking for a spanning + * WAKE_AFFINE domain, adjust the affine target to any idle cpu + * in cache sharing domains along the way. + */ + if (want_affine) { + int target = -1; - affine_sd = tmp; - want_affine = 0; + /* + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. + */ + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) + target = cpu; + + /* + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. + */ + if (tmp->flags & SD_PREFER_SIBLING) + target = select_idle_sibling(p, tmp, target); + + if (target >= 0) { + if (tmp->flags & SD_WAKE_AFFINE) { + affine_sd = tmp; + want_affine = 0; + } + cpu = target; + } } if (!want_sd && !want_affine) @@ -1403,10 +1508,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag update_shares(tmp); } - if (affine_sd && wake_affine(affine_sd, p, sync)) { - new_cpu = cpu; - goto out; - } + if (affine_sd && wake_affine(affine_sd, p, sync)) + return cpu; while (sd) { int load_idx = sd->forkexec_idx; @@ -1447,8 +1550,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag /* while loop will break here if sd == NULL */ } -out: - rcu_read_unlock(); return new_cpu; } #endif /* CONFIG_SMP */ @@ -1568,13 +1669,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); int sync = wake_flags & WF_SYNC; + int scale = cfs_rq->nr_running >= sched_nr_latency; - update_curr(cfs_rq); - - if (unlikely(rt_prio(p->prio))) { - resched_task(curr); - return; - } + if (unlikely(rt_prio(p->prio))) + goto preempt; if (unlikely(p->sched_class != &fair_sched_class)) return; @@ -1582,18 +1680,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; - /* - * Only set the backward buddy when the current task is still on the - * rq. This can happen when a wakeup gets interleaved with schedule on - * the ->pre_schedule() or idle_balance() point, either of which can - * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, for - * obvious reasons its a bad idea to schedule back to the idle thread. - */ - if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) - set_last_buddy(se); - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) set_next_buddy(pse); /* @@ -1611,36 +1698,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) { - resched_task(curr); - return; - } + if (unlikely(curr->policy == SCHED_IDLE)) + goto preempt; - if ((sched_feat(WAKEUP_SYNC) && sync) || - (sched_feat(WAKEUP_OVERLAP) && - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { - resched_task(curr); - return; - } + if (sched_feat(WAKEUP_SYNC) && sync) + goto preempt; - if (sched_feat(WAKEUP_RUNNING)) { - if (pse->avg_running < se->avg_running) { - set_next_buddy(pse); - resched_task(curr); - return; - } - } + if (sched_feat(WAKEUP_OVERLAP) && + se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost) + goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; + update_curr(cfs_rq); find_matching_se(&se, &pse); - BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) - resched_task(curr); + goto preempt; + + return; + +preempt: + resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); } static struct task_struct *pick_next_task_fair(struct rq *rq) @@ -1649,21 +1744,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; - if (unlikely(!cfs_rq->nr_running)) + if (!cfs_rq->nr_running) return NULL; do { se = pick_next_entity(cfs_rq); - /* - * If se was a buddy, clear it so that it will have to earn - * the favour again. - * - * If se was not a buddy, clear the buddies because neither - * was elegible to run, let them earn it again. - * - * IOW. unconditionally clear buddies. - */ - __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -1830,6 +1915,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } + +static void rq_online_fair(struct rq *rq) +{ + update_sysctl(); +} + +static void rq_offline_fair(struct rq *rq) +{ + update_sysctl(); +} + #endif /* CONFIG_SMP */ /* @@ -1847,28 +1943,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) } /* - * Share the fairness runtime between parent and child, thus the - * total amount of pressure for CPU stays equal - new tasks - * get a chance to run but frequent forkers are not allowed to - * monopolize the CPU. Note: the parent runqueue is locked, - * the child is not running yet. + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled */ -static void task_new_fair(struct rq *rq, struct task_struct *p) +static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct cfs_rq *cfs_rq = task_cfs_rq(current); struct sched_entity *se = &p->se, *curr = cfs_rq->curr; int this_cpu = smp_processor_id(); + struct rq *rq = this_rq(); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); - sched_info_queued(p); + if (unlikely(task_cpu(p) != this_cpu)) + __set_task_cpu(p, this_cpu); update_curr(cfs_rq); + if (curr) se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); - /* 'curr' will be NULL if the child belongs to a different group */ - if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && - curr && entity_before(curr, se)) { + if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { /* * Upon rescheduling, sched_class::put_prev_task() will place * 'current' within the tree based on its new key value. @@ -1877,7 +1975,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } - enqueue_task_fair(rq, p, 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); } /* @@ -1939,21 +2037,17 @@ static void moved_group_fair(struct task_struct *p) } #endif -unsigned int get_rr_interval_fair(struct task_struct *task) +unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { struct sched_entity *se = &task->se; - unsigned long flags; - struct rq *rq; unsigned int rr_interval = 0; /* * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise * idle runqueue: */ - rq = task_rq_lock(task, &flags); if (rq->cfs.load.weight) rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); - task_rq_unlock(rq, &flags); return rr_interval; } @@ -1977,11 +2071,13 @@ static const struct sched_class fair_sched_class = { .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, + .rq_online = rq_online_fair, + .rq_offline = rq_offline_fair, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, - .task_new = task_new_fair, + .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 0d94083582c..d5059fd761d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0) SCHED_FEAT(WAKEUP_OVERLAP, 0) /* - * Wakeup preemption towards tasks that run short - */ -SCHED_FEAT(WAKEUP_RUNNING, 0) - -/* * Use the SYNC wakeup hint, pipes and the likes use this to indicate * the remote end is likely to consume the data we just wrote, and * therefore has cache benefit from being placed on the same cpu, see diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index b133a28fcde..5f93b570d38 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -34,10 +34,10 @@ static struct task_struct *pick_next_task_idle(struct rq *rq) static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { - spin_unlock_irq(&rq->lock); + raw_spin_unlock_irq(&rq->lock); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); - spin_lock_irq(&rq->lock); + raw_spin_lock_irq(&rq->lock); } static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) @@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, check_preempt_curr(rq, p, 0); } -unsigned int get_rr_interval_idle(struct task_struct *task) +unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) { return 0; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a4d790cddb1..d2ea2828164 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -327,7 +327,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) weight = cpumask_weight(rd->span); - spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); rt_period = ktime_to_ns(rt_b->rt_period); for_each_cpu(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); @@ -336,7 +336,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) if (iter == rt_rq) continue; - spin_lock(&iter->rt_runtime_lock); + raw_spin_lock(&iter->rt_runtime_lock); /* * Either all rqs have inf runtime and there's nothing to steal * or __disable_runtime() below sets a specific rq to inf to @@ -358,14 +358,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq) rt_rq->rt_runtime += diff; more = 1; if (rt_rq->rt_runtime == rt_period) { - spin_unlock(&iter->rt_runtime_lock); + raw_spin_unlock(&iter->rt_runtime_lock); break; } } next: - spin_unlock(&iter->rt_runtime_lock); + raw_spin_unlock(&iter->rt_runtime_lock); } - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); return more; } @@ -386,8 +386,8 @@ static void __disable_runtime(struct rq *rq) s64 want; int i; - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); /* * Either we're all inf and nobody needs to borrow, or we're * already disabled and thus have nothing to do, or we have @@ -396,7 +396,7 @@ static void __disable_runtime(struct rq *rq) if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); /* * Calculate the difference between what we started out with @@ -418,7 +418,7 @@ static void __disable_runtime(struct rq *rq) if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; - spin_lock(&iter->rt_runtime_lock); + raw_spin_lock(&iter->rt_runtime_lock); if (want > 0) { diff = min_t(s64, iter->rt_runtime, want); iter->rt_runtime -= diff; @@ -427,13 +427,13 @@ static void __disable_runtime(struct rq *rq) iter->rt_runtime -= want; want -= want; } - spin_unlock(&iter->rt_runtime_lock); + raw_spin_unlock(&iter->rt_runtime_lock); if (!want) break; } - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); /* * We cannot be left wanting - that would mean some runtime * leaked out of the system. @@ -445,8 +445,8 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); } } @@ -454,9 +454,9 @@ static void disable_runtime(struct rq *rq) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __disable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static void __enable_runtime(struct rq *rq) @@ -472,13 +472,13 @@ static void __enable_runtime(struct rq *rq) for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); - spin_lock(&rt_b->rt_runtime_lock); - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; - spin_unlock(&rt_rq->rt_runtime_lock); - spin_unlock(&rt_b->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); } } @@ -486,9 +486,9 @@ static void enable_runtime(struct rq *rq) { unsigned long flags; - spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&rq->lock, flags); __enable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); } static int balance_runtime(struct rt_rq *rt_rq) @@ -496,9 +496,9 @@ static int balance_runtime(struct rt_rq *rt_rq) int more = 0; if (rt_rq->rt_time > rt_rq->rt_runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); more = do_balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); } return more; @@ -524,11 +524,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); - spin_lock(&rq->lock); + raw_spin_lock(&rq->lock); if (rt_rq->rt_time) { u64 runtime; - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); if (rt_rq->rt_throttled) balance_runtime(rt_rq); runtime = rt_rq->rt_runtime; @@ -539,13 +539,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } else if (rt_rq->rt_nr_running) idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); - spin_unlock(&rq->lock); + raw_spin_unlock(&rq->lock); } return idle; @@ -624,11 +624,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { - spin_lock(&rt_rq->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); - spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_rq->rt_runtime_lock); } } } @@ -1153,29 +1153,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, - const struct cpumask *mask) -{ - int first; - - /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) - return this_cpu; - - first = cpumask_first(mask); - if (first < nr_cpu_ids) - return first; - - return -1; -} - static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - cpumask_var_t domain_mask; if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ @@ -1198,28 +1181,26 @@ static int find_lowest_rq(struct task_struct *task) * Otherwise, we consult the sched_domains span maps to figure * out which cpu is logically closest to our hot cache data. */ - if (this_cpu == cpu) - this_cpu = -1; /* Skip this_cpu opt if the same */ - - if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - int best_cpu; + if (!cpumask_test_cpu(this_cpu, lowest_mask)) + this_cpu = -1; /* Skip this_cpu opt if not among lowest */ - cpumask_and(domain_mask, - sched_domain_span(sd), - lowest_mask); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; - best_cpu = pick_optimal_cpu(this_cpu, - domain_mask); - - if (best_cpu != -1) { - free_cpumask_var(domain_mask); - return best_cpu; - } - } + /* + * "this_cpu" is cheaper to preempt than a + * remote processor. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) + return this_cpu; + + best_cpu = cpumask_first_and(lowest_mask, + sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) + return best_cpu; } - free_cpumask_var(domain_mask); } /* @@ -1227,7 +1208,13 @@ static int find_lowest_rq(struct task_struct *task) * just give the caller *something* to work with from the compatible * locations. */ - return pick_optimal_cpu(this_cpu, lowest_mask); + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + return -1; } /* Will lock the rq it finds */ @@ -1259,7 +1246,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->se.on_rq)) { - spin_unlock(&lowest_rq->lock); + raw_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; break; } @@ -1734,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq) dequeue_pushable_task(rq, p); } -unsigned int get_rr_interval_rt(struct task_struct *task) +unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) { /* * Time slice is 0 for SCHED_FIFO tasks diff --git a/kernel/signal.c b/kernel/signal.c index 6705320784f..1814e68e4de 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,12 +22,14 @@ #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/signalfd.h> +#include <linux/ratelimit.h> #include <linux/tracehook.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> -#include <trace/events/sched.h> +#define CREATE_TRACE_POINTS +#include <trace/events/signal.h> #include <asm/param.h> #include <asm/uaccess.h> @@ -41,6 +43,8 @@ static struct kmem_cache *sigqueue_cachep; +int print_fatal_signals __read_mostly; + static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; @@ -159,7 +163,7 @@ int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; - + s = pending->signal.sig; m = mask->sig; switch (_NSIG_WORDS) { @@ -184,17 +188,31 @@ int next_signal(struct sigpending *pending, sigset_t *mask) sig = ffz(~x) + 1; break; } - + return sig; } +static inline void print_dropped_signal(int sig) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!print_fatal_signals) + return; + + if (!__ratelimit(&ratelimit_state)) + return; + + printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", + current->comm, current->pid, sig); +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appopriate lock must be held to stop the target task from exiting */ -static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, - int override_rlimit) +static struct sigqueue * +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; struct user_struct *user; @@ -207,10 +225,15 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, */ user = get_uid(__task_cred(t)->user); atomic_inc(&user->sigpending); + if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) + t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { q = kmem_cache_alloc(sigqueue_cachep, flags); + } else { + print_dropped_signal(sig); + } + if (unlikely(q == NULL)) { atomic_dec(&user->sigpending); free_uid(user); @@ -400,7 +423,7 @@ still_pending: */ info->si_signo = sig; info->si_errno = 0; - info->si_code = 0; + info->si_code = SI_USER; info->si_pid = 0; info->si_uid = 0; } @@ -584,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s) return 1; } +static inline int is_si_special(const struct siginfo *info) +{ + return info <= SEND_SIG_FORCED; +} + +static inline bool si_fromuser(const struct siginfo *info) +{ + return info == SEND_SIG_NOINFO || + (!is_si_special(info) && SI_FROMUSER(info)); +} + /* * Bad permissions for sending the signal * - the caller must hold at least the RCU read lock @@ -598,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return -EINVAL; - if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) + if (!si_fromuser(info)) return 0; error = audit_signal_info(sig, t); /* Let audit system see the signal */ @@ -834,7 +868,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigqueue *q; int override_rlimit; - trace_sched_signal_send(sig, t); + trace_signal_generate(sig, info, t); assert_spin_locked(&t->sighand->siglock); @@ -869,7 +903,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, + q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); @@ -896,12 +930,21 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, break; } } else if (!is_si_special(info)) { - if (sig >= SIGRTMIN && info->si_code != SI_USER) - /* - * Queue overflow, abort. We may abort if the signal was rt - * and sent by user using something other than kill(). - */ + if (sig >= SIGRTMIN && info->si_code != SI_USER) { + /* + * Queue overflow, abort. We may abort if the + * signal was rt and sent by user using something + * other than kill(). + */ + trace_signal_overflow_fail(sig, group, info); return -EAGAIN; + } else { + /* + * This is a silent loss of information. We still + * send the signal, but the *info bits are lost. + */ + trace_signal_lose_info(sig, group, info); + } } out_set: @@ -917,16 +960,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, int from_ancestor_ns = 0; #ifdef CONFIG_PID_NS - if (!is_si_special(info) && SI_FROMUSER(info) && - task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) - from_ancestor_ns = 1; + from_ancestor_ns = si_fromuser(info) && + !task_pid_nr_ns(current, task_active_pid_ns(t)); #endif return __send_signal(sig, info, t, group, from_ancestor_ns); } -int print_fatal_signals; - static void print_fatal_signal(struct pt_regs *regs, int signr) { printk("%s/%d: potentially unexpected fatal signal %d.\n", @@ -1022,12 +1062,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) return ret; } -void -force_sig_specific(int sig, struct task_struct *t) -{ - force_sig_info(sig, SEND_SIG_FORCED, t); -} - /* * Nuke all other threads in the group. */ @@ -1156,8 +1190,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, goto out_unlock; } pcred = __task_cred(p); - if ((info == SEND_SIG_NOINFO || - (!is_si_special(info) && SI_FROMUSER(info))) && + if (si_fromuser(info) && euid != pcred->suid && euid != pcred->uid && uid != pcred->suid && uid != pcred->uid) { ret = -EPERM; @@ -1293,19 +1326,19 @@ EXPORT_SYMBOL(kill_pid); * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers + * expirations or I/O completions". In the case of Posix Timers * we allocate the sigqueue structure from the timer_create. If this * allocation fails we are able to report the failure to the application * with an EAGAIN error. */ - struct sigqueue *sigqueue_alloc(void) { - struct sigqueue *q; + struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); - if ((q = __sigqueue_alloc(current, GFP_KERNEL, 0))) + if (q) q->flags |= SIGQUEUE_PREALLOC; - return(q); + + return q; } void sigqueue_free(struct sigqueue *q) @@ -1807,11 +1840,6 @@ relock: for (;;) { struct k_sigaction *ka; - - if (unlikely(signal->group_stop_count > 0) && - do_signal_stop(0)) - goto relock; - /* * Tracing can induce an artifical signal and choose sigaction. * The return value in @signr determines the default action, @@ -1823,6 +1851,10 @@ relock: if (unlikely(signr != 0)) ka = return_ka; else { + if (unlikely(signal->group_stop_count > 0) && + do_signal_stop(0)) + goto relock; + signr = dequeue_signal(current, ¤t->blocked, info); @@ -1839,6 +1871,9 @@ relock: ka = &sighand->action[signr-1]; } + /* Trace actually delivered signals. */ + trace_signal_deliver(signr, info, ka); + if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c new file mode 100644 index 00000000000..e45c4364529 --- /dev/null +++ b/kernel/slow-work-debugfs.c @@ -0,0 +1,227 @@ +/* Slow work debugging + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/slow-work.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/seq_file.h> +#include "slow-work.h" + +#define ITERATOR_SHIFT (BITS_PER_LONG - 4) +#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) +#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) + +void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) +{ + seq_puts(m, "Slow-work: New thread"); +} + +/* + * Render the time mark field on a work item into a 5-char time with units plus + * a space + */ +static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) +{ + struct timespec now, diff; + + now = CURRENT_TIME; + diff = timespec_sub(now, work->mark); + + if (diff.tv_sec < 0) + seq_puts(m, " -ve "); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) + seq_printf(m, "%3luns ", diff.tv_nsec); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) + seq_printf(m, "%3luus ", diff.tv_nsec / 1000); + else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) + seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); + else if (diff.tv_sec <= 1) + seq_puts(m, " 1s "); + else if (diff.tv_sec < 60) + seq_printf(m, "%4lus ", diff.tv_sec); + else if (diff.tv_sec < 60 * 60) + seq_printf(m, "%4lum ", diff.tv_sec / 60); + else if (diff.tv_sec < 60 * 60 * 24) + seq_printf(m, "%4luh ", diff.tv_sec / 3600); + else + seq_puts(m, "exces "); +} + +/* + * Describe a slow work item for debugfs + */ +static int slow_work_runqueue_show(struct seq_file *m, void *v) +{ + struct slow_work *work; + struct list_head *p = v; + unsigned long id; + + switch ((unsigned long) v) { + case 1: + seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); + return 0; + case 2: + seq_puts(m, "=== ===== ================ == ===== ==========\n"); + return 0; + + case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: + id = (unsigned long) v - 3; + + read_lock(&slow_work_execs_lock); + work = slow_work_execs[id]; + if (work) { + smp_read_barrier_depends(); + + seq_printf(m, "%3lu %5d %16p %2lx ", + id, slow_work_pids[id], work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + } + read_unlock(&slow_work_execs_lock); + return 0; + + default: + work = list_entry(p, struct slow_work, link); + seq_printf(m, "%3s - %16p %2lx ", + work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", + work, work->flags); + slow_work_print_mark(m, work); + + if (work->ops->desc) + work->ops->desc(work, m); + seq_putc(m, '\n'); + return 0; + } +} + +/* + * map the iterator to a work item + */ +static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) +{ + struct list_head *p; + unsigned long count, id; + + switch (*_pos >> ITERATOR_SHIFT) { + case 0x0: + if (*_pos == 0) + *_pos = 1; + if (*_pos < 3) + return (void *)(unsigned long) *_pos; + if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) + for (id = *_pos - 3; + id < SLOW_WORK_THREAD_LIMIT; + id++, (*_pos)++) + if (slow_work_execs[id]) + return (void *)(unsigned long) *_pos; + *_pos = 0x1UL << ITERATOR_SHIFT; + + case 0x1: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &slow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + + case 0x2: + count = *_pos & ITERATOR_COUNTER; + list_for_each(p, &vslow_work_queue) { + if (count == 0) + return p; + count--; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * set up the iterator to start reading from the first line + */ +static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) +{ + spin_lock_irq(&slow_work_queue_lock); + return slow_work_runqueue_index(m, _pos); +} + +/* + * move to the next line + */ +static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) +{ + struct list_head *p = v; + unsigned long selector = *_pos >> ITERATOR_SHIFT; + + (*_pos)++; + switch (selector) { + case 0x0: + return slow_work_runqueue_index(m, _pos); + + case 0x1: + if (*_pos >> ITERATOR_SHIFT == 0x1) { + p = p->next; + if (p != &slow_work_queue) + return p; + } + *_pos = 0x2UL << ITERATOR_SHIFT; + p = &vslow_work_queue; + + case 0x2: + if (*_pos >> ITERATOR_SHIFT == 0x2) { + p = p->next; + if (p != &vslow_work_queue) + return p; + } + *_pos = 0x3UL << ITERATOR_SHIFT; + + default: + return NULL; + } +} + +/* + * clean up after reading + */ +static void slow_work_runqueue_stop(struct seq_file *m, void *v) +{ + spin_unlock_irq(&slow_work_queue_lock); +} + +static const struct seq_operations slow_work_runqueue_ops = { + .start = slow_work_runqueue_start, + .stop = slow_work_runqueue_stop, + .next = slow_work_runqueue_next, + .show = slow_work_runqueue_show, +}; + +/* + * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents + */ +static int slow_work_runqueue_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slow_work_runqueue_ops); +} + +const struct file_operations slow_work_runqueue_fops = { + .owner = THIS_MODULE, + .open = slow_work_runqueue_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 0d31135efbf..7494bbf5a27 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -16,11 +16,8 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/wait.h> - -#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of - * things to do */ -#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after - * OOM */ +#include <linux/debugfs.h> +#include "slow-work.h" static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); @@ -46,13 +43,12 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process #ifdef CONFIG_SYSCTL static const int slow_work_min_min_threads = 2; -static int slow_work_max_max_threads = 255; +static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; static const int slow_work_min_vslow = 1; static const int slow_work_max_vslow = 99; ctl_table slow_work_sysctls[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "min-threads", .data = &slow_work_min_threads, .maxlen = sizeof(unsigned), @@ -62,7 +58,6 @@ ctl_table slow_work_sysctls[] = { .extra2 = &slow_work_max_threads, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "max-threads", .data = &slow_work_max_threads, .maxlen = sizeof(unsigned), @@ -72,16 +67,15 @@ ctl_table slow_work_sysctls[] = { .extra2 = (void *) &slow_work_max_max_threads, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "vslow-percentage", .data = &vslow_work_proportion, .maxlen = sizeof(unsigned), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = (void *) &slow_work_min_vslow, .extra2 = (void *) &slow_work_max_vslow, }, - { .ctl_name = 0 } + {} }; #endif @@ -98,6 +92,56 @@ static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); static struct slow_work slow_work_new_thread; /* new thread starter */ /* + * slow work ID allocation (use slow_work_queue_lock) + */ +static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + +/* + * Unregistration tracking to prevent put_ref() from disappearing during module + * unload + */ +#ifdef CONFIG_MODULES +static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; +static struct module *slow_work_unreg_module; +static struct slow_work *slow_work_unreg_work_item; +static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); +static DEFINE_MUTEX(slow_work_unreg_sync_lock); + +static void slow_work_set_thread_processing(int id, struct slow_work *work) +{ + if (work) + slow_work_thread_processing[id] = work->owner; +} +static void slow_work_done_thread_processing(int id, struct slow_work *work) +{ + struct module *module = slow_work_thread_processing[id]; + + slow_work_thread_processing[id] = NULL; + smp_mb(); + if (slow_work_unreg_work_item == work || + slow_work_unreg_module == module) + wake_up_all(&slow_work_unreg_wq); +} +static void slow_work_clear_thread_processing(int id) +{ + slow_work_thread_processing[id] = NULL; +} +#else +static void slow_work_set_thread_processing(int id, struct slow_work *work) {} +static void slow_work_done_thread_processing(int id, struct slow_work *work) {} +static void slow_work_clear_thread_processing(int id) {} +#endif + +/* + * Data for tracking currently executing items for indication through /proc + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; +pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; +DEFINE_RWLOCK(slow_work_execs_lock); +#endif + +/* * The queues of work items and the lock governing access to them. These are * shared between all the CPUs. It doesn't make sense to have per-CPU queues * as the number of threads bears no relation to the number of CPUs. @@ -105,9 +149,18 @@ static struct slow_work slow_work_new_thread; /* new thread starter */ * There are two queues of work items: one for slow work items, and one for * very slow work items. */ -static LIST_HEAD(slow_work_queue); -static LIST_HEAD(vslow_work_queue); -static DEFINE_SPINLOCK(slow_work_queue_lock); +LIST_HEAD(slow_work_queue); +LIST_HEAD(vslow_work_queue); +DEFINE_SPINLOCK(slow_work_queue_lock); + +/* + * The following are two wait queues that get pinged when a work item is placed + * on an empty queue. These allow work items that are hogging a thread by + * sleeping in a way that could be deferred to yield their thread and enqueue + * themselves. + */ +static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); +static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); /* * The thread controls. A variable used to signal to the threads that they @@ -126,6 +179,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited); static int slow_work_user_count; static DEFINE_MUTEX(slow_work_user_lock); +static inline int slow_work_get_ref(struct slow_work *work) +{ + if (work->ops->get_ref) + return work->ops->get_ref(work); + + return 0; +} + +static inline void slow_work_put_ref(struct slow_work *work) +{ + if (work->ops->put_ref) + work->ops->put_ref(work); +} + /* * Calculate the maximum number of active threads in the pool that are * permitted to process very slow work items. @@ -149,7 +216,7 @@ static unsigned slow_work_calc_vsmax(void) * Attempt to execute stuff queued on a slow thread. Return true if we managed * it, false if there was nothing to do. */ -static bool slow_work_execute(void) +static noinline bool slow_work_execute(int id) { struct slow_work *work = NULL; unsigned vsmax; @@ -186,6 +253,13 @@ static bool slow_work_execute(void) } else { very_slow = false; /* avoid the compiler warning */ } + + slow_work_set_thread_processing(id, work); + if (work) { + slow_work_mark_time(work); + slow_work_begin_exec(id, work); + } + spin_unlock_irq(&slow_work_queue_lock); if (!work) @@ -194,12 +268,19 @@ static bool slow_work_execute(void) if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) BUG(); - work->ops->execute(work); + /* don't execute if the work is in the process of being cancelled */ + if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) + work->ops->execute(work); if (very_slow) atomic_dec(&vslow_work_executing_count); clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); + /* wake up anyone waiting for this work to be complete */ + wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); + + slow_work_end_exec(id, work); + /* if someone tried to enqueue the item whilst we were executing it, * then it'll be left unenqueued to avoid multiple threads trying to * execute it simultaneously @@ -219,7 +300,10 @@ static bool slow_work_execute(void) spin_unlock_irq(&slow_work_queue_lock); } - work->ops->put_ref(work); + /* sort out the race between module unloading and put_ref() */ + slow_work_put_ref(work); + slow_work_done_thread_processing(id, work); + return true; auto_requeue: @@ -227,15 +311,61 @@ auto_requeue: * - we transfer our ref on the item back to the appropriate queue * - don't wake another thread up as we're awake already */ + slow_work_mark_time(work); if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) list_add_tail(&work->link, &vslow_work_queue); else list_add_tail(&work->link, &slow_work_queue); spin_unlock_irq(&slow_work_queue_lock); + slow_work_clear_thread_processing(id); return true; } /** + * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work + * work: The work item under execution that wants to sleep + * _timeout: Scheduler sleep timeout + * + * Allow a requeueable work item to sleep on a slow-work processor thread until + * that thread is needed to do some other work or the sleep is interrupted by + * some other event. + * + * The caller must set up a wake up event before calling this and must have set + * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own + * condition before calling this function as no test is made here. + * + * False is returned if there is nothing on the queue; true is returned if the + * work item should be requeued + */ +bool slow_work_sleep_till_thread_needed(struct slow_work *work, + signed long *_timeout) +{ + wait_queue_head_t *wfo_wq; + struct list_head *queue; + + DEFINE_WAIT(wait); + + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + + if (!list_empty(queue)) + return true; + + add_wait_queue_exclusive(wfo_wq, &wait); + if (list_empty(queue)) + *_timeout = schedule_timeout(*_timeout); + finish_wait(wfo_wq, &wait); + + return !list_empty(queue); +} +EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); + +/** * slow_work_enqueue - Schedule a slow work item for processing * @work: The work item to queue * @@ -260,16 +390,22 @@ auto_requeue: * allowed to pick items to execute. This ensures that very slow items won't * overly block ones that are just ordinarily slow. * - * Returns 0 if successful, -EAGAIN if not. + * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is + * attempted queued) */ int slow_work_enqueue(struct slow_work *work) { + wait_queue_head_t *wfo_wq; + struct list_head *queue; unsigned long flags; + int ret; + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; BUG_ON(slow_work_user_count <= 0); BUG_ON(!work); BUG_ON(!work->ops); - BUG_ON(!work->ops->get_ref); /* when honouring an enqueue request, we only promise that we will run * the work function in the future; we do not promise to run it once @@ -280,8 +416,19 @@ int slow_work_enqueue(struct slow_work *work) * maintaining our promise */ if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + spin_lock_irqsave(&slow_work_queue_lock, flags); + if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) + goto cancelled; + /* we promise that we will not attempt to execute the work * function in more than one thread simultaneously * @@ -299,25 +446,221 @@ int slow_work_enqueue(struct slow_work *work) if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); } else { - if (work->ops->get_ref(work) < 0) - goto cant_get_ref; - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) - list_add_tail(&work->link, &vslow_work_queue); - else - list_add_tail(&work->link, &slow_work_queue); + ret = slow_work_get_ref(work); + if (ret < 0) + goto failed; + slow_work_mark_time(work); + list_add_tail(&work->link, queue); wake_up(&slow_work_thread_wq); + + /* if someone who could be requeued is sleeping on a + * thread, then ask them to yield their thread */ + if (work->link.prev == queue) + wake_up(wfo_wq); } spin_unlock_irqrestore(&slow_work_queue_lock, flags); } return 0; -cant_get_ref: +cancelled: + ret = -ECANCELED; +failed: spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return -EAGAIN; + return ret; } EXPORT_SYMBOL(slow_work_enqueue); +static int slow_work_wait(void *word) +{ + schedule(); + return 0; +} + +/** + * slow_work_cancel - Cancel a slow work item + * @work: The work item to cancel + * + * This function will cancel a previously enqueued work item. If we cannot + * cancel the work item, it is guarenteed to have run when this function + * returns. + */ +void slow_work_cancel(struct slow_work *work) +{ + bool wait = true, put = false; + + set_bit(SLOW_WORK_CANCELLING, &work->flags); + smp_mb(); + + /* if the work item is a delayed work item with an active timer, we + * need to wait for the timer to finish _before_ getting the spinlock, + * lest we deadlock against the timer routine + * + * the timer routine will leave DELAYED set if it notices the + * CANCELLING flag in time + */ + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + del_timer_sync(&dwork->timer); + } + + spin_lock_irq(&slow_work_queue_lock); + + if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { + /* the timer routine aborted or never happened, so we are left + * holding the timer's reference on the item and should just + * drop the pending flag and wait for any ongoing execution to + * finish */ + struct delayed_slow_work *dwork = + container_of(work, struct delayed_slow_work, work); + + BUG_ON(timer_pending(&dwork->timer)); + BUG_ON(!list_empty(&work->link)); + + clear_bit(SLOW_WORK_DELAYED, &work->flags); + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && + !list_empty(&work->link)) { + /* the link in the pending queue holds a reference on the item + * that we will need to release */ + list_del_init(&work->link); + wait = false; + put = true; + clear_bit(SLOW_WORK_PENDING, &work->flags); + + } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { + /* the executor is holding our only reference on the item, so + * we merely need to wait for it to finish executing */ + clear_bit(SLOW_WORK_PENDING, &work->flags); + } + + spin_unlock_irq(&slow_work_queue_lock); + + /* the EXECUTING flag is set by the executor whilst the spinlock is set + * and before the item is dequeued - so assuming the above doesn't + * actually dequeue it, simply waiting for the EXECUTING flag to be + * released here should be sufficient */ + if (wait) + wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, + TASK_UNINTERRUPTIBLE); + + clear_bit(SLOW_WORK_CANCELLING, &work->flags); + if (put) + slow_work_put_ref(work); +} +EXPORT_SYMBOL(slow_work_cancel); + +/* + * Handle expiry of the delay timer, indicating that a delayed slow work item + * should now be queued if not cancelled + */ +static void delayed_slow_work_timer(unsigned long data) +{ + wait_queue_head_t *wfo_wq; + struct list_head *queue; + struct slow_work *work = (struct slow_work *) data; + unsigned long flags; + bool queued = false, put = false, first = false; + + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { + wfo_wq = &vslow_work_queue_waits_for_occupation; + queue = &vslow_work_queue; + } else { + wfo_wq = &slow_work_queue_waits_for_occupation; + queue = &slow_work_queue; + } + + spin_lock_irqsave(&slow_work_queue_lock, flags); + if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { + clear_bit(SLOW_WORK_DELAYED, &work->flags); + + if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { + /* we discard the reference the timer was holding in + * favour of the one the executor holds */ + set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); + put = true; + } else { + slow_work_mark_time(work); + list_add_tail(&work->link, queue); + queued = true; + if (work->link.prev == queue) + first = true; + } + } + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + if (put) + slow_work_put_ref(work); + if (first) + wake_up(wfo_wq); + if (queued) + wake_up(&slow_work_thread_wq); +} + +/** + * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing + * @dwork: The delayed work item to queue + * @delay: When to start executing the work, in jiffies from now + * + * This is similar to slow_work_enqueue(), but it adds a delay before the work + * is actually queued for processing. + * + * The item can have delayed processing requested on it whilst it is being + * executed. The delay will begin immediately, and if it expires before the + * item finishes executing, the item will be placed back on the queue when it + * has done executing. + */ +int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, + unsigned long delay) +{ + struct slow_work *work = &dwork->work; + unsigned long flags; + int ret; + + if (delay == 0) + return slow_work_enqueue(&dwork->work); + + BUG_ON(slow_work_user_count <= 0); + BUG_ON(!work); + BUG_ON(!work->ops); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + return -ECANCELED; + + if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + spin_lock_irqsave(&slow_work_queue_lock, flags); + + if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) + goto cancelled; + + /* the timer holds a reference whilst it is pending */ + ret = work->ops->get_ref(work); + if (ret < 0) + goto cant_get_ref; + + if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) + BUG(); + dwork->timer.expires = jiffies + delay; + dwork->timer.data = (unsigned long) work; + dwork->timer.function = delayed_slow_work_timer; + add_timer(&dwork->timer); + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + } + + return 0; + +cancelled: + ret = -ECANCELED; +cant_get_ref: + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + return ret; +} +EXPORT_SYMBOL(delayed_slow_work_enqueue); + /* * Schedule a cull of the thread pool at some time in the near future */ @@ -368,13 +711,23 @@ static inline bool slow_work_available(int vsmax) */ static int slow_work_thread(void *_data) { - int vsmax; + int vsmax, id; DEFINE_WAIT(wait); set_freezable(); set_user_nice(current, -5); + /* allocate ourselves an ID */ + spin_lock_irq(&slow_work_queue_lock); + id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); + BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); + __set_bit(id, slow_work_ids); + slow_work_set_thread_pid(id, current->pid); + spin_unlock_irq(&slow_work_queue_lock); + + sprintf(current->comm, "kslowd%03u", id); + for (;;) { vsmax = vslow_work_proportion; vsmax *= atomic_read(&slow_work_thread_count); @@ -395,7 +748,7 @@ static int slow_work_thread(void *_data) vsmax *= atomic_read(&slow_work_thread_count); vsmax /= 100; - if (slow_work_available(vsmax) && slow_work_execute()) { + if (slow_work_available(vsmax) && slow_work_execute(id)) { cond_resched(); if (list_empty(&slow_work_queue) && list_empty(&vslow_work_queue) && @@ -412,6 +765,11 @@ static int slow_work_thread(void *_data) break; } + spin_lock_irq(&slow_work_queue_lock); + slow_work_set_thread_pid(id, 0); + __clear_bit(id, slow_work_ids); + spin_unlock_irq(&slow_work_queue_lock); + if (atomic_dec_and_test(&slow_work_thread_count)) complete_and_exit(&slow_work_last_thread_exited, 0); return 0; @@ -427,21 +785,6 @@ static void slow_work_cull_timeout(unsigned long data) } /* - * Get a reference on slow work thread starter - */ -static int slow_work_new_thread_get_ref(struct slow_work *work) -{ - return 0; -} - -/* - * Drop a reference on slow work thread starter - */ -static void slow_work_new_thread_put_ref(struct slow_work *work) -{ -} - -/* * Start a new slow work thread */ static void slow_work_new_thread_execute(struct slow_work *work) @@ -475,9 +818,11 @@ static void slow_work_new_thread_execute(struct slow_work *work) } static const struct slow_work_ops slow_work_new_thread_ops = { - .get_ref = slow_work_new_thread_get_ref, - .put_ref = slow_work_new_thread_put_ref, + .owner = THIS_MODULE, .execute = slow_work_new_thread_execute, +#ifdef CONFIG_SLOW_WORK_DEBUG + .desc = slow_work_new_thread_desc, +#endif }; /* @@ -546,12 +891,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, /** * slow_work_register_user - Register a user of the facility + * @module: The module about to make use of the facility * * Register a user of the facility, starting up the initial threads if there * aren't any other users at this point. This will return 0 if successful, or * an error if not. */ -int slow_work_register_user(void) +int slow_work_register_user(struct module *module) { struct task_struct *p; int loop; @@ -598,14 +944,81 @@ error: } EXPORT_SYMBOL(slow_work_register_user); +/* + * wait for all outstanding items from the calling module to complete + * - note that more items may be queued whilst we're waiting + */ +static void slow_work_wait_for_items(struct module *module) +{ +#ifdef CONFIG_MODULES + DECLARE_WAITQUEUE(myself, current); + struct slow_work *work; + int loop; + + mutex_lock(&slow_work_unreg_sync_lock); + add_wait_queue(&slow_work_unreg_wq, &myself); + + for (;;) { + spin_lock_irq(&slow_work_queue_lock); + + /* first of all, we wait for the last queued item in each list + * to be processed */ + list_for_each_entry_reverse(work, &vslow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + list_for_each_entry_reverse(work, &slow_work_queue, link) { + if (work->owner == module) { + set_current_state(TASK_UNINTERRUPTIBLE); + slow_work_unreg_work_item = work; + goto do_wait; + } + } + + /* then we wait for the items being processed to finish */ + slow_work_unreg_module = module; + smp_mb(); + for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { + if (slow_work_thread_processing[loop] == module) + goto do_wait; + } + spin_unlock_irq(&slow_work_queue_lock); + break; /* okay, we're done */ + + do_wait: + spin_unlock_irq(&slow_work_queue_lock); + schedule(); + slow_work_unreg_work_item = NULL; + slow_work_unreg_module = NULL; + } + + remove_wait_queue(&slow_work_unreg_wq, &myself); + mutex_unlock(&slow_work_unreg_sync_lock); +#endif /* CONFIG_MODULES */ +} + /** * slow_work_unregister_user - Unregister a user of the facility + * @module: The module whose items should be cleared * * Unregister a user of the facility, killing all the threads if this was the * last one. + * + * This waits for all the work items belonging to the nominated module to go + * away before proceeding. */ -void slow_work_unregister_user(void) +void slow_work_unregister_user(struct module *module) { + /* first of all, wait for all outstanding items from the calling module + * to complete */ + if (module) + slow_work_wait_for_items(module); + + /* then we can actually go about shutting down the facility if need + * be */ mutex_lock(&slow_work_user_lock); BUG_ON(slow_work_user_count <= 0); @@ -639,6 +1052,16 @@ static int __init init_slow_work(void) if (slow_work_max_max_threads < nr_cpus * 2) slow_work_max_max_threads = nr_cpus * 2; #endif +#ifdef CONFIG_SLOW_WORK_DEBUG + { + struct dentry *dbdir; + + dbdir = debugfs_create_dir("slow_work", NULL); + if (dbdir && !IS_ERR(dbdir)) + debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, + NULL, &slow_work_runqueue_fops); + } +#endif return 0; } diff --git a/kernel/slow-work.h b/kernel/slow-work.h new file mode 100644 index 00000000000..321f3c59d73 --- /dev/null +++ b/kernel/slow-work.h @@ -0,0 +1,72 @@ +/* Slow work private definitions + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of + * things to do */ +#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after + * OOM */ + +#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ + +/* + * slow-work.c + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +extern struct slow_work *slow_work_execs[]; +extern pid_t slow_work_pids[]; +extern rwlock_t slow_work_execs_lock; +#endif + +extern struct list_head slow_work_queue; +extern struct list_head vslow_work_queue; +extern spinlock_t slow_work_queue_lock; + +/* + * slow-work-debugfs.c + */ +#ifdef CONFIG_SLOW_WORK_DEBUG +extern const struct file_operations slow_work_runqueue_fops; + +extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); +#endif + +/* + * Helper functions + */ +static inline void slow_work_set_thread_pid(int id, pid_t pid) +{ +#ifdef CONFIG_SLOW_WORK_PROC + slow_work_pids[id] = pid; +#endif +} + +static inline void slow_work_mark_time(struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + work->mark = CURRENT_TIME; +#endif +} + +static inline void slow_work_begin_exec(int id, struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + slow_work_execs[id] = work; +#endif +} + +static inline void slow_work_end_exec(int id, struct slow_work *work) +{ +#ifdef CONFIG_SLOW_WORK_PROC + write_lock(&slow_work_execs_lock); + slow_work_execs[id] = NULL; + write_unlock(&slow_work_execs_lock); +#endif +} diff --git a/kernel/smp.c b/kernel/smp.c index c9d1c7835c2..de735a6637d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -16,11 +16,11 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); static struct { struct list_head queue; - spinlock_t lock; + raw_spinlock_t lock; } call_function __cacheline_aligned_in_smp = { .queue = LIST_HEAD_INIT(call_function.queue), - .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), }; enum { @@ -35,7 +35,7 @@ struct call_function_data { struct call_single_queue { struct list_head list; - spinlock_t lock; + raw_spinlock_t lock; }; static DEFINE_PER_CPU(struct call_function_data, cfd_data); @@ -80,7 +80,7 @@ static int __cpuinit init_call_single_data(void) for_each_possible_cpu(i) { struct call_single_queue *q = &per_cpu(call_single_queue, i); - spin_lock_init(&q->lock); + raw_spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } @@ -141,10 +141,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) unsigned long flags; int ipi; - spin_lock_irqsave(&dst->lock, flags); + raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); list_add_tail(&data->list, &dst->list); - spin_unlock_irqrestore(&dst->lock, flags); + raw_spin_unlock_irqrestore(&dst->lock, flags); /* * The list addition should be visible before sending the IPI @@ -171,7 +171,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) void generic_smp_call_function_interrupt(void) { struct call_function_data *data; - int cpu = get_cpu(); + int cpu = smp_processor_id(); /* * Shouldn't receive this interrupt on a cpu that is not yet online. @@ -201,9 +201,9 @@ void generic_smp_call_function_interrupt(void) refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); if (!refs) { - spin_lock(&call_function.lock); + raw_spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); - spin_unlock(&call_function.lock); + raw_spin_unlock(&call_function.lock); } if (refs) @@ -212,7 +212,6 @@ void generic_smp_call_function_interrupt(void) csd_unlock(&data->csd); } - put_cpu(); } /* @@ -230,9 +229,9 @@ void generic_smp_call_function_single_interrupt(void) */ WARN_ON_ONCE(!cpu_online(smp_processor_id())); - spin_lock(&q->lock); + raw_spin_lock(&q->lock); list_replace_init(&q->list, &list); - spin_unlock(&q->lock); + raw_spin_unlock(&q->lock); while (!list_empty(&list)) { struct call_single_data *data; @@ -265,9 +264,7 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data); * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. + * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) @@ -321,6 +318,51 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +/* + * smp_call_function_any - Run a function on any of the given cpus + * @mask: The mask of cpus it can run on. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait until function has completed. + * + * Returns 0 on success, else a negative status code (if no cpus were online). + * Note that @wait will be implicitly turned on in case of allocation failures, + * since we fall back to on-stack allocation. + * + * Selection preference: + * 1) current cpu if in @mask + * 2) any cpu of current node if in @mask + * 3) any other online cpu in @mask + */ +int smp_call_function_any(const struct cpumask *mask, + void (*func)(void *info), void *info, int wait) +{ + unsigned int cpu; + const struct cpumask *nodemask; + int ret; + + /* Try for same CPU (cheapest) */ + cpu = get_cpu(); + if (cpumask_test_cpu(cpu, mask)) + goto call; + + /* Try for same node. */ + nodemask = cpumask_of_node(cpu); + for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; + cpu = cpumask_next_and(cpu, nodemask, mask)) { + if (cpu_online(cpu)) + goto call; + } + + /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ + cpu = cpumask_any_and(mask, cpu_online_mask); +call: + ret = smp_call_function_single(cpu, func, info, wait); + put_cpu(); + return ret; +} +EXPORT_SYMBOL_GPL(smp_call_function_any); + /** * __smp_call_function_single(): Run a function on another CPU * @cpu: The CPU to run on. @@ -355,9 +397,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, * @wait: If true, wait (atomically) until function has completed * on other CPUs. * - * If @wait is true, then returns once @func has returned. Note that @wait - * will be implicitly turned on in case of allocation failures, since - * we fall back to on-stack allocation. + * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption @@ -408,14 +448,14 @@ void smp_call_function_many(const struct cpumask *mask, cpumask_clear_cpu(this_cpu, data->cpumask); atomic_set(&data->refs, cpumask_weight(data->cpumask)); - spin_lock_irqsave(&call_function.lock, flags); + raw_spin_lock_irqsave(&call_function.lock, flags); /* * Place entry at the _HEAD_ of the list, so that any cpu still * observing the entry in generic_smp_call_function_interrupt() * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); - spin_unlock_irqrestore(&call_function.lock, flags); + raw_spin_unlock_irqrestore(&call_function.lock, flags); /* * Make the list addition visible before sending the ipi. @@ -443,8 +483,7 @@ EXPORT_SYMBOL(smp_call_function_many); * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise - * it returns just before the target cpu calls @func. In case of allocation - * failure, @wait will be implicitly turned on. + * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. @@ -461,20 +500,20 @@ EXPORT_SYMBOL(smp_call_function); void ipi_call_lock(void) { - spin_lock(&call_function.lock); + raw_spin_lock(&call_function.lock); } void ipi_call_unlock(void) { - spin_unlock(&call_function.lock); + raw_spin_unlock(&call_function.lock); } void ipi_call_lock_irq(void) { - spin_lock_irq(&call_function.lock); + raw_spin_lock_irq(&call_function.lock); } void ipi_call_unlock_irq(void) { - spin_unlock_irq(&call_function.lock); + raw_spin_unlock_irq(&call_function.lock); } diff --git a/kernel/softirq.c b/kernel/softirq.c index f8749e5216e..a09502e2ef7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -302,9 +302,9 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); + rcu_irq_exit(); #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ - rcu_irq_exit(); if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_stop_sched_tick(0); #endif @@ -697,7 +697,7 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int ksoftirqd(void * __bind_cpu) +static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); @@ -810,7 +810,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return NOTIFY_BAD; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 81324d12eb3..d22579087e2 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -22,9 +22,9 @@ static DEFINE_SPINLOCK(print_lock); -static DEFINE_PER_CPU(unsigned long, touch_timestamp); -static DEFINE_PER_CPU(unsigned long, print_timestamp); -static DEFINE_PER_CPU(struct task_struct *, watchdog_task); +static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ +static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ +static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -70,12 +70,12 @@ static void __touch_softlockup_watchdog(void) { int this_cpu = raw_smp_processor_id(); - __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); + __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); } void touch_softlockup_watchdog(void) { - __raw_get_cpu_var(touch_timestamp) = 0; + __raw_get_cpu_var(softlockup_touch_ts) = 0; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -85,7 +85,7 @@ void touch_all_softlockup_watchdogs(void) /* Cause each CPU to re-update its timestamp rather than complain */ for_each_online_cpu(cpu) - per_cpu(touch_timestamp, cpu) = 0; + per_cpu(softlockup_touch_ts, cpu) = 0; } EXPORT_SYMBOL(touch_all_softlockup_watchdogs); @@ -104,28 +104,28 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write, void softlockup_tick(void) { int this_cpu = smp_processor_id(); - unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); - unsigned long print_timestamp; + unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); + unsigned long print_ts; struct pt_regs *regs = get_irq_regs(); unsigned long now; /* Is detection switched off? */ - if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { + if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { /* Be sure we don't false trigger if switched back on */ - if (touch_timestamp) - per_cpu(touch_timestamp, this_cpu) = 0; + if (touch_ts) + per_cpu(softlockup_touch_ts, this_cpu) = 0; return; } - if (touch_timestamp == 0) { + if (touch_ts == 0) { __touch_softlockup_watchdog(); return; } - print_timestamp = per_cpu(print_timestamp, this_cpu); + print_ts = per_cpu(softlockup_print_ts, this_cpu); /* report at most once a second */ - if (print_timestamp == touch_timestamp || did_panic) + if (print_ts == touch_ts || did_panic) return; /* do not print during early bootup: */ @@ -140,18 +140,18 @@ void softlockup_tick(void) * Wake up the high-prio watchdog task twice per * threshold timespan. */ - if (now > touch_timestamp + softlockup_thresh/2) - wake_up_process(per_cpu(watchdog_task, this_cpu)); + if (now > touch_ts + softlockup_thresh/2) + wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); /* Warn about unreasonable delays: */ - if (now <= (touch_timestamp + softlockup_thresh)) + if (now <= (touch_ts + softlockup_thresh)) return; - per_cpu(print_timestamp, this_cpu) = touch_timestamp; + per_cpu(softlockup_print_ts, this_cpu) = touch_ts; spin_lock(&print_lock); printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", - this_cpu, now - touch_timestamp, + this_cpu, now - touch_ts, current->comm, task_pid_nr(current)); print_modules(); print_irqtrace_events(current); @@ -209,32 +209,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - BUG_ON(per_cpu(watchdog_task, hotcpu)); + BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); if (IS_ERR(p)) { printk(KERN_ERR "watchdog for %i failed\n", hotcpu); return NOTIFY_BAD; } - per_cpu(touch_timestamp, hotcpu) = 0; - per_cpu(watchdog_task, hotcpu) = p; + per_cpu(softlockup_touch_ts, hotcpu) = 0; + per_cpu(softlockup_watchdog, hotcpu) = p; kthread_bind(p, hotcpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(watchdog_task, hotcpu)); + wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(watchdog_task, hotcpu)) + if (!per_cpu(softlockup_watchdog, hotcpu)) break; /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(watchdog_task, hotcpu), + kthread_bind(per_cpu(softlockup_watchdog, hotcpu), cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: - p = per_cpu(watchdog_task, hotcpu); - per_cpu(watchdog_task, hotcpu) = NULL; + p = per_cpu(softlockup_watchdog, hotcpu); + per_cpu(softlockup_watchdog, hotcpu) = NULL; kthread_stop(p); break; #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5ddab730cb2..be6517fb9c1 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -21,193 +21,72 @@ #include <linux/debug_locks.h> #include <linux/module.h> -#ifndef _spin_trylock -int __lockfunc _spin_trylock(spinlock_t *lock) -{ - return __spin_trylock(lock); -} -EXPORT_SYMBOL(_spin_trylock); -#endif - -#ifndef _read_trylock -int __lockfunc _read_trylock(rwlock_t *lock) -{ - return __read_trylock(lock); -} -EXPORT_SYMBOL(_read_trylock); -#endif - -#ifndef _write_trylock -int __lockfunc _write_trylock(rwlock_t *lock) -{ - return __write_trylock(lock); -} -EXPORT_SYMBOL(_write_trylock); -#endif - /* * If lockdep is enabled then we use the non-preemption spin-ops * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) - -#ifndef _read_lock -void __lockfunc _read_lock(rwlock_t *lock) -{ - __read_lock(lock); -} -EXPORT_SYMBOL(_read_lock); -#endif - -#ifndef _spin_lock_irqsave -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) -{ - return __spin_lock_irqsave(lock); -} -EXPORT_SYMBOL(_spin_lock_irqsave); -#endif - -#ifndef _spin_lock_irq -void __lockfunc _spin_lock_irq(spinlock_t *lock) -{ - __spin_lock_irq(lock); -} -EXPORT_SYMBOL(_spin_lock_irq); -#endif - -#ifndef _spin_lock_bh -void __lockfunc _spin_lock_bh(spinlock_t *lock) -{ - __spin_lock_bh(lock); -} -EXPORT_SYMBOL(_spin_lock_bh); -#endif - -#ifndef _read_lock_irqsave -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) -{ - return __read_lock_irqsave(lock); -} -EXPORT_SYMBOL(_read_lock_irqsave); -#endif - -#ifndef _read_lock_irq -void __lockfunc _read_lock_irq(rwlock_t *lock) -{ - __read_lock_irq(lock); -} -EXPORT_SYMBOL(_read_lock_irq); -#endif - -#ifndef _read_lock_bh -void __lockfunc _read_lock_bh(rwlock_t *lock) -{ - __read_lock_bh(lock); -} -EXPORT_SYMBOL(_read_lock_bh); -#endif - -#ifndef _write_lock_irqsave -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) -{ - return __write_lock_irqsave(lock); -} -EXPORT_SYMBOL(_write_lock_irqsave); -#endif - -#ifndef _write_lock_irq -void __lockfunc _write_lock_irq(rwlock_t *lock) -{ - __write_lock_irq(lock); -} -EXPORT_SYMBOL(_write_lock_irq); -#endif - -#ifndef _write_lock_bh -void __lockfunc _write_lock_bh(rwlock_t *lock) -{ - __write_lock_bh(lock); -} -EXPORT_SYMBOL(_write_lock_bh); -#endif - -#ifndef _spin_lock -void __lockfunc _spin_lock(spinlock_t *lock) -{ - __spin_lock(lock); -} -EXPORT_SYMBOL(_spin_lock); -#endif - -#ifndef _write_lock -void __lockfunc _write_lock(rwlock_t *lock) -{ - __write_lock(lock); -} -EXPORT_SYMBOL(_write_lock); -#endif - -#else /* CONFIG_PREEMPT: */ - /* + * The __lock_function inlines are taken from + * include/linux/spinlock_api_smp.h + */ +#else +#define raw_read_can_lock(l) read_can_lock(l) +#define raw_write_can_lock(l) write_can_lock(l) +/* + * We build the __lock_function inlines here. They are too large for + * inlining all over the place, but here is only one user per function + * which embedds them into the calling _lock_function below. + * * This could be a long-held lock. We both prepare to spin for a long * time (making _this_ CPU preemptable if possible), and we also signal * towards that other CPU that it should break the lock ASAP. - * - * (We do this in a function because inlining it would be excessive.) */ - #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype##_t *lock) \ +void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ { \ for (;;) { \ preempt_disable(); \ - if (likely(_raw_##op##_trylock(lock))) \ + if (likely(do_raw_##op##_trylock(lock))) \ break; \ preempt_enable(); \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ } \ \ -EXPORT_SYMBOL(_##op##_lock); \ - \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock) \ +unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ { \ unsigned long flags; \ \ for (;;) { \ preempt_disable(); \ local_irq_save(flags); \ - if (likely(_raw_##op##_trylock(lock))) \ + if (likely(do_raw_##op##_trylock(lock))) \ break; \ local_irq_restore(flags); \ preempt_enable(); \ \ if (!(lock)->break_lock) \ (lock)->break_lock = 1; \ - while (!op##_can_lock(lock) && (lock)->break_lock) \ - _raw_##op##_relax(&lock->raw_lock); \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ } \ (lock)->break_lock = 0; \ return flags; \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ - \ -void __lockfunc _##op##_lock_irq(locktype##_t *lock) \ +void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ { \ - _##op##_lock_irqsave(lock); \ + _raw_##op##_lock_irqsave(lock); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irq); \ - \ -void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ +void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ { \ unsigned long flags; \ \ @@ -216,164 +95,283 @@ void __lockfunc _##op##_lock_bh(locktype##_t *lock) \ /* irq-disabling. We use the generic preemption-aware */ \ /* function: */ \ /**/ \ - flags = _##op##_lock_irqsave(lock); \ + flags = _raw_##op##_lock_irqsave(lock); \ local_bh_disable(); \ local_irq_restore(flags); \ } \ - \ -EXPORT_SYMBOL(_##op##_lock_bh) /* * Build preemption-friendly versions of the following * lock-spinning functions: * - * _[spin|read|write]_lock() - * _[spin|read|write]_lock_irq() - * _[spin|read|write]_lock_irqsave() - * _[spin|read|write]_lock_bh() + * __[spin|read|write]_lock() + * __[spin|read|write]_lock_irq() + * __[spin|read|write]_lock_irqsave() + * __[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, spinlock); +BUILD_LOCK_OPS(spin, raw_spinlock); BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); -#endif /* CONFIG_PREEMPT */ +#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#ifndef CONFIG_INLINE_SPIN_TRYLOCK +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +{ + return __raw_spin_trylock(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock); +#endif -void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) +#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH +int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + return __raw_spin_trylock_bh(lock); } -EXPORT_SYMBOL(_spin_lock_nested); +EXPORT_SYMBOL(_raw_spin_trylock_bh); +#endif -unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) +#ifndef CONFIG_INLINE_SPIN_LOCK +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { - unsigned long flags; + __raw_spin_lock(lock); +} +EXPORT_SYMBOL(_raw_spin_lock); +#endif - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock, - _raw_spin_lock_flags, &flags); - return flags; +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE +unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +{ + return __raw_spin_lock_irqsave(lock); } -EXPORT_SYMBOL(_spin_lock_irqsave_nested); +EXPORT_SYMBOL(_raw_spin_lock_irqsave); +#endif -void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, - struct lockdep_map *nest_lock) +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { - preempt_disable(); - spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); - LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); + __raw_spin_lock_irq(lock); } -EXPORT_SYMBOL(_spin_lock_nest_lock); +EXPORT_SYMBOL(_raw_spin_lock_irq); +#endif +#ifndef CONFIG_INLINE_SPIN_LOCK_BH +void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +{ + __raw_spin_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh); #endif -#ifndef _spin_unlock -void __lockfunc _spin_unlock(spinlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { - __spin_unlock(lock); + __raw_spin_unlock(lock); } -EXPORT_SYMBOL(_spin_unlock); +EXPORT_SYMBOL(_raw_spin_unlock); #endif -#ifndef _write_unlock -void __lockfunc _write_unlock(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE +void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { - __write_unlock(lock); + __raw_spin_unlock_irqrestore(lock, flags); } -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); #endif -#ifndef _read_unlock -void __lockfunc _read_unlock(rwlock_t *lock) +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { - __read_unlock(lock); + __raw_spin_unlock_irq(lock); } -EXPORT_SYMBOL(_read_unlock); +EXPORT_SYMBOL(_raw_spin_unlock_irq); #endif -#ifndef _spin_unlock_irqrestore -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH +void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { - __spin_unlock_irqrestore(lock, flags); + __raw_spin_unlock_bh(lock); } -EXPORT_SYMBOL(_spin_unlock_irqrestore); +EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif -#ifndef _spin_unlock_irq -void __lockfunc _spin_unlock_irq(spinlock_t *lock) +#ifndef CONFIG_INLINE_READ_TRYLOCK +int __lockfunc _raw_read_trylock(rwlock_t *lock) { - __spin_unlock_irq(lock); + return __raw_read_trylock(lock); } -EXPORT_SYMBOL(_spin_unlock_irq); +EXPORT_SYMBOL(_raw_read_trylock); #endif -#ifndef _spin_unlock_bh -void __lockfunc _spin_unlock_bh(spinlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK +void __lockfunc _raw_read_lock(rwlock_t *lock) { - __spin_unlock_bh(lock); + __raw_read_lock(lock); } -EXPORT_SYMBOL(_spin_unlock_bh); +EXPORT_SYMBOL(_raw_read_lock); #endif -#ifndef _read_unlock_irqrestore -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE +unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) { - __read_unlock_irqrestore(lock, flags); + return __raw_read_lock_irqsave(lock); } -EXPORT_SYMBOL(_read_unlock_irqrestore); +EXPORT_SYMBOL(_raw_read_lock_irqsave); #endif -#ifndef _read_unlock_irq -void __lockfunc _read_unlock_irq(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK_IRQ +void __lockfunc _raw_read_lock_irq(rwlock_t *lock) { - __read_unlock_irq(lock); + __raw_read_lock_irq(lock); } -EXPORT_SYMBOL(_read_unlock_irq); +EXPORT_SYMBOL(_raw_read_lock_irq); #endif -#ifndef _read_unlock_bh -void __lockfunc _read_unlock_bh(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_LOCK_BH +void __lockfunc _raw_read_lock_bh(rwlock_t *lock) { - __read_unlock_bh(lock); + __raw_read_lock_bh(lock); } -EXPORT_SYMBOL(_read_unlock_bh); +EXPORT_SYMBOL(_raw_read_lock_bh); #endif -#ifndef _write_unlock_irqrestore -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +#ifndef CONFIG_INLINE_READ_UNLOCK +void __lockfunc _raw_read_unlock(rwlock_t *lock) { - __write_unlock_irqrestore(lock, flags); + __raw_read_unlock(lock); } -EXPORT_SYMBOL(_write_unlock_irqrestore); +EXPORT_SYMBOL(_raw_read_unlock); #endif -#ifndef _write_unlock_irq -void __lockfunc _write_unlock_irq(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE +void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { - __write_unlock_irq(lock); + __raw_read_unlock_irqrestore(lock, flags); } -EXPORT_SYMBOL(_write_unlock_irq); +EXPORT_SYMBOL(_raw_read_unlock_irqrestore); #endif -#ifndef _write_unlock_bh -void __lockfunc _write_unlock_bh(rwlock_t *lock) +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ +void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) { - __write_unlock_bh(lock); + __raw_read_unlock_irq(lock); } -EXPORT_SYMBOL(_write_unlock_bh); +EXPORT_SYMBOL(_raw_read_unlock_irq); #endif -#ifndef _spin_trylock_bh -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +#ifndef CONFIG_INLINE_READ_UNLOCK_BH +void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) { - return __spin_trylock_bh(lock); + __raw_read_unlock_bh(lock); } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(_raw_read_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_TRYLOCK +int __lockfunc _raw_write_trylock(rwlock_t *lock) +{ + return __raw_write_trylock(lock); +} +EXPORT_SYMBOL(_raw_write_trylock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK +void __lockfunc _raw_write_lock(rwlock_t *lock) +{ + __raw_write_lock(lock); +} +EXPORT_SYMBOL(_raw_write_lock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +{ + return __raw_write_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ +void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +{ + __raw_write_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_BH +void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +{ + __raw_write_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_lock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK +void __lockfunc _raw_write_unlock(rwlock_t *lock) +{ + __raw_write_unlock(lock); +} +EXPORT_SYMBOL(_raw_write_unlock); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE +void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + __raw_write_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_write_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ +void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +{ + __raw_write_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH +void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +{ + __raw_write_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_bh); +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) +{ + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nested); + +unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, + int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, + do_raw_spin_lock_flags, &flags); + return flags; +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); + +void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + preempt_disable(); + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nest_lock); + #endif notrace int in_lock_functions(unsigned long addr) diff --git a/kernel/srcu.c b/kernel/srcu.c index b0aeeaf22ce..818d7d9aa03 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -49,6 +49,7 @@ int init_srcu_struct(struct srcu_struct *sp) sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); return (sp->per_cpu_ref ? 0 : -ENOMEM); } +EXPORT_SYMBOL_GPL(init_srcu_struct); /* * srcu_readers_active_idx -- returns approximate number of readers @@ -97,6 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) free_percpu(sp->per_cpu_ref); sp->per_cpu_ref = NULL; } +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /** * srcu_read_lock - register a new reader for an SRCU-protected structure. @@ -118,6 +120,7 @@ int srcu_read_lock(struct srcu_struct *sp) preempt_enable(); return idx; } +EXPORT_SYMBOL_GPL(srcu_read_lock); /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. @@ -136,22 +139,12 @@ void srcu_read_unlock(struct srcu_struct *sp, int idx) per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; preempt_enable(); } +EXPORT_SYMBOL_GPL(srcu_read_unlock); -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. - * - * Note that it is illegal to call synchornize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section. +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ -void synchronize_srcu(struct srcu_struct *sp) +void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) { int idx; @@ -173,7 +166,7 @@ void synchronize_srcu(struct srcu_struct *sp) return; } - synchronize_sched(); /* Force memory barrier on all CPUs. */ + sync_func(); /* Force memory barrier on all CPUs. */ /* * The preceding synchronize_sched() ensures that any CPU that @@ -190,7 +183,7 @@ void synchronize_srcu(struct srcu_struct *sp) idx = sp->completed & 0x1; sp->completed++; - synchronize_sched(); /* Force memory barrier on all CPUs. */ + sync_func(); /* Force memory barrier on all CPUs. */ /* * At this point, because of the preceding synchronize_sched(), @@ -203,7 +196,7 @@ void synchronize_srcu(struct srcu_struct *sp) while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); - synchronize_sched(); /* Force memory barrier on all CPUs. */ + sync_func(); /* Force memory barrier on all CPUs. */ /* * The preceding synchronize_sched() forces all srcu_read_unlock() @@ -237,6 +230,47 @@ void synchronize_srcu(struct srcu_struct *sp) } /** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchronize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, synchronize_sched); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); + +/** + * synchronize_srcu_expedited - like synchronize_srcu, but less patient + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchronize_srcu_expedited() + * from the corresponding SRCU read-side critical section; doing so + * will result in deadlock. However, it is perfectly legal to call + * synchronize_srcu_expedited() on one srcu_struct from some other + * srcu_struct's read-side critical section. + */ +void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, synchronize_sched_expedited); +} +EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); + +/** * srcu_batches_completed - return batches completed. * @sp: srcu_struct on which to report batch completion. * @@ -248,10 +282,4 @@ long srcu_batches_completed(struct srcu_struct *sp) { return sp->completed; } - -EXPORT_SYMBOL_GPL(init_srcu_struct); -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -EXPORT_SYMBOL_GPL(srcu_read_lock); -EXPORT_SYMBOL_GPL(srcu_read_unlock); -EXPORT_SYMBOL_GPL(synchronize_srcu); EXPORT_SYMBOL_GPL(srcu_batches_completed); diff --git a/kernel/sys.c b/kernel/sys.c index 255475d163e..20ccfb5da6a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/utsname.h> #include <linux/mman.h> -#include <linux/smp_lock.h> #include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> @@ -190,10 +189,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread(g, p) { if (__task_cred(p)->uid == who) error = set_one_prio(p, niceval, error); - while_each_thread(g, p); + } while_each_thread(g, p); if (who != cred->uid) free_uid(user); /* For find_user() */ break; @@ -253,13 +252,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) !(user = find_user(who))) goto out_unlock; /* No processes for this user */ - do_each_thread(g, p) + do_each_thread(g, p) { if (__task_cred(p)->uid == who) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } - while_each_thread(g, p); + } while_each_thread(g, p); if (who != cred->uid) free_uid(user); /* for find_user() */ break; @@ -349,6 +348,9 @@ void kernel_power_off(void) machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + /* * Reboot system call: for obvious reasons only root may call it, * and even root needs to set up some magic numbers in the registers @@ -381,7 +383,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) cmd = LINUX_REBOOT_CMD_HALT; - lock_kernel(); + mutex_lock(&reboot_mutex); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: kernel_restart(NULL); @@ -397,20 +399,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, case LINUX_REBOOT_CMD_HALT: kernel_halt(); - unlock_kernel(); do_exit(0); panic("cannot halt"); case LINUX_REBOOT_CMD_POWER_OFF: kernel_power_off(); - unlock_kernel(); do_exit(0); break; case LINUX_REBOOT_CMD_RESTART2: if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { - unlock_kernel(); - return -EFAULT; + ret = -EFAULT; + break; } buffer[sizeof(buffer) - 1] = '\0'; @@ -433,7 +433,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, ret = -EINVAL; break; } - unlock_kernel(); + mutex_unlock(&reboot_mutex); return ret; } @@ -911,16 +911,15 @@ change_okay: void do_sys_times(struct tms *tms) { - struct task_cputime cputime; - cputime_t cutime, cstime; + cputime_t tgutime, tgstime, cutime, cstime; - thread_group_cputime(current, &cputime); spin_lock_irq(¤t->sighand->siglock); + thread_group_times(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); - tms->tms_utime = cputime_to_clock_t(cputime.utime); - tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_utime = cputime_to_clock_t(tgutime); + tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); tms->tms_cstime = cputime_to_clock_t(cstime); } @@ -1110,6 +1109,8 @@ SYSCALL_DEFINE0(setsid) err = session; out: write_unlock_irq(&tasklist_lock); + if (err > 0) + proc_sid_connector(group_leader); return err; } @@ -1336,16 +1337,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; unsigned long flags; - cputime_t utime, stime; - struct task_cputime cputime; + cputime_t tgutime, tgstime, utime, stime; unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { - utime = task_utime(current); - stime = task_stime(current); + task_times(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1371,9 +1370,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_cputime(p, &cputime); - utime = cputime_add(utime, cputime.utime); - stime = cputime_add(stime, cputime.stime); + thread_group_times(p, &tgutime, &tgstime); + utime = cputime_add(utime, tgutime); + stime = cputime_add(stime, tgstime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@ -1546,24 +1545,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg4 | arg5) return -EINVAL; switch (arg2) { - case 0: + case PR_MCE_KILL_CLEAR: if (arg3 != 0) return -EINVAL; current->flags &= ~PF_MCE_PROCESS; break; - case 1: + case PR_MCE_KILL_SET: current->flags |= PF_MCE_PROCESS; - if (arg3 != 0) + if (arg3 == PR_MCE_KILL_EARLY) current->flags |= PF_MCE_EARLY; - else + else if (arg3 == PR_MCE_KILL_LATE) current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); + else + return -EINVAL; break; default: return -EINVAL; } error = 0; break; - + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e06d0b8d195..695384f12a7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -48,8 +48,10 @@ cond_syscall(sys_shutdown); cond_syscall(sys_sendmsg); cond_syscall(compat_sys_sendmsg); cond_syscall(sys_recvmsg); +cond_syscall(sys_recvmmsg); cond_syscall(compat_sys_recvmsg); cond_syscall(compat_sys_recvfrom); +cond_syscall(compat_sys_recvmmsg); cond_syscall(sys_socketcall); cond_syscall(sys_futex); cond_syscall(compat_sys_futex); @@ -139,7 +141,6 @@ cond_syscall(sys_pciconfig_read); cond_syscall(sys_pciconfig_write); cond_syscall(sys_pciconfig_iobase); cond_syscall(sys32_ipc); -cond_syscall(sys32_sysctl); cond_syscall(ppc_rtas); cond_syscall(sys_spu_run); cond_syscall(sys_spu_create); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0d949c51741..6665761c006 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -27,7 +27,6 @@ #include <linux/security.h> #include <linux/ctype.h> #include <linux/kmemcheck.h> -#include <linux/smp_lock.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> @@ -36,6 +35,7 @@ #include <linux/sysrq.h> #include <linux/highuid.h> #include <linux/writeback.h> +#include <linux/ratelimit.h> #include <linux/hugetlb.h> #include <linux/initrd.h> #include <linux/key.h> @@ -60,7 +60,6 @@ #include <asm/io.h> #endif -static int deprecated_sysctl_warning(struct __sysctl_args *args); #if defined(CONFIG_SYSCTL) @@ -158,6 +157,8 @@ extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif +extern struct ratelimit_state printk_ratelimit_state; + #ifdef CONFIG_RT_MUTEXES extern int max_lock_depth; #endif @@ -207,31 +208,26 @@ extern int lock_stat; static struct ctl_table root_table[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = kern_table, }, { - .ctl_name = CTL_VM, .procname = "vm", .mode = 0555, .child = vm_table, }, { - .ctl_name = CTL_FS, .procname = "fs", .mode = 0555, .child = fs_table, }, { - .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, .child = debug_table, }, { - .ctl_name = CTL_DEV, .procname = "dev", .mode = 0555, .child = dev_table, @@ -240,7 +236,7 @@ static struct ctl_table root_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; #ifdef CONFIG_SCHED_DEBUG @@ -248,196 +244,178 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +static int min_sched_shares_ratelimit = 100000; /* 100 usec */ +static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_SCHED_DEBUG { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_min_granularity_ns", .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, - .strategy = &sysctl_intvec, + .proc_handler = sched_proc_update_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_latency_ns", .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, - .strategy = &sysctl_intvec, + .proc_handler = sched_proc_update_handler, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = sched_proc_update_handler, .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_shares_ratelimit", .data = &sysctl_sched_shares_ratelimit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_shares_ratelimit, + .extra2 = &max_sched_shares_ratelimit, }, { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_shares_thresh", - .data = &sysctl_sched_shares_thresh, - .maxlen = sizeof(unsigned int), + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, }, { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, + .procname = "sched_shares_thresh", + .data = &sysctl_sched_shares_thresh, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_migration_cost", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_nr_migrate", .data = &sysctl_sched_nr_migrate, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_time_avg", .data = &sysctl_sched_time_avg, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_rt_handler, + .proc_handler = sched_rt_handler, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_rt_runtime_us", .data = &sysctl_sched_rt_runtime, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &sched_rt_handler, + .proc_handler = sched_rt_handler, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_compat_yield", .data = &sysctl_sched_compat_yield, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_PROVE_LOCKING { - .ctl_name = CTL_UNNUMBERED, .procname = "prove_locking", .data = &prove_locking, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_LOCK_STAT { - .ctl_name = CTL_UNNUMBERED, .procname = "lock_stat", .data = &lock_stat, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", .data = &core_uses_pid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", .data = core_pattern, .maxlen = CORENAME_MAX_SIZE, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "core_pipe_limit", .data = &core_pipe_limit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_taint, + .proc_handler = proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -446,181 +424,160 @@ static struct ctl_table kern_table[] = { .data = &latencytop_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_BLK_DEV_INITRD { - .ctl_name = KERN_REALROOTDEV, .procname = "real-root-dev", .data = &real_root_dev, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "print-fatal-signals", .data = &print_fatal_signals, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_SPARC { - .ctl_name = KERN_SPARC_REBOOT, .procname = "reboot-cmd", .data = reboot_command, .maxlen = 256, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, { - .ctl_name = KERN_SPARC_STOP_A, .procname = "stop-a", .data = &stop_a_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_SPARC_SCONS_PWROFF, .procname = "scons-poweroff", .data = &scons_pwroff, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_SPARC64 { - .ctl_name = CTL_UNNUMBERED, .procname = "tsb-ratio", .data = &sysctl_tsb_ratio, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef __hppa__ { - .ctl_name = KERN_HPPA_PWRSW, .procname = "soft-power", .data = &pwrsw_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_HPPA_UNALIGNED, .procname = "unaligned-trap", .data = &unaligned_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_CTLALTDEL, .procname = "ctrl-alt-del", .data = &C_A_D, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #ifdef CONFIG_FUNCTION_TRACER { - .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_enabled", .data = &ftrace_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &ftrace_enable_sysctl, + .proc_handler = ftrace_enable_sysctl, }, #endif #ifdef CONFIG_STACK_TRACER { - .ctl_name = CTL_UNNUMBERED, .procname = "stack_tracer_enabled", .data = &stack_tracer_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &stack_trace_sysctl, + .proc_handler = stack_trace_sysctl, }, #endif #ifdef CONFIG_TRACING { - .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_dump_on_oops", .data = &ftrace_dump_on_oops, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_MODULES { - .ctl_name = KERN_MODPROBE, .procname = "modprobe", .data = &modprobe_path, .maxlen = KMOD_PATH_LEN, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "modules_disabled", .data = &modules_disabled, .maxlen = sizeof(int), .mode = 0644, /* only handle a transition from default "0" to "1" */ - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &one, .extra2 = &one, }, #endif #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { - .ctl_name = KERN_HOTPLUG, .procname = "hotplug", .data = &uevent_helper, .maxlen = UEVENT_HELPER_PATH_LEN, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, #endif #ifdef CONFIG_CHR_DEV_SG { - .ctl_name = KERN_SG_BIG_BUFF, .procname = "sg-big-buff", .data = &sg_big_buff, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_BSD_PROCESS_ACCT { - .ctl_name = KERN_ACCT, .procname = "acct", .data = &acct_parm, .maxlen = 3*sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_MAGIC_SYSRQ { - .ctl_name = KERN_SYSRQ, .procname = "sysrq", .data = &__sysrq_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_PROC_SYSCTL @@ -629,215 +586,188 @@ static struct ctl_table kern_table[] = { .data = NULL, .maxlen = sizeof (int), .mode = 0600, - .proc_handler = &proc_do_cad_pid, + .proc_handler = proc_do_cad_pid, }, #endif { - .ctl_name = KERN_MAX_THREADS, .procname = "threads-max", .data = &max_threads, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_RANDOM, .procname = "random", .mode = 0555, .child = random_table, }, { - .ctl_name = KERN_OVERFLOWUID, .procname = "overflowuid", .data = &overflowuid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, { - .ctl_name = KERN_OVERFLOWGID, .procname = "overflowgid", .data = &overflowgid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, #ifdef CONFIG_S390 #ifdef CONFIG_MATHEMU { - .ctl_name = KERN_IEEE_EMULATION_WARNINGS, .procname = "ieee_emulation_warnings", .data = &sysctl_ieee_emulation_warnings, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_S390_USER_DEBUG_LOGGING, .procname = "userprocess_debug", .data = &sysctl_userprocess_debug, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = KERN_PIDMAX, .procname = "pid_max", .data = &pid_max, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, { - .ctl_name = KERN_PANIC_ON_OOPS, .procname = "panic_on_oops", .data = &panic_on_oops, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #if defined CONFIG_PRINTK { - .ctl_name = KERN_PRINTK, .procname = "printk", .data = &console_loglevel, .maxlen = 4*sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_PRINTK_RATELIMIT, .procname = "printk_ratelimit", .data = &printk_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = KERN_PRINTK_RATELIMIT_BURST, .procname = "printk_ratelimit_burst", .data = &printk_ratelimit_state.burst, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "printk_delay", .data = &printk_delay_msec, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &ten_thousand, }, #endif { - .ctl_name = KERN_NGROUPS_MAX, .procname = "ngroups_max", .data = &ngroups_max, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { - .ctl_name = KERN_UNKNOWN_NMI_PANIC, .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .procname = "nmi_watchdog", .data = &nmi_watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_nmi_enabled, + .proc_handler = proc_nmi_enabled, }, #endif #if defined(CONFIG_X86) { - .ctl_name = KERN_PANIC_ON_NMI, .procname = "panic_on_unrecovered_nmi", .data = &panic_on_unrecovered_nmi, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "panic_on_io_nmi", .data = &panic_on_io_nmi, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = KERN_BOOTLOADER_TYPE, .procname = "bootloader_type", .data = &bootloader_type, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "bootloader_version", .data = &bootloader_version, .maxlen = sizeof (int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", .data = &kstack_depth_to_print, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "io_delay_type", .data = &io_delay_type, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_MMU) { - .ctl_name = KERN_RANDOMIZE, .procname = "randomize_va_space", .data = &randomize_va_space, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { - .ctl_name = KERN_SPIN_RETRY, .procname = "spin_retry", .data = &spin_retry, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) @@ -846,123 +776,104 @@ static struct ctl_table kern_table[] = { .data = &acpi_realmode_flags, .maxlen = sizeof (unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, #endif #ifdef CONFIG_IA64 { - .ctl_name = KERN_IA64_UNALIGNED, .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "unaligned-dump-stack", .data = &unaligned_dump_stack, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_DETECT_SOFTLOCKUP { - .ctl_name = CTL_UNNUMBERED, .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "softlockup_thresh", .data = &softlockup_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dosoftlockup_thresh, - .strategy = &sysctl_intvec, + .proc_handler = proc_dosoftlockup_thresh, .extra1 = &neg_one, .extra2 = &sixty, }, #endif #ifdef CONFIG_DETECT_HUNG_TASK { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_check_count", .data = &sysctl_hung_task_check_count, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_doulongvec_minmax, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_timeout_secs", .data = &sysctl_hung_task_timeout_secs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_dohung_task_timeout_secs, - .strategy = &sysctl_intvec, + .proc_handler = proc_dohung_task_timeout_secs, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_doulongvec_minmax, }, #endif #ifdef CONFIG_COMPAT { - .ctl_name = KERN_COMPAT_LOG, .procname = "compat-log", .data = &compat_log, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_RT_MUTEXES { - .ctl_name = KERN_MAX_LOCK_DEPTH, .procname = "max_lock_depth", .data = &max_lock_depth, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "poweroff_cmd", .data = &poweroff_cmd, .maxlen = POWEROFF_CMD_PATH_LEN, .mode = 0644, - .proc_handler = &proc_dostring, - .strategy = &sysctl_string, + .proc_handler = proc_dostring, }, #ifdef CONFIG_KEYS { - .ctl_name = CTL_UNNUMBERED, .procname = "keys", .mode = 0555, .child = key_sysctls, @@ -970,17 +881,15 @@ static struct ctl_table kern_table[] = { #endif #ifdef CONFIG_RCU_TORTURE_TEST { - .ctl_name = CTL_UNNUMBERED, .procname = "rcutorture_runnable", .data = &rcutorture_runnable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_SLOW_WORK { - .ctl_name = CTL_UNNUMBERED, .procname = "slow-work", .mode = 0555, .child = slow_work_sysctls, @@ -988,146 +897,127 @@ static struct ctl_table kern_table[] = { #endif #ifdef CONFIG_PERF_EVENTS { - .ctl_name = CTL_UNNUMBERED, .procname = "perf_event_paranoid", .data = &sysctl_perf_event_paranoid, .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "perf_event_mlock_kb", .data = &sysctl_perf_event_mlock, .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "perf_event_max_sample_rate", .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_KMEMCHECK { - .ctl_name = CTL_UNNUMBERED, .procname = "kmemcheck", .data = &kmemcheck_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_BLOCK { - .ctl_name = CTL_UNNUMBERED, .procname = "blk_iopoll", .data = &blk_iopoll_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; static struct ctl_table vm_table[] = { { - .ctl_name = VM_OVERCOMMIT_MEMORY, .procname = "overcommit_memory", .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_PANIC_ON_OOM, .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "oom_kill_allocating_task", .data = &sysctl_oom_kill_allocating_task, .maxlen = sizeof(sysctl_oom_kill_allocating_task), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "oom_dump_tasks", .data = &sysctl_oom_dump_tasks, .maxlen = sizeof(sysctl_oom_dump_tasks), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_OVERCOMMIT_RATIO, .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_PAGE_CLUSTER, .procname = "page-cluster", .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_DIRTY_BACKGROUND, .procname = "dirty_background_ratio", .data = &dirty_background_ratio, .maxlen = sizeof(dirty_background_ratio), .mode = 0644, - .proc_handler = &dirty_background_ratio_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_background_ratio_handler, .extra1 = &zero, .extra2 = &one_hundred, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "dirty_background_bytes", .data = &dirty_background_bytes, .maxlen = sizeof(dirty_background_bytes), .mode = 0644, - .proc_handler = &dirty_background_bytes_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_background_bytes_handler, .extra1 = &one_ul, }, { - .ctl_name = VM_DIRTY_RATIO, .procname = "dirty_ratio", .data = &vm_dirty_ratio, .maxlen = sizeof(vm_dirty_ratio), .mode = 0644, - .proc_handler = &dirty_ratio_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_ratio_handler, .extra1 = &zero, .extra2 = &one_hundred, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "dirty_bytes", .data = &vm_dirty_bytes, .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, - .proc_handler = &dirty_bytes_handler, - .strategy = &sysctl_intvec, + .proc_handler = dirty_bytes_handler, .extra1 = &dirty_bytes_min, }, { @@ -1135,289 +1025,256 @@ static struct ctl_table vm_table[] = { .data = &dirty_writeback_interval, .maxlen = sizeof(dirty_writeback_interval), .mode = 0644, - .proc_handler = &dirty_writeback_centisecs_handler, + .proc_handler = dirty_writeback_centisecs_handler, }, { .procname = "dirty_expire_centisecs", .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_NR_PDFLUSH_THREADS, .procname = "nr_pdflush_threads", .data = &nr_pdflush_threads, .maxlen = sizeof nr_pdflush_threads, .mode = 0444 /* read-only*/, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = VM_SWAPPINESS, .procname = "swappiness", .data = &vm_swappiness, .maxlen = sizeof(vm_swappiness), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one_hundred, }, #ifdef CONFIG_HUGETLB_PAGE - { + { .procname = "nr_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &hugetlb_sysctl_handler, + .proc_handler = hugetlb_sysctl_handler, .extra1 = (void *)&hugetlb_zero, .extra2 = (void *)&hugetlb_infinity, - }, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#endif { - .ctl_name = VM_HUGETLB_GROUP, .procname = "hugetlb_shm_group", .data = &sysctl_hugetlb_shm_group, .maxlen = sizeof(gid_t), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "hugepages_treat_as_movable", .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &hugetlb_treat_movable_handler, + .proc_handler = hugetlb_treat_movable_handler, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "nr_overcommit_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &hugetlb_overcommit_handler, + .proc_handler = hugetlb_overcommit_handler, .extra1 = (void *)&hugetlb_zero, .extra2 = (void *)&hugetlb_infinity, }, #endif { - .ctl_name = VM_LOWMEM_RESERVE_RATIO, .procname = "lowmem_reserve_ratio", .data = &sysctl_lowmem_reserve_ratio, .maxlen = sizeof(sysctl_lowmem_reserve_ratio), .mode = 0644, - .proc_handler = &lowmem_reserve_ratio_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, }, { - .ctl_name = VM_DROP_PAGECACHE, .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, - .strategy = &sysctl_intvec, }, { - .ctl_name = VM_MIN_FREE_KBYTES, .procname = "min_free_kbytes", .data = &min_free_kbytes, .maxlen = sizeof(min_free_kbytes), .mode = 0644, - .proc_handler = &min_free_kbytes_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = min_free_kbytes_sysctl_handler, .extra1 = &zero, }, { - .ctl_name = VM_PERCPU_PAGELIST_FRACTION, .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, - .proc_handler = &percpu_pagelist_fraction_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = &min_percpu_pagelist_fract, }, #ifdef CONFIG_MMU { - .ctl_name = VM_MAX_MAP_COUNT, .procname = "max_map_count", .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, #else { - .ctl_name = CTL_UNNUMBERED, .procname = "nr_trim_pages", .data = &sysctl_nr_trim_pages, .maxlen = sizeof(sysctl_nr_trim_pages), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, #endif { - .ctl_name = VM_LAPTOP_MODE, .procname = "laptop_mode", .data = &laptop_mode, .maxlen = sizeof(laptop_mode), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = VM_BLOCK_DUMP, .procname = "block_dump", .data = &block_dump, .maxlen = sizeof(block_dump), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, { - .ctl_name = VM_VFS_CACHE_PRESSURE, .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, .maxlen = sizeof(sysctl_vfs_cache_pressure), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT { - .ctl_name = VM_LEGACY_VA_LAYOUT, .procname = "legacy_va_layout", .data = &sysctl_legacy_va_layout, .maxlen = sizeof(sysctl_legacy_va_layout), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, #endif #ifdef CONFIG_NUMA { - .ctl_name = VM_ZONE_RECLAIM_MODE, .procname = "zone_reclaim_mode", .data = &zone_reclaim_mode, .maxlen = sizeof(zone_reclaim_mode), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, { - .ctl_name = VM_MIN_UNMAPPED, .procname = "min_unmapped_ratio", .data = &sysctl_min_unmapped_ratio, .maxlen = sizeof(sysctl_min_unmapped_ratio), .mode = 0644, - .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, .extra1 = &zero, .extra2 = &one_hundred, }, { - .ctl_name = VM_MIN_SLAB, .procname = "min_slab_ratio", .data = &sysctl_min_slab_ratio, .maxlen = sizeof(sysctl_min_slab_ratio), .mode = 0644, - .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, - .strategy = &sysctl_intvec, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, .extra1 = &zero, .extra2 = &one_hundred, }, #endif #ifdef CONFIG_SMP { - .ctl_name = CTL_UNNUMBERED, .procname = "stat_interval", .data = &sysctl_stat_interval, .maxlen = sizeof(sysctl_stat_interval), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", .data = &dac_mmap_min_addr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &mmap_min_addr_handler, + .proc_handler = mmap_min_addr_handler, }, #ifdef CONFIG_NUMA { - .ctl_name = CTL_UNNUMBERED, .procname = "numa_zonelist_order", .data = &numa_zonelist_order, .maxlen = NUMA_ZONELIST_ORDER_LEN, .mode = 0644, - .proc_handler = &numa_zonelist_order_handler, - .strategy = &sysctl_string, + .proc_handler = numa_zonelist_order_handler, }, #endif #if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { - .ctl_name = VM_VDSO_ENABLED, .procname = "vdso_enabled", .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), .mode = 0644, - .proc_handler = &proc_dointvec, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec, .extra1 = &zero, }, #endif #ifdef CONFIG_HIGHMEM { - .ctl_name = CTL_UNNUMBERED, .procname = "highmem_is_dirtyable", .data = &vm_highmem_is_dirtyable, .maxlen = sizeof(vm_highmem_is_dirtyable), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, #endif { - .ctl_name = CTL_UNNUMBERED, .procname = "scan_unevictable_pages", .data = &scan_unevictable_pages, .maxlen = sizeof(scan_unevictable_pages), .mode = 0644, - .proc_handler = &scan_unevictable_handler, + .proc_handler = scan_unevictable_handler, }, #ifdef CONFIG_MEMORY_FAILURE { - .ctl_name = CTL_UNNUMBERED, .procname = "memory_failure_early_kill", .data = &sysctl_memory_failure_early_kill, .maxlen = sizeof(sysctl_memory_failure_early_kill), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "memory_failure_recovery", .data = &sysctl_memory_failure_recovery, .maxlen = sizeof(sysctl_memory_failure_recovery), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, @@ -1427,116 +1284,104 @@ static struct ctl_table vm_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) static struct ctl_table binfmt_misc_table[] = { - { .ctl_name = 0 } + { } }; #endif static struct ctl_table fs_table[] = { { - .ctl_name = FS_NRINODE, .procname = "inode-nr", .data = &inodes_stat, .maxlen = 2*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = FS_STATINODE, .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .procname = "file-nr", .data = &files_stat, .maxlen = 3*sizeof(int), .mode = 0444, - .proc_handler = &proc_nr_files, + .proc_handler = proc_nr_files, }, { - .ctl_name = FS_MAXFILE, .procname = "file-max", .data = &files_stat.max_files, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, .procname = "nr_open", .data = &sysctl_nr_open, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &sysctl_nr_open_min, .extra2 = &sysctl_nr_open_max, }, { - .ctl_name = FS_DENTRY, .procname = "dentry-state", .data = &dentry_stat, .maxlen = 6*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = FS_OVERFLOWUID, .procname = "overflowuid", .data = &fs_overflowuid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, { - .ctl_name = FS_OVERFLOWGID, .procname = "overflowgid", .data = &fs_overflowgid, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &minolduid, .extra2 = &maxolduid, }, #ifdef CONFIG_FILE_LOCKING { - .ctl_name = FS_LEASES, .procname = "leases-enable", .data = &leases_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_DNOTIFY { - .ctl_name = FS_DIR_NOTIFY, .procname = "dir-notify-enable", .data = &dir_notify_enable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_MMU #ifdef CONFIG_FILE_LOCKING { - .ctl_name = FS_LEASE_TIME, .procname = "lease-break-time", .data = &lease_break_time, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_AIO @@ -1545,19 +1390,18 @@ static struct ctl_table fs_table[] = { .data = &aio_nr, .maxlen = sizeof(aio_nr), .mode = 0444, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "aio-max-nr", .data = &aio_max_nr, .maxlen = sizeof(aio_max_nr), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, #endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { - .ctl_name = FS_INOTIFY, .procname = "inotify", .mode = 0555, .child = inotify_table, @@ -1572,19 +1416,16 @@ static struct ctl_table fs_table[] = { #endif #endif { - .ctl_name = KERN_SETUID_DUMPABLE, .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &two, }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) { - .ctl_name = CTL_UNNUMBERED, .procname = "binfmt_misc", .mode = 0555, .child = binfmt_misc_table, @@ -1594,13 +1435,12 @@ static struct ctl_table fs_table[] = { * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt */ - { .ctl_name = 0 } + { } }; static struct ctl_table debug_table[] = { #if defined(CONFIG_X86) || defined(CONFIG_PPC) { - .ctl_name = CTL_UNNUMBERED, .procname = "exception-trace", .data = &show_unhandled_signals, .maxlen = sizeof(int), @@ -1608,11 +1448,11 @@ static struct ctl_table debug_table[] = { .proc_handler = proc_dointvec }, #endif - { .ctl_name = 0 } + { } }; static struct ctl_table dev_table[] = { - { .ctl_name = 0 } + { } }; static DEFINE_SPINLOCK(sysctl_lock); @@ -1766,122 +1606,6 @@ void register_sysctl_root(struct ctl_table_root *root) spin_unlock(&sysctl_lock); } -#ifdef CONFIG_SYSCTL_SYSCALL -/* Perform the actual read/write of a sysctl table entry. */ -static int do_sysctl_strategy(struct ctl_table_root *root, - struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int op = 0, rc; - - if (oldval) - op |= MAY_READ; - if (newval) - op |= MAY_WRITE; - if (sysctl_perm(root, table, op)) - return -EPERM; - - if (table->strategy) { - rc = table->strategy(table, oldval, oldlenp, newval, newlen); - if (rc < 0) - return rc; - if (rc > 0) - return 0; - } - - /* If there is no strategy routine, or if the strategy returns - * zero, proceed with automatic r/w */ - if (table->data && table->maxlen) { - rc = sysctl_data(table, oldval, oldlenp, newval, newlen); - if (rc < 0) - return rc; - } - return 0; -} - -static int parse_table(int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - struct ctl_table_root *root, - struct ctl_table *table) -{ - int n; -repeat: - if (!nlen) - return -ENOTDIR; - if (get_user(n, name)) - return -EFAULT; - for ( ; table->ctl_name || table->procname; table++) { - if (!table->ctl_name) - continue; - if (n == table->ctl_name) { - int error; - if (table->child) { - if (sysctl_perm(root, table, MAY_EXEC)) - return -EPERM; - name++; - nlen--; - table = table->child; - goto repeat; - } - error = do_sysctl_strategy(root, table, - oldval, oldlenp, - newval, newlen); - return error; - } - } - return -ENOTDIR; -} - -int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table_header *head; - int error = -ENOTDIR; - - if (nlen <= 0 || nlen >= CTL_MAXNAME) - return -ENOTDIR; - if (oldval) { - int old_len; - if (!oldlenp || get_user(old_len, oldlenp)) - return -EFAULT; - } - - for (head = sysctl_head_next(NULL); head; - head = sysctl_head_next(head)) { - error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, - head->root, head->ctl_table); - if (error != -ENOTDIR) { - sysctl_head_finish(head); - break; - } - } - return error; -} - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - int error; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - error = deprecated_sysctl_warning(&tmp); - if (error) - goto out; - - lock_kernel(); - error = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, tmp.oldlenp, - tmp.newval, tmp.newlen); - unlock_kernel(); -out: - return error; -} -#endif /* CONFIG_SYSCTL_SYSCALL */ - /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. @@ -1917,7 +1641,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) { - for (; table->ctl_name || table->procname; table++) { + for (; table->procname; table++) { table->parent = parent; if (table->child) sysctl_set_parent(table, table->child); @@ -1949,11 +1673,11 @@ static struct ctl_table *is_branch_in(struct ctl_table *branch, return NULL; /* ... and nothing else */ - if (branch[1].procname || branch[1].ctl_name) + if (branch[1].procname) return NULL; /* table should contain subdirectory with the same name */ - for (p = table; p->procname || p->ctl_name; p++) { + for (p = table; p->procname; p++) { if (!p->child) continue; if (p->procname && strcmp(p->procname, s) == 0) @@ -1998,9 +1722,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) * * The members of the &struct ctl_table structure are used as follows: * - * ctl_name - This is the numeric sysctl value used by sysctl(2). The number - * must be unique within that level of sysctl - * * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file * @@ -2015,8 +1736,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) * * proc_handler - the text handler routine (described below) * - * strategy - the strategy routine (described below) - * * de - for internal use by the sysctl routines * * extra1, extra2 - extra pointers usable by the proc handler routines @@ -2029,19 +1748,6 @@ static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) * struct enable minimal validation of the values being written to be * performed, and the mode field allows minimal authentication. * - * More sophisticated management can be enabled by the provision of a - * strategy routine with the table entry. This will be called before - * any automatic read or write of the data is performed. - * - * The strategy routine may return - * - * < 0 - Error occurred (error is passed to user process) - * - * 0 - OK - proceed with automatic read or write. - * - * > 0 - OK - read or write has been done by the strategy routine, so - * return immediately. - * * There must be a proc_handler routine for any terminal nodes * mirrored under /proc/sys (non-terminals are handled by a built-in * directory handler). Several default handlers are available to @@ -2068,13 +1774,13 @@ struct ctl_table_header *__register_sysctl_paths( struct ctl_table_set *set; /* Count the path components */ - for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) + for (npath = 0; path[npath].procname; ++npath) ; /* * For each path component, allocate a 2-element ctl_table array. * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (ctl_name == 0). + * for this, the second will be the sentinel (procname == 0). * * We allocate everything in one go so that we don't have to * worry about freeing additional memory in unregister_sysctl_table. @@ -2091,7 +1797,6 @@ struct ctl_table_header *__register_sysctl_paths( for (n = 0; n < npath; ++n, ++path) { /* Copy the procname */ new->procname = path->procname; - new->ctl_name = path->ctl_name; new->mode = 0555; *prevp = new; @@ -2953,286 +2658,6 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, #endif /* CONFIG_PROC_FS */ - -#ifdef CONFIG_SYSCTL_SYSCALL -/* - * General sysctl support routines - */ - -/* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - size_t len; - - /* Get out of I don't have a variable */ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - if (oldval && oldlenp) { - if (get_user(len, oldlenp)) - return -EFAULT; - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, table->data, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - if (newval && newlen) { - if (newlen > table->maxlen) - newlen = table->maxlen; - - if (copy_from_user(table->data, newval, newlen)) - return -EFAULT; - } - return 1; -} - -/* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (!table->data || !table->maxlen) - return -ENOTDIR; - - if (oldval && oldlenp) { - size_t bufsize; - if (get_user(bufsize, oldlenp)) - return -EFAULT; - if (bufsize) { - size_t len = strlen(table->data), copied; - - /* This shouldn't trigger for a well-formed sysctl */ - if (len > table->maxlen) - len = table->maxlen; - - /* Copy up to a max of bufsize-1 bytes of the string */ - copied = (len >= bufsize) ? bufsize - 1 : len; - - if (copy_to_user(oldval, table->data, copied) || - put_user(0, (char __user *)(oldval + copied))) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - size_t len = newlen; - if (len > table->maxlen) - len = table->maxlen; - if(copy_from_user(table->data, newval, len)) - return -EFAULT; - if (len == table->maxlen) - len--; - ((char *) table->data)[len] = 0; - } - return 1; -} - -/* - * This function makes sure that all of the integers in the vector - * are between the minimum and maximum values given in the arrays - * table->extra1 and table->extra2, respectively. - */ -int sysctl_intvec(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - - if (newval && newlen) { - int __user *vec = (int __user *) newval; - int *min = (int *) table->extra1; - int *max = (int *) table->extra2; - size_t length; - int i; - - if (newlen % sizeof(int) != 0) - return -EINVAL; - - if (!table->extra1 && !table->extra2) - return 0; - - if (newlen > table->maxlen) - newlen = table->maxlen; - length = newlen / sizeof(int); - - for (i = 0; i < length; i++) { - int value; - if (get_user(value, vec + i)) - return -EFAULT; - if (min && value < min[i]) - return -EINVAL; - if (max && value > max[i]) - return -EINVAL; - } - } - return 0; -} - -/* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (oldval && oldlenp) { - size_t olen; - - if (get_user(olen, oldlenp)) - return -EFAULT; - if (olen) { - int val; - - if (olen < sizeof(int)) - return -EINVAL; - - val = *(int *)(table->data) / HZ; - if (put_user(val, (int __user *)oldval)) - return -EFAULT; - if (put_user(sizeof(int), oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - int new; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(new, (int __user *)newval)) - return -EFAULT; - *(int *)(table->data) = new*HZ; - } - return 1; -} - -/* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - if (oldval && oldlenp) { - size_t olen; - - if (get_user(olen, oldlenp)) - return -EFAULT; - if (olen) { - int val; - - if (olen < sizeof(int)) - return -EINVAL; - - val = jiffies_to_msecs(*(int *)(table->data)); - if (put_user(val, (int __user *)oldval)) - return -EFAULT; - if (put_user(sizeof(int), oldlenp)) - return -EFAULT; - } - } - if (newval && newlen) { - int new; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(new, (int __user *)newval)) - return -EFAULT; - *(int *)(table->data) = msecs_to_jiffies(new); - } - return 1; -} - - - -#else /* CONFIG_SYSCTL_SYSCALL */ - - -SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) -{ - struct __sysctl_args tmp; - int error; - - if (copy_from_user(&tmp, args, sizeof(tmp))) - return -EFAULT; - - error = deprecated_sysctl_warning(&tmp); - - /* If no error reading the parameters then just -ENOSYS ... */ - if (!error) - error = -ENOSYS; - - return error; -} - -int sysctl_data(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_string(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_intvec(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -int sysctl_ms_jiffies(struct ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - -#endif /* CONFIG_SYSCTL_SYSCALL */ - -static int deprecated_sysctl_warning(struct __sysctl_args *args) -{ - static int msg_count; - int name[CTL_MAXNAME]; - int i; - - /* Check args->nlen. */ - if (args->nlen < 0 || args->nlen > CTL_MAXNAME) - return -ENOTDIR; - - /* Read in the sysctl name for better debug message logging */ - for (i = 0; i < args->nlen; i++) - if (get_user(name[i], args->name + i)) - return -EFAULT; - - /* Ignore accesses to kernel.version */ - if ((args->nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION)) - return 0; - - if (msg_count < 5) { - msg_count++; - printk(KERN_INFO - "warning: process `%s' used the deprecated sysctl " - "system call with ", current->comm); - for (i = 0; i < args->nlen; i++) - printk("%d.", name[i]); - printk("\n"); - } - return 0; -} - /* * No sense putting this after each symbol definition, twice, * exception granted :-) @@ -3247,9 +2672,4 @@ EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(register_sysctl_table); EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(sysctl_intvec); -EXPORT_SYMBOL(sysctl_jiffies); -EXPORT_SYMBOL(sysctl_ms_jiffies); -EXPORT_SYMBOL(sysctl_string); -EXPORT_SYMBOL(sysctl_data); EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c new file mode 100644 index 00000000000..112533d5fc0 --- /dev/null +++ b/kernel/sysctl_binary.c @@ -0,0 +1,1514 @@ +#include <linux/stat.h> +#include <linux/sysctl.h> +#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include <linux/sunrpc/debug.h> +#include <linux/string.h> +#include <net/ip_vs.h> +#include <linux/syscalls.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/fs.h> +#include <linux/nsproxy.h> +#include <linux/pid_namespace.h> +#include <linux/file.h> +#include <linux/ctype.h> +#include <linux/netdevice.h> + +#ifdef CONFIG_SYSCTL_SYSCALL + +struct bin_table; +typedef ssize_t bin_convert_t(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen); + +static bin_convert_t bin_dir; +static bin_convert_t bin_string; +static bin_convert_t bin_intvec; +static bin_convert_t bin_ulongvec; +static bin_convert_t bin_uuid; +static bin_convert_t bin_dn_node_address; + +#define CTL_DIR bin_dir +#define CTL_STR bin_string +#define CTL_INT bin_intvec +#define CTL_ULONG bin_ulongvec +#define CTL_UUID bin_uuid +#define CTL_DNADR bin_dn_node_address + +#define BUFSZ 256 + +struct bin_table { + bin_convert_t *convert; + int ctl_name; + const char *procname; + const struct bin_table *child; +}; + +static const struct bin_table bin_random_table[] = { + { CTL_INT, RANDOM_POOLSIZE, "poolsize" }, + { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" }, + { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" }, + { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, + { CTL_UUID, RANDOM_BOOT_ID, "boot_id" }, + { CTL_UUID, RANDOM_UUID, "uuid" }, + {} +}; + +static const struct bin_table bin_pty_table[] = { + { CTL_INT, PTY_MAX, "max" }, + { CTL_INT, PTY_NR, "nr" }, + {} +}; + +static const struct bin_table bin_kern_table[] = { + { CTL_STR, KERN_OSTYPE, "ostype" }, + { CTL_STR, KERN_OSRELEASE, "osrelease" }, + /* KERN_OSREV not used */ + { CTL_STR, KERN_VERSION, "version" }, + /* KERN_SECUREMASK not used */ + /* KERN_PROF not used */ + { CTL_STR, KERN_NODENAME, "hostname" }, + { CTL_STR, KERN_DOMAINNAME, "domainname" }, + + { CTL_INT, KERN_PANIC, "panic" }, + { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, + + { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, + { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, + { CTL_INT, KERN_PRINTK, "printk" }, + + /* KERN_NAMETRANS not used */ + /* KERN_PPC_HTABRECLAIM not used */ + /* KERN_PPC_ZEROPAGED not used */ + { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, + + { CTL_STR, KERN_MODPROBE, "modprobe" }, + { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" }, + { CTL_INT, KERN_ACCT, "acct" }, + /* KERN_PPC_L2CR "l2cr" no longer used */ + + /* KERN_RTSIGNR not used */ + /* KERN_RTSIGMAX not used */ + + { CTL_ULONG, KERN_SHMMAX, "shmmax" }, + { CTL_INT, KERN_MSGMAX, "msgmax" }, + { CTL_INT, KERN_MSGMNB, "msgmnb" }, + /* KERN_MSGPOOL not used*/ + { CTL_INT, KERN_SYSRQ, "sysrq" }, + { CTL_INT, KERN_MAX_THREADS, "threads-max" }, + { CTL_DIR, KERN_RANDOM, "random", bin_random_table }, + { CTL_ULONG, KERN_SHMALL, "shmall" }, + { CTL_INT, KERN_MSGMNI, "msgmni" }, + { CTL_INT, KERN_SEM, "sem" }, + { CTL_INT, KERN_SPARC_STOP_A, "stop-a" }, + { CTL_INT, KERN_SHMMNI, "shmmni" }, + + { CTL_INT, KERN_OVERFLOWUID, "overflowuid" }, + { CTL_INT, KERN_OVERFLOWGID, "overflowgid" }, + + { CTL_STR, KERN_HOTPLUG, "hotplug", }, + { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, + + { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, + { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" }, + /* KERN_TAINTED "tainted" no longer used */ + { CTL_INT, KERN_CADPID, "cad_pid" }, + { CTL_INT, KERN_PIDMAX, "pid_max" }, + { CTL_STR, KERN_CORE_PATTERN, "core_pattern" }, + { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" }, + { CTL_INT, KERN_HPPA_PWRSW, "soft-power" }, + { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" }, + + { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, + { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, + + { CTL_DIR, KERN_PTY, "pty", bin_pty_table }, + { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" }, + { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, + /* KERN_HZ_TIMER "hz_timer" no longer used */ + { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, + { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" }, + { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" }, + + { CTL_INT, KERN_SPIN_RETRY, "spin_retry" }, + /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */ + { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, + { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, + { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, + { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, + { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, + {} +}; + +static const struct bin_table bin_vm_table[] = { + { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, + { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" }, + { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, + { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, + /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ + /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ + { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, + { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, + /* VM_PAGEBUF unused */ + /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ + { CTL_INT, VM_SWAPPINESS, "swappiness" }, + { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, + { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" }, + { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" }, + { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" }, + { CTL_INT, VM_BLOCK_DUMP, "block_dump" }, + { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" }, + { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, + { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, + /* VM_SWAP_TOKEN_TIMEOUT unused */ + { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" }, + { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, + { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, + { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" }, + { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" }, + { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" }, + { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" }, + + {} +}; + +static const struct bin_table bin_net_core_table[] = { + { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" }, + { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" }, + { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" }, + { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" }, + /* NET_CORE_DESTROY_DELAY unused */ + { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, + /* NET_CORE_FASTROUTE unused */ + { CTL_INT, NET_CORE_MSG_COST, "message_cost" }, + { CTL_INT, NET_CORE_MSG_BURST, "message_burst" }, + { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" }, + /* NET_CORE_HOT_LIST_LENGTH unused */ + /* NET_CORE_DIVERT_VERSION unused */ + /* NET_CORE_NO_CONG_THRESH unused */ + /* NET_CORE_NO_CONG unused */ + /* NET_CORE_LO_CONG unused */ + /* NET_CORE_MOD_CONG unused */ + { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" }, + { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" }, + { CTL_INT, NET_CORE_BUDGET, "netdev_budget" }, + { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, + { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, + { CTL_INT, NET_CORE_WARNINGS, "warnings" }, + {}, +}; + +static const struct bin_table bin_net_unix_table[] = { + /* NET_UNIX_DESTROY_DELAY unused */ + /* NET_UNIX_DELETE_DELAY unused */ + { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, + {} +}; + +static const struct bin_table bin_net_ipv4_route_table[] = { + { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" }, + /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */ + /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */ + { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, + { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, + { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, + { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, + { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, + { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, + { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, + { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, + { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, + { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, + { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, + { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, + { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" }, + {} +}; + +static const struct bin_table bin_net_ipv4_conf_vars_table[] = { + { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" }, + { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, + + { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, + { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, + { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, + { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, + { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" }, + { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, + { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, + { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, + { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, + { CTL_INT, NET_IPV4_CONF_TAG, "tag" }, + { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" }, + { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, + { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, + { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, + { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, + + { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, + { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, + { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, + { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, + {} +}; + +static const struct bin_table bin_net_ipv4_conf_table[] = { + { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table }, + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table }, + { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table }, + {} +}; + +static const struct bin_table bin_net_neigh_vars_table[] = { + { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, + { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, + { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" }, + /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */ + { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, + { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, + { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, + { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" }, + { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, + /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */ + /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */ + /* NET_NEIGH_LOCKTIME "locktime" no longer used */ + { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" }, + { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" }, + { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" }, + { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, + { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, + {} +}; + +static const struct bin_table bin_net_neigh_table[] = { + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table }, + { CTL_DIR, 0, NULL, bin_net_neigh_vars_table }, + {} +}; + +static const struct bin_table bin_net_ipv4_netfilter_table[] = { + { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, + + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */ + + /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */ + /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */ + /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */ + /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */ + + { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */ + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, + + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ + + { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, + {} +}; + +static const struct bin_table bin_net_ipv4_table[] = { + {CTL_INT, NET_IPV4_FORWARD, "ip_forward" }, + + { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table }, + { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table }, + { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table }, + /* NET_IPV4_FIB_HASH unused */ + { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table }, + + { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, + { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, + { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" }, + { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, + { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, + /* NET_IPV4_AUTOCONFIG unused */ + { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, + { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, + { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, + { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, + { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, + { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, + { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, + { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, + { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, + { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, + { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, + { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, + { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, + { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" }, + { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" }, + { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, + { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, + { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, + { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, + { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, + { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, + { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, + { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, + { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, + { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, + { CTL_INT, NET_TCP_FACK, "tcp_fack" }, + { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" }, + { CTL_INT, NET_TCP_ECN, "tcp_ecn" }, + { CTL_INT, NET_TCP_DSACK, "tcp_dsack" }, + { CTL_INT, NET_TCP_MEM, "tcp_mem" }, + { CTL_INT, NET_TCP_WMEM, "tcp_wmem" }, + { CTL_INT, NET_TCP_RMEM, "tcp_rmem" }, + { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" }, + { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, + { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" }, + { CTL_INT, NET_TCP_FRTO, "tcp_frto" }, + { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, + { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" }, + { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, + { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, + { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, + { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, + { CTL_INT, NET_TCP_ABC, "tcp_abc" }, + { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, + { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, + { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, + { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, + { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, + { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, + { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, + { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, + { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, + /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */ + { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, + { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, + + { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, + { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, + { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, + { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, + { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, + { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, + + { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, + { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, + { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, + + { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, + /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ + + { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, + + /* NET_TCP_DEFAULT_WIN_SCALE unused */ + /* NET_TCP_BIC_BETA unused */ + /* NET_IPV4_TCP_MAX_KA_PROBES unused */ + /* NET_IPV4_IP_MASQ_DEBUG unused */ + /* NET_TCP_SYN_TAILDROP unused */ + /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ + /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ + /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ + /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ + /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ + /* NET_IPV4_ALWAYS_DEFRAG unused */ + {} +}; + +static const struct bin_table bin_net_ipx_table[] = { + { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, + /* NET_IPX_FORWARDING unused */ + {} +}; + +static const struct bin_table bin_net_atalk_table[] = { + { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, + { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, + { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, + { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, + {}, +}; + +static const struct bin_table bin_net_netrom_table[] = { + { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, + { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, + { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, + { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, + { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, + { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, + { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, + { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, + { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, + { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" }, + { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, + { CTL_INT, NET_NETROM_RESET, "reset" }, + {} +}; + +static const struct bin_table bin_net_ax25_param_table[] = { + { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, + { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, + { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" }, + { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" }, + { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" }, + { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, + { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" }, + { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" }, + { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" }, + { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, + { CTL_INT, NET_AX25_N2, "maximum_retry_count" }, + { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" }, + { CTL_INT, NET_AX25_PROTOCOL, "protocol" }, + { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, + {} +}; + +static const struct bin_table bin_net_ax25_table[] = { + { CTL_DIR, 0, NULL, bin_net_ax25_param_table }, + {} +}; + +static const struct bin_table bin_net_rose_table[] = { + { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, + { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, + { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, + { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, + { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, + { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" }, + { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, + { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, + { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" }, + { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, + {} +}; + +static const struct bin_table bin_net_ipv6_conf_var_table[] = { + { CTL_INT, NET_IPV6_FORWARDING, "forwarding" }, + { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" }, + { CTL_INT, NET_IPV6_MTU, "mtu" }, + { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" }, + { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, + { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" }, + { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, + { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" }, + { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, + { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, + { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, + { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, + { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, + { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, + { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, + { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" }, + { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, + { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, + { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, + { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + {} +}; + +static const struct bin_table bin_net_ipv6_conf_table[] = { + { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table }, + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table }, + { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table }, + {} +}; + +static const struct bin_table bin_net_ipv6_route_table[] = { + /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */ + { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, + { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, + { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, + { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, + { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, + { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, + { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, + { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, + {} +}; + +static const struct bin_table bin_net_ipv6_icmp_table[] = { + { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, + {} +}; + +static const struct bin_table bin_net_ipv6_table[] = { + { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table }, + { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table }, + { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table }, + { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table }, + { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" }, + { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, + { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, + { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, + { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, + { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, + { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, + {} +}; + +static const struct bin_table bin_net_x25_table[] = { + { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, + { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, + { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, + { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, + { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, + { CTL_INT, NET_X25_FORWARD, "x25_forward" }, + {} +}; + +static const struct bin_table bin_net_tr_table[] = { + { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" }, + {} +}; + + +static const struct bin_table bin_net_decnet_conf_vars[] = { + { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, + { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" }, + { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" }, + { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" }, + {} +}; + +static const struct bin_table bin_net_decnet_conf[] = { + { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars }, + { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars }, + {} +}; + +static const struct bin_table bin_net_decnet_table[] = { + { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf }, + { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" }, + { CTL_STR, NET_DECNET_NODE_NAME, "node_name" }, + { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" }, + { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" }, + { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" }, + { CTL_INT, NET_DECNET_DI_COUNT, "di_count" }, + { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" }, + { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, + { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, + { CTL_INT, NET_DECNET_MEM, "decnet_mem" }, + { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" }, + { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" }, + { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" }, + {} +}; + +static const struct bin_table bin_net_sctp_table[] = { + { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" }, + { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" }, + { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" }, + { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, + { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, + { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, + { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, + { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, + { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, + { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" }, + { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, + { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" }, + { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" }, + { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, + { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, + { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, + { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, + {} +}; + +static const struct bin_table bin_net_llc_llc2_timeout_table[] = { + { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" }, + { CTL_INT, NET_LLC2_P_TIMEOUT, "p" }, + { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" }, + { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" }, + {} +}; + +static const struct bin_table bin_net_llc_station_table[] = { + { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, + {} +}; + +static const struct bin_table bin_net_llc_llc2_table[] = { + { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table }, + {} +}; + +static const struct bin_table bin_net_llc_table[] = { + { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table }, + { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table }, + {} +}; + +static const struct bin_table bin_net_netfilter_table[] = { + { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, + /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */ + /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */ + /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */ + /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */ + /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, + { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, + /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, + { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, + { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, + /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */ + /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, + { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, + { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, + + {} +}; + +static const struct bin_table bin_net_irda_table[] = { + { CTL_INT, NET_IRDA_DISCOVERY, "discovery" }, + { CTL_STR, NET_IRDA_DEVNAME, "devname" }, + { CTL_INT, NET_IRDA_DEBUG, "debug" }, + { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" }, + { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, + { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, + { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, + { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, + { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, + { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, + { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, + { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, + { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, + { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, + {} +}; + +static const struct bin_table bin_net_table[] = { + { CTL_DIR, NET_CORE, "core", bin_net_core_table }, + /* NET_ETHER not used */ + /* NET_802 not used */ + { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table }, + { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table }, + { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table }, + { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table }, + { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table }, + { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table }, + /* NET_BRIDGE "bridge" no longer used */ + { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table }, + { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table }, + { CTL_DIR, NET_X25, "x25", bin_net_x25_table }, + { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table }, + { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table }, + /* NET_ECONET not used */ + { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table }, + { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, + { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, + /* NET_DCCP "dccp" no longer used */ + { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table }, + { CTL_INT, 2089, "nf_conntrack_max" }, + {} +}; + +static const struct bin_table bin_fs_quota_table[] = { + { CTL_INT, FS_DQ_LOOKUPS, "lookups" }, + { CTL_INT, FS_DQ_DROPS, "drops" }, + { CTL_INT, FS_DQ_READS, "reads" }, + { CTL_INT, FS_DQ_WRITES, "writes" }, + { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" }, + { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" }, + { CTL_INT, FS_DQ_FREE, "free_dquots" }, + { CTL_INT, FS_DQ_SYNCS, "syncs" }, + { CTL_INT, FS_DQ_WARNINGS, "warnings" }, + {} +}; + +static const struct bin_table bin_fs_xfs_table[] = { + { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" }, + { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" }, + { CTL_INT, XFS_PANIC_MASK, "panic_mask" }, + + { CTL_INT, XFS_ERRLEVEL, "error_level" }, + { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, + { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" }, + { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" }, + { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" }, + { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" }, + { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" }, + { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, + { CTL_INT, XFS_ROTORSTEP, "rotorstep" }, + { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" }, + { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" }, + { CTL_INT, XFS_STATS_CLEAR, "stats_clear" }, + {} +}; + +static const struct bin_table bin_fs_ocfs2_nm_table[] = { + { CTL_STR, 1, "hb_ctl_path" }, + {} +}; + +static const struct bin_table bin_fs_ocfs2_table[] = { + { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table }, + {} +}; + +static const struct bin_table bin_inotify_table[] = { + { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, + { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, + { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, + {} +}; + +static const struct bin_table bin_fs_table[] = { + { CTL_INT, FS_NRINODE, "inode-nr" }, + { CTL_INT, FS_STATINODE, "inode-state" }, + /* FS_MAXINODE unused */ + /* FS_NRDQUOT unused */ + /* FS_MAXDQUOT unused */ + /* FS_NRFILE "file-nr" no longer used */ + { CTL_INT, FS_MAXFILE, "file-max" }, + { CTL_INT, FS_DENTRY, "dentry-state" }, + /* FS_NRSUPER unused */ + /* FS_MAXUPSER unused */ + { CTL_INT, FS_OVERFLOWUID, "overflowuid" }, + { CTL_INT, FS_OVERFLOWGID, "overflowgid" }, + { CTL_INT, FS_LEASES, "leases-enable" }, + { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" }, + { CTL_INT, FS_LEASE_TIME, "lease-break-time" }, + { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table }, + { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table }, + { CTL_ULONG, FS_AIO_NR, "aio-nr" }, + { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" }, + { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table }, + { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table }, + { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" }, + {} +}; + +static const struct bin_table bin_ipmi_table[] = { + { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, + {} +}; + +static const struct bin_table bin_mac_hid_files[] = { + /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ + /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, + /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ + {} +}; + +static const struct bin_table bin_raid_table[] = { + { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, + { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, + {} +}; + +static const struct bin_table bin_scsi_table[] = { + { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" }, + {} +}; + +static const struct bin_table bin_dev_table[] = { + /* DEV_CDROM "cdrom" no longer used */ + /* DEV_HWMON unused */ + /* DEV_PARPORT "parport" no longer used */ + { CTL_DIR, DEV_RAID, "raid", bin_raid_table }, + { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files }, + { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table }, + { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table }, + {} +}; + +static const struct bin_table bin_bus_isa_table[] = { + { CTL_INT, BUS_ISA_MEM_BASE, "membase" }, + { CTL_INT, BUS_ISA_PORT_BASE, "portbase" }, + { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" }, + {} +}; + +static const struct bin_table bin_bus_table[] = { + { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table }, + {} +}; + + +static const struct bin_table bin_s390dbf_table[] = { + { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, + { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, + {} +}; + +static const struct bin_table bin_sunrpc_table[] = { + /* CTL_RPCDEBUG "rpc_debug" no longer used */ + /* CTL_NFSDEBUG "nfs_debug" no longer used */ + /* CTL_NFSDDEBUG "nfsd_debug" no longer used */ + /* CTL_NLMDEBUG "nlm_debug" no longer used */ + + { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, + { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, + { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" }, + { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" }, + {} +}; + +static const struct bin_table bin_pm_table[] = { + /* frv specific */ + /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */ + { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" }, + { CTL_INT, 3 /* CTL_PM_P0 */, "p0" }, + { CTL_INT, 4 /* CTL_PM_CM */, "cm" }, + {} +}; + +static const struct bin_table bin_root_table[] = { + { CTL_DIR, CTL_KERN, "kernel", bin_kern_table }, + { CTL_DIR, CTL_VM, "vm", bin_vm_table }, + { CTL_DIR, CTL_NET, "net", bin_net_table }, + /* CTL_PROC not used */ + { CTL_DIR, CTL_FS, "fs", bin_fs_table }, + /* CTL_DEBUG "debug" no longer used */ + { CTL_DIR, CTL_DEV, "dev", bin_dev_table }, + { CTL_DIR, CTL_BUS, "bus", bin_bus_table }, + { CTL_DIR, CTL_ABI, "abi" }, + /* CTL_CPU not used */ + /* CTL_ARLAN "arlan" no longer used */ + { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table }, + { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table }, + { CTL_DIR, CTL_PM, "pm", bin_pm_table }, + {} +}; + +static ssize_t bin_dir(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + return -ENOTDIR; +} + + +static ssize_t bin_string(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + ssize_t result, copied = 0; + + if (oldval && oldlen) { + char __user *lastp; + loff_t pos = 0; + int ch; + + result = vfs_read(file, oldval, oldlen, &pos); + if (result < 0) + goto out; + + copied = result; + lastp = oldval + copied - 1; + + result = -EFAULT; + if (get_user(ch, lastp)) + goto out; + + /* Trim off the trailing newline */ + if (ch == '\n') { + result = -EFAULT; + if (put_user('\0', lastp)) + goto out; + copied -= 1; + } + } + + if (newval && newlen) { + loff_t pos = 0; + + result = vfs_write(file, newval, newlen, &pos); + if (result < 0) + goto out; + } + + result = copied; +out: + return result; +} + +static ssize_t bin_intvec(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t copied = 0; + char *buffer; + ssize_t result; + + result = -ENOMEM; + buffer = kmalloc(BUFSZ, GFP_KERNEL); + if (!buffer) + goto out; + + if (oldval && oldlen) { + unsigned __user *vec = oldval; + size_t length = oldlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buffer, BUFSZ - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + + str = buffer; + end = str + result; + *end++ = '\0'; + for (i = 0; i < length; i++) { + unsigned long value; + + value = simple_strtoul(str, &str, 10); + while (isspace(*str)) + str++; + + result = -EFAULT; + if (put_user(value, vec + i)) + goto out_kfree; + + copied += sizeof(*vec); + if (!isdigit(*str)) + break; + } + } + + if (newval && newlen) { + unsigned __user *vec = newval; + size_t length = newlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + str = buffer; + end = str + BUFSZ; + for (i = 0; i < length; i++) { + unsigned long value; + + result = -EFAULT; + if (get_user(value, vec + i)) + goto out_kfree; + + str += snprintf(str, end - str, "%lu\t", value); + } + + set_fs(KERNEL_DS); + result = vfs_write(file, buffer, str - buffer, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + } + result = copied; +out_kfree: + kfree(buffer); +out: + return result; +} + +static ssize_t bin_ulongvec(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t copied = 0; + char *buffer; + ssize_t result; + + result = -ENOMEM; + buffer = kmalloc(BUFSZ, GFP_KERNEL); + if (!buffer) + goto out; + + if (oldval && oldlen) { + unsigned long __user *vec = oldval; + size_t length = oldlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buffer, BUFSZ - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + + str = buffer; + end = str + result; + *end++ = '\0'; + for (i = 0; i < length; i++) { + unsigned long value; + + value = simple_strtoul(str, &str, 10); + while (isspace(*str)) + str++; + + result = -EFAULT; + if (put_user(value, vec + i)) + goto out_kfree; + + copied += sizeof(*vec); + if (!isdigit(*str)) + break; + } + } + + if (newval && newlen) { + unsigned long __user *vec = newval; + size_t length = newlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + str = buffer; + end = str + BUFSZ; + for (i = 0; i < length; i++) { + unsigned long value; + + result = -EFAULT; + if (get_user(value, vec + i)) + goto out_kfree; + + str += snprintf(str, end - str, "%lu\t", value); + } + + set_fs(KERNEL_DS); + result = vfs_write(file, buffer, str - buffer, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + } + result = copied; +out_kfree: + kfree(buffer); +out: + return result; +} + +static unsigned hex_value(int ch) +{ + return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10; +} + +static ssize_t bin_uuid(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t result, copied = 0; + + /* Only supports reads */ + if (oldval && oldlen) { + loff_t pos = 0; + char buf[40], *str = buf; + unsigned char uuid[16]; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buf, sizeof(buf) - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + + buf[result] = '\0'; + + /* Convert the uuid to from a string to binary */ + for (i = 0; i < 16; i++) { + result = -EIO; + if (!isxdigit(str[0]) || !isxdigit(str[1])) + goto out; + + uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); + str += 2; + if (*str == '-') + str++; + } + + if (oldlen > 16) + oldlen = 16; + + result = -EFAULT; + if (copy_to_user(oldval, uuid, oldlen)) + goto out; + + copied = oldlen; + } + result = copied; +out: + return result; +} + +static ssize_t bin_dn_node_address(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t result, copied = 0; + + if (oldval && oldlen) { + loff_t pos = 0; + char buf[15], *nodep; + unsigned long area, node; + __le16 dnaddr; + + set_fs(KERNEL_DS); + result = vfs_read(file, buf, sizeof(buf) - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + + buf[result] = '\0'; + + /* Convert the decnet addresss to binary */ + result = -EIO; + nodep = strchr(buf, '.') + 1; + if (!nodep) + goto out; + + area = simple_strtoul(buf, NULL, 10); + node = simple_strtoul(nodep, NULL, 10); + + result = -EIO; + if ((area > 63)||(node > 1023)) + goto out; + + dnaddr = cpu_to_le16((area << 10) | node); + + result = -EFAULT; + if (put_user(dnaddr, (__le16 __user *)oldval)) + goto out; + + copied = sizeof(dnaddr); + } + + if (newval && newlen) { + loff_t pos = 0; + __le16 dnaddr; + char buf[15]; + int len; + + result = -EINVAL; + if (newlen != sizeof(dnaddr)) + goto out; + + result = -EFAULT; + if (get_user(dnaddr, (__le16 __user *)newval)) + goto out; + + len = snprintf(buf, sizeof(buf), "%hu.%hu", + le16_to_cpu(dnaddr) >> 10, + le16_to_cpu(dnaddr) & 0x3ff); + + set_fs(KERNEL_DS); + result = vfs_write(file, buf, len, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + } + + result = copied; +out: + return result; +} + +static const struct bin_table *get_sysctl(const int *name, int nlen, char *path) +{ + const struct bin_table *table = &bin_root_table[0]; + int ctl_name; + + /* The binary sysctl tables have a small maximum depth so + * there is no danger of overflowing our path as it PATH_MAX + * bytes long. + */ + memcpy(path, "sys/", 4); + path += 4; + +repeat: + if (!nlen) + return ERR_PTR(-ENOTDIR); + ctl_name = *name; + name++; + nlen--; + for ( ; table->convert; table++) { + int len = 0; + + /* + * For a wild card entry map from ifindex to network + * device name. + */ + if (!table->ctl_name) { +#ifdef CONFIG_NET + struct net *net = current->nsproxy->net_ns; + struct net_device *dev; + dev = dev_get_by_index(net, ctl_name); + if (dev) { + len = strlen(dev->name); + memcpy(path, dev->name, len); + dev_put(dev); + } +#endif + /* Use the well known sysctl number to proc name mapping */ + } else if (ctl_name == table->ctl_name) { + len = strlen(table->procname); + memcpy(path, table->procname, len); + } + if (len) { + path += len; + if (table->child) { + *path++ = '/'; + table = table->child; + goto repeat; + } + *path = '\0'; + return table; + } + } + return ERR_PTR(-ENOTDIR); +} + +static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep) +{ + char *tmp, *result; + + result = ERR_PTR(-ENOMEM); + tmp = __getname(); + if (tmp) { + const struct bin_table *table = get_sysctl(name, nlen, tmp); + result = tmp; + *tablep = table; + if (IS_ERR(table)) { + __putname(tmp); + result = ERR_CAST(table); + } + } + return result; +} + +static ssize_t binary_sysctl(const int *name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + const struct bin_table *table = NULL; + struct nameidata nd; + struct vfsmount *mnt; + struct file *file; + ssize_t result; + char *pathname; + int flags; + int acc_mode, fmode; + + pathname = sysctl_getname(name, nlen, &table); + result = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + + /* How should the sysctl be accessed? */ + if (oldval && oldlen && newval && newlen) { + flags = O_RDWR; + acc_mode = MAY_READ | MAY_WRITE; + fmode = FMODE_READ | FMODE_WRITE; + } else if (newval && newlen) { + flags = O_WRONLY; + acc_mode = MAY_WRITE; + fmode = FMODE_WRITE; + } else if (oldval && oldlen) { + flags = O_RDONLY; + acc_mode = MAY_READ; + fmode = FMODE_READ; + } else { + result = 0; + goto out_putname; + } + + mnt = current->nsproxy->pid_ns->proc_mnt; + result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); + if (result) + goto out_putname; + + result = may_open(&nd.path, acc_mode, fmode); + if (result) + goto out_putpath; + + file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); + result = PTR_ERR(file); + if (IS_ERR(file)) + goto out_putname; + + result = table->convert(file, oldval, oldlen, newval, newlen); + + fput(file); +out_putname: + putname(pathname); +out: + return result; + +out_putpath: + path_put(&nd.path); + goto out_putname; +} + + +#else /* CONFIG_SYSCTL_SYSCALL */ + +static ssize_t binary_sysctl(const int *name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + return -ENOSYS; +} + +#endif /* CONFIG_SYSCTL_SYSCALL */ + + +static void deprecated_sysctl_warning(const int *name, int nlen) +{ + int i; + + /* + * CTL_KERN/KERN_VERSION is used by older glibc and cannot + * ever go away. + */ + if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + return; + + if (printk_ratelimit()) { + printk(KERN_INFO + "warning: process `%s' used the deprecated sysctl " + "system call with ", current->comm); + for (i = 0; i < nlen; i++) + printk("%d.", name[i]); + printk("\n"); + } + return; +} + +static ssize_t do_sysctl(int __user *args_name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + int name[CTL_MAXNAME]; + int i; + + /* Check args->nlen. */ + if (nlen < 0 || nlen > CTL_MAXNAME) + return -ENOTDIR; + /* Read in the sysctl name for simplicity */ + for (i = 0; i < nlen; i++) + if (get_user(name[i], args_name + i)) + return -EFAULT; + + deprecated_sysctl_warning(name, nlen); + + return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); +} + +SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) +{ + struct __sysctl_args tmp; + size_t oldlen = 0; + ssize_t result; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + if (tmp.oldval && !tmp.oldlenp) + return -EFAULT; + + if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) + return -EFAULT; + + result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, + tmp.newval, tmp.newlen); + + if (result >= 0) { + oldlen = result; + result = 0; + } + + if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) + return -EFAULT; + + return result; +} + + +#ifdef CONFIG_COMPAT +#include <asm/compat.h> + +struct compat_sysctl_args { + compat_uptr_t name; + int nlen; + compat_uptr_t oldval; + compat_uptr_t oldlenp; + compat_uptr_t newval; + compat_size_t newlen; + compat_ulong_t __unused[4]; +}; + +asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) +{ + struct compat_sysctl_args tmp; + compat_size_t __user *compat_oldlenp; + size_t oldlen = 0; + ssize_t result; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + if (tmp.oldval && !tmp.oldlenp) + return -EFAULT; + + compat_oldlenp = compat_ptr(tmp.oldlenp); + if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) + return -EFAULT; + + result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, + compat_ptr(tmp.oldval), oldlen, + compat_ptr(tmp.newval), tmp.newlen); + + if (result >= 0) { + oldlen = result; + result = 0; + } + + if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) + return -EFAULT; + + return result; +} + +#endif /* CONFIG_COMPAT */ diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index b38423ca711..04cdcf72c82 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -5,1239 +5,6 @@ #include <linux/string.h> #include <net/ip_vs.h> -struct trans_ctl_table { - int ctl_name; - const char *procname; - const struct trans_ctl_table *child; -}; - -static const struct trans_ctl_table trans_random_table[] = { - { RANDOM_POOLSIZE, "poolsize" }, - { RANDOM_ENTROPY_COUNT, "entropy_avail" }, - { RANDOM_READ_THRESH, "read_wakeup_threshold" }, - { RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, - { RANDOM_BOOT_ID, "boot_id" }, - { RANDOM_UUID, "uuid" }, - {} -}; - -static const struct trans_ctl_table trans_pty_table[] = { - { PTY_MAX, "max" }, - { PTY_NR, "nr" }, - {} -}; - -static const struct trans_ctl_table trans_kern_table[] = { - { KERN_OSTYPE, "ostype" }, - { KERN_OSRELEASE, "osrelease" }, - /* KERN_OSREV not used */ - { KERN_VERSION, "version" }, - /* KERN_SECUREMASK not used */ - /* KERN_PROF not used */ - { KERN_NODENAME, "hostname" }, - { KERN_DOMAINNAME, "domainname" }, - - { KERN_PANIC, "panic" }, - { KERN_REALROOTDEV, "real-root-dev" }, - - { KERN_SPARC_REBOOT, "reboot-cmd" }, - { KERN_CTLALTDEL, "ctrl-alt-del" }, - { KERN_PRINTK, "printk" }, - - /* KERN_NAMETRANS not used */ - /* KERN_PPC_HTABRECLAIM not used */ - /* KERN_PPC_ZEROPAGED not used */ - { KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, - - { KERN_MODPROBE, "modprobe" }, - { KERN_SG_BIG_BUFF, "sg-big-buff" }, - { KERN_ACCT, "acct" }, - { KERN_PPC_L2CR, "l2cr" }, - - /* KERN_RTSIGNR not used */ - /* KERN_RTSIGMAX not used */ - - { KERN_SHMMAX, "shmmax" }, - { KERN_MSGMAX, "msgmax" }, - { KERN_MSGMNB, "msgmnb" }, - /* KERN_MSGPOOL not used*/ - { KERN_SYSRQ, "sysrq" }, - { KERN_MAX_THREADS, "threads-max" }, - { KERN_RANDOM, "random", trans_random_table }, - { KERN_SHMALL, "shmall" }, - { KERN_MSGMNI, "msgmni" }, - { KERN_SEM, "sem" }, - { KERN_SPARC_STOP_A, "stop-a" }, - { KERN_SHMMNI, "shmmni" }, - - { KERN_OVERFLOWUID, "overflowuid" }, - { KERN_OVERFLOWGID, "overflowgid" }, - - { KERN_HOTPLUG, "hotplug", }, - { KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, - - { KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, - { KERN_CORE_USES_PID, "core_uses_pid" }, - { KERN_TAINTED, "tainted" }, - { KERN_CADPID, "cad_pid" }, - { KERN_PIDMAX, "pid_max" }, - { KERN_CORE_PATTERN, "core_pattern" }, - { KERN_PANIC_ON_OOPS, "panic_on_oops" }, - { KERN_HPPA_PWRSW, "soft-power" }, - { KERN_HPPA_UNALIGNED, "unaligned-trap" }, - - { KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, - { KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, - - { KERN_PTY, "pty", trans_pty_table }, - { KERN_NGROUPS_MAX, "ngroups_max" }, - { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, - { KERN_HZ_TIMER, "hz_timer" }, - { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, - { KERN_BOOTLOADER_TYPE, "bootloader_type" }, - { KERN_RANDOMIZE, "randomize_va_space" }, - - { KERN_SPIN_RETRY, "spin_retry" }, - { KERN_ACPI_VIDEO_FLAGS, "acpi_video_flags" }, - { KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, - { KERN_COMPAT_LOG, "compat-log" }, - { KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { KERN_NMI_WATCHDOG, "nmi_watchdog" }, - { KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, - {} -}; - -static const struct trans_ctl_table trans_vm_table[] = { - { VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, - { VM_PAGE_CLUSTER, "page-cluster" }, - { VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, - { VM_DIRTY_RATIO, "dirty_ratio" }, - { VM_DIRTY_WB_CS, "dirty_writeback_centisecs" }, - { VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs" }, - { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, - { VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, - /* VM_PAGEBUF unused */ - { VM_HUGETLB_PAGES, "nr_hugepages" }, - { VM_SWAPPINESS, "swappiness" }, - { VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, - { VM_MIN_FREE_KBYTES, "min_free_kbytes" }, - { VM_MAX_MAP_COUNT, "max_map_count" }, - { VM_LAPTOP_MODE, "laptop_mode" }, - { VM_BLOCK_DUMP, "block_dump" }, - { VM_HUGETLB_GROUP, "hugetlb_shm_group" }, - { VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, - { VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, - /* VM_SWAP_TOKEN_TIMEOUT unused */ - { VM_DROP_PAGECACHE, "drop_caches" }, - { VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, - { VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, - { VM_MIN_UNMAPPED, "min_unmapped_ratio" }, - { VM_PANIC_ON_OOM, "panic_on_oom" }, - { VM_VDSO_ENABLED, "vdso_enabled" }, - { VM_MIN_SLAB, "min_slab_ratio" }, - - {} -}; - -static const struct trans_ctl_table trans_net_core_table[] = { - { NET_CORE_WMEM_MAX, "wmem_max" }, - { NET_CORE_RMEM_MAX, "rmem_max" }, - { NET_CORE_WMEM_DEFAULT, "wmem_default" }, - { NET_CORE_RMEM_DEFAULT, "rmem_default" }, - /* NET_CORE_DESTROY_DELAY unused */ - { NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, - /* NET_CORE_FASTROUTE unused */ - { NET_CORE_MSG_COST, "message_cost" }, - { NET_CORE_MSG_BURST, "message_burst" }, - { NET_CORE_OPTMEM_MAX, "optmem_max" }, - /* NET_CORE_HOT_LIST_LENGTH unused */ - /* NET_CORE_DIVERT_VERSION unused */ - /* NET_CORE_NO_CONG_THRESH unused */ - /* NET_CORE_NO_CONG unused */ - /* NET_CORE_LO_CONG unused */ - /* NET_CORE_MOD_CONG unused */ - { NET_CORE_DEV_WEIGHT, "dev_weight" }, - { NET_CORE_SOMAXCONN, "somaxconn" }, - { NET_CORE_BUDGET, "netdev_budget" }, - { NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, - { NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, - { NET_CORE_WARNINGS, "warnings" }, - {}, -}; - -static const struct trans_ctl_table trans_net_unix_table[] = { - /* NET_UNIX_DESTROY_DELAY unused */ - /* NET_UNIX_DELETE_DELAY unused */ - { NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_route_table[] = { - { NET_IPV4_ROUTE_FLUSH, "flush" }, - { NET_IPV4_ROUTE_MIN_DELAY, "min_delay" }, - { NET_IPV4_ROUTE_MAX_DELAY, "max_delay" }, - { NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, - { NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, - { NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, - { NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, - { NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, - { NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, - { NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, - { NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, - { NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, - { NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" }, - { NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = { - { NET_IPV4_CONF_FORWARDING, "forwarding" }, - { NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, - - { NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, - { NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, - { NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, - { NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, - { NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, - { NET_IPV4_CONF_RP_FILTER, "rp_filter" }, - { NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, - { NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, - { NET_IPV4_CONF_TAG, "tag" }, - { NET_IPV4_CONF_ARPFILTER, "arp_filter" }, - { NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, - { NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, - { NET_IPV4_CONF_NOPOLICY, "disable_policy" }, - { NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, - - { NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, - { NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, - { NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, - { NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, - { NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_conf_table[] = { - { NET_PROTO_CONF_ALL, "all", trans_net_ipv4_conf_vars_table }, - { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv4_conf_vars_table }, - { 0, NULL, trans_net_ipv4_conf_vars_table }, - {} -}; - -static const struct trans_ctl_table trans_net_neigh_vars_table[] = { - { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, - { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, - { NET_NEIGH_APP_SOLICIT, "app_solicit" }, - { NET_NEIGH_RETRANS_TIME, "retrans_time" }, - { NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, - { NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, - { NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, - { NET_NEIGH_UNRES_QLEN, "unres_qlen" }, - { NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, - { NET_NEIGH_ANYCAST_DELAY, "anycast_delay" }, - { NET_NEIGH_PROXY_DELAY, "proxy_delay" }, - { NET_NEIGH_LOCKTIME, "locktime" }, - { NET_NEIGH_GC_INTERVAL, "gc_interval" }, - { NET_NEIGH_GC_THRESH1, "gc_thresh1" }, - { NET_NEIGH_GC_THRESH2, "gc_thresh2" }, - { NET_NEIGH_GC_THRESH3, "gc_thresh3" }, - { NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, - { NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_neigh_table[] = { - { NET_PROTO_CONF_DEFAULT, "default", trans_net_neigh_vars_table }, - { 0, NULL, trans_net_neigh_vars_table }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_netfilter_table[] = { - { NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, - - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "ip_conntrack_tcp_timeout_syn_sent" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "ip_conntrack_tcp_timeout_syn_recv" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "ip_conntrack_tcp_timeout_established" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "ip_conntrack_tcp_timeout_fin_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "ip_conntrack_tcp_timeout_close_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "ip_conntrack_tcp_timeout_last_ack" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "ip_conntrack_tcp_timeout_time_wait" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "ip_conntrack_tcp_timeout_close" }, - - { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, "ip_conntrack_udp_timeout" }, - { NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "ip_conntrack_udp_timeout_stream" }, - { NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, "ip_conntrack_icmp_timeout" }, - { NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, "ip_conntrack_generic_timeout" }, - - { NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, - { NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, - { NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "ip_conntrack_tcp_timeout_max_retrans" }, - { NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, - { NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, - { NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, - - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "ip_conntrack_sctp_timeout_closed" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "ip_conntrack_sctp_timeout_cookie_wait" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "ip_conntrack_sctp_timeout_cookie_echoed" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "ip_conntrack_sctp_timeout_established" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "ip_conntrack_sctp_timeout_shutdown_sent" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "ip_conntrack_sctp_timeout_shutdown_recd" }, - { NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "ip_conntrack_sctp_timeout_shutdown_ack_sent" }, - - { NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, - { NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv4_table[] = { - { NET_IPV4_FORWARD, "ip_forward" }, - { NET_IPV4_DYNADDR, "ip_dynaddr" }, - - { NET_IPV4_CONF, "conf", trans_net_ipv4_conf_table }, - { NET_IPV4_NEIGH, "neigh", trans_net_neigh_table }, - { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table }, - /* NET_IPV4_FIB_HASH unused */ - { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table }, - - { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, - { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, - { NET_IPV4_TCP_SACK, "tcp_sack" }, - { NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, - { NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, - /* NET_IPV4_AUTOCONFIG unused */ - { NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, - { NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, - { NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, - { NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, - { NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, - /* NET_IPV4_TCP_MAX_KA_PROBES unused */ - { NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, - { NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, - { NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, - { NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, - { NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, - /* NET_IPV4_IP_MASQ_DEBUG unused */ - { NET_TCP_SYNCOOKIES, "tcp_syncookies" }, - { NET_TCP_STDURG, "tcp_stdurg" }, - { NET_TCP_RFC1337, "tcp_rfc1337" }, - /* NET_TCP_SYN_TAILDROP unused */ - { NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, - { NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, - { NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, - { NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, - /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ - /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ - /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ - /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ - /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ - { NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, - { NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, - { NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, - /* NET_IPV4_ALWAYS_DEFRAG unused */ - { NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, - { NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, - { NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, - { NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, - { NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, - { NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, - { NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, - { NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, - { NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, - { NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, - { NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, - { NET_TCP_FACK, "tcp_fack" }, - { NET_TCP_REORDERING, "tcp_reordering" }, - { NET_TCP_ECN, "tcp_ecn" }, - { NET_TCP_DSACK, "tcp_dsack" }, - { NET_TCP_MEM, "tcp_mem" }, - { NET_TCP_WMEM, "tcp_wmem" }, - { NET_TCP_RMEM, "tcp_rmem" }, - { NET_TCP_APP_WIN, "tcp_app_win" }, - { NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, - { NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, - { NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, - { NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, - { NET_TCP_TW_REUSE, "tcp_tw_reuse" }, - { NET_TCP_FRTO, "tcp_frto" }, - { NET_TCP_LOW_LATENCY, "tcp_low_latency" }, - { NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, - { NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, - { NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, - /* NET_TCP_DEFAULT_WIN_SCALE unused */ - { NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, - { NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, - /* NET_TCP_BIC_BETA unused */ - { NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, - { NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { NET_TCP_ABC, "tcp_abc" }, - { NET_IPV4_IPFRAG_MAX_DIST, "ipfrag_max_dist" }, - { NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, - { NET_TCP_BASE_MSS, "tcp_base_mss" }, - { NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, - { NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, - { NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, - { NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, - { NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, - { NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, - { NET_TCP_AVAIL_CONG_CONTROL, "tcp_available_congestion_control" }, - { NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, - { NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, - { NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, - { 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipx_table[] = { - { NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, - /* NET_IPX_FORWARDING unused */ - {} -}; - -static const struct trans_ctl_table trans_net_atalk_table[] = { - { NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, - { NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, - { NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, - { NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, - {}, -}; - -static const struct trans_ctl_table trans_net_netrom_table[] = { - { NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, - { NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, - { NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, - { NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, - { NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, - { NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, - { NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, - { NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, - { NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, - { NET_NETROM_ROUTING_CONTROL, "routing_control" }, - { NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, - { NET_NETROM_RESET, "reset" }, - {} -}; - -static const struct trans_ctl_table trans_net_ax25_param_table[] = { - { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, - { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, - { NET_AX25_BACKOFF_TYPE, "backoff_type" }, - { NET_AX25_CONNECT_MODE, "connect_mode" }, - { NET_AX25_STANDARD_WINDOW, "standard_window_size" }, - { NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, - { NET_AX25_T1_TIMEOUT, "t1_timeout" }, - { NET_AX25_T2_TIMEOUT, "t2_timeout" }, - { NET_AX25_T3_TIMEOUT, "t3_timeout" }, - { NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, - { NET_AX25_N2, "maximum_retry_count" }, - { NET_AX25_PACLEN, "maximum_packet_length" }, - { NET_AX25_PROTOCOL, "protocol" }, - { NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_ax25_table[] = { - { 0, NULL, trans_net_ax25_param_table }, - {} -}; - -static const struct trans_ctl_table trans_net_bridge_table[] = { - { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, - { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, - { NET_BRIDGE_NF_CALL_IP6TABLES, "bridge-nf-call-ip6tables" }, - { NET_BRIDGE_NF_FILTER_VLAN_TAGGED, "bridge-nf-filter-vlan-tagged" }, - { NET_BRIDGE_NF_FILTER_PPPOE_TAGGED, "bridge-nf-filter-pppoe-tagged" }, - {} -}; - -static const struct trans_ctl_table trans_net_rose_table[] = { - { NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, - { NET_ROSE_ROUTING_CONTROL, "routing_control" }, - { NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, - { NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, - { NET_ROSE_WINDOW_SIZE, "window_size" }, - { NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_conf_var_table[] = { - { NET_IPV6_FORWARDING, "forwarding" }, - { NET_IPV6_HOP_LIMIT, "hop_limit" }, - { NET_IPV6_MTU, "mtu" }, - { NET_IPV6_ACCEPT_RA, "accept_ra" }, - { NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, - { NET_IPV6_AUTOCONF, "autoconf" }, - { NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, - { NET_IPV6_RTR_SOLICITS, "router_solicitations" }, - { NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, - { NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, - { NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, - { NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, - { NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, - { NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, - { NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, - { NET_IPV6_MAX_ADDRESSES, "max_addresses" }, - { NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, - { NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, - { NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, - { NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, - { NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, - { NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, - { NET_IPV6_PROXY_NDP, "proxy_ndp" }, - { NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_conf_table[] = { - { NET_PROTO_CONF_ALL, "all", trans_net_ipv6_conf_var_table }, - { NET_PROTO_CONF_DEFAULT, "default", trans_net_ipv6_conf_var_table }, - { 0, NULL, trans_net_ipv6_conf_var_table }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_route_table[] = { - { NET_IPV6_ROUTE_FLUSH, "flush" }, - { NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, - { NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, - { NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, - { NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_icmp_table[] = { - { NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, - {} -}; - -static const struct trans_ctl_table trans_net_ipv6_table[] = { - { NET_IPV6_CONF, "conf", trans_net_ipv6_conf_table }, - { NET_IPV6_NEIGH, "neigh", trans_net_neigh_table }, - { NET_IPV6_ROUTE, "route", trans_net_ipv6_route_table }, - { NET_IPV6_ICMP, "icmp", trans_net_ipv6_icmp_table }, - { NET_IPV6_BINDV6ONLY, "bindv6only" }, - { NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, - { NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, - { NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, - { NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, - { NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, - { 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, - {} -}; - -static const struct trans_ctl_table trans_net_x25_table[] = { - { NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, - { NET_X25_FORWARD, "x25_forward" }, - {} -}; - -static const struct trans_ctl_table trans_net_tr_table[] = { - { NET_TR_RIF_TIMEOUT, "rif_timeout" }, - {} -}; - - -static const struct trans_ctl_table trans_net_decnet_conf_vars[] = { - { NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, - { NET_DECNET_CONF_DEV_PRIORITY, "priority" }, - { NET_DECNET_CONF_DEV_T2, "t2" }, - { NET_DECNET_CONF_DEV_T3, "t3" }, - {} -}; - -static const struct trans_ctl_table trans_net_decnet_conf[] = { - { 0, NULL, trans_net_decnet_conf_vars }, - {} -}; - -static const struct trans_ctl_table trans_net_decnet_table[] = { - { NET_DECNET_CONF, "conf", trans_net_decnet_conf }, - { NET_DECNET_NODE_ADDRESS, "node_address" }, - { NET_DECNET_NODE_NAME, "node_name" }, - { NET_DECNET_DEFAULT_DEVICE, "default_device" }, - { NET_DECNET_TIME_WAIT, "time_wait" }, - { NET_DECNET_DN_COUNT, "dn_count" }, - { NET_DECNET_DI_COUNT, "di_count" }, - { NET_DECNET_DR_COUNT, "dr_count" }, - { NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, - { NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, - { NET_DECNET_MEM, "decnet_mem" }, - { NET_DECNET_RMEM, "decnet_rmem" }, - { NET_DECNET_WMEM, "decnet_wmem" }, - { NET_DECNET_DEBUG_LEVEL, "debug" }, - {} -}; - -static const struct trans_ctl_table trans_net_sctp_table[] = { - { NET_SCTP_RTO_INITIAL, "rto_initial" }, - { NET_SCTP_RTO_MIN, "rto_min" }, - { NET_SCTP_RTO_MAX, "rto_max" }, - { NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, - { NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, - { NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, - { NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, - { NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, - { NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, - { NET_SCTP_HB_INTERVAL, "hb_interval" }, - { NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, - { NET_SCTP_MAX_BURST, "max_burst" }, - { NET_SCTP_ADDIP_ENABLE, "addip_enable" }, - { NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, - { NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, - { NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, - { NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_llc2_timeout_table[] = { - { NET_LLC2_ACK_TIMEOUT, "ack" }, - { NET_LLC2_P_TIMEOUT, "p" }, - { NET_LLC2_REJ_TIMEOUT, "rej" }, - { NET_LLC2_BUSY_TIMEOUT, "busy" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_station_table[] = { - { NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_llc2_table[] = { - { NET_LLC2, "timeout", trans_net_llc_llc2_timeout_table }, - {} -}; - -static const struct trans_ctl_table trans_net_llc_table[] = { - { NET_LLC2, "llc2", trans_net_llc_llc2_table }, - { NET_LLC_STATION, "station", trans_net_llc_station_table }, - {} -}; - -static const struct trans_ctl_table trans_net_netfilter_table[] = { - { NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, "nf_conntrack_tcp_timeout_syn_sent" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, "nf_conntrack_tcp_timeout_syn_recv" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, "nf_conntrack_tcp_timeout_established" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, "nf_conntrack_tcp_timeout_fin_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, "nf_conntrack_tcp_timeout_close_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, "nf_conntrack_tcp_timeout_last_ack" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, "nf_conntrack_tcp_timeout_time_wait" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, "nf_conntrack_tcp_timeout_close" }, - { NET_NF_CONNTRACK_UDP_TIMEOUT, "nf_conntrack_udp_timeout" }, - { NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM, "nf_conntrack_udp_timeout_stream" }, - { NET_NF_CONNTRACK_ICMP_TIMEOUT, "nf_conntrack_icmp_timeout" }, - { NET_NF_CONNTRACK_GENERIC_TIMEOUT, "nf_conntrack_generic_timeout" }, - { NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, - { NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, - { NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, "nf_conntrack_tcp_timeout_max_retrans" }, - { NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, - { NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, - { NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, "nf_conntrack_sctp_timeout_closed" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, "nf_conntrack_sctp_timeout_cookie_wait" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, "nf_conntrack_sctp_timeout_cookie_echoed" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, "nf_conntrack_sctp_timeout_established" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, "nf_conntrack_sctp_timeout_shutdown_sent" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, "nf_conntrack_sctp_timeout_shutdown_recd" }, - { NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, "nf_conntrack_sctp_timeout_shutdown_ack_sent" }, - { NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, - { NET_NF_CONNTRACK_ICMPV6_TIMEOUT, "nf_conntrack_icmpv6_timeout" }, - { NET_NF_CONNTRACK_FRAG6_TIMEOUT, "nf_conntrack_frag6_timeout" }, - { NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, - { NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, - { NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, - - {} -}; - -static const struct trans_ctl_table trans_net_dccp_table[] = { - { NET_DCCP_DEFAULT, "default" }, - {} -}; - -static const struct trans_ctl_table trans_net_irda_table[] = { - { NET_IRDA_DISCOVERY, "discovery" }, - { NET_IRDA_DEVNAME, "devname" }, - { NET_IRDA_DEBUG, "debug" }, - { NET_IRDA_FAST_POLL, "fast_poll_increase" }, - { NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, - { NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, - { NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, - { NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, - { NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, - { NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, - { NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, - { NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, - { NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, - { NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, - {} -}; - -static const struct trans_ctl_table trans_net_table[] = { - { NET_CORE, "core", trans_net_core_table }, - /* NET_ETHER not used */ - /* NET_802 not used */ - { NET_UNIX, "unix", trans_net_unix_table }, - { NET_IPV4, "ipv4", trans_net_ipv4_table }, - { NET_IPX, "ipx", trans_net_ipx_table }, - { NET_ATALK, "appletalk", trans_net_atalk_table }, - { NET_NETROM, "netrom", trans_net_netrom_table }, - { NET_AX25, "ax25", trans_net_ax25_table }, - { NET_BRIDGE, "bridge", trans_net_bridge_table }, - { NET_ROSE, "rose", trans_net_rose_table }, - { NET_IPV6, "ipv6", trans_net_ipv6_table }, - { NET_X25, "x25", trans_net_x25_table }, - { NET_TR, "token-ring", trans_net_tr_table }, - { NET_DECNET, "decnet", trans_net_decnet_table }, - /* NET_ECONET not used */ - { NET_SCTP, "sctp", trans_net_sctp_table }, - { NET_LLC, "llc", trans_net_llc_table }, - { NET_NETFILTER, "netfilter", trans_net_netfilter_table }, - { NET_DCCP, "dccp", trans_net_dccp_table }, - { NET_IRDA, "irda", trans_net_irda_table }, - { 2089, "nf_conntrack_max" }, - {} -}; - -static const struct trans_ctl_table trans_fs_quota_table[] = { - { FS_DQ_LOOKUPS, "lookups" }, - { FS_DQ_DROPS, "drops" }, - { FS_DQ_READS, "reads" }, - { FS_DQ_WRITES, "writes" }, - { FS_DQ_CACHE_HITS, "cache_hits" }, - { FS_DQ_ALLOCATED, "allocated_dquots" }, - { FS_DQ_FREE, "free_dquots" }, - { FS_DQ_SYNCS, "syncs" }, - { FS_DQ_WARNINGS, "warnings" }, - {} -}; - -static const struct trans_ctl_table trans_fs_xfs_table[] = { - { XFS_SGID_INHERIT, "irix_sgid_inherit" }, - { XFS_SYMLINK_MODE, "irix_symlink_mode" }, - { XFS_PANIC_MASK, "panic_mask" }, - - { XFS_ERRLEVEL, "error_level" }, - { XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, - { XFS_INHERIT_SYNC, "inherit_sync" }, - { XFS_INHERIT_NODUMP, "inherit_nodump" }, - { XFS_INHERIT_NOATIME, "inherit_noatime" }, - { XFS_BUF_TIMER, "xfsbufd_centisecs" }, - { XFS_BUF_AGE, "age_buffer_centisecs" }, - { XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, - { XFS_ROTORSTEP, "rotorstep" }, - { XFS_INHERIT_NODFRG, "inherit_nodefrag" }, - { XFS_FILESTREAM_TIMER, "filestream_centisecs" }, - { XFS_STATS_CLEAR, "stats_clear" }, - {} -}; - -static const struct trans_ctl_table trans_fs_ocfs2_nm_table[] = { - { 1, "hb_ctl_path" }, - {} -}; - -static const struct trans_ctl_table trans_fs_ocfs2_table[] = { - { 1, "nm", trans_fs_ocfs2_nm_table }, - {} -}; - -static const struct trans_ctl_table trans_inotify_table[] = { - { INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, - { INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, - { INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, - {} -}; - -static const struct trans_ctl_table trans_fs_table[] = { - { FS_NRINODE, "inode-nr" }, - { FS_STATINODE, "inode-state" }, - /* FS_MAXINODE unused */ - /* FS_NRDQUOT unused */ - /* FS_MAXDQUOT unused */ - { FS_NRFILE, "file-nr" }, - { FS_MAXFILE, "file-max" }, - { FS_DENTRY, "dentry-state" }, - /* FS_NRSUPER unused */ - /* FS_MAXUPSER unused */ - { FS_OVERFLOWUID, "overflowuid" }, - { FS_OVERFLOWGID, "overflowgid" }, - { FS_LEASES, "leases-enable" }, - { FS_DIR_NOTIFY, "dir-notify-enable" }, - { FS_LEASE_TIME, "lease-break-time" }, - { FS_DQSTATS, "quota", trans_fs_quota_table }, - { FS_XFS, "xfs", trans_fs_xfs_table }, - { FS_AIO_NR, "aio-nr" }, - { FS_AIO_MAX_NR, "aio-max-nr" }, - { FS_INOTIFY, "inotify", trans_inotify_table }, - { FS_OCFS2, "ocfs2", trans_fs_ocfs2_table }, - { KERN_SETUID_DUMPABLE, "suid_dumpable" }, - {} -}; - -static const struct trans_ctl_table trans_debug_table[] = { - {} -}; - -static const struct trans_ctl_table trans_cdrom_table[] = { - { DEV_CDROM_INFO, "info" }, - { DEV_CDROM_AUTOCLOSE, "autoclose" }, - { DEV_CDROM_AUTOEJECT, "autoeject" }, - { DEV_CDROM_DEBUG, "debug" }, - { DEV_CDROM_LOCK, "lock" }, - { DEV_CDROM_CHECK_MEDIA, "check_media" }, - {} -}; - -static const struct trans_ctl_table trans_ipmi_table[] = { - { DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, - {} -}; - -static const struct trans_ctl_table trans_mac_hid_files[] = { - /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ - /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ - { DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, - { DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, - { DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, - /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ - {} -}; - -static const struct trans_ctl_table trans_raid_table[] = { - { DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, - { DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, - {} -}; - -static const struct trans_ctl_table trans_scsi_table[] = { - { DEV_SCSI_LOGGING_LEVEL, "logging_level" }, - {} -}; - -static const struct trans_ctl_table trans_parport_default_table[] = { - { DEV_PARPORT_DEFAULT_TIMESLICE, "timeslice" }, - { DEV_PARPORT_DEFAULT_SPINTIME, "spintime" }, - {} -}; - -static const struct trans_ctl_table trans_parport_device_table[] = { - { DEV_PARPORT_DEVICE_TIMESLICE, "timeslice" }, - {} -}; - -static const struct trans_ctl_table trans_parport_devices_table[] = { - { DEV_PARPORT_DEVICES_ACTIVE, "active" }, - { 0, NULL, trans_parport_device_table }, - {} -}; - -static const struct trans_ctl_table trans_parport_parport_table[] = { - { DEV_PARPORT_SPINTIME, "spintime" }, - { DEV_PARPORT_BASE_ADDR, "base-addr" }, - { DEV_PARPORT_IRQ, "irq" }, - { DEV_PARPORT_DMA, "dma" }, - { DEV_PARPORT_MODES, "modes" }, - { DEV_PARPORT_DEVICES, "devices", trans_parport_devices_table }, - { DEV_PARPORT_AUTOPROBE, "autoprobe" }, - { DEV_PARPORT_AUTOPROBE + 1, "autoprobe0" }, - { DEV_PARPORT_AUTOPROBE + 2, "autoprobe1" }, - { DEV_PARPORT_AUTOPROBE + 3, "autoprobe2" }, - { DEV_PARPORT_AUTOPROBE + 4, "autoprobe3" }, - {} -}; -static const struct trans_ctl_table trans_parport_table[] = { - { DEV_PARPORT_DEFAULT, "default", trans_parport_default_table }, - { 0, NULL, trans_parport_parport_table }, - {} -}; - -static const struct trans_ctl_table trans_dev_table[] = { - { DEV_CDROM, "cdrom", trans_cdrom_table }, - /* DEV_HWMON unused */ - { DEV_PARPORT, "parport", trans_parport_table }, - { DEV_RAID, "raid", trans_raid_table }, - { DEV_MAC_HID, "mac_hid", trans_mac_hid_files }, - { DEV_SCSI, "scsi", trans_scsi_table }, - { DEV_IPMI, "ipmi", trans_ipmi_table }, - {} -}; - -static const struct trans_ctl_table trans_bus_isa_table[] = { - { BUS_ISA_MEM_BASE, "membase" }, - { BUS_ISA_PORT_BASE, "portbase" }, - { BUS_ISA_PORT_SHIFT, "portshift" }, - {} -}; - -static const struct trans_ctl_table trans_bus_table[] = { - { CTL_BUS_ISA, "isa", trans_bus_isa_table }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table0[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan0-txRing" }, - { 151, "arlan0-rxRing" }, - { 152, "arlan0-18" }, - { 153, "arlan0-ring" }, - { 154, "arlan0-shm-cpy" }, - { 155, "config0" }, - { 156, "reset0" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table1[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan1-txRing" }, - { 151, "arlan1-rxRing" }, - { 152, "arlan1-18" }, - { 153, "arlan1-ring" }, - { 154, "arlan1-shm-cpy" }, - { 155, "config1" }, - { 156, "reset1" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table2[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan2-txRing" }, - { 151, "arlan2-rxRing" }, - { 152, "arlan2-18" }, - { 153, "arlan2-ring" }, - { 154, "arlan2-shm-cpy" }, - { 155, "config2" }, - { 156, "reset2" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_conf_table3[] = { - { 1, "spreadingCode" }, - { 2, "channelNumber" }, - { 3, "scramblingDisable" }, - { 4, "txAttenuation" }, - { 5, "systemId" }, - { 6, "maxDatagramSize" }, - { 7, "maxFrameSize" }, - { 8, "maxRetries" }, - { 9, "receiveMode" }, - { 10, "priority" }, - { 11, "rootOrRepeater" }, - { 12, "SID" }, - { 13, "registrationMode" }, - { 14, "registrationFill" }, - { 15, "localTalkAddress" }, - { 16, "codeFormat" }, - { 17, "numChannels" }, - { 18, "channel1" }, - { 19, "channel2" }, - { 20, "channel3" }, - { 21, "channel4" }, - { 22, "txClear" }, - { 23, "txRetries" }, - { 24, "txRouting" }, - { 25, "txScrambled" }, - { 26, "rxParameter" }, - { 27, "txTimeoutMs" }, - { 28, "waitCardTimeout" }, - { 29, "channelSet" }, - { 30, "name" }, - { 31, "waitTime" }, - { 32, "lParameter" }, - { 33, "_15" }, - { 34, "headerSize" }, - { 36, "tx_delay_ms" }, - { 37, "retries" }, - { 38, "ReTransmitPacketMaxSize" }, - { 39, "waitReTransmitPacketMaxSize" }, - { 40, "fastReTransCount" }, - { 41, "driverRetransmissions" }, - { 42, "txAckTimeoutMs" }, - { 43, "registrationInterrupts" }, - { 44, "hardwareType" }, - { 45, "radioType" }, - { 46, "writeEEPROM" }, - { 47, "writeRadioType" }, - { 48, "entry_exit_debug" }, - { 49, "debug" }, - { 50, "in_speed" }, - { 51, "out_speed" }, - { 52, "in_speed10" }, - { 53, "out_speed10" }, - { 54, "in_speed_max" }, - { 55, "out_speed_max" }, - { 56, "measure_rate" }, - { 57, "pre_Command_Wait" }, - { 58, "rx_tweak1" }, - { 59, "rx_tweak2" }, - { 60, "tx_queue_len" }, - - { 150, "arlan3-txRing" }, - { 151, "arlan3-rxRing" }, - { 152, "arlan3-18" }, - { 153, "arlan3-ring" }, - { 154, "arlan3-shm-cpy" }, - { 155, "config3" }, - { 156, "reset3" }, - {} -}; - -static const struct trans_ctl_table trans_arlan_table[] = { - { 1, "arlan0", trans_arlan_conf_table0 }, - { 2, "arlan1", trans_arlan_conf_table1 }, - { 3, "arlan2", trans_arlan_conf_table2 }, - { 4, "arlan3", trans_arlan_conf_table3 }, - {} -}; - -static const struct trans_ctl_table trans_s390dbf_table[] = { - { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, - { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, - {} -}; - -static const struct trans_ctl_table trans_sunrpc_table[] = { - { CTL_RPCDEBUG, "rpc_debug" }, - { CTL_NFSDEBUG, "nfs_debug" }, - { CTL_NFSDDEBUG, "nfsd_debug" }, - { CTL_NLMDEBUG, "nlm_debug" }, - { CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, - { CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, - { CTL_MIN_RESVPORT, "min_resvport" }, - { CTL_MAX_RESVPORT, "max_resvport" }, - {} -}; - -static const struct trans_ctl_table trans_pm_table[] = { - { 1 /* CTL_PM_SUSPEND */, "suspend" }, - { 2 /* CTL_PM_CMODE */, "cmode" }, - { 3 /* CTL_PM_P0 */, "p0" }, - { 4 /* CTL_PM_CM */, "cm" }, - {} -}; - -static const struct trans_ctl_table trans_frv_table[] = { - { 1, "cache-mode" }, - { 2, "pin-cxnr" }, - {} -}; - -static const struct trans_ctl_table trans_root_table[] = { - { CTL_KERN, "kernel", trans_kern_table }, - { CTL_VM, "vm", trans_vm_table }, - { CTL_NET, "net", trans_net_table }, - /* CTL_PROC not used */ - { CTL_FS, "fs", trans_fs_table }, - { CTL_DEBUG, "debug", trans_debug_table }, - { CTL_DEV, "dev", trans_dev_table }, - { CTL_BUS, "bus", trans_bus_table }, - { CTL_ABI, "abi" }, - /* CTL_CPU not used */ - { CTL_ARLAN, "arlan", trans_arlan_table }, - { CTL_S390DBF, "s390dbf", trans_s390dbf_table }, - { CTL_SUNRPC, "sunrpc", trans_sunrpc_table }, - { CTL_PM, "pm", trans_pm_table }, - { CTL_FRV, "frv", trans_frv_table }, - {} -}; - - - static int sysctl_depth(struct ctl_table *table) { @@ -1261,47 +28,6 @@ static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) return table; } -static const struct trans_ctl_table *sysctl_binary_lookup(struct ctl_table *table) -{ - struct ctl_table *test; - const struct trans_ctl_table *ref; - int cur_depth; - - cur_depth = sysctl_depth(table); - - ref = trans_root_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->ctl_name || ref->procname || ref->child; ref++) { - int match = 0; - - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (test->ctl_name && ref->ctl_name && - (test->ctl_name == ref->ctl_name)) - match++; - - if (!ref->ctl_name && !ref->procname) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - ref = NULL; -out: - return ref; -} static void sysctl_print_path(struct ctl_table *table) { @@ -1315,26 +41,6 @@ static void sysctl_print_path(struct ctl_table *table) } } printk(" "); - if (table->ctl_name) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk(".%d", tmp->ctl_name); - } - } -} - -static void sysctl_repair_table(struct ctl_table *table) -{ - /* Don't complain about the classic default - * sysctl strategy routine. Maybe later we - * can get the tables fixed and complain about - * this. - */ - if (table->ctl_name && table->procname && - (table->proc_handler == proc_dointvec) && - (!table->strategy)) { - table->strategy = sysctl_data; - } } static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, @@ -1352,7 +58,7 @@ static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, ref = head->ctl_table; repeat: test = sysctl_parent(table, cur_depth); - for (; ref->ctl_name || ref->procname; ref++) { + for (; ref->procname; ref++) { int match = 0; if (cur_depth && !ref->child) continue; @@ -1361,10 +67,6 @@ repeat: (strcmp(test->procname, ref->procname) == 0)) match++; - if (test->ctl_name && ref->ctl_name && - (test->ctl_name == ref->ctl_name)) - match++; - if (match) { if (cur_depth != 0) { cur_depth--; @@ -1392,38 +94,6 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str *fail = str; } -static int sysctl_check_dir(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table *ref; - int error; - - error = 0; - ref = sysctl_check_lookup(namespaces, table); - if (ref) { - int match = 0; - if ((!table->procname && !ref->procname) || - (table->procname && ref->procname && - (strcmp(table->procname, ref->procname) == 0))) - match++; - - if ((!table->ctl_name && !ref->ctl_name) || - (table->ctl_name && ref->ctl_name && - (table->ctl_name == ref->ctl_name))) - match++; - - if (match != 2) { - printk(KERN_ERR "%s: failed: ", __func__); - sysctl_print_path(table); - printk(" ref: "); - sysctl_print_path(ref); - printk("\n"); - error = -EINVAL; - } - } - return error; -} - static void sysctl_check_leaf(struct nsproxy *namespaces, struct ctl_table *table, const char **fail) { @@ -1434,37 +104,15 @@ static void sysctl_check_leaf(struct nsproxy *namespaces, set_fail(fail, table, "Sysctl already exists"); } -static void sysctl_check_bin_path(struct ctl_table *table, const char **fail) -{ - const struct trans_ctl_table *ref; - - ref = sysctl_binary_lookup(table); - if (table->ctl_name && !ref) - set_fail(fail, table, "Unknown sysctl binary path"); - if (ref) { - if (ref->procname && - (!table->procname || - (strcmp(table->procname, ref->procname) != 0))) - set_fail(fail, table, "procname does not match binary path procname"); - - if (ref->ctl_name && table->ctl_name && - (table->ctl_name != ref->ctl_name)) - set_fail(fail, table, "ctl_name does not match binary path ctl_name"); - } -} - int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) { int error = 0; - for (; table->ctl_name || table->procname; table++) { + for (; table->procname; table++) { const char *fail = NULL; - sysctl_repair_table(table); if (table->parent) { if (table->procname && !table->parent->procname) set_fail(&fail, table, "Parent without procname"); - if (table->ctl_name && !table->parent->ctl_name) - set_fail(&fail, table, "Parent without ctl_name"); } if (!table->procname) set_fail(&fail, table, "No procname"); @@ -1477,21 +125,12 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "Writable sysctl directory"); if (table->proc_handler) set_fail(&fail, table, "Directory with proc_handler"); - if (table->strategy) - set_fail(&fail, table, "Directory with strategy"); if (table->extra1) set_fail(&fail, table, "Directory with extra1"); if (table->extra2) set_fail(&fail, table, "Directory with extra2"); - if (sysctl_check_dir(namespaces, table)) - set_fail(&fail, table, "Inconsistent directory names"); } else { - if ((table->strategy == sysctl_data) || - (table->strategy == sysctl_string) || - (table->strategy == sysctl_intvec) || - (table->strategy == sysctl_jiffies) || - (table->strategy == sysctl_ms_jiffies) || - (table->proc_handler == proc_dostring) || + if ((table->proc_handler == proc_dostring) || (table->proc_handler == proc_dointvec) || (table->proc_handler == proc_dointvec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || @@ -1513,15 +152,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "No max"); } } -#ifdef CONFIG_SYSCTL_SYSCALL - if (table->ctl_name && !table->strategy) - set_fail(&fail, table, "Missing strategy"); -#endif -#if 0 - if (!table->ctl_name && table->strategy) - set_fail(&fail, table, "Strategy without ctl_name"); -#endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif @@ -1531,7 +162,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) #endif sysctl_check_leaf(namespaces, table, &fail); } - sysctl_check_bin_path(table, &fail); if (table->mode > 0777) set_fail(&fail, table, "bogus .mode"); if (fail) { diff --git a/kernel/time.c b/kernel/time.c index 2e2e469a7fe..c6324d96009 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -136,7 +136,6 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; - update_xtime_cache(0); write_sequnlock_irq(&xtime_lock); clock_was_set(); } @@ -662,6 +661,36 @@ u64 nsec_to_clock_t(u64 x) #endif } +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ +#if (NSEC_PER_SEC % HZ) == 0 + /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ + return div_u64(n, NSEC_PER_SEC / HZ); +#elif (HZ % 512) == 0 + /* overflow after 292 years if HZ = 1024 */ + return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * Generic case - optimized for cases where HZ is a multiple of 3. + * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. + */ + return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); +#endif +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 620b58abdc3..3d5fc0fd1cc 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -20,6 +20,8 @@ #include <linux/sysdev.h> #include <linux/tick.h> +#include "tick-internal.h" + /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); @@ -28,7 +30,7 @@ static LIST_HEAD(clockevents_released); static RAW_NOTIFIER_HEAD(clockevents_chain); /* Protection for the above */ -static DEFINE_SPINLOCK(clockevents_lock); +static DEFINE_RAW_SPINLOCK(clockevents_lock); /** * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock); * * Math helper, returns latch value converted to nanoseconds (bound checked) */ -unsigned long clockevent_delta2ns(unsigned long latch, - struct clock_event_device *evt) +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) { - u64 clc = ((u64) latch << evt->shift); + u64 clc = (u64) latch << evt->shift; if (unlikely(!evt->mult)) { evt->mult = 1; @@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch, do_div(clc, evt->mult); if (clc < 1000) clc = 1000; - if (clc > LONG_MAX) - clc = LONG_MAX; + if (clc > KTIME_MAX) + clc = KTIME_MAX; - return (unsigned long) clc; + return clc; } EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -140,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb) unsigned long flags; int ret; - spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock_irqsave(&clockevents_lock, flags); ret = raw_notifier_chain_register(&clockevents_chain, nb); - spin_unlock_irqrestore(&clockevents_lock, flags); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); return ret; } @@ -184,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(!dev->cpumask); - spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); clockevents_notify_released(); - spin_unlock_irqrestore(&clockevents_lock, flags); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); @@ -240,7 +241,7 @@ void clockevents_notify(unsigned long reason, void *arg) struct list_head *node, *tmp; unsigned long flags; - spin_lock_irqsave(&clockevents_lock, flags); + raw_spin_lock_irqsave(&clockevents_lock, flags); clockevents_do_notify(reason, arg); switch (reason) { @@ -255,7 +256,7 @@ void clockevents_notify(unsigned long reason, void *arg) default: break; } - spin_unlock_irqrestore(&clockevents_lock, flags); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_notify); #endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 5e18c6ab2c6..e85c23404d3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -39,7 +39,7 @@ void timecounter_init(struct timecounter *tc, tc->cycle_last = cc->read(cc); tc->nsec = start_tstamp; } -EXPORT_SYMBOL(timecounter_init); +EXPORT_SYMBOL_GPL(timecounter_init); /** * timecounter_read_delta - get nanoseconds since last call of this function @@ -83,7 +83,7 @@ u64 timecounter_read(struct timecounter *tc) return nsec; } -EXPORT_SYMBOL(timecounter_read); +EXPORT_SYMBOL_GPL(timecounter_read); u64 timecounter_cyc2time(struct timecounter *tc, cycle_t cycle_tstamp) @@ -105,7 +105,60 @@ u64 timecounter_cyc2time(struct timecounter *tc, return nsec; } -EXPORT_SYMBOL(timecounter_cyc2time); +EXPORT_SYMBOL_GPL(timecounter_cyc2time); + +/** + * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks + * @mult: pointer to mult variable + * @shift: pointer to shift variable + * @from: frequency to convert from + * @to: frequency to convert to + * @minsec: guaranteed runtime conversion range in seconds + * + * The function evaluates the shift/mult pair for the scaled math + * operations of clocksources and clockevents. + * + * @to and @from are frequency values in HZ. For clock sources @to is + * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock + * event @to is the counter frequency and @from is NSEC_PER_SEC. + * + * The @minsec conversion range argument controls the time frame in + * seconds which must be covered by the runtime conversion with the + * calculated mult and shift factors. This guarantees that no 64bit + * overflow happens when the input value of the conversion is + * multiplied with the calculated mult factor. Larger ranges may + * reduce the conversion accuracy by chosing smaller mult and shift + * factors. + */ +void +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +{ + u64 tmp; + u32 sft, sftacc= 32; + + /* + * Calculate the shift factor which is limiting the conversion + * range: + */ + tmp = ((u64)minsec * from) >> 32; + while (tmp) { + tmp >>=1; + sftacc--; + } + + /* + * Find the conversion shift/mult pair which has the best + * accuracy and fits the maxsec conversion range: + */ + for (sft = 32; sft > 0; sft--) { + tmp = (u64) to << sft; + do_div(tmp, from); + if ((tmp >> sftacc) == 0) + break; + } + *mult = tmp; + *shift = sft; +} /*[Clocksource internal variables]--------- * curr_clocksource: @@ -413,6 +466,47 @@ void clocksource_touch_watchdog(void) clocksource_resume_watchdog(); } +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs: Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ + u64 max_nsecs, max_cycles; + + /* + * Calculate the maximum number of cycles that we can pass to the + * cyc2ns function without overflowing a 64-bit signed result. The + * maximum number of cycles is equal to ULLONG_MAX/cs->mult which + * is equivalent to the below. + * max_cycles < (2^63)/cs->mult + * max_cycles < 2^(log2((2^63)/cs->mult)) + * max_cycles < 2^(log2(2^63) - log2(cs->mult)) + * max_cycles < 2^(63 - log2(cs->mult)) + * max_cycles < 1 << (63 - log2(cs->mult)) + * Please note that we add 1 to the result of the log2 to account for + * any rounding errors, ensure the above inequality is satisfied and + * no overflow will occur. + */ + max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); + + /* + * The actual maximum number of cycles we can defer the clocksource is + * determined by the minimum of max_cycles and cs->mask. + */ + max_cycles = min_t(u64, max_cycles, (u64) cs->mask); + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); + + /* + * To ensure that the clocksource does not wrap whilst we are idle, + * limit the time the clocksource can be deferred by 12.5%. Please + * note a margin of 12.5% is used because this can be computed with + * a shift, versus say 10% which would require division. + */ + return max_nsecs - (max_nsecs >> 5); +} + #ifdef CONFIG_GENERIC_TIME /** @@ -511,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs) */ int clocksource_register(struct clocksource *cs) { + /* calculate max idle time permitted for this clocksource */ + cs->max_idle_ns = clocksource_max_deferment(cs); + mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_select(); @@ -580,7 +677,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, * @count: length of buffer * * Takes input from sysfs interface for manually overriding the default - * clocksource selction. + * clocksource selection. */ static ssize_t sysfs_override_clocksource(struct sys_device *dev, struct sysdev_attribute *attr, diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c2ec25087a3..b3bafd5fc66 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device; /* FIXME: Use cpumask_var_t. */ static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); static DECLARE_BITMAP(tmpmask, NR_CPUS); -static DEFINE_SPINLOCK(tick_broadcast_lock); +static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static int tick_broadcast_force; #ifdef CONFIG_TICK_ONESHOT @@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) unsigned long flags; int ret = 0; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Devices might be registered with both periodic and oneshot @@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) tick_broadcast_clear_oneshot(cpu); } } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } @@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - spin_lock(&tick_broadcast_lock); + raw_spin_lock(&tick_broadcast_lock); cpumask_and(to_cpumask(tmpmask), cpu_online_mask, tick_get_broadcast_mask()); tick_do_broadcast(to_cpumask(tmpmask)); - spin_unlock(&tick_broadcast_lock); + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) unsigned long flags; int cpu, bc_stopped; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); @@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason) tick_broadcast_setup_oneshot(bc); } out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) unsigned long flags; unsigned int cpu = *cpup; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); @@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) clockevents_shutdown(bc); } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } void tick_suspend_broadcast(void) @@ -317,13 +317,13 @@ void tick_suspend_broadcast(void) struct clock_event_device *bc; unsigned long flags; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; if (bc) clockevents_shutdown(bc); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } int tick_resume_broadcast(void) @@ -332,7 +332,7 @@ int tick_resume_broadcast(void) unsigned long flags; int broadcast = 0; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; @@ -351,7 +351,7 @@ int tick_resume_broadcast(void) break; } } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); return broadcast; } @@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) ktime_t now, next_event; int cpu; - spin_lock(&tick_broadcast_lock); + raw_spin_lock(&tick_broadcast_lock); again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; @@ -443,7 +443,7 @@ again: if (tick_broadcast_set_event(next_event, 0)) goto again; } - spin_unlock(&tick_broadcast_lock); + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) unsigned long flags; int cpu; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Periodic mode does not care about the enter/exit of power @@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void) struct clock_event_device *bc; unsigned long flags; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) unsigned long flags; unsigned int cpu = *cpup; - spin_lock_irqsave(&tick_broadcast_lock, flags); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* * Clear the broadcast mask flag for the dead cpu, but do not @@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) */ cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); - spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 83c4417b6a3..b6b898d2eee 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); ktime_t tick_next_period; ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -DEFINE_SPINLOCK(tick_device_lock); +static DEFINE_RAW_SPINLOCK(tick_device_lock); /* * Debugging: see timer_list.c @@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) int cpu, ret = NOTIFY_OK; unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); return NOTIFY_STOP; out_bc: @@ -278,7 +278,7 @@ out_bc: if (tick_check_broadcast_device(newdev)) ret = NOTIFY_STOP; - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); return ret; } @@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup) struct clock_event_device *dev = td->evtdev; unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); td->mode = TICKDEV_MODE_PERIODIC; if (dev) { /* @@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_suspend(void) @@ -330,9 +330,9 @@ static void tick_suspend(void) struct tick_device *td = &__get_cpu_var(tick_cpu_device); unsigned long flags; - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_shutdown(td->evtdev); - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); } static void tick_resume(void) @@ -341,7 +341,7 @@ static void tick_resume(void) unsigned long flags; int broadcast = tick_resume_broadcast(); - spin_lock_irqsave(&tick_device_lock, flags); + raw_spin_lock_irqsave(&tick_device_lock, flags); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); if (!broadcast) { @@ -350,7 +350,7 @@ static void tick_resume(void) else tick_resume_oneshot(); } - spin_unlock_irqrestore(&tick_device_lock, flags); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index b1c05bf75ee..290eefbc1f6 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,7 +6,6 @@ #define TICK_DO_TIMER_BOOT -2 DECLARE_PER_CPU(struct tick_device, tick_cpu_device); -extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; extern ktime_t tick_period; extern int tick_do_timer_cpu __read_mostly; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index a96c0e2b89c..0a8a213016f 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, dev->min_delta_ns += dev->min_delta_ns >> 1; printk(KERN_WARNING - "CE: %s increasing min_delta_ns to %lu nsec\n", + "CE: %s increasing min_delta_ns to %llu nsec\n", dev->name ? dev->name : "?", - dev->min_delta_ns << 1); + (unsigned long long) dev->min_delta_ns << 1); i = 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 89aed5933ed..f992762d7f5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz); * value. We do this unconditionally on any cpu, as we don't know whether the * cpu, which has the update task assigned is in a long sleep. */ -static void tick_nohz_update_jiffies(void) +static void tick_nohz_update_jiffies(ktime_t now) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - ktime_t now; - - if (!ts->tick_stopped) - return; cpumask_clear_cpu(cpu, nohz_cpu_mask); - now = ktime_get(); ts->idle_waketime = now; local_irq_save(flags); @@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -static void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t delta; - if (ts->idle_active) { - ktime_t now, delta; - now = ktime_get(); - delta = ktime_sub(now, ts->idle_entrytime); - ts->idle_lastupdate = now; - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - ts->idle_active = 0; + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_lastupdate = now; + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + ts->idle_active = 0; - sched_clock_idle_wakeup_event(0); - } + sched_clock_idle_wakeup_event(0); } static ktime_t tick_nohz_start_idle(struct tick_sched *ts) @@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle) struct tick_sched *ts; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + u64 time_delta; int cpu; local_irq_save(flags); @@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle) if (ratelimit < 10) { printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", - local_softirq_pending()); + (unsigned int) local_softirq_pending()); ratelimit++; } goto end; @@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle) seq = read_seqbegin(&xtime_lock); last_update = last_jiffies_update; last_jiffies = jiffies; + time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + arch_needs_cpu(cpu)) { + next_jiffies = last_jiffies + 1; delta_jiffies = 1; + } else { + /* Get the next timer wheel timer */ + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + } /* * Do not stop the tick, if we are only one off * or if the cpu is required for rcu @@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle) if ((long)delta_jiffies >= 1) { /* - * calculate the expiry time for the next timer wheel - * timer - */ - expires = ktime_add_ns(last_update, tick_period.tv64 * - delta_jiffies); - - /* * If this cpu is the one which updates jiffies, then * give up the assignment and let it be taken by the * cpu which runs the tick timer next, which might be * this cpu as well. If we don't drop this here the * jiffies might be stale and do_timer() never - * invoked. + * invoked. Keep track of the fact that it was the one + * which had the do_timer() duty last. If this cpu is + * the one which had the do_timer() duty last, we + * limit the sleep time to the timekeeping + * max_deferement value which we retrieved + * above. Otherwise we can sleep as long as we want. */ - if (cpu == tick_do_timer_cpu) + if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + time_delta = KTIME_MAX; + ts->do_timer_last = 0; + } else if (!ts->do_timer_last) { + time_delta = KTIME_MAX; + } + + /* + * calculate the expiry time for the next timer wheel + * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals + * that there is no timer pending or at least extremely + * far into the future (12 days for HZ=1000). In this + * case we set the expiry to the end of time. + */ + if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { + /* + * Calculate the time delta for the next timer event. + * If the time delta exceeds the maximum time delta + * permitted by the current clocksource then adjust + * the time delta accordingly to ensure the + * clocksource does not wrap. + */ + time_delta = min_t(u64, time_delta, + tick_period.tv64 * delta_jiffies); + } + + if (time_delta < KTIME_MAX) + expires = ktime_add_ns(last_update, time_delta); + else + expires.tv64 = KTIME_MAX; if (delta_jiffies > 1) cpumask_set_cpu(cpu, nohz_cpu_mask); @@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle) ts->idle_sleeps++; + /* Mark expires */ + ts->idle_expires = expires; + /* - * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that - * there is no timer pending or at least extremly far - * into the future (12 days for HZ=1000). In this case - * we simply stop the tick timer: + * If the expiration time == KTIME_MAX, then + * in this case we simply stop the tick timer. */ - if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { - ts->idle_expires.tv64 = KTIME_MAX; + if (unlikely(expires.tv64 == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); goto out; } - /* Mark expiries */ - ts->idle_expires = expires; - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED); @@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void) ktime_t now; local_irq_disable(); - tick_nohz_stop_idle(cpu); + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) + now = ktime_get(); + + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); if (!ts->inidle || !ts->tick_stopped) { ts->inidle = 0; @@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void) /* Update jiffies first */ select_nohz_load_balancer(0); - now = ktime_get(); tick_do_update_jiffies64(now); cpumask_clear_cpu(cpu, nohz_cpu_mask); @@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void) * timer and do not touch the other magic bits which need to be done * when idle is left. */ -static void tick_nohz_kick_tick(int cpu) +static void tick_nohz_kick_tick(int cpu, ktime_t now) { #if 0 /* Switch back to 2.6.27 behaviour */ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t delta, now; - - if (!ts->tick_stopped) - return; + ktime_t delta; /* * Do not touch the tick device, when the next expiry is either * already reached or less/equal than the tick period. */ - now = ktime_get(); delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); if (delta.tv64 <= tick_period.tv64) return; @@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu) #endif } +static inline void tick_check_nohz(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now; + + if (!ts->idle_active && !ts->tick_stopped) + return; + now = ktime_get(); + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); + if (ts->tick_stopped) { + tick_nohz_update_jiffies(now); + tick_nohz_kick_tick(cpu, now); + } +} + #else static inline void tick_nohz_switch_to_nohz(void) { } +static inline void tick_check_nohz(int cpu) { } #endif /* NO_HZ */ @@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { } void tick_check_idle(int cpu) { tick_check_oneshot_broadcast(cpu); -#ifdef CONFIG_NO_HZ - tick_nohz_stop_idle(cpu); - tick_nohz_update_jiffies(); - tick_nohz_kick_tick(cpu); -#endif + tick_check_nohz(cpu); } /* diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index 71e7f1a1915..12f5c55090b 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c @@ -40,7 +40,7 @@ ktime_t timecompare_transform(struct timecompare *sync, return ns_to_ktime(nsec); } -EXPORT_SYMBOL(timecompare_transform); +EXPORT_SYMBOL_GPL(timecompare_transform); int timecompare_offset(struct timecompare *sync, s64 *offset, @@ -89,7 +89,7 @@ int timecompare_offset(struct timecompare *sync, * source time */ sample.offset = - ktime_to_ns(ktime_add(end, start)) / 2 - + (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - ts; /* simple insertion sort based on duration */ @@ -131,7 +131,7 @@ int timecompare_offset(struct timecompare *sync, return used; } -EXPORT_SYMBOL(timecompare_offset); +EXPORT_SYMBOL_GPL(timecompare_offset); void __timecompare_update(struct timecompare *sync, u64 source_tstamp) @@ -188,4 +188,4 @@ void __timecompare_update(struct timecompare *sync, } } } -EXPORT_SYMBOL(__timecompare_update); +EXPORT_SYMBOL_GPL(__timecompare_update); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c3a4e2907ea..af4135f0582 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -165,19 +165,12 @@ struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -static struct timespec xtime_cache __attribute__ ((aligned (16))); -void update_xtime_cache(u64 nsec) -{ - xtime_cache = xtime; - timespec_add_ns(&xtime_cache, nsec); -} - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, timekeeper.clock); + update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); } #ifdef CONFIG_GENERIC_TIME @@ -332,12 +325,10 @@ int do_settimeofday(struct timespec *tv) xtime = *tv; - update_xtime_cache(0); - timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, timekeeper.clock); + update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -488,6 +479,17 @@ int timekeeping_valid_for_hres(void) } /** + * timekeeping_max_deferment - Returns max time the clocksource can be deferred + * + * Caller must observe xtime_lock via read_seqbegin/read_seqretry to + * ensure that the clocksource does not change! + */ +u64 timekeeping_max_deferment(void) +{ + return timekeeper.clock->max_idle_ns; +} + +/** * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. @@ -548,7 +550,6 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -582,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); total_sleep_time = timespec_add_safe(total_sleep_time, ts); } - update_xtime_cache(0); /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; @@ -723,6 +723,49 @@ static void timekeeping_adjust(s64 offset) } /** + * logarithmic_accumulation - shifted accumulation of cycles + * + * This functions accumulates a shifted interval of cycles into + * into a shifted interval nanoseconds. Allows for O(log) accumulation + * loop. + * + * Returns the unconsumed cycles. + */ +static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +{ + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + + /* If the offset is smaller then a shifted interval, do nothing */ + if (offset < timekeeper.cycle_interval<<shift) + return offset; + + /* Accumulate one shifted interval */ + offset -= timekeeper.cycle_interval << shift; + timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; + + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; + while (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; + xtime.tv_sec++; + second_overflow(); + } + + /* Accumulate into raw time */ + raw_time.tv_nsec += timekeeper.raw_interval << shift;; + while (raw_time.tv_nsec >= NSEC_PER_SEC) { + raw_time.tv_nsec -= NSEC_PER_SEC; + raw_time.tv_sec++; + } + + /* Accumulate error between NTP and clock interval */ + timekeeper.ntp_error += tick_length << shift; + timekeeper.ntp_error -= timekeeper.xtime_interval << + (timekeeper.ntp_error_shift + shift); + + return offset; +} + +/** * update_wall_time - Uses the current clocksource to increment the wall time * * Called from the timer interrupt, must hold a write on xtime_lock. @@ -731,7 +774,7 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; - u64 nsecs; + int shift = 0, maxshift; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -745,33 +788,22 @@ void update_wall_time(void) #endif timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; - /* normally this loop will run just once, however in the - * case of lost or late ticks, it will accumulate correctly. + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller then the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. */ + shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = max(0, shift); + /* Bound shift to one less then what overflows tick_length */ + maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; + shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { - u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; - - /* accumulate one interval */ - offset -= timekeeper.cycle_interval; - clock->cycle_last += timekeeper.cycle_interval; - - timekeeper.xtime_nsec += timekeeper.xtime_interval; - if (timekeeper.xtime_nsec >= nsecps) { - timekeeper.xtime_nsec -= nsecps; - xtime.tv_sec++; - second_overflow(); - } - - raw_time.tv_nsec += timekeeper.raw_interval; - if (raw_time.tv_nsec >= NSEC_PER_SEC) { - raw_time.tv_nsec -= NSEC_PER_SEC; - raw_time.tv_sec++; - } - - /* accumulate error between NTP and clock interval */ - timekeeper.ntp_error += tick_length; - timekeeper.ntp_error -= timekeeper.xtime_interval << - timekeeper.ntp_error_shift; + offset = logarithmic_accumulation(offset, shift); + shift--; } /* correct the clock when NTP error is too big */ @@ -807,11 +839,8 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; - nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); - update_xtime_cache(nsecs); - /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, timekeeper.clock); + update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); } /** @@ -846,13 +875,13 @@ void monotonic_to_bootbased(struct timespec *ts) unsigned long get_seconds(void) { - return xtime_cache.tv_sec; + return xtime.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime_cache; + return xtime; } struct timespec current_kernel_time(void) @@ -862,8 +891,7 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - - now = xtime_cache; + now = xtime; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -877,8 +905,7 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - - now = xtime_cache; + now = xtime; mono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1b5b7aa2fdf..28265636b6c 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, next_one: i = 0; - spin_lock_irqsave(&base->cpu_base->lock, flags); + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = base->first; /* @@ -100,13 +100,13 @@ next_one: timer = rb_entry(curr, struct hrtimer, node); tmp = *timer; - spin_unlock_irqrestore(&base->cpu_base->lock, flags); + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); print_timer(m, timer, &tmp, i, now); next++; goto next_one; } - spin_unlock_irqrestore(&base->cpu_base->lock, flags); + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); } static void @@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(expires_next); P(hres_active); P(nr_events); + P(nr_retries); + P(nr_hangs); + P_ns(max_hang_time); #endif #undef P #undef P_ns @@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) return; } SEQ_printf(m, "%s\n", dev->name); - SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); - SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); - SEQ_printf(m, " mult: %lu\n", dev->mult); - SEQ_printf(m, " shift: %d\n", dev->shift); + SEQ_printf(m, " max_delta_ns: %llu\n", + (unsigned long long) dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %llu\n", + (unsigned long long) dev->min_delta_ns); + SEQ_printf(m, " mult: %u\n", dev->mult); + SEQ_printf(m, " shift: %u\n", dev->shift); SEQ_printf(m, " mode: %d\n", dev->mode); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); @@ -252,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.4\n"); + SEQ_printf(m, "Timer List Version: v0.5\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index ee5681f8d7e..2f3b585b8d7 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: */ -static DEFINE_PER_CPU(spinlock_t, lookup_lock); +static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock); /* * Mutex to serialize state changes with show-stats activities: @@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, /* * It doesnt matter which lock we take: */ - spinlock_t *lock; + raw_spinlock_t *lock; struct entry *entry, input; unsigned long flags; if (likely(!timer_stats_active)) return; - lock = &per_cpu(lookup_lock, raw_smp_processor_id()); + lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id()); input.timer = timer; input.start_func = startf; @@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.pid = pid; input.timer_flag = timer_flag; - spin_lock_irqsave(lock, flags); + raw_spin_lock_irqsave(lock, flags); if (!timer_stats_active) goto out_unlock; @@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, atomic_inc(&overflow_count); out_unlock: - spin_unlock_irqrestore(lock, flags); + raw_spin_unlock_irqrestore(lock, flags); } static void print_name_offset(struct seq_file *m, unsigned long addr) @@ -348,9 +348,11 @@ static void sync_access(void) int cpu; for_each_online_cpu(cpu) { - spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); + raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); + + raw_spin_lock_irqsave(lock, flags); /* nothing */ - spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); + raw_spin_unlock_irqrestore(lock, flags); } } @@ -408,7 +410,7 @@ void __init init_timer_stats(void) int cpu; for_each_possible_cpu(cpu) - spin_lock_init(&per_cpu(lookup_lock, cpu)); + raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); } static int __init init_tstats_procfs(void) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b416512ad17..d006554888d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -339,6 +339,27 @@ config POWER_TRACER power management decisions, specifically the C-state and P-state behavior. +config KSYM_TRACER + bool "Trace read and write access on kernel memory locations" + depends on HAVE_HW_BREAKPOINT + select TRACING + help + This tracer helps find read and write operations on any given kernel + symbol i.e. /proc/kallsyms. + +config PROFILE_KSYM_TRACER + bool "Profile all kernel memory accesses on 'watched' variables" + depends on KSYM_TRACER + help + This tracer profiles kernel accesses on variables watched through the + ksym tracer ftrace plugin. Depending upon the hardware, all read + and write operations on kernel variables can be monitored for + accesses. + + The results will be displayed in: + /debugfs/tracing/profile_ksym + + Say N if unsure. config STACK_TRACER bool "Trace max stack" @@ -428,6 +449,23 @@ config BLK_DEV_IO_TRACE If unsure, say N. +config KPROBE_EVENT + depends on KPROBES + depends on X86 + bool "Enable kprobes-based dynamic events" + select TRACING + default y + help + This allows the user to add tracing events (similar to tracepoints) on the fly + via the ftrace interface. See Documentation/trace/kprobetrace.txt + for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. If + you want to use perf tools, this option is strongly recommended. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 26f03ac07c2..cd9ecd89ec7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,8 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 37ba67e3326..7968762c816 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -60,6 +60,13 @@ static int last_ftrace_enabled; /* Quick disabling of function tracer. */ int function_trace_stop; +/* List for set_ftrace_pid's pids. */ +LIST_HEAD(ftrace_pids); +struct ftrace_pid { + struct list_head list; + struct pid *pid; +}; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +#endif + static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) else func = ftrace_list_func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_list->next == &ftrace_list_end) { ftrace_func_t func = ftrace_list->func; - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } @@ -231,7 +242,7 @@ static void ftrace_update_pid_func(void) func = __ftrace_trace_function; #endif - if (ftrace_pid_trace) { + if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); func = ftrace_pid_func; } else { @@ -740,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&ftrace_profile_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -821,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) } #endif /* CONFIG_FUNCTION_PROFILER */ -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; #ifdef CONFIG_DYNAMIC_FTRACE @@ -1261,12 +1270,34 @@ static int ftrace_update_code(struct module *mod) ftrace_new_addrs = p->newlist; p->flags = 0L; - /* convert record (i.e, patch mcount-call with NOP) */ - if (ftrace_code_disable(mod, p)) { - p->flags |= FTRACE_FL_CONVERTED; - ftrace_update_cnt++; - } else + /* + * Do the initial record convertion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) { ftrace_free_rec(p); + continue; + } + + p->flags |= FTRACE_FL_CONVERTED; + ftrace_update_cnt++; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up) { + int failed = __ftrace_replace_code(p, 1); + if (failed) { + ftrace_bug(failed, p->ip); + ftrace_free_rec(p); + } + } } stop = ftrace_now(raw_smp_processor_id()); @@ -1656,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin) return ret; } -enum { - MATCH_FULL, - MATCH_FRONT_ONLY, - MATCH_MIDDLE_ONLY, - MATCH_END_ONLY, -}; - -/* - * (static function - no need for kernel doc) - * - * Pass in a buffer containing a glob and this function will - * set search to point to the search part of the buffer and - * return the type of search it is (see enum above). - * This does modify buff. - * - * Returns enum type. - * search returns the pointer to use for comparison. - * not returns 1 if buff started with a '!' - * 0 otherwise. - */ -static int -ftrace_setup_glob(char *buff, int len, char **search, int *not) -{ - int type = MATCH_FULL; - int i; - - if (buff[0] == '!') { - *not = 1; - buff++; - len--; - } else - *not = 0; - - *search = buff; - - for (i = 0; i < len; i++) { - if (buff[i] == '*') { - if (!i) { - *search = buff + 1; - type = MATCH_END_ONLY; - } else { - if (type == MATCH_END_ONLY) - type = MATCH_MIDDLE_ONLY; - else - type = MATCH_FRONT_ONLY; - buff[i] = 0; - break; - } - } - } - - return type; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; @@ -1747,7 +1724,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) return ftrace_match(str, regex, len, type); } -static void ftrace_match_records(char *buff, int len, int enable) +static int ftrace_match_records(char *buff, int len, int enable) { unsigned int search_len; struct ftrace_page *pg; @@ -1756,9 +1733,10 @@ static void ftrace_match_records(char *buff, int len, int enable) char *search; int type; int not; + int found = 0; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - type = ftrace_setup_glob(buff, len, &search, ¬); + type = filter_parse_regex(buff, len, &search, ¬); search_len = strlen(search); @@ -1773,6 +1751,7 @@ static void ftrace_match_records(char *buff, int len, int enable) rec->flags &= ~flag; else rec->flags |= flag; + found = 1; } /* * Only enable filtering if we have a function that @@ -1782,6 +1761,8 @@ static void ftrace_match_records(char *buff, int len, int enable) ftrace_filtered = 1; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); + + return found; } static int @@ -1803,7 +1784,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, return 1; } -static void ftrace_match_module_records(char *buff, char *mod, int enable) +static int ftrace_match_module_records(char *buff, char *mod, int enable) { unsigned search_len = 0; struct ftrace_page *pg; @@ -1812,6 +1793,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) char *search = buff; unsigned long flag; int not = 0; + int found = 0; flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; @@ -1826,7 +1808,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) } if (strlen(buff)) { - type = ftrace_setup_glob(buff, strlen(buff), &search, ¬); + type = filter_parse_regex(buff, strlen(buff), &search, ¬); search_len = strlen(search); } @@ -1842,12 +1824,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) rec->flags &= ~flag; else rec->flags |= flag; + found = 1; } if (enable && (rec->flags & FTRACE_FL_FILTER)) ftrace_filtered = 1; } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); + + return found; } /* @@ -1876,8 +1861,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) if (!strlen(mod)) return -EINVAL; - ftrace_match_module_records(func, mod, enable); - return 0; + if (ftrace_match_module_records(func, mod, enable)) + return 0; + return -EINVAL; } static struct ftrace_func_command ftrace_mod_cmd = { @@ -1991,7 +1977,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int count = 0; char *search; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2068,7 +2054,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, else if (glob) { int not; - type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); /* we do not support '!' for function probes */ @@ -2174,8 +2160,9 @@ static int ftrace_process_regex(char *buff, int len, int enable) func = strsep(&next, ":"); if (!next) { - ftrace_match_records(func, len, enable); - return 0; + if (ftrace_match_records(func, len, enable)) + return 0; + return ret; } /* command found */ @@ -2221,16 +2208,15 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, !trace_parser_cont(parser)) { ret = ftrace_process_regex(parser->buffer, parser->idx, enable); - if (ret) - goto out; - trace_parser_clear(parser); + if (ret) + goto out_unlock; } ret = read; - +out_unlock: mutex_unlock(&ftrace_regex_lock); -out: + return ret; } @@ -2312,6 +2298,32 @@ static int __init set_ftrace_filter(char *str) } __setup("ftrace_filter=", set_ftrace_filter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static int __init set_graph_function(char *str) +{ + strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static void __init set_ftrace_early_graph(char *buf) +{ + int ret; + char *func; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static void __init set_ftrace_early_filter(char *buf, int enable) { char *func; @@ -2328,6 +2340,10 @@ static void __init set_ftrace_early_filters(void) set_ftrace_early_filter(ftrace_filter_buf, 1); if (ftrace_notrace_buf[0]) set_ftrace_early_filter(ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ } static int @@ -2513,7 +2529,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) return -ENODEV; /* decode regex */ - type = ftrace_setup_glob(buffer, strlen(buffer), &search, ¬); + type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); if (not) return -EINVAL; @@ -2536,10 +2552,9 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) exists = true; break; } - if (!exists) { + if (!exists) array[(*idx)++] = rec->ip; - found = 1; - } + found = 1; } } while_for_each_ftrace_rec(); @@ -2624,7 +2639,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } -static int ftrace_convert_nops(struct module *mod, +static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { @@ -2684,7 +2699,7 @@ static void ftrace_init_module(struct module *mod, { if (ftrace_disabled || start == end) return; - ftrace_convert_nops(mod, start, end); + ftrace_process_locs(mod, start, end); } static int ftrace_module_notify(struct notifier_block *self, @@ -2745,7 +2760,7 @@ void __init ftrace_init(void) last_ftrace_enabled = ftrace_enabled = 1; - ret = ftrace_convert_nops(NULL, + ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); @@ -2778,23 +2793,6 @@ static inline void ftrace_startup_enable(int command) { } # define ftrace_shutdown_sysctl() do { } while (0) #endif /* CONFIG_DYNAMIC_FTRACE */ -static ssize_t -ftrace_pid_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - if (ftrace_pid_trace == ftrace_swapper_pid) - r = sprintf(buf, "swapper tasks\n"); - else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); - else - r = sprintf(buf, "no pid\n"); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - static void clear_ftrace_swapper(void) { struct task_struct *p; @@ -2845,14 +2843,12 @@ static void set_ftrace_pid(struct pid *pid) rcu_read_unlock(); } -static void clear_ftrace_pid_task(struct pid **pid) +static void clear_ftrace_pid_task(struct pid *pid) { - if (*pid == ftrace_swapper_pid) + if (pid == ftrace_swapper_pid) clear_ftrace_swapper(); else - clear_ftrace_pid(*pid); - - *pid = NULL; + clear_ftrace_pid(pid); } static void set_ftrace_pid_task(struct pid *pid) @@ -2863,74 +2859,184 @@ static void set_ftrace_pid_task(struct pid *pid) set_ftrace_pid(pid); } -static ssize_t -ftrace_pid_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int ftrace_pid_add(int p) { struct pid *pid; - char buf[64]; - long val; - int ret; + struct ftrace_pid *fpid; + int ret = -EINVAL; - if (cnt >= sizeof(buf)) - return -EINVAL; + mutex_lock(&ftrace_lock); - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; + if (!p) + pid = ftrace_swapper_pid; + else + pid = find_get_pid(p); - buf[cnt] = 0; + if (!pid) + goto out; - ret = strict_strtol(buf, 10, &val); - if (ret < 0) - return ret; + ret = 0; - mutex_lock(&ftrace_lock); - if (val < 0) { - /* disable pid tracing */ - if (!ftrace_pid_trace) - goto out; + list_for_each_entry(fpid, &ftrace_pids, list) + if (fpid->pid == pid) + goto out_put; - clear_ftrace_pid_task(&ftrace_pid_trace); + ret = -ENOMEM; - } else { - /* swapper task is special */ - if (!val) { - pid = ftrace_swapper_pid; - if (pid == ftrace_pid_trace) - goto out; - } else { - pid = find_get_pid(val); + fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); + if (!fpid) + goto out_put; - if (pid == ftrace_pid_trace) { - put_pid(pid); - goto out; - } - } + list_add(&fpid->list, &ftrace_pids); + fpid->pid = pid; - if (ftrace_pid_trace) - clear_ftrace_pid_task(&ftrace_pid_trace); + set_ftrace_pid_task(pid); - if (!pid) - goto out; + ftrace_update_pid_func(); + ftrace_startup_enable(0); - ftrace_pid_trace = pid; + mutex_unlock(&ftrace_lock); + return 0; - set_ftrace_pid_task(ftrace_pid_trace); +out_put: + if (pid != ftrace_swapper_pid) + put_pid(pid); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static void ftrace_pid_reset(void) +{ + struct ftrace_pid *fpid, *safe; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { + struct pid *pid = fpid->pid; + + clear_ftrace_pid_task(pid); + + list_del(&fpid->list); + kfree(fpid); } - /* update the function call */ ftrace_update_pid_func(); ftrace_startup_enable(0); - out: mutex_unlock(&ftrace_lock); +} - return cnt; +static void *fpid_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ftrace_lock); + + if (list_empty(&ftrace_pids) && (!*pos)) + return (void *) 1; + + return seq_list_start(&ftrace_pids, *pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (v == (void *)1) + return NULL; + + return seq_list_next(v, &ftrace_pids, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); + + if (v == (void *)1) { + seq_printf(m, "no pid\n"); + return 0; + } + + if (fpid->pid == ftrace_swapper_pid) + seq_printf(m, "swapper tasks\n"); + else + seq_printf(m, "%u\n", pid_vnr(fpid->pid)); + + return 0; +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_pid_sops); + + return ret; +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64], *tmp; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* + * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" + * to clean the filter quietly. + */ + tmp = strstrip(buf); + if (strlen(tmp) == 0) + return 1; + + ret = strict_strtol(tmp, 10, &val); + if (ret < 0) + return ret; + + ret = ftrace_pid_add(val); + + return ret ? ret : cnt; +} + +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; } static const struct file_operations ftrace_pid_fops = { - .read = ftrace_pid_read, - .write = ftrace_pid_write, + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_pid_release, }; static __init int ftrace_init_debugfs(void) @@ -3293,4 +3399,3 @@ void ftrace_graph_stop(void) ftrace_stop(); } #endif - diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index e06c6e3d56a..9f4f565b01e 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -14,7 +14,5 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); -EXPORT_TRACEPOINT_SYMBOL_GPL(power_end); EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4ff0197054..2326b04c95c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -397,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s) int ret; ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\n", - (unsigned int)sizeof(field.time_stamp)); + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit)); + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\n", + "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE); + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); return ret; } @@ -420,7 +423,7 @@ struct ring_buffer_per_cpu { int cpu; struct ring_buffer *buffer; spinlock_t reader_lock; /* serialize readers */ - raw_spinlock_t lock; + arch_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; struct buffer_page *head_page; /* read from head */ @@ -483,7 +486,7 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +static inline u64 rb_time_stamp(struct ring_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; @@ -494,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) u64 time; preempt_disable_notrace(); - time = rb_time_stamp(buffer, cpu); + time = rb_time_stamp(buffer); preempt_enable_no_resched_notrace(); return time; @@ -599,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list) } /* - * rb_is_head_page - test if the give page is the head page + * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to @@ -995,7 +998,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); - cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1190,9 +1193,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) struct list_head *p; unsigned i; - atomic_inc(&cpu_buffer->record_disabled); - synchronize_sched(); - + spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1207,11 +1208,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) return; rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); - atomic_dec(&cpu_buffer->record_disabled); - + spin_unlock_irq(&cpu_buffer->reader_lock); } static void @@ -1222,9 +1221,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *p; unsigned i; - atomic_inc(&cpu_buffer->record_disabled); - synchronize_sched(); - spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); @@ -1237,11 +1233,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add_tail(&bpage->list, cpu_buffer->pages); } rb_reset_cpu(cpu_buffer); - spin_unlock_irq(&cpu_buffer->reader_lock); - rb_check_pages(cpu_buffer); - atomic_dec(&cpu_buffer->record_disabled); + spin_unlock_irq(&cpu_buffer->reader_lock); } /** @@ -1249,11 +1243,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, * @buffer: the buffer to resize. * @size: the new size. * - * The tracer is responsible for making sure that the buffer is - * not being used while changing the size. - * Note: We may be able to change the above requirement by using - * RCU synchronizations. - * * Minimum size is 2 * BUF_PAGE_SIZE. * * Returns -1 on failure. @@ -1285,6 +1274,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) if (size == buffer_size) return size; + atomic_inc(&buffer->record_disabled); + + /* Make sure all writers are done with this buffer. */ + synchronize_sched(); + mutex_lock(&buffer->mutex); get_online_cpus(); @@ -1347,6 +1341,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); + return size; free_pages: @@ -1356,6 +1352,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); return -ENOMEM; /* @@ -1365,6 +1362,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) out_fail: put_online_cpus(); mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); return -1; } EXPORT_SYMBOL_GPL(ring_buffer_resize); @@ -1785,9 +1783,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *commit_page, struct buffer_page *tail_page, u64 *ts) { + struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; @@ -1868,7 +1866,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer); next_page->page->time_stamp = *ts; } @@ -1890,13 +1888,10 @@ static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { - struct buffer_page *tail_page, *commit_page; + struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); @@ -1907,7 +1902,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (write > BUF_PAGE_SIZE) return rb_move_tail(cpu_buffer, length, tail, - commit_page, tail_page, ts); + tail_page, ts); /* We reserved something on the buffer */ @@ -2111,7 +2106,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer); /* * Only the first commit can update the timestamp. @@ -2681,7 +2676,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_entries); /** - * ring_buffer_overrun_cpu - get the number of overruns in buffer + * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer @@ -2832,7 +2827,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) int ret; local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); again: /* @@ -2921,7 +2916,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return reader; @@ -3284,9 +3279,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) synchronize_sched(); spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return iter; @@ -3406,11 +3401,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; - __raw_spin_lock(&cpu_buffer->lock); + arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); - __raw_spin_unlock(&cpu_buffer->lock); + arch_spin_unlock(&cpu_buffer->lock); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 573d3cc762c..b2477caf09c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -35,6 +35,28 @@ static int disable_reader; module_param(disable_reader, uint, 0644); MODULE_PARM_DESC(disable_reader, "only run producer"); +static int write_iteration = 50; +module_param(write_iteration, uint, 0644); +MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); + +static int producer_nice = 19; +static int consumer_nice = 19; + +static int producer_fifo = -1; +static int consumer_fifo = -1; + +module_param(producer_nice, uint, 0644); +MODULE_PARM_DESC(producer_nice, "nice prio for producer"); + +module_param(consumer_nice, uint, 0644); +MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); + +module_param(producer_fifo, uint, 0644); +MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); + +module_param(consumer_fifo, uint, 0644); +MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); + static int read_events; static int kill_test; @@ -208,15 +230,18 @@ static void ring_buffer_producer(void) do { struct ring_buffer_event *event; int *entry; - - event = ring_buffer_lock_reserve(buffer, 10); - if (!event) { - missed++; - } else { - hit++; - entry = ring_buffer_event_data(event); - *entry = smp_processor_id(); - ring_buffer_unlock_commit(buffer, event); + int i; + + for (i = 0; i < write_iteration; i++) { + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } } do_gettimeofday(&end_tv); @@ -263,6 +288,27 @@ static void ring_buffer_producer(void) if (kill_test) trace_printk("ERROR!\n"); + + if (!disable_reader) { + if (consumer_fifo < 0) + trace_printk("Running Consumer at nice: %d\n", + consumer_nice); + else + trace_printk("Running Consumer at SCHED_FIFO %d\n", + consumer_fifo); + } + if (producer_fifo < 0) + trace_printk("Running Producer at nice: %d\n", + producer_nice); + else + trace_printk("Running Producer at SCHED_FIFO %d\n", + producer_fifo); + + /* Let the user know that the test is running at low priority */ + if (producer_fifo < 0 && consumer_fifo < 0 && + producer_nice == 19 && consumer_nice == 19) + trace_printk("WARNING!!! This test is running at lowest priority.\n"); + trace_printk("Time: %lld (usecs)\n", time); trace_printk("Overruns: %lld\n", overruns); if (disable_reader) @@ -392,6 +438,27 @@ static int __init ring_buffer_benchmark_init(void) if (IS_ERR(producer)) goto out_kill; + /* + * Run them as low-prio background tasks by default: + */ + if (!disable_reader) { + if (consumer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(consumer, SCHED_FIFO, ¶m); + } else + set_user_nice(consumer, consumer_nice); + } + + if (producer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(producer, SCHED_FIFO, ¶m); + } else + set_user_nice(producer, producer_nice); + return 0; out_kill: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c820b0310a1..8b9f20ab8ee 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -12,7 +12,7 @@ * Copyright (C) 2004 William Lee Irwin III */ #include <linux/ring_buffer.h> -#include <linux/utsrelease.h> +#include <generated/utsrelease.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> @@ -86,17 +86,17 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static int tracing_disabled = 1; -DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); +DEFINE_PER_CPU(int, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { preempt_disable(); - local_inc(&__get_cpu_var(ftrace_cpu_disabled)); + __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); } static inline void ftrace_enable_cpu(void) { - local_dec(&__get_cpu_var(ftrace_cpu_disabled)); + __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); preempt_enable(); } @@ -129,7 +129,7 @@ static int tracing_set_tracer(const char *buf); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static int __init set_ftrace(char *str) +static int __init set_cmdline_ftrace(char *str) { strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; @@ -137,7 +137,7 @@ static int __init set_ftrace(char *str) ring_buffer_expanded = 1; return 1; } -__setup("ftrace=", set_ftrace); +__setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { @@ -203,7 +203,7 @@ cycle_t ftrace_now(int cpu) */ static struct trace_array max_tr; -static DEFINE_PER_CPU(struct trace_array_cpu, max_data); +static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); /* tracer_enabled is used to toggle activation of a tracer */ static int tracer_enabled = 1; @@ -313,7 +313,6 @@ static const char *trace_options[] = { "bin", "block", "stacktrace", - "sched-tree", "trace_printk", "ftrace_preempt", "branch", @@ -493,15 +492,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) * protected by per_cpu spinlocks. But the action of the swap * needs its own lock. * - * This is defined as a raw_spinlock_t in order to help + * This is defined as a arch_spinlock_t in order to help * with performance when lockdep debugging is enabled. * * It is also used in other places outside the update_max_tr * so it needs to be defined outside of the * CONFIG_TRACER_MAX_TRACE. */ -static raw_spinlock_t ftrace_max_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t ftrace_max_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_TRACER_MAX_TRACE unsigned long __read_mostly tracing_max_latency; @@ -555,13 +554,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; max_tr.buffer = buf; __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); } /** @@ -581,7 +580,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); ftrace_disable_cpu(); @@ -603,7 +602,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); } #endif /* CONFIG_TRACER_MAX_TRACE */ @@ -802,7 +801,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; -static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; @@ -915,7 +914,7 @@ static void trace_save_cmdline(struct task_struct *tsk) * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!__raw_spin_trylock(&trace_cmdline_lock)) + if (!arch_spin_trylock(&trace_cmdline_lock)) return; idx = map_pid_to_cmdline[tsk->pid]; @@ -940,7 +939,7 @@ static void trace_save_cmdline(struct task_struct *tsk) memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); - __raw_spin_unlock(&trace_cmdline_lock); + arch_spin_unlock(&trace_cmdline_lock); } void trace_find_cmdline(int pid, char comm[]) @@ -958,14 +957,14 @@ void trace_find_cmdline(int pid, char comm[]) } preempt_disable(); - __raw_spin_lock(&trace_cmdline_lock); + arch_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) strcpy(comm, saved_cmdlines[map]); else strcpy(comm, "<...>"); - __raw_spin_unlock(&trace_cmdline_lock); + arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); } @@ -1085,7 +1084,7 @@ trace_function(struct trace_array *tr, struct ftrace_entry *entry; /* If we are reading the ring buffer, don't trace */ - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) return; event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), @@ -1151,6 +1150,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, __ftrace_trace_stack(tr->buffer, flags, skip, pc); } +/** + * trace_dump_stack - record a stack back trace in the trace buffer + */ +void trace_dump_stack(void) +{ + unsigned long flags; + + if (tracing_disabled || tracing_selftest_running) + return; + + local_save_flags(flags); + + /* skipping 3 traces, seems to get us at the caller of this function */ + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); +} + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { @@ -1251,8 +1266,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - static raw_spinlock_t trace_buf_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t trace_buf_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_bprint; @@ -1283,7 +1298,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) /* Lockdep uses trace_printk for lock tracing */ local_irq_save(flags); - __raw_spin_lock(&trace_buf_lock); + arch_spin_lock(&trace_buf_lock); len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); if (len > TRACE_BUF_SIZE || len < 0) @@ -1304,7 +1319,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) ring_buffer_unlock_commit(buffer, event); out_unlock: - __raw_spin_unlock(&trace_buf_lock); + arch_spin_unlock(&trace_buf_lock); local_irq_restore(flags); out: @@ -1334,7 +1349,7 @@ int trace_array_printk(struct trace_array *tr, int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { - static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; struct ftrace_event_call *call = &event_print; @@ -1360,12 +1375,9 @@ int trace_array_vprintk(struct trace_array *tr, pause_graph_tracing(); raw_local_irq_save(irq_flags); - __raw_spin_lock(&trace_buf_lock); + arch_spin_lock(&trace_buf_lock); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); - len = min(len, TRACE_BUF_SIZE-1); - trace_buf[len] = 0; - size = sizeof(*entry) + len + 1; buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, @@ -1373,15 +1385,15 @@ int trace_array_vprintk(struct trace_array *tr, if (!event) goto out_unlock; entry = ring_buffer_event_data(event); - entry->ip = ip; + entry->ip = ip; memcpy(&entry->buf, trace_buf, len); - entry->buf[len] = 0; + entry->buf[len] = '\0'; if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); out_unlock: - __raw_spin_unlock(&trace_buf_lock); + arch_spin_unlock(&trace_buf_lock); raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: @@ -1515,6 +1527,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) int i = (int)*pos; void *ent; + WARN_ON_ONCE(iter->leftover); + (*pos)++; /* can't go backwards */ @@ -1613,8 +1627,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) ; } else { - l = *pos - 1; - p = s_next(m, p, &l); + /* + * If we overflowed the seq_file before, then we want + * to just reuse the trace_seq buffer again. + */ + if (iter->leftover) + p = iter; + else { + l = *pos - 1; + p = s_next(m, p, &l); + } } trace_event_read_lock(); @@ -1922,6 +1944,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; + int ret; if (iter->ent == NULL) { if (iter->tr) { @@ -1941,9 +1964,27 @@ static int s_show(struct seq_file *m, void *v) if (!(trace_flags & TRACE_ITER_VERBOSE)) print_func_help_header(m); } + } else if (iter->leftover) { + /* + * If we filled the seq_file buffer earlier, we + * want to just show it now. + */ + ret = trace_print_seq(m, &iter->seq); + + /* ret should this time be zero, but you never know */ + iter->leftover = ret; + } else { print_trace_line(iter); - trace_print_seq(m, &iter->seq); + ret = trace_print_seq(m, &iter->seq); + /* + * If we overflow the seq_file buffer, then it will + * ask us for this data again at start up. + * Use that instead. + * ret is 0 if seq_file write succeeded. + * -1 otherwise. + */ + iter->leftover = ret; } return 0; @@ -2253,7 +2294,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, mutex_lock(&tracing_cpumask_update_lock); local_irq_disable(); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); for_each_tracing_cpu(cpu) { /* * Increase/decrease the disabled counter if we are @@ -2268,7 +2309,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, atomic_dec(&global_trace.data[cpu]->disabled); } } - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); local_irq_enable(); cpumask_copy(tracing_cpumask, tracing_cpumask_new); @@ -2290,67 +2331,49 @@ static const struct file_operations tracing_cpumask_fops = { .write = tracing_cpumask_write, }; -static ssize_t -tracing_trace_options_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; u32 tracer_flags; - int len = 0; - char *buf; - int r = 0; int i; - - /* calculate max size */ - for (i = 0; trace_options[i]; i++) { - len += strlen(trace_options[i]); - len += 3; /* "no" and newline */ - } - mutex_lock(&trace_types_lock); tracer_flags = current_trace->flags->val; trace_opts = current_trace->flags->opts; - /* - * Increase the size with names of options specific - * of the current tracer. - */ - for (i = 0; trace_opts[i].name; i++) { - len += strlen(trace_opts[i].name); - len += 3; /* "no" and newline */ - } - - /* +1 for \0 */ - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) { - mutex_unlock(&trace_types_lock); - return -ENOMEM; - } - for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) - r += sprintf(buf + r, "%s\n", trace_options[i]); + seq_printf(m, "%s\n", trace_options[i]); else - r += sprintf(buf + r, "no%s\n", trace_options[i]); + seq_printf(m, "no%s\n", trace_options[i]); } for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) - r += sprintf(buf + r, "%s\n", - trace_opts[i].name); + seq_printf(m, "%s\n", trace_opts[i].name); else - r += sprintf(buf + r, "no%s\n", - trace_opts[i].name); + seq_printf(m, "no%s\n", trace_opts[i].name); } mutex_unlock(&trace_types_lock); - WARN_ON(r >= len + 1); + return 0; +} - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +static int __set_tracer_option(struct tracer *trace, + struct tracer_flags *tracer_flags, + struct tracer_opt *opts, int neg) +{ + int ret; - kfree(buf); - return r; + ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + if (ret) + return ret; + + if (neg) + tracer_flags->val &= ~opts->bit; + else + tracer_flags->val |= opts->bit; + return 0; } /* Try to assign a tracer specific option */ @@ -2358,33 +2381,17 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) { struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; - int ret = 0, i = 0; - int len; + int i; for (i = 0; tracer_flags->opts[i].name; i++) { opts = &tracer_flags->opts[i]; - len = strlen(opts->name); - if (strncmp(cmp, opts->name, len) == 0) { - ret = trace->set_flag(tracer_flags->val, - opts->bit, !neg); - break; - } + if (strcmp(cmp, opts->name) == 0) + return __set_tracer_option(trace, trace->flags, + opts, neg); } - /* Not found */ - if (!tracer_flags->opts[i].name) - return -EINVAL; - - /* Refused to handle */ - if (ret) - return ret; - - if (neg) - tracer_flags->val &= ~opts->bit; - else - tracer_flags->val |= opts->bit; - return 0; + return -EINVAL; } static void set_tracer_flags(unsigned int mask, int enabled) @@ -2404,7 +2411,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int ret; int i; @@ -2416,16 +2423,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); - if (strncmp(buf, "no", 2) == 0) { + if (strncmp(cmp, "no", 2) == 0) { neg = 1; cmp += 2; } for (i = 0; trace_options[i]; i++) { - int len = strlen(trace_options[i]); - - if (strncmp(cmp, trace_options[i], len) == 0) { + if (strcmp(cmp, trace_options[i]) == 0) { set_tracer_flags(1 << i, !neg); break; } @@ -2440,14 +2446,23 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return ret; } - filp->f_pos += cnt; + *ppos += cnt; return cnt; } +static int tracing_trace_options_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_trace_options_show, NULL); +} + static const struct file_operations tracing_iter_fops = { - .open = tracing_open_generic, - .read = tracing_trace_options_read, + .open = tracing_trace_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, .write = tracing_trace_options_write, }; @@ -2582,7 +2597,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, } mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2764,7 +2779,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (err) return err; - filp->f_pos += ret; + *ppos += ret; return ret; } @@ -2897,6 +2912,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) else cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); + + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); @@ -3103,7 +3122,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, __free_page(spd->pages[idx]); } -static struct pipe_buf_operations tracing_pipe_buf_ops = { +static const struct pipe_buf_operations tracing_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -3299,7 +3318,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } } - filp->f_pos += cnt; + *ppos += cnt; /* If check pages failed, return ENOMEM */ if (tracing_disabled) @@ -3334,7 +3353,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { char *buf; - char *end; if (tracing_disabled) return -EINVAL; @@ -3342,7 +3360,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - buf = kmalloc(cnt + 1, GFP_KERNEL); + buf = kmalloc(cnt + 2, GFP_KERNEL); if (buf == NULL) return -ENOMEM; @@ -3350,35 +3368,31 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, kfree(buf); return -EFAULT; } + if (buf[cnt-1] != '\n') { + buf[cnt] = '\n'; + buf[cnt+1] = '\0'; + } else + buf[cnt] = '\0'; - /* Cut from the first nil or newline. */ - buf[cnt] = '\0'; - end = strchr(buf, '\n'); - if (end) - *end = '\0'; - - cnt = mark_printk("%s\n", buf); + cnt = mark_printk("%s", buf); kfree(buf); *fpos += cnt; return cnt; } -static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_clock_show(struct seq_file *m, void *v) { - char buf[64]; - int bufiter = 0; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) - bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, + seq_printf(m, "%s%s%s%s", i ? " " : "", i == trace_clock_id ? "[" : "", trace_clocks[i].name, i == trace_clock_id ? "]" : ""); - bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); + seq_putc(m, '\n'); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); + return 0; } static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, @@ -3420,6 +3434,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, return cnt; } +static int tracing_clock_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_clock_show, NULL); +} + static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -3458,8 +3479,10 @@ static const struct file_operations tracing_mark_fops = { }; static const struct file_operations trace_clock_fops = { - .open = tracing_open_generic, - .read = tracing_clock_read, + .open = tracing_clock_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, .write = tracing_clock_write, }; @@ -3589,7 +3612,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, } /* Pipe buffer operations for a buffer. */ -static struct pipe_buf_operations buffer_pipe_buf_ops = { +static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, @@ -3730,7 +3753,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) - return ENOMEM; + return -ENOMEM; trace_seq_init(s); @@ -3920,39 +3943,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - ret = 0; - switch (val) { - case 0: - /* do nothing if already cleared */ - if (!(topt->flags->val & topt->opt->bit)) - break; - - mutex_lock(&trace_types_lock); - if (current_trace->set_flag) - ret = current_trace->set_flag(topt->flags->val, - topt->opt->bit, 0); - mutex_unlock(&trace_types_lock); - if (ret) - return ret; - topt->flags->val &= ~topt->opt->bit; - break; - case 1: - /* do nothing if already set */ - if (topt->flags->val & topt->opt->bit) - break; + if (val != 0 && val != 1) + return -EINVAL; + if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - if (current_trace->set_flag) - ret = current_trace->set_flag(topt->flags->val, - topt->opt->bit, 1); + ret = __set_tracer_option(current_trace, topt->flags, + topt->opt, val); mutex_unlock(&trace_types_lock); if (ret) return ret; - topt->flags->val |= topt->opt->bit; - break; - - default: - return -EINVAL; } *ppos += cnt; @@ -4279,8 +4279,8 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { - static raw_spinlock_t ftrace_dump_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static arch_spinlock_t ftrace_dump_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; @@ -4290,7 +4290,7 @@ static void __ftrace_dump(bool disable_tracing) /* only one dump */ local_irq_save(flags); - __raw_spin_lock(&ftrace_dump_lock); + arch_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4365,7 +4365,7 @@ static void __ftrace_dump(bool disable_tracing) } out: - __raw_spin_unlock(&ftrace_dump_lock); + arch_spin_unlock(&ftrace_dump_lock); local_irq_restore(flags); } @@ -4426,7 +4426,7 @@ __init static int tracer_alloc_buffers(void) /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_data, i); + max_tr.data[i] = &per_cpu(max_tr_data, i); } trace_init_cmdlines(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 405cb850b75..4df6a77eb19 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include <linux/ftrace.h> #include <trace/boot.h> #include <linux/kmemtrace.h> +#include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -37,6 +38,7 @@ enum trace_type { TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_BLK, + TRACE_KSYM, __TRACE_LAST_TYPE, }; @@ -98,9 +100,32 @@ struct syscall_trace_enter { struct syscall_trace_exit { struct trace_entry ent; int nr; - unsigned long ret; + long ret; }; +struct kprobe_trace_entry { + struct trace_entry ent; + unsigned long ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + +struct kretprobe_trace_entry { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; + int nargs; + unsigned long args[]; +}; + +#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ + (offsetof(struct kretprobe_trace_entry, args) + \ + (sizeof(unsigned long) * (n))) + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -209,6 +234,7 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ + IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) @@ -246,6 +272,7 @@ struct tracer_flags { * @pipe_open: called when the trace_pipe file is opened * @wait_pipe: override how the user waits for traces on trace_pipe * @close: called when the trace file is released + * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe * @splice_read: override the default splice_read callback on trace_pipe * @selftest: selftest to run on boot (see trace_selftest.c) @@ -264,6 +291,7 @@ struct tracer { void (*pipe_open)(struct trace_iterator *iter); void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); + void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); @@ -364,6 +392,8 @@ int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); +extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); + extern unsigned long nsecs_to_usecs(unsigned long nsecs); #ifdef CONFIG_TRACER_MAX_TRACE @@ -413,7 +443,7 @@ extern int DYN_FTRACE_TEST_NAME(void); extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; -DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); +DECLARE_PER_CPU(int, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, @@ -438,6 +468,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_ksym(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -483,10 +515,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } #else -static inline int ftrace_trace_addr(unsigned long addr) -{ - return 1; -} static inline int ftrace_graph_addr(unsigned long addr) { return 1; @@ -500,12 +528,12 @@ print_graph_function(struct trace_iterator *iter) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -extern struct pid *ftrace_pid_trace; +extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER static inline int ftrace_trace_task(struct task_struct *task) { - if (!ftrace_pid_trace) + if (list_empty(&ftrace_pids)) return 1; return test_tsk_trace_trace(task); @@ -569,18 +597,17 @@ enum trace_iterator_flags { TRACE_ITER_BIN = 0x40, TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, - TRACE_ITER_SCHED_TREE = 0x200, - TRACE_ITER_PRINTK = 0x400, - TRACE_ITER_PREEMPTONLY = 0x800, - TRACE_ITER_BRANCH = 0x1000, - TRACE_ITER_ANNOTATE = 0x2000, - TRACE_ITER_USERSTACKTRACE = 0x4000, - TRACE_ITER_SYM_USEROBJ = 0x8000, - TRACE_ITER_PRINTK_MSGONLY = 0x10000, - TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ - TRACE_ITER_LATENCY_FMT = 0x40000, - TRACE_ITER_SLEEP_TIME = 0x80000, - TRACE_ITER_GRAPH_TIME = 0x100000, + TRACE_ITER_PRINTK = 0x200, + TRACE_ITER_PREEMPTONLY = 0x400, + TRACE_ITER_BRANCH = 0x800, + TRACE_ITER_ANNOTATE = 0x1000, + TRACE_ITER_USERSTACKTRACE = 0x2000, + TRACE_ITER_SYM_USEROBJ = 0x4000, + TRACE_ITER_PRINTK_MSGONLY = 0x8000, + TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */ + TRACE_ITER_LATENCY_FMT = 0x20000, + TRACE_ITER_SLEEP_TIME = 0x40000, + TRACE_ITER_GRAPH_TIME = 0x80000, }; /* @@ -687,7 +714,6 @@ struct event_filter { int n_preds; struct filter_pred **preds; char *filter_string; - bool no_reset; }; struct event_subsystem { @@ -699,22 +725,40 @@ struct event_subsystem { }; struct filter_pred; +struct regex; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, int val1, int val2); +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +enum regex_type { + MATCH_FULL = 0, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; +}; + struct filter_pred { - filter_pred_fn_t fn; - u64 val; - char str_val[MAX_FILTER_STR_VAL]; - int str_len; - char *field_name; - int offset; - int not; - int op; - int pop_n; + filter_pred_fn_t fn; + u64 val; + struct regex regex; + char *field_name; + int offset; + int not; + int op; + int pop_n; }; +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, @@ -730,7 +774,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && + !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 20c5f92e28a..84a3a7ba072 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -20,6 +20,8 @@ #include <linux/ktime.h> #include <linux/trace_clock.h> +#include "trace.h" + /* * trace_clock_local(): the simplest and least coherent tracing clock. * @@ -28,17 +30,17 @@ */ u64 notrace trace_clock_local(void) { - unsigned long flags; u64 clock; + int resched; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - raw_local_irq_save(flags); + resched = ftrace_preempt_disable(); clock = sched_clock(); - raw_local_irq_restore(flags); + ftrace_preempt_enable(resched); return clock; } @@ -69,10 +71,10 @@ u64 notrace trace_clock(void) /* keep prev_time and lock in the same cacheline. */ static struct { u64 prev_time; - raw_spinlock_t lock; + arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { - .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, + .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) @@ -92,7 +94,7 @@ u64 notrace trace_clock_global(void) if (unlikely(in_nmi())) goto out; - __raw_spin_lock(&trace_clock_struct.lock); + arch_spin_lock(&trace_clock_struct.lock); /* * TODO: if this happens often then maybe we should reset @@ -104,7 +106,7 @@ u64 notrace trace_clock_global(void) trace_clock_struct.prev_time = now; - __raw_spin_unlock(&trace_clock_struct.lock); + arch_spin_unlock(&trace_clock_struct.lock); out: raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ead3d724599..c16a08f399d 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -364,3 +364,19 @@ FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, F_printk("type:%u call_site:%lx ptr:%p", __entry->type_id, __entry->call_site, __entry->ptr) ); + +FTRACE_ENTRY(ksym_trace, ksym_trace_entry, + + TRACE_KSYM, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned char, type ) + __array( char , cmd, TASK_COMM_LEN ) + __field( unsigned long, addr ) + ), + + F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", + (void *)__entry->ip, (unsigned int)__entry->type, + (void *)__entry->addr, __entry->cmd) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 8d5c171cc99..9e25573242c 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -8,17 +8,14 @@ #include <linux/module.h> #include "trace.h" -/* - * We can't use a size but a type in alloc_percpu() - * So let's create a dummy type that matches the desired size - */ -typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; -char *trace_profile_buf; -EXPORT_SYMBOL_GPL(trace_profile_buf); +char *perf_trace_buf; +EXPORT_SYMBOL_GPL(perf_trace_buf); + +char *perf_trace_buf_nmi; +EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); -char *trace_profile_buf_nmi; -EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); +typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; /* Count the events in use (per event id, not per instance) */ static int total_profile_count; @@ -28,24 +25,24 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) char *buf; int ret = -ENOMEM; - if (atomic_inc_return(&event->profile_count)) + if (event->profile_count++ > 0) return 0; if (!total_profile_count) { - buf = (char *)alloc_percpu(profile_buf_t); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf; - rcu_assign_pointer(trace_profile_buf, buf); + rcu_assign_pointer(perf_trace_buf, buf); - buf = (char *)alloc_percpu(profile_buf_t); + buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, buf); + rcu_assign_pointer(perf_trace_buf_nmi, buf); } - ret = event->profile_enable(); + ret = event->profile_enable(event); if (!ret) { total_profile_count++; return 0; @@ -53,13 +50,13 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) fail_buf_nmi: if (!total_profile_count) { - free_percpu(trace_profile_buf_nmi); - free_percpu(trace_profile_buf); - trace_profile_buf_nmi = NULL; - trace_profile_buf = NULL; + free_percpu(perf_trace_buf_nmi); + free_percpu(perf_trace_buf); + perf_trace_buf_nmi = NULL; + perf_trace_buf = NULL; } fail_buf: - atomic_dec(&event->profile_count); + event->profile_count--; return ret; } @@ -86,17 +83,17 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event) { char *buf, *nmi_buf; - if (!atomic_add_negative(-1, &event->profile_count)) + if (--event->profile_count > 0) return; - event->profile_disable(); + event->profile_disable(event); if (!--total_profile_count) { - buf = trace_profile_buf; - rcu_assign_pointer(trace_profile_buf, NULL); + buf = perf_trace_buf; + rcu_assign_pointer(perf_trace_buf, NULL); - nmi_buf = trace_profile_buf_nmi; - rcu_assign_pointer(trace_profile_buf_nmi, NULL); + nmi_buf = perf_trace_buf_nmi; + rcu_assign_pointer(perf_trace_buf_nmi, NULL); /* * Ensure every events in profiling have finished before diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d128f65778e..189b09baf4f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(trace_define_field); if (ret) \ return ret; -int trace_define_common_fields(struct ftrace_event_call *call) +static int trace_define_common_fields(struct ftrace_event_call *call) { int ret; struct trace_entry ent; @@ -91,11 +91,8 @@ int trace_define_common_fields(struct ftrace_event_call *call) return ret; } -EXPORT_SYMBOL_GPL(trace_define_common_fields); -#ifdef CONFIG_MODULES - -static void trace_destroy_fields(struct ftrace_event_call *call) +void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; @@ -107,27 +104,49 @@ static void trace_destroy_fields(struct ftrace_event_call *call) } } -#endif /* CONFIG_MODULES */ +int trace_event_raw_init(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(call->event); + if (!id) + return -ENODEV; + call->id = id; + INIT_LIST_HEAD(&call->fields); + + return 0; +} +EXPORT_SYMBOL_GPL(trace_event_raw_init); -static void ftrace_event_enable_disable(struct ftrace_event_call *call, +static int ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { + int ret = 0; + switch (enable) { case 0: if (call->enabled) { call->enabled = 0; tracing_stop_cmdline_record(); - call->unregfunc(call->data); + call->unregfunc(call); } break; case 1: if (!call->enabled) { - call->enabled = 1; tracing_start_cmdline_record(); - call->regfunc(call->data); + ret = call->regfunc(call); + if (ret) { + tracing_stop_cmdline_record(); + pr_info("event trace: Could not enable event " + "%s\n", call->name); + break; + } + call->enabled = 1; } break; } + + return ret; } static void ftrace_clear_events(void) @@ -406,7 +425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 0: case 1: mutex_lock(&event_mutex); - ftrace_event_enable_disable(call, val); + ret = ftrace_event_enable_disable(call, val); mutex_unlock(&event_mutex); break; @@ -416,7 +435,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; - return cnt; + return ret ? ret : cnt; } static ssize_t @@ -507,7 +526,7 @@ extern char *__bad_type_size(void); #define FIELD(type, name) \ sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name) + sizeof(field.name), is_signed_type(type) static int trace_write_header(struct trace_seq *s) { @@ -515,17 +534,17 @@ static int trace_write_header(struct trace_seq *s) /* struct trace_entry */ return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, lock_depth)); + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" + "\n", + FIELD(unsigned short, type), + FIELD(unsigned char, flags), + FIELD(unsigned char, preempt_count), + FIELD(int, pid), + FIELD(int, lock_depth)); } static ssize_t @@ -878,9 +897,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - entry = trace_create_file("enable", 0644, system->entry, - (void *)system->name, - &ftrace_system_enable_fops); + trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); return system->entry; } @@ -892,7 +911,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *filter, const struct file_operations *format) { - struct dentry *entry; int ret; /* @@ -910,55 +928,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, } if (call->regfunc) - entry = trace_create_file("enable", 0644, call->dir, call, - enable); + trace_create_file("enable", 0644, call->dir, call, + enable); if (call->id && call->profile_enable) - entry = trace_create_file("id", 0444, call->dir, call, - id); + trace_create_file("id", 0444, call->dir, call, + id); if (call->define_fields) { - ret = call->define_fields(call); + ret = trace_define_common_fields(call); + if (!ret) + ret = call->define_fields(call); if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); return ret; } - entry = trace_create_file("filter", 0644, call->dir, call, - filter); + trace_create_file("filter", 0644, call->dir, call, + filter); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = trace_create_file("format", 0444, call->dir, call, - format); + trace_create_file("format", 0444, call->dir, call, + format); return 0; } -#define for_each_event(event, start, end) \ - for (event = start; \ - (unsigned long)event < (unsigned long)end; \ - event++) +static int __trace_add_event_call(struct ftrace_event_call *call) +{ + struct dentry *d_events; + int ret; -#ifdef CONFIG_MODULES + if (!call->name) + return -EINVAL; -static LIST_HEAD(ftrace_module_file_list); + if (call->raw_init) { + ret = call->raw_init(call); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace " + "events/%s\n", call->name); + return ret; + } + } -/* - * Modules must own their file_operations to keep up with - * reference counting. - */ -struct ftrace_module_file_ops { - struct list_head list; - struct module *mod; - struct file_operations id; - struct file_operations enable; - struct file_operations format; - struct file_operations filter; -}; + d_events = event_trace_events_dir(); + if (!d_events) + return -ENOENT; + + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); + + return ret; +} + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) +{ + int ret; + mutex_lock(&event_mutex); + ret = __trace_add_event_call(call); + mutex_unlock(&event_mutex); + return ret; +} static void remove_subsystem_dir(const char *name) { @@ -986,6 +1025,53 @@ static void remove_subsystem_dir(const char *name) } } +/* + * Must be called under locking both of event_mutex and trace_event_mutex. + */ +static void __trace_remove_event_call(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event) + __unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); + remove_subsystem_dir(call->system); +} + +/* Remove an event_call */ +void trace_remove_event_call(struct ftrace_event_call *call) +{ + mutex_lock(&event_mutex); + down_write(&trace_event_mutex); + __trace_remove_event_call(call); + up_write(&trace_event_mutex); + mutex_unlock(&event_mutex); +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { @@ -1043,7 +1129,7 @@ static void trace_module_add_events(struct module *mod) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1061,10 +1147,11 @@ static void trace_module_add_events(struct module *mod) return; } call->mod = mod; - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + ret = event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + if (!ret) + list_add(&call->list, &ftrace_events); } } @@ -1078,14 +1165,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - ftrace_event_enable_disable(call, 0); - if (call->event) - __unregister_ftrace_event(call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); - trace_destroy_fields(call); - destroy_preds(call); - remove_subsystem_dir(call->system); + __trace_remove_event_call(call); } } @@ -1203,7 +1283,7 @@ static __init int event_trace_init(void) if (!call->name) continue; if (call->raw_init) { - ret = call->raw_init(); + ret = call->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1211,10 +1291,12 @@ static __init int event_trace_init(void) continue; } } - list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (!ret) + list_add(&call->list, &ftrace_events); } while (true) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 98a6cc5c64e..50504cb228d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -18,11 +18,10 @@ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ -#include <linux/debugfs.h> -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> +#include <linux/perf_event.h> #include "trace.h" #include "trace_output.h" @@ -31,6 +30,7 @@ enum filter_op_ids { OP_OR, OP_AND, + OP_GLOB, OP_NE, OP_EQ, OP_LT, @@ -48,16 +48,17 @@ struct filter_op { }; static struct filter_op filter_ops[] = { - { OP_OR, "||", 1 }, - { OP_AND, "&&", 2 }, - { OP_NE, "!=", 4 }, - { OP_EQ, "==", 4 }, - { OP_LT, "<", 5 }, - { OP_LE, "<=", 5 }, - { OP_GT, ">", 5 }, - { OP_GE, ">=", 5 }, - { OP_NONE, "OP_NONE", 0 }, - { OP_OPEN_PAREN, "(", 0 }, + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_GLOB, "~", 4 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, }; enum { @@ -197,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event, char *addr = (char *)(event + pred->offset); int cmp, match; - cmp = strncmp(addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -211,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, char **addr = (char **)(event + pred->offset); int cmp, match; - cmp = strncmp(*addr, pred->str_val, pred->str_len); + cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -237,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, char *addr = (char *)(event + str_loc); int cmp, match; - cmp = strncmp(addr, pred->str_val, str_len); + cmp = pred->regex.match(addr, &pred->regex, str_len); - match = (!cmp) ^ pred->not; + match = cmp ^ pred->not; return match; } @@ -250,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event, return 0; } +/* Basic regex callbacks */ +static int regex_match_full(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (strstr(str, r->pattern)) + return 1; + return 0; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + char *ptr = strstr(str, r->pattern); + + if (ptr && (ptr[r->len] == 0)) + return 1; + return 0; +} + +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + *search = buff + 1; + type = MATCH_END_ONLY; + } else { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } + } + } + + return type; +} + +static void filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = &pred->regex; + char *search; + enum regex_type type = MATCH_FULL; + int not = 0; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } + + switch (type) { + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + } + + pred->not ^= not; +} + /* return 1 if event matches, 0 otherwise (discard) */ -int filter_match_preds(struct ftrace_event_call *call, void *rec) +int filter_match_preds(struct event_filter *filter, void *rec) { - struct event_filter *filter = call->filter; int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; @@ -396,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred) { kfree(pred->field_name); pred->field_name = NULL; - pred->str_len = 0; + pred->regex.len = 0; } static int filter_set_pred(struct filter_pred *dest, @@ -426,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void destroy_preds(struct ftrace_event_call *call) +static void __free_preds(struct event_filter *filter) { - struct event_filter *filter = call->filter; int i; if (!filter) @@ -441,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call) kfree(filter->preds); kfree(filter->filter_string); kfree(filter); +} + +void destroy_preds(struct ftrace_event_call *call) +{ + __free_preds(call->filter); call->filter = NULL; + call->filter_active = 0; } -static int init_preds(struct ftrace_event_call *call) +static struct event_filter *__alloc_preds(void) { struct event_filter *filter; struct filter_pred *pred; int i; - if (call->filter) - return 0; - - filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!call->filter) - return -ENOMEM; + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); filter->n_preds = 0; @@ -471,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call) filter->preds[i] = pred; } - return 0; + return filter; oom: - destroy_preds(call); + __free_preds(filter); + return ERR_PTR(-ENOMEM); +} - return -ENOMEM; +static int init_preds(struct ftrace_event_call *call) +{ + if (call->filter) + return 0; + + call->filter_active = 0; + call->filter = __alloc_preds(); + if (IS_ERR(call->filter)) + return PTR_ERR(call->filter); + + return 0; } static int init_subsystem_preds(struct event_subsystem *system) @@ -499,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system) return 0; } -enum { - FILTER_DISABLE_ALL, - FILTER_INIT_NO_RESET, - FILTER_SKIP_NO_RESET, -}; - -static void filter_free_subsystem_preds(struct event_subsystem *system, - int flag) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; @@ -517,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, if (strcmp(call->system, system->name) != 0) continue; - if (flag == FILTER_INIT_NO_RESET) { - call->filter->no_reset = false; - continue; - } - - if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset) - continue; - filter_disable_preds(call); remove_filter_string(call->filter); } @@ -532,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system, static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, filter_pred_fn_t fn) { - struct event_filter *filter = call->filter; int idx, err; if (filter->n_preds == MAX_FILTER_PRED) { @@ -550,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return err; filter->n_preds++; - call->filter_active = 1; return 0; } @@ -575,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field) static int is_legal_op(struct ftrace_event_field *field, int op) { - if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) + if (is_string_field(field) && + (op != OP_EQ && op != OP_NE && op != OP_GLOB)) + return 0; + if (!is_string_field(field) && op == OP_GLOB) return 0; return 1; @@ -626,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, + struct event_filter *filter, struct filter_pred *pred, bool dry_run) { @@ -660,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps, } if (is_string_field(field)) { - pred->str_len = field->size; + filter_build_regex(pred); - if (field->filter_type == FILTER_STATIC_STRING) + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; - else if (field->filter_type == FILTER_DYN_STRING) + pred->regex.field_len = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) fn = filter_pred_strloc; else { fn = filter_pred_pchar; - pred->str_len = strlen(pred->str_val); + pred->regex.field_len = strlen(pred->regex.pattern); } } else { if (field->is_signed) - ret = strict_strtoll(pred->str_val, 0, &val); + ret = strict_strtoll(pred->regex.pattern, 0, &val); else - ret = strict_strtoull(pred->str_val, 0, &val); + ret = strict_strtoull(pred->regex.pattern, 0, &val); if (ret) { parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; @@ -694,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, pred, fn); - return 0; -} - -static int filter_add_subsystem_pred(struct filter_parse_state *ps, - struct event_subsystem *system, - struct filter_pred *pred, - char *filter_string, - bool dry_run) -{ - struct ftrace_event_call *call; - int err = 0; - bool fail = true; - - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->define_fields) - continue; - - if (strcmp(call->system, system->name)) - continue; - - if (call->filter->no_reset) - continue; - - err = filter_add_pred(ps, call, pred, dry_run); - if (err) - call->filter->no_reset = true; - else - fail = false; - - if (!dry_run) - replace_filter_string(call->filter, filter_string); - } - - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; - } + return filter_add_pred_fn(ps, call, filter, pred, fn); return 0; } @@ -1045,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2) return NULL; } - strcpy(pred->str_val, operand2); - pred->str_len = strlen(operand2); + strcpy(pred->regex.pattern, operand2); + pred->regex.len = strlen(pred->regex.pattern); pred->op = op; @@ -1090,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps) return 0; } -static int replace_preds(struct event_subsystem *system, - struct ftrace_event_call *call, +static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, struct filter_parse_state *ps, char *filter_string, bool dry_run) @@ -1138,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system, add_pred: if (!pred) return -ENOMEM; - if (call) - err = filter_add_pred(ps, call, pred, false); - else - err = filter_add_subsystem_pred(ps, system, pred, - filter_string, dry_run); + err = filter_add_pred(ps, call, filter, pred, dry_run); filter_free_pred(pred); if (err) return err; @@ -1153,10 +1226,50 @@ add_pred: return 0; } -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +static int replace_system_preds(struct event_subsystem *system, + struct filter_parse_state *ps, + char *filter_string) { + struct ftrace_event_call *call; + bool fail = true; int err; + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (!call->define_fields) + continue; + + if (strcmp(call->system, system->name) != 0) + continue; + + /* try to see if the filter can be applied */ + err = replace_preds(call, filter, ps, filter_string, true); + if (err) + continue; + + /* really apply the filter */ + filter_disable_preds(call); + err = replace_preds(call, filter, ps, filter_string, false); + if (err) + filter_disable_preds(call); + else { + call->filter_active = 1; + replace_filter_string(filter, filter_string); + } + fail = false; + } + + if (fail) { + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + } + return 0; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1168,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); remove_filter_string(call->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1187,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } - err = replace_preds(NULL, call, ps, filter_string, false); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); - + else + call->filter_active = 1; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1205,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { int err; - struct filter_parse_state *ps; mutex_lock(&event_mutex); @@ -1215,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out_unlock; if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); + filter_free_subsystem_preds(system); remove_filter_string(system->filter); - mutex_unlock(&event_mutex); - return 0; + goto out_unlock; } err = -ENOMEM; @@ -1235,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system, goto out; } - filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); - - /* try to see the filter can be applied to which events */ - err = replace_preds(system, NULL, ps, filter_string, true); - if (err) { + err = replace_system_preds(system, ps, filter_string); + if (err) append_filter_err(ps, system->filter); - goto out; + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#ifdef CONFIG_EVENT_PROFILE + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_preds(filter); +} + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct filter_parse_state *ps; + struct ftrace_event_call *call = NULL; + + mutex_lock(&event_mutex); + + list_for_each_entry(call, &ftrace_events, list) { + if (call->id == event_id) + break; } - filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); + err = -EINVAL; + if (!call) + goto out_unlock; - /* really apply the filter to the events */ - err = replace_preds(system, NULL, ps, filter_string, false); - if (err) { - append_filter_err(ps, system->filter); - filter_free_subsystem_preds(system, 2); + err = -EEXIST; + if (event->filter) + goto out_unlock; + + filter = __alloc_preds(); + if (IS_ERR(filter)) { + err = PTR_ERR(filter); + goto out_unlock; } -out: + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_preds; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + event->filter = filter; + +free_ps: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); + +free_preds: + if (err) + __free_preds(filter); + out_unlock: mutex_unlock(&event_mutex); return err; } +#endif /* CONFIG_EVENT_PROFILE */ + diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 9753fcc61bc..458e5bfe26d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -48,11 +48,11 @@ struct ____ftrace_##name { \ tstruct \ }; \ -static void __used ____ftrace_check_##name(void) \ +static void __always_unused ____ftrace_check_##name(void) \ { \ struct ____ftrace_##name *__entry = NULL; \ \ - /* force cmpile-time check on F_printk() */ \ + /* force compile-time check on F_printk() */ \ printk(print); \ } @@ -66,44 +66,47 @@ static void __used ____ftrace_check_##name(void) \ #undef __field #define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __field_desc #define __field_desc(type, container, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __array #define __array(type, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed_type(type)); \ if (!ret) \ return 0; #undef __array_desc #define __array_desc(type, container, item, len) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\n", \ + "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ offsetof(typeof(field), container.item), \ - sizeof(field.container.item)); \ + sizeof(field.container.item), \ + is_signed_type(type)); \ if (!ret) \ return 0; #undef __dynamic_array #define __dynamic_array(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\n", \ - offsetof(typeof(field), item)); \ + "offset:%zu;\tsize:0;\tsigned:%u;\n", \ + offsetof(typeof(field), item), \ + is_signed_type(type)); \ if (!ret) \ return 0; @@ -131,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ #include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -182,10 +184,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ struct struct_name field; \ int ret; \ \ - ret = trace_define_common_fields(event_call); \ - if (ret) \ - return ret; \ - \ tstruct; \ \ return ret; \ @@ -193,6 +191,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #include "trace_entries.h" +static int ftrace_raw_init_event(struct ftrace_event_call *call) +{ + INIT_LIST_HEAD(&call->fields); + return 0; +} #undef __field #define __field(type, item) @@ -211,7 +214,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ -static int ftrace_raw_init_event_##call(void); \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ @@ -219,14 +221,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ + .raw_init = ftrace_raw_init_event, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - INIT_LIST_HEAD(&event_##call.fields); \ - return 0; \ -} \ #include "trace_entries.h" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 45e6c01b2e4..b1342c5d37c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,9 +14,20 @@ #include "trace.h" #include "trace_output.h" -struct fgraph_data { +struct fgraph_cpu_data { pid_t last_pid; int depth; + int ignore; +}; + +struct fgraph_data { + struct fgraph_cpu_data *cpu_data; + + /* Place to preserve last processed entry. */ + struct ftrace_graph_ent_entry ent; + struct ftrace_graph_ret_entry ret; + int failed; + int cpu; }; #define TRACE_GRAPH_INDENT 2 @@ -176,7 +187,7 @@ static int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) return 0; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, @@ -240,7 +251,7 @@ static void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) return; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, @@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) if (!data) return TRACE_TYPE_HANDLED; - last_pid = &(per_cpu_ptr(data, cpu)->last_pid); + last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); if (*last_pid == pid) return TRACE_TYPE_HANDLED; @@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry * get_return_for_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *curr) { - struct ring_buffer_iter *ring_iter; + struct fgraph_data *data = iter->private; + struct ring_buffer_iter *ring_iter = NULL; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *next; - ring_iter = iter->buffer_iter[iter->cpu]; + /* + * If the previous output failed to write to the seq buffer, + * then we just reuse the data from before. + */ + if (data && data->failed) { + curr = &data->ent; + next = &data->ret; + } else { - /* First peek to compare current entry and the next one */ - if (ring_iter) - event = ring_buffer_iter_peek(ring_iter, NULL); - else { - /* We need to consume the current entry to see the next one */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, - NULL); - } + ring_iter = iter->buffer_iter[iter->cpu]; + + /* First peek to compare current entry and the next one */ + if (ring_iter) + event = ring_buffer_iter_peek(ring_iter, NULL); + else { + /* + * We need to consume the current entry to see + * the next one. + */ + ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + NULL); + } - if (!event) - return NULL; + if (!event) + return NULL; + + next = ring_buffer_event_data(event); - next = ring_buffer_event_data(event); + if (data) { + /* + * Save current and next entries for later reference + * if the output fails. + */ + data->ent = *curr; + data->ret = *next; + } + } if (next->ent.type != TRACE_GRAPH_RET) return NULL; @@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); /* * Comments display at + 1 to depth. Since @@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); *depth = call->depth; } @@ -782,19 +816,34 @@ static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter) { - int cpu = iter->cpu; + struct fgraph_data *data = iter->private; struct ftrace_graph_ent *call = &field->graph_ent; struct ftrace_graph_ret_entry *leaf_ret; + static enum print_line_t ret; + int cpu = iter->cpu; if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) return TRACE_TYPE_PARTIAL_LINE; leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) - return print_graph_entry_leaf(iter, field, leaf_ret, s); + ret = print_graph_entry_leaf(iter, field, leaf_ret, s); else - return print_graph_entry_nested(iter, field, s, cpu); + ret = print_graph_entry_nested(iter, field, s, cpu); + if (data) { + /* + * If we failed to write our output, then we need to make + * note of it. Because we already consumed our entry. + */ + if (s->full) { + data->failed = 1; + data->cpu = cpu; + } else + data->failed = 0; + } + + return ret; } static enum print_line_t @@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (data) { int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data, cpu)->depth); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); /* * Comments display at + 1 to depth. This is the @@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, int i; if (data) - depth = per_cpu_ptr(data, iter->cpu)->depth; + depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; if (print_graph_prologue(iter, s, 0, 0)) return TRACE_TYPE_PARTIAL_LINE; @@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t print_graph_function(struct trace_iterator *iter) { + struct ftrace_graph_ent_entry *field; + struct fgraph_data *data = iter->private; struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; + int cpu = iter->cpu; + int ret; + + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { + per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; + return TRACE_TYPE_HANDLED; + } + + /* + * If the last output failed, there's a possibility we need + * to print out the missing entry which would never go out. + */ + if (data && data->failed) { + field = &data->ent; + iter->cpu = data->cpu; + ret = print_graph_entry(field, s, iter); + if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { + per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; + ret = TRACE_TYPE_NO_CONSUME; + } + iter->cpu = cpu; + return ret; + } switch (entry->type) { case TRACE_GRAPH_ENT: { @@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter) * sizeof(struct ftrace_graph_ent_entry) is very small, * it can be safely saved at the stack. */ - struct ftrace_graph_ent_entry *field, saved; + struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); saved = *field; return print_graph_entry(&saved, s, iter); @@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s) static void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ - struct fgraph_data *data = alloc_percpu(struct fgraph_data); + struct fgraph_data *data; int cpu; + iter->private = NULL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) - pr_warning("function graph tracer: not enough memory\n"); - else - for_each_possible_cpu(cpu) { - pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); - int *depth = &(per_cpu_ptr(data, cpu)->depth); - *pid = -1; - *depth = 0; - } + goto out_err; + + data->cpu_data = alloc_percpu(struct fgraph_cpu_data); + if (!data->cpu_data) + goto out_err_free; + + for_each_possible_cpu(cpu) { + pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + *pid = -1; + *depth = 0; + *ignore = 0; + } iter->private = data; + + return; + + out_err_free: + kfree(data); + out_err: + pr_warning("function graph tracer: not enough memory\n"); } static void graph_trace_close(struct trace_iterator *iter) { - free_percpu(iter->private); + struct fgraph_data *data = iter->private; + + if (data) { + free_percpu(data->cpu_data); + kfree(data); + } } static struct tracer graph_trace __read_mostly = { .name = "function_graph", .open = graph_trace_open, + .pipe_open = graph_trace_open, .close = graph_trace_close, + .pipe_close = graph_trace_close, .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 69543a905cd..7b97000745f 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -20,10 +20,10 @@ #define BTS_BUFFER_SIZE (1 << 13) -static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); +static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer); +static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer); -#define this_tracer per_cpu(tracer, smp_processor_id()) +#define this_tracer per_cpu(hwb_tracer, smp_processor_id()) static int trace_hw_branches_enabled __read_mostly; static int trace_hw_branches_suspended __read_mostly; @@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly; static void bts_trace_init_cpu(int cpu) { - per_cpu(tracer, cpu) = - ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); + per_cpu(hwb_tracer, cpu) = + ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu), + BTS_BUFFER_SIZE, NULL, (size_t)-1, + BTS_KERNEL); - if (IS_ERR(per_cpu(tracer, cpu))) - per_cpu(tracer, cpu) = NULL; + if (IS_ERR(per_cpu(hwb_tracer, cpu))) + per_cpu(hwb_tracer, cpu) = NULL; } static int bts_trace_init(struct trace_array *tr) @@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr) for_each_online_cpu(cpu) { bts_trace_init_cpu(cpu); - if (likely(per_cpu(tracer, cpu))) + if (likely(per_cpu(hwb_tracer, cpu))) trace_hw_branches_enabled = 1; } trace_hw_branches_suspended = 0; @@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr) get_online_cpus(); for_each_online_cpu(cpu) { - if (likely(per_cpu(tracer, cpu))) { - ds_release_bts(per_cpu(tracer, cpu)); - per_cpu(tracer, cpu) = NULL; + if (likely(per_cpu(hwb_tracer, cpu))) { + ds_release_bts(per_cpu(hwb_tracer, cpu)); + per_cpu(hwb_tracer, cpu) = NULL; } } trace_hw_branches_enabled = 0; @@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr) get_online_cpus(); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_resume_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_resume_bts(per_cpu(hwb_tracer, cpu)); trace_hw_branches_suspended = 0; put_online_cpus(); } @@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr) get_online_cpus(); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_suspend_bts(per_cpu(hwb_tracer, cpu)); trace_hw_branches_suspended = 1; put_online_cpus(); } @@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, bts_trace_init_cpu(cpu); if (trace_hw_branches_suspended && - likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); + likely(per_cpu(hwb_tracer, cpu))) + ds_suspend_bts(per_cpu(hwb_tracer, cpu)); } break; case CPU_DOWN_PREPARE: /* The notification is sent with interrupts enabled. */ - if (likely(per_cpu(tracer, cpu))) { - ds_release_bts(per_cpu(tracer, cpu)); - per_cpu(tracer, cpu) = NULL; + if (likely(per_cpu(hwb_tracer, cpu))) { + ds_release_bts(per_cpu(hwb_tracer, cpu)); + per_cpu(hwb_tracer, cpu) = NULL; } } @@ -258,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter) get_online_cpus(); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_suspend_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_suspend_bts(per_cpu(hwb_tracer, cpu)); /* * We need to collect the trace on the respective cpu since ftrace * implicitly adds the record for the current cpu. @@ -268,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter) on_each_cpu(trace_bts_cpu, iter->tr, 1); for_each_online_cpu(cpu) - if (likely(per_cpu(tracer, cpu))) - ds_resume_bts(per_cpu(tracer, cpu)); + if (likely(per_cpu(hwb_tracer, cpu))) + ds_resume_bts(per_cpu(hwb_tracer, cpu)); put_online_cpus(); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 3aa7eaa2114..2974bc7538c 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr, goto out_unlock; trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + /* Skip 5 functions to get to the irq/preempt enable function */ + __trace_stack(tr, flags, 5, pc); if (data->critical_sequence != max_sequence) goto out_unlock; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 00000000000..7ecab06547a --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,1533 @@ +/* + * Kprobes-based tracing events + * + * Created by Masami Hiramatsu <mhiramat@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/kprobes.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/debugfs.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/ptrace.h> +#include <linux/perf_event.h> + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define KPROBE_EVENT_SYSTEM "kprobes" + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_NARGS "__probe_nargs" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + "common_lock_depth", + FIELD_STRING_IP, + FIELD_STRING_NARGS, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +struct fetch_func { + unsigned long (*func)(struct pt_regs *, void *); + void *data; +}; + +static __kprobes unsigned long call_fetch(struct fetch_func *f, + struct pt_regs *regs) +{ + return f->func(regs, f->data); +} + +/* fetch handlers */ +static __kprobes unsigned long fetch_register(struct pt_regs *regs, + void *offset) +{ + return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +} + +static __kprobes unsigned long fetch_stack(struct pt_regs *regs, + void *num) +{ + return regs_get_kernel_stack_nth(regs, + (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) +{ + unsigned long retval; + + if (probe_kernel_address(addr, retval)) + return 0; + return retval; +} + +static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) +{ + return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num)); +} + +static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, + void *dummy) +{ + return regs_return_value(regs); +} + +static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, + void *dummy) +{ + return kernel_stack_pointer(regs); +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + if (sc->addr) + sc->addr += sc->offset; + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + + update_symbol_cache(sc); + return sc; +} + +static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) +{ + struct symbol_cache *sc = data; + + if (sc->addr) + return fetch_memory(regs, (void *)sc->addr); + else + return 0; +} + +/* Special indirect memory access interface */ +struct indirect_fetch_data { + struct fetch_func orig; + long offset; +}; + +static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) +{ + struct indirect_fetch_data *ind = data; + unsigned long addr; + + addr = call_fetch(&ind->orig, regs); + if (addr) { + addr += ind->offset; + return fetch_memory(regs, (void *)addr); + } else + return 0; +} + +static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +{ + if (data->orig.func == fetch_indirect) + free_indirect_fetch_data(data->orig.data); + else if (data->orig.func == fetch_symbol) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/** + * Kprobe event core functions + */ + +struct probe_arg { + struct fetch_func fetch; + const char *name; +}; + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 + +struct trace_probe { + struct list_head list; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long nhit; + unsigned int flags; /* For TP_FLAG_* */ + const char *symbol; /* symbol name */ + struct ftrace_event_call call; + struct trace_event event; + unsigned int nr_args; + struct probe_arg args[]; +}; + +#define SIZEOF_TRACE_PROBE(n) \ + (offsetof(struct trace_probe, args) + \ + (sizeof(struct probe_arg) * (n))) + +static __kprobes int probe_is_return(struct trace_probe *tp) +{ + return tp->rp.handler != NULL; +} + +static __kprobes const char *probe_symbol(struct trace_probe *tp) +{ + return tp->symbol ? tp->symbol : "unknown"; +} + +static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) +{ + int ret = -EINVAL; + + if (ff->func == fetch_argument) + ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data); + else if (ff->func == fetch_register) { + const char *name; + name = regs_query_register_name((unsigned int)((long)ff->data)); + ret = snprintf(buf, n, "%%%s", name); + } else if (ff->func == fetch_stack) + ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); + else if (ff->func == fetch_memory) + ret = snprintf(buf, n, "@0x%p", ff->data); + else if (ff->func == fetch_symbol) { + struct symbol_cache *sc = ff->data; + if (sc->offset) + ret = snprintf(buf, n, "@%s%+ld", sc->symbol, + sc->offset); + else + ret = snprintf(buf, n, "@%s", sc->symbol); + } else if (ff->func == fetch_retvalue) + ret = snprintf(buf, n, "$retval"); + else if (ff->func == fetch_stack_address) + ret = snprintf(buf, n, "$stack"); + else if (ff->func == fetch_indirect) { + struct indirect_fetch_data *id = ff->data; + size_t l = 0; + ret = snprintf(buf, n, "%+ld(", id->offset); + if (ret >= n) + goto end; + l += ret; + ret = probe_arg_string(buf + l, n - l, &id->orig); + if (ret < 0) + goto end; + l += ret; + ret = snprintf(buf + l, n - l, ")"); + ret += l; + } +end: + if (ret >= n) + return -ENOSPC; + return ret; +} + +static int register_probe_event(struct trace_probe *tp); +static void unregister_probe_event(struct trace_probe *tp); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); + +/* + * Allocate new trace_probe and initialize it (including kprobes). + */ +static struct trace_probe *alloc_trace_probe(const char *group, + const char *event, + void *addr, + const char *symbol, + unsigned long offs, + int nargs, int is_return) +{ + struct trace_probe *tp; + + tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); + if (!tp) + return ERR_PTR(-ENOMEM); + + if (symbol) { + tp->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tp->symbol) + goto error; + tp->rp.kp.symbol_name = tp->symbol; + tp->rp.kp.offset = offs; + } else + tp->rp.kp.addr = addr; + + if (is_return) + tp->rp.handler = kretprobe_dispatcher; + else + tp->rp.kp.pre_handler = kprobe_dispatcher; + + if (!event) + goto error; + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + goto error; + + if (!group) + goto error; + tp->call.system = kstrdup(group, GFP_KERNEL); + if (!tp->call.system) + goto error; + + INIT_LIST_HEAD(&tp->list); + return tp; +error: + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); + return ERR_PTR(-ENOMEM); +} + +static void free_probe_arg(struct probe_arg *arg) +{ + if (arg->fetch.func == fetch_symbol) + free_symbol_cache(arg->fetch.data); + else if (arg->fetch.func == fetch_indirect) + free_indirect_fetch_data(arg->fetch.data); + kfree(arg->name); +} + +static void free_trace_probe(struct trace_probe *tp) +{ + int i; + + for (i = 0; i < tp->nr_args; i++) + free_probe_arg(&tp->args[i]); + + kfree(tp->call.system); + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); +} + +static struct trace_probe *find_probe_event(const char *event, + const char *group) +{ + struct trace_probe *tp; + + list_for_each_entry(tp, &probe_list, list) + if (strcmp(tp->call.name, event) == 0 && + strcmp(tp->call.system, group) == 0) + return tp; + return NULL; +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + if (probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + list_del(&tp->list); + unregister_probe_event(tp); +} + +/* Register a trace_probe and probe_event */ +static int register_trace_probe(struct trace_probe *tp) +{ + struct trace_probe *old_tp; + int ret; + + mutex_lock(&probe_lock); + + /* register as an event */ + old_tp = find_probe_event(tp->call.name, tp->call.system); + if (old_tp) { + /* delete old event */ + unregister_trace_probe(old_tp); + free_trace_probe(old_tp); + } + ret = register_probe_event(tp); + if (ret) { + pr_warning("Faild to register probe event(%d)\n", ret); + goto end; + } + + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (probe_is_return(tp)) + ret = register_kretprobe(&tp->rp); + else + ret = register_kprobe(&tp->rp.kp); + + if (ret) { + pr_warning("Could not insert probe(%d)\n", ret); + if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + unregister_probe_event(tp); + } else + list_add_tail(&tp->list, &probe_list); +end: + mutex_unlock(&probe_lock); + return ret; +} + +/* Split symbol and offset. */ +static int split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtoul(tmp + 1, 0, offset); + if (ret) + return ret; + *tmp = '\0'; + } else + *offset = 0; + return 0; +} + +#define PARAM_MAX_ARGS 16 +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) { + ff->func = fetch_retvalue; + ff->data = NULL; + } else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + ff->func = fetch_stack_address; + ff->data = NULL; + } else if (isdigit(arg[5])) { + ret = strict_strtoul(arg + 5, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + ff->func = fetch_stack; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) { + ret = strict_strtoul(arg + 3, 10, ¶m); + if (ret || param > PARAM_MAX_ARGS) + ret = -EINVAL; + else { + ff->func = fetch_argument; + ff->data = (void *)param; + } + } else + ret = -EINVAL; + return ret; +} + +/* Recursive argument parser */ +static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, ff, is_return); + break; + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + ff->func = fetch_register; + ff->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + ff->func = fetch_memory; + ff->data = (void *)param; + } else { + ret = split_symbol_offset(arg + 1, &offset); + if (ret) + break; + ff->data = alloc_symbol_cache(arg + 1, offset); + if (ff->data) + ff->func = fetch_symbol; + else + ret = -EINVAL; + } + break; + case '+': /* indirect memory */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) { + ret = -EINVAL; + break; + } + *tmp = '\0'; + ret = strict_strtol(arg + 1, 0, &offset); + if (ret) + break; + if (arg[0] == '-') + offset = -offset; + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (tmp) { + struct indirect_fetch_data *id; + *tmp = '\0'; + id = kzalloc(sizeof(struct indirect_fetch_data), + GFP_KERNEL); + if (!id) + return -ENOMEM; + id->offset = offset; + ret = __parse_probe_arg(arg, &id->orig, is_return); + if (ret) + kfree(id); + else { + ff->func = fetch_indirect; + ff->data = (void *)id; + } + } else + ret = -EINVAL; + break; + default: + /* TODO: support custom handler */ + ret = -EINVAL; + } + return ret; +} + +/* String length checking wrapper */ +static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +{ + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument is too long.: %s\n", arg); + return -ENOSPC; + } + return __parse_probe_arg(arg, ff, is_return); +} + +/* Return 1 if name is reserved or already used by another argument */ +static int conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + return 0; +} + +static int create_trace_probe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * Fetch args: + * $argN : fetch Nth of function argument. (N:0-) + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Indirect memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + */ + struct trace_probe *tp; + int i, ret = 0; + int is_return = 0, is_delete = 0; + char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + unsigned long offset = 0; + void *addr = NULL; + char buf[MAX_EVENT_NAME_LEN]; + + /* argc must be >= 1 */ + if (argv[0][0] == 'p') + is_return = 0; + else if (argv[0][0] == 'r') + is_return = 1; + else if (argv[0][0] == '-') + is_delete = 1; + else { + pr_info("Probe definition must be started with 'p', 'r' or" + " '-'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + if (strchr(event, '/')) { + group = event; + event = strchr(group, '/') + 1; + event[-1] = '\0'; + if (strlen(group) == 0) { + pr_info("Group name is not specifiled\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specifiled\n"); + return -EINVAL; + } + } + if (!group) + group = KPROBE_EVENT_SYSTEM; + + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + tp = find_probe_event(event, group); + if (!tp) { + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + unregister_trace_probe(tp); + free_trace_probe(tp); + return 0; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + if (isdigit(argv[1][0])) { + if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; + } + /* an address specified */ + ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + if (ret) { + pr_info("Failed to parse address.\n"); + return ret; + } + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = split_symbol_offset(symbol, &offset); + if (ret) { + pr_info("Failed to parse symbol.\n"); + return ret; + } + if (offset && is_return) { + pr_info("Return probe must be used without offset.\n"); + return -EINVAL; + } + } + argc -= 2; argv += 2; + + /* setup a probe */ + if (!event) { + /* Make a new event name */ + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", + is_return ? 'r' : 'p', addr); + event = buf; + } + tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + is_return); + if (IS_ERR(tp)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tp)); + return PTR_ERR(tp); + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) + *arg++ = '\0'; + else + arg = argv[i]; + + if (conflict_field_name(argv[i], tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + if (!tp->args[i].name) { + pr_info("Failed to allocate argument%d name '%s'.\n", + i, argv[i]); + ret = -ENOMEM; + goto error; + } + + /* Parse fetch argument */ + ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + if (ret) { + pr_info("Parse error at argument%d. (%d)\n", i, ret); + kfree(tp->args[i].name); + goto error; + } + + tp->nr_args++; + } + + ret = register_trace_probe(tp); + if (ret) + goto error; + return 0; + +error: + free_trace_probe(tp); + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_probe *tp; + + mutex_lock(&probe_lock); + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tp = list_entry(probe_list.next, struct trace_probe, list); + unregister_trace_probe(tp); + free_trace_probe(tp); + } + mutex_unlock(&probe_lock); +} + + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + int i, ret; + char buf[MAX_ARGSTR_LEN + 1]; + + seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); + + if (!tp->symbol) + seq_printf(m, " 0x%p", tp->rp.kp.addr); + else if (tp->rp.kp.offset) + seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + else + seq_printf(m, " %s", probe_symbol(tp)); + + for (i = 0; i < tp->nr_args; i++) { + ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); + if (ret < 0) { + pr_warning("Argument%d decoding error(%d).\n", i, ret); + return ret; + } + seq_printf(m, " %s=%s", tp->args[i].name, buf); + } + seq_printf(m, "\n"); + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static int command_trace_probe(const char *buf) +{ + char **argv; + int argc = 0, ret = 0; + + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = create_trace_probe(argc, argv); + + argv_free(argv); + return ret; +} + +#define WRITE_BUFSIZE 128 + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *kbuf, *tmp; + int ret; + size_t done; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = done = 0; + while (done < count) { + size = count - done; + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + if (tmp) + *tmp = '\0'; + + ret = command_trace_probe(kbuf); + if (ret) + goto out; + } + ret = done; +out: + kfree(kbuf); + return ret; +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + + seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, + tp->rp.kp.nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Kprobe handler */ +static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct kprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + tp->nhit++; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + return 0; +} + +/* Kretprobe handler */ +static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct kretprobe_trace_entry *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, i, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + local_save_flags(irq_flags); + pc = preempt_count(); + + size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + + event = trace_current_buffer_lock_reserve(&buffer, call->id, size, + irq_flags, pc); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); + + return 0; +} + +/* Event entry printers */ +enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags) +{ + struct kprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags) +{ + struct kretprobe_trace_entry *field; + struct trace_seq *s = &iter->seq; + struct trace_event *event; + struct trace_probe *tp; + int i; + + field = (struct kretprobe_trace_entry *)iter->ent; + event = ftrace_find_event(field->ent.type); + tp = container_of(event, struct trace_probe, event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, " <- ")) + goto partial; + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + for (i = 0; i < field->nargs; i++) + if (!trace_seq_printf(s, " %s=%lx", + tp->args[i].name, field->args[i])) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int probe_event_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_TRACE; + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_event_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_TRACE; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + +static int probe_event_raw_init(struct ftrace_event_call *event_call) +{ + INIT_LIST_HEAD(&event_call->fields); + + return 0; +} + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + +static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kretprobe_trace_entry field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) + DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + return 0; +} + +static int __probe_event_show_format(struct trace_seq *s, + struct trace_probe *tp, const char *fmt, + const char *arg) +{ + int i; + + /* Show format */ + if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) + return 0; + + if (!trace_seq_printf(s, "\", %s", arg)) + return 0; + + for (i = 0; i < tp->nr_args; i++) + if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) + return 0; + + return trace_seq_puts(s, "\n"); +} + +#undef SHOW_FIELD +#define SHOW_FIELD(type, item, name) \ + do { \ + ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ + "offset:%u;\tsize:%u;\n", name, \ + (unsigned int)offsetof(typeof(field), item),\ + (unsigned int)sizeof(type)); \ + if (!ret) \ + return 0; \ + } while (0) + +static int kprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx)", + "REC->" FIELD_STRING_IP); +} + +static int kretprobe_event_show_format(struct ftrace_event_call *call, + struct trace_seq *s) +{ + struct kretprobe_trace_entry field __attribute__((unused)); + int ret, i; + struct trace_probe *tp = (struct trace_probe *)call->data; + + SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); + SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); + SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + + /* Show fields */ + for (i = 0; i < tp->nr_args; i++) + SHOW_FIELD(unsigned long, args[i], tp->args[i].name); + trace_seq_puts(s, "\n"); + + return __probe_event_show_format(s, tp, "(%lx <- %lx)", + "REC->" FIELD_STRING_FUNC + ", REC->" FIELD_STRING_RETIP); +} + +#ifdef CONFIG_EVENT_PROFILE + +/* Kprobe profile handler */ +static __kprobes int kprobe_profile_func(struct kprobe *kp, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct ftrace_event_call *call = &tp->call; + struct kprobe_trace_entry *entry; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *trace_buf; + char *raw_data; + int rctx; + + pc = preempt_count(); + __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + raw_data = per_cpu_ptr(trace_buf, __cpu); + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->ip = (unsigned long)kp->addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ip, 1, entry, size); + +end: + perf_swevent_put_recursion_context(rctx); +end_recursion: + local_irq_restore(irq_flags); + + return 0; +} + +/* Kretprobe profile handler */ +static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct ftrace_event_call *call = &tp->call; + struct kretprobe_trace_entry *entry; + struct trace_entry *ent; + int size, __size, i, pc, __cpu; + unsigned long irq_flags; + char *trace_buf; + char *raw_data; + int rctx; + + pc = preempt_count(); + __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + "profile buffer not large enough")) + return 0; + + /* + * Protect the non nmi buffer + * This also protects the rcu read side + */ + local_irq_save(irq_flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + + __cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto end; + + raw_data = per_cpu_ptr(trace_buf, __cpu); + + /* Zero dead bytes from alignment to avoid buffer leak to userspace */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + entry = (struct kretprobe_trace_entry *)raw_data; + ent = &entry->ent; + + tracing_generic_entry_update(ent, irq_flags, pc); + ent->type = call->id; + entry->nargs = tp->nr_args; + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + for (i = 0; i < tp->nr_args; i++) + entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + perf_tp_event(call->id, entry->ret_ip, 1, entry, size); + +end: + perf_swevent_put_recursion_context(rctx); +end_recursion: + local_irq_restore(irq_flags); + + return 0; +} + +static int probe_profile_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_PROFILE; + + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_profile_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_PROFILE; + + if (!(tp->flags & TP_FLAG_TRACE)) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} +#endif /* CONFIG_EVENT_PROFILE */ + + +static __kprobes +int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + + if (tp->flags & TP_FLAG_TRACE) + kprobe_trace_func(kp, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kprobe_profile_func(kp, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static __kprobes +int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + + if (tp->flags & TP_FLAG_TRACE) + kretprobe_trace_func(ri, regs); +#ifdef CONFIG_EVENT_PROFILE + if (tp->flags & TP_FLAG_PROFILE) + kretprobe_profile_func(ri, regs); +#endif /* CONFIG_EVENT_PROFILE */ + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static int register_probe_event(struct trace_probe *tp) +{ + struct ftrace_event_call *call = &tp->call; + int ret; + + /* Initialize ftrace_event_call */ + if (probe_is_return(tp)) { + tp->event.trace = print_kretprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kretprobe_event_show_format; + call->define_fields = kretprobe_event_define_fields; + } else { + tp->event.trace = print_kprobe_event; + call->raw_init = probe_event_raw_init; + call->show_format = kprobe_event_show_format; + call->define_fields = kprobe_event_define_fields; + } + call->event = &tp->event; + call->id = register_ftrace_event(&tp->event); + if (!call->id) + return -ENODEV; + call->enabled = 0; + call->regfunc = probe_event_enable; + call->unregfunc = probe_event_disable; + +#ifdef CONFIG_EVENT_PROFILE + call->profile_enable = probe_profile_enable; + call->profile_disable = probe_profile_disable; +#endif + call->data = tp; + ret = trace_add_event_call(call); + if (ret) { + pr_info("Failed to register kprobe event: %s\n", call->name); + unregister_ftrace_event(&tp->event); + } + return ret; +} + +static void unregister_probe_event(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + trace_remove_event_call(&tp->call); +} + +/* Make a debugfs interface for controling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + /* Event list interface */ + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_events' entry\n"); + + /* Profile interface */ + entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + NULL, &kprobe_profile_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_profile' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static __init int kprobe_trace_self_tests_init(void) +{ + int ret; + int (*target)(int, int, int, int, int, int); + + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " + "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function entry\n"); + + ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " + "$retval"); + if (WARN_ON_ONCE(ret)) + pr_warning("error enabling function return\n"); + + ret = target(1, 2, 3, 4, 5, 6); + + cleanup_all_probes(); + + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c new file mode 100644 index 00000000000..faf37fa4408 --- /dev/null +++ b/kernel/trace/trace_ksym.c @@ -0,0 +1,545 @@ +/* + * trace_ksym.c - Kernel Symbol Tracer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include <linux/kallsyms.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/fs.h> + +#include "trace_output.h" +#include "trace_stat.h" +#include "trace.h" + +#include <linux/hw_breakpoint.h> +#include <asm/hw_breakpoint.h> + +/* + * For now, let us restrict the no. of symbols traced simultaneously to number + * of available hardware breakpoint registers. + */ +#define KSYM_TRACER_MAX HBP_NUM + +#define KSYM_TRACER_OP_LEN 3 /* rw- */ + +struct trace_ksym { + struct perf_event **ksym_hbp; + struct perf_event_attr attr; +#ifdef CONFIG_PROFILE_KSYM_TRACER + unsigned long counter; +#endif + struct hlist_node ksym_hlist; +}; + +static struct trace_array *ksym_trace_array; + +static unsigned int ksym_filter_entry_count; +static unsigned int ksym_tracing_enabled; + +static HLIST_HEAD(ksym_filter_head); + +static DEFINE_MUTEX(ksym_tracer_mutex); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + +#define MAX_UL_INT 0xffffffff + +void ksym_collect_stats(unsigned long hbp_hit_addr) +{ + struct hlist_node *node; + struct trace_ksym *entry; + + rcu_read_lock(); + hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { + if ((entry->attr.bp_addr == hbp_hit_addr) && + (entry->counter <= MAX_UL_INT)) { + entry->counter++; + break; + } + } + rcu_read_unlock(); +} +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + +void ksym_hbp_handler(struct perf_event *hbp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct ring_buffer_event *event; + struct ksym_trace_entry *entry; + struct ring_buffer *buffer; + int pc; + + if (!ksym_tracing_enabled) + return; + + buffer = ksym_trace_array->buffer; + + pc = preempt_count(); + + event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, + sizeof(*entry), 0, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->ip = instruction_pointer(regs); + entry->type = hw_breakpoint_type(hbp); + entry->addr = hw_breakpoint_addr(hbp); + strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); + +#ifdef CONFIG_PROFILE_KSYM_TRACER + ksym_collect_stats(hw_breakpoint_addr(hbp)); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ + + trace_buffer_unlock_commit(buffer, event, 0, pc); +} + +/* Valid access types are represented as + * + * rw- : Set Read/Write Access Breakpoint + * -w- : Set Write Access Breakpoint + * --- : Clear Breakpoints + * --x : Set Execution Break points (Not available yet) + * + */ +static int ksym_trace_get_access_type(char *str) +{ + int access = 0; + + if (str[0] == 'r') + access |= HW_BREAKPOINT_R; + + if (str[1] == 'w') + access |= HW_BREAKPOINT_W; + + if (str[2] == 'x') + access |= HW_BREAKPOINT_X; + + switch (access) { + case HW_BREAKPOINT_R: + case HW_BREAKPOINT_W: + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + return access; + default: + return -EINVAL; + } +} + +/* + * There can be several possible malformed requests and we attempt to capture + * all of them. We enumerate some of the rules + * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. + * i.e. multiple ':' symbols disallowed. Possible uses are of the form + * <module>:<ksym_name>:<op>. + * 2. No delimiter symbol ':' in the input string + * 3. Spurious operator symbols or symbols not in their respective positions + * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file + * 5. Kernel symbol not a part of /proc/kallsyms + * 6. Duplicate requests + */ +static int parse_ksym_trace_str(char *input_string, char **ksymname, + unsigned long *addr) +{ + int ret; + + *ksymname = strsep(&input_string, ":"); + *addr = kallsyms_lookup_name(*ksymname); + + /* Check for malformed request: (2), (1) and (5) */ + if ((!input_string) || + (strlen(input_string) != KSYM_TRACER_OP_LEN) || + (*addr == 0)) + return -EINVAL;; + + ret = ksym_trace_get_access_type(input_string); + + return ret; +} + +int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) +{ + struct trace_ksym *entry; + int ret = -ENOMEM; + + if (ksym_filter_entry_count >= KSYM_TRACER_MAX) { + printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No" + " new requests for tracing can be accepted now.\n", + KSYM_TRACER_MAX); + return -ENOSPC; + } + + entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + hw_breakpoint_init(&entry->attr); + + entry->attr.bp_type = op; + entry->attr.bp_addr = addr; + entry->attr.bp_len = HW_BREAKPOINT_LEN_4; + + ret = -EAGAIN; + entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, + ksym_hbp_handler); + + if (IS_ERR(entry->ksym_hbp)) { + ret = PTR_ERR(entry->ksym_hbp); + printk(KERN_INFO "ksym_tracer request failed. Try again" + " later!!\n"); + goto err; + } + + hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); + ksym_filter_entry_count++; + + return 0; + +err: + kfree(entry); + + return ret; +} + +static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + struct trace_seq *s; + ssize_t cnt = 0; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + trace_seq_init(s); + + mutex_lock(&ksym_tracer_mutex); + + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + ret = trace_seq_printf(s, "%pS:", + (void *)(unsigned long)entry->attr.bp_addr); + if (entry->attr.bp_type == HW_BREAKPOINT_R) + ret = trace_seq_puts(s, "r--\n"); + else if (entry->attr.bp_type == HW_BREAKPOINT_W) + ret = trace_seq_puts(s, "-w-\n"); + else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) + ret = trace_seq_puts(s, "rw-\n"); + WARN_ON_ONCE(!ret); + } + + cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + mutex_unlock(&ksym_tracer_mutex); + + kfree(s); + + return cnt; +} + +static void __ksym_trace_reset(void) +{ + struct trace_ksym *entry; + struct hlist_node *node, *node1; + + mutex_lock(&ksym_tracer_mutex); + hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, + ksym_hlist) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + } + mutex_unlock(&ksym_tracer_mutex); +} + +static ssize_t ksym_trace_filter_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct trace_ksym *entry; + struct hlist_node *node; + char *buf, *input_string, *ksymname = NULL; + unsigned long ksym_addr = 0; + int ret, op, changed = 0; + + buf = kzalloc(count + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(buf, buffer, count)) + goto out; + + buf[count] = '\0'; + input_string = strstrip(buf); + + /* + * Clear all breakpoints if: + * 1: echo > ksym_trace_filter + * 2: echo 0 > ksym_trace_filter + * 3: echo "*:---" > ksym_trace_filter + */ + if (!buf[0] || !strcmp(buf, "0") || + !strcmp(buf, "*:---")) { + __ksym_trace_reset(); + ret = 0; + goto out; + } + + ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); + if (ret < 0) + goto out; + + mutex_lock(&ksym_tracer_mutex); + + ret = -EINVAL; + hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { + if (entry->attr.bp_addr == ksym_addr) { + /* Check for malformed request: (6) */ + if (entry->attr.bp_type != op) + changed = 1; + else + goto out_unlock; + break; + } + } + if (changed) { + unregister_wide_hw_breakpoint(entry->ksym_hbp); + entry->attr.bp_type = op; + ret = 0; + if (op > 0) { + entry->ksym_hbp = + register_wide_hw_breakpoint(&entry->attr, + ksym_hbp_handler); + if (IS_ERR(entry->ksym_hbp)) + ret = PTR_ERR(entry->ksym_hbp); + else + goto out_unlock; + } + /* Error or "symbol:---" case: drop it */ + ksym_filter_entry_count--; + hlist_del_rcu(&(entry->ksym_hlist)); + synchronize_rcu(); + kfree(entry); + goto out_unlock; + } else { + /* Check for malformed request: (4) */ + if (op) + ret = process_new_ksym_entry(ksymname, op, ksym_addr); + } +out_unlock: + mutex_unlock(&ksym_tracer_mutex); +out: + kfree(buf); + return !ret ? count : ret; +} + +static const struct file_operations ksym_tracing_fops = { + .open = tracing_open_generic, + .read = ksym_trace_filter_read, + .write = ksym_trace_filter_write, +}; + +static void ksym_trace_reset(struct trace_array *tr) +{ + ksym_tracing_enabled = 0; + __ksym_trace_reset(); +} + +static int ksym_trace_init(struct trace_array *tr) +{ + int cpu, ret = 0; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + ksym_tracing_enabled = 1; + ksym_trace_array = tr; + + return ret; +} + +static void ksym_trace_print_header(struct seq_file *m) +{ + seq_puts(m, + "# TASK-PID CPU# Symbol " + "Type Function\n"); + seq_puts(m, + "# | | | " + " | |\n"); +} + +static enum print_line_t ksym_trace_output(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct ksym_trace_entry *field; + char str[KSYM_SYMBOL_LEN]; + int ret; + + if (entry->type != TRACE_KSYM) + return TRACE_TYPE_UNHANDLED; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, + entry->pid, iter->cpu, (char *)field->addr); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + switch (field->type) { + case HW_BREAKPOINT_R: + ret = trace_seq_printf(s, " R "); + break; + case HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + ret = trace_seq_printf(s, " RW "); + break; + default: + return TRACE_TYPE_PARTIAL_LINE; + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + sprint_symbol(str, field->ip); + ret = trace_seq_printf(s, "%s\n", str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +struct tracer ksym_tracer __read_mostly = +{ + .name = "ksym_tracer", + .init = ksym_trace_init, + .reset = ksym_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_ksym, +#endif + .print_header = ksym_trace_print_header, + .print_line = ksym_trace_output +}; + +__init static int init_ksym_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + ksym_filter_entry_count = 0; + + entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer, + NULL, &ksym_tracing_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'ksym_trace_filter' file\n"); + + return register_tracer(&ksym_tracer); +} +device_initcall(init_ksym_trace); + + +#ifdef CONFIG_PROFILE_KSYM_TRACER +static int ksym_tracer_stat_headers(struct seq_file *m) +{ + seq_puts(m, " Access Type "); + seq_puts(m, " Symbol Counter\n"); + seq_puts(m, " ----------- "); + seq_puts(m, " ------ -------\n"); + return 0; +} + +static int ksym_tracer_stat_show(struct seq_file *m, void *v) +{ + struct hlist_node *stat = v; + struct trace_ksym *entry; + int access_type = 0; + char fn_name[KSYM_NAME_LEN]; + + entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); + + access_type = entry->attr.bp_type; + + switch (access_type) { + case HW_BREAKPOINT_R: + seq_puts(m, " R "); + break; + case HW_BREAKPOINT_W: + seq_puts(m, " W "); + break; + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: + seq_puts(m, " RW "); + break; + default: + seq_puts(m, " NA "); + } + + if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) + seq_printf(m, " %-36s", fn_name); + else + seq_printf(m, " %-36s", "<NA>"); + seq_printf(m, " %15lu\n", entry->counter); + + return 0; +} + +static void *ksym_tracer_stat_start(struct tracer_stat *trace) +{ + return ksym_filter_head.first; +} + +static void * +ksym_tracer_stat_next(void *v, int idx) +{ + struct hlist_node *stat = v; + + return stat->next; +} + +static struct tracer_stat ksym_tracer_stats = { + .name = "ksym_tracer", + .stat_start = ksym_tracer_stat_start, + .stat_next = ksym_tracer_stat_next, + .stat_headers = ksym_tracer_stat_headers, + .stat_show = ksym_tracer_stat_show +}; + +__init static int ksym_tracer_stat_init(void) +{ + int ret; + + ret = register_stat_tracer(&ksym_tracer_stats); + if (ret) { + printk(KERN_WARNING "Warning: could not register " + "ksym tracer stats\n"); + return 1; + } + + return 0; +} +fs_initcall(ksym_tracer_stat_init); +#endif /* CONFIG_PROFILE_KSYM_TRACER */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed17565826b..8e46b3323cd 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; -void trace_print_seq(struct seq_file *m, struct trace_seq *s) +int trace_print_seq(struct seq_file *m, struct trace_seq *s) { int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + int ret; + + ret = seq_write(m, s->buffer, len); - seq_write(m, s->buffer, len); + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. + */ + if (!ret) + trace_seq_init(s); - trace_seq_init(s); + return ret; } enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) @@ -69,6 +77,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) * @s: trace sequence descriptor * @fmt: printf format string * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formating of a trace * trace_seq_printf is used to store strings into a special @@ -82,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_list ap; int ret; - if (!len) + if (s->full || !len) return 0; va_start(ap, fmt); @@ -90,12 +101,14 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) va_end(ap); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; - return len; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); @@ -116,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = vsnprintf(s->buffer + s->len, len, fmt, args); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -136,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) int len = (PAGE_SIZE - 1) - s->len; int ret; - if (!len) + if (s->full || !len) return 0; ret = bstr_printf(s->buffer + s->len, len, fmt, binary); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) + if (ret >= len) { + s->full = 1; return 0; + } s->len += ret; @@ -164,9 +181,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str) { int len = strlen(str); - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) return 0; + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return 0; + } + memcpy(s->buffer + s->len, str, len); s->len += len; @@ -175,8 +197,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str) int trace_seq_putc(struct trace_seq *s, unsigned char c) { - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) + return 0; + + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; return 0; + } s->buffer[s->len++] = c; @@ -185,8 +212,13 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c) int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) { - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) + return 0; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; return 0; + } memcpy(s->buffer + s->len, mem, len); s->len += len; @@ -200,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) const unsigned char *data = mem; int i, j; + if (s->full) + return 0; + #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else @@ -217,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) { void *ret; - if (len > ((PAGE_SIZE - 1) - s->len)) + if (s->full) + return 0; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; return NULL; + } ret = s->buffer + s->len; s->len += len; @@ -230,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path) { unsigned char *p; - if (s->len >= (PAGE_SIZE - 1)) + if (s->full) + return 0; + + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; return 0; + } + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); if (!IS_ERR(p)) { p = mangle_path(s->buffer + s->len, p, "\n"); @@ -244,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path) return 1; } + s->full = 1; return 0; } @@ -370,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, unsigned long vmstart = 0; int ret = 1; + if (s->full) + return 0; + if (mm) { const struct vm_area_struct *vma; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 26185d72767..0271742abb8 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -28,8 +28,8 @@ static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; -static raw_spinlock_t wakeup_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t wakeup_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static void __wakeup_reset(struct trace_array *tr); @@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, goto out; local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); /* We could race with grabbing wakeup_lock */ if (unlikely(!tracer_enabled || next != wakeup_task)) @@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, out_unlock: __wakeup_reset(wakeup_trace); - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: atomic_dec(&wakeup_trace->data[cpu]->disabled); @@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr) tracing_reset_online_cpus(tr); local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); __wakeup_reset(tr); - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); } @@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) goto out; /* interrupts should be off from try_to_wake_up */ - __raw_spin_lock(&wakeup_lock); + arch_spin_lock(&wakeup_lock); /* check for races. */ if (!tracer_enabled || p->prio >= wakeup_prio) @@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: - __raw_spin_unlock(&wakeup_lock); + arch_spin_unlock(&wakeup_lock); out: atomic_dec(&wakeup_trace->data[cpu]->disabled); } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index d2cdbabb4ea..280fea470d6 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -17,6 +17,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: case TRACE_HW_BRANCHES: + case TRACE_KSYM: return 1; } return 0; @@ -66,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) /* Don't allow flipping of max traces now */ local_irq_save(flags); - __raw_spin_lock(&ftrace_max_lock); + arch_spin_lock(&ftrace_max_lock); cnt = ring_buffer_entries(tr->buffer); @@ -84,7 +85,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) break; } tracing_on(); - __raw_spin_unlock(&ftrace_max_lock); + arch_spin_unlock(&ftrace_max_lock); local_irq_restore(flags); if (count) @@ -808,3 +809,57 @@ trace_selftest_startup_hw_branches(struct tracer *trace, return ret; } #endif /* CONFIG_HW_BRANCH_TRACER */ + +#ifdef CONFIG_KSYM_TRACER +static int ksym_selftest_dummy; + +int +trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + ksym_selftest_dummy = 0; + /* Register the read-write tracing request */ + + ret = process_new_ksym_entry("ksym_selftest_dummy", + HW_BREAKPOINT_R | HW_BREAKPOINT_W, + (unsigned long)(&ksym_selftest_dummy)); + + if (ret < 0) { + printk(KERN_CONT "ksym_trace read-write startup test failed\n"); + goto ret_path; + } + /* Perform a read and a write operation over the dummy variable to + * trigger the tracer + */ + if (ksym_selftest_dummy == 0) + ksym_selftest_dummy++; + + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + /* read & write operations - one each is performed on the dummy variable + * triggering two entries in the trace buffer + */ + if (!ret && count != 2) { + printk(KERN_CONT "Ksym tracer startup test failed"); + ret = -1; + } + +ret_path: + return ret; +} +#endif /* CONFIG_KSYM_TRACER */ + diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8504ac71e4e..678a5120ee3 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = { }; static unsigned long max_stack_size; -static raw_spinlock_t max_stack_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t max_stack_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; static int stack_trace_disabled __read_mostly; static DEFINE_PER_CPU(int, trace_active); @@ -54,7 +54,7 @@ static inline void check_stack(void) return; local_irq_save(flags); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); /* a race could have already updated it */ if (this_size <= max_stack_size) @@ -103,7 +103,7 @@ static inline void check_stack(void) } out: - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); } @@ -171,9 +171,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); *ptr = val; - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); return count; @@ -207,7 +207,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { local_irq_disable(); - __raw_spin_lock(&max_stack_lock); + arch_spin_lock(&max_stack_lock); if (*pos == 0) return SEQ_START_TOKEN; @@ -217,7 +217,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { - __raw_spin_unlock(&max_stack_lock); + arch_spin_unlock(&max_stack_lock); local_irq_enable(); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 527e17eae57..75289f372dd 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -14,6 +14,43 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +extern unsigned long __start_syscalls_metadata[]; +extern unsigned long __stop_syscalls_metadata[]; + +static struct syscall_metadata **syscalls_metadata; + +static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata *start; + struct syscall_metadata *stop; + char str[KSYM_SYMBOL_LEN]; + + + start = (struct syscall_metadata *)__start_syscalls_metadata; + stop = (struct syscall_metadata *)__stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + for ( ; start < stop; start++) { + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + if (start->name && !strcmp(start->name + 3, str + 3)) + return start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) { @@ -30,7 +67,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags) if (!entry) goto end; - if (entry->enter_id != ent->type) { + if (entry->enter_event->id != ent->type) { WARN_ON_ONCE(1); goto end; } @@ -85,7 +122,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } - if (entry->exit_id != ent->type) { + if (entry->exit_event->id != ent->type) { WARN_ON_ONCE(1); return TRACE_TYPE_UNHANDLED; } @@ -103,24 +140,19 @@ extern char *__bad_type_size(void); #define SYSCALL_FIELD(type, name) \ sizeof(type) != sizeof(trace.name) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) + #type, #name, offsetof(typeof(trace), name), \ + sizeof(trace.name), is_signed_type(type) int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) { int i; - int nr; int ret; - struct syscall_metadata *entry; + struct syscall_metadata *entry = call->data; struct syscall_trace_enter trace; int offset = offsetof(struct syscall_trace_enter, args); - nr = syscall_name_to_nr(call->data); - entry = syscall_nr_to_meta(nr); - - if (!entry) - return 0; - - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr)); if (!ret) return 0; @@ -130,8 +162,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) entry->args[i]); if (!ret) return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, - sizeof(unsigned long)); + ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" + "\tsigned:%u;\n", offset, + sizeof(unsigned long), + is_signed_type(unsigned long)); if (!ret) return 0; offset += sizeof(unsigned long); @@ -163,8 +197,10 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) struct syscall_trace_exit trace; ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" + "\tsigned:%u;\n", SYSCALL_FIELD(int, nr), SYSCALL_FIELD(long, ret)); if (!ret) @@ -176,19 +212,12 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) int syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; - struct syscall_metadata *meta; + struct syscall_metadata *meta = call->data; int ret; - int nr; int i; int offset = offsetof(typeof(trace), args); - nr = syscall_name_to_nr(call->data); - meta = syscall_nr_to_meta(nr); - - if (!meta) - return 0; - - ret = trace_define_common_fields(call); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); if (ret) return ret; @@ -208,11 +237,11 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) struct syscall_trace_exit trace; int ret; - ret = trace_define_common_fields(call); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), FILTER_OTHER); return ret; @@ -239,8 +268,8 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id, - size, 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->enter_event->id, size, 0, 0); if (!event) return; @@ -271,8 +300,8 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id, - sizeof(*entry), 0, 0); + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->exit_event->id, sizeof(*entry), 0, 0); if (!event) return; @@ -285,23 +314,18 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -int reg_event_syscall_enter(void *ptr) +int reg_event_syscall_enter(struct ftrace_event_call *call) { int ret = 0; int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) ret = register_trace_sys_enter(ftrace_syscall_enter); - if (ret) { - pr_info("event trace: Could not activate" - "syscall entry trace point"); - } else { + if (!ret) { set_bit(num, enabled_enter_syscalls); sys_refcount_enter++; } @@ -309,13 +333,11 @@ int reg_event_syscall_enter(void *ptr) return ret; } -void unreg_event_syscall_enter(void *ptr) +void unreg_event_syscall_enter(struct ftrace_event_call *call) { int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -326,23 +348,18 @@ void unreg_event_syscall_enter(void *ptr) mutex_unlock(&syscall_trace_lock); } -int reg_event_syscall_exit(void *ptr) +int reg_event_syscall_exit(struct ftrace_event_call *call) { int ret = 0; int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit); - if (ret) { - pr_info("event trace: Could not activate" - "syscall exit trace point"); - } else { + if (!ret) { set_bit(num, enabled_exit_syscalls); sys_refcount_exit++; } @@ -350,13 +367,11 @@ int reg_event_syscall_exit(void *ptr) return ret; } -void unreg_event_syscall_exit(void *ptr) +void unreg_event_syscall_exit(struct ftrace_event_call *call) { int num; - char *name; - name = (char *)ptr; - num = syscall_name_to_nr(name); + num = ((struct syscall_metadata *)call->data)->syscall_nr; if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -367,13 +382,44 @@ void unreg_event_syscall_exit(void *ptr) mutex_unlock(&syscall_trace_lock); } -struct trace_event event_syscall_enter = { - .trace = print_syscall_enter, -}; +int init_syscall_trace(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(call->event); + if (!id) + return -ENODEV; + call->id = id; + INIT_LIST_HEAD(&call->fields); + return 0; +} + +int __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + NR_syscalls, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return -ENOMEM; + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + if (!meta) + continue; + + meta->syscall_nr = i; + syscalls_metadata[i] = meta; + } -struct trace_event event_syscall_exit = { - .trace = print_syscall_exit, -}; + return 0; +} +core_initcall(init_ftrace_syscalls); #ifdef CONFIG_EVENT_PROFILE @@ -387,8 +433,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; unsigned long flags; + char *trace_buf; char *raw_data; int syscall_nr; + int rctx; int size; int cpu; @@ -412,41 +460,42 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); - else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; rec = (struct syscall_trace_enter *) raw_data; tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_id; + rec->ent.type = sys_data->enter_event->id; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tp_event(sys_data->enter_id, 0, 1, rec, size); + perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } -int reg_prof_syscall_enter(char *name) +int prof_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return -ENOSYS; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_enter) @@ -462,13 +511,11 @@ int reg_prof_syscall_enter(char *name) return ret; } -void unreg_prof_syscall_enter(char *name) +void prof_sysenter_disable(struct ftrace_event_call *call) { int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); sys_prof_refcount_enter--; @@ -484,7 +531,9 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) struct syscall_trace_exit *rec; unsigned long flags; int syscall_nr; + char *trace_buf; char *raw_data; + int rctx; int size; int cpu; @@ -510,17 +559,19 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) /* Protect the per cpu buffer, begin the rcu read side */ local_irq_save(flags); + + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + goto end_recursion; + cpu = smp_processor_id(); - if (in_nmi()) - raw_data = rcu_dereference(trace_profile_buf_nmi); - else - raw_data = rcu_dereference(trace_profile_buf); + trace_buf = rcu_dereference(perf_trace_buf); - if (!raw_data) + if (!trace_buf) goto end; - raw_data = per_cpu_ptr(raw_data, cpu); + raw_data = per_cpu_ptr(trace_buf, cpu); /* zero the dead bytes from align to not leak stack to user */ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; @@ -528,24 +579,24 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec = (struct syscall_trace_exit *)raw_data; tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->exit_id; + rec->ent.type = sys_data->exit_event->id; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_tp_event(sys_data->exit_id, 0, 1, rec, size); + perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); end: + perf_swevent_put_recursion_context(rctx); +end_recursion: local_irq_restore(flags); } -int reg_prof_syscall_exit(char *name) +int prof_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return -ENOSYS; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_exit) @@ -561,13 +612,11 @@ int reg_prof_syscall_exit(char *name) return ret; } -void unreg_prof_syscall_exit(char *name) +void prof_sysexit_disable(struct ftrace_event_call *call) { int num; - num = syscall_name_to_nr(name); - if (num < 0 || num >= NR_syscalls) - return; + num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); sys_prof_refcount_exit--; diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c new file mode 100644 index 00000000000..eb27fd3430a --- /dev/null +++ b/kernel/user-return-notifier.c @@ -0,0 +1,44 @@ + +#include <linux/user-return-notifier.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/module.h> + +static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); + +/* + * Request a notification when the current cpu returns to userspace. Must be + * called in atomic context. The notifier will also be called in atomic + * context. + */ +void user_return_notifier_register(struct user_return_notifier *urn) +{ + set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); + hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); +} +EXPORT_SYMBOL_GPL(user_return_notifier_register); + +/* + * Removes a registered user return notifier. Must be called from atomic + * context, and from the same cpu registration occured in. + */ +void user_return_notifier_unregister(struct user_return_notifier *urn) +{ + hlist_del(&urn->link); + if (hlist_empty(&__get_cpu_var(return_notifier_list))) + clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); +} +EXPORT_SYMBOL_GPL(user_return_notifier_unregister); + +/* Calls registered user return notifiers */ +void fire_user_return_notifiers(void) +{ + struct user_return_notifier *urn; + struct hlist_node *tmp1, *tmp2; + struct hlist_head *head; + + head = &get_cpu_var(return_notifier_list); + hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) + urn->on_user_return(urn); + put_cpu_var(return_notifier_list); +} diff --git a/kernel/user.c b/kernel/user.c index 2c000e7132a..46d0165ca70 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -330,9 +330,9 @@ done: */ static void free_user(struct user_struct *up, unsigned long flags) { - spin_unlock_irqrestore(&uidhash_lock, flags); INIT_DELAYED_WORK(&up->work, cleanup_user_struct); schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); + spin_unlock_irqrestore(&uidhash_lock, flags); } #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 69eae358a72..a2cd77e70d4 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -57,78 +57,47 @@ static int proc_do_uts_string(ctl_table *table, int write, #define proc_do_uts_string NULL #endif - -#ifdef CONFIG_SYSCTL_SYSCALL -/* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ctl_table uts_table; - int r, write; - write = newval && newlen; - memcpy(&uts_table, table, sizeof(uts_table)); - uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen); - put_uts(table, write, uts_table.data); - return r; -} -#else -#define sysctl_uts_string NULL -#endif - static struct ctl_table uts_kern_table[] = { { - .ctl_name = KERN_OSTYPE, .procname = "ostype", .data = init_uts_ns.name.sysname, .maxlen = sizeof(init_uts_ns.name.sysname), .mode = 0444, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_OSRELEASE, .procname = "osrelease", .data = init_uts_ns.name.release, .maxlen = sizeof(init_uts_ns.name.release), .mode = 0444, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_VERSION, .procname = "version", .data = init_uts_ns.name.version, .maxlen = sizeof(init_uts_ns.name.version), .mode = 0444, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_NODENAME, .procname = "hostname", .data = init_uts_ns.name.nodename, .maxlen = sizeof(init_uts_ns.name.nodename), .mode = 0644, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, { - .ctl_name = KERN_DOMAINNAME, .procname = "domainname", .data = init_uts_ns.name.domainname, .maxlen = sizeof(init_uts_ns.name.domainname), .mode = 0644, .proc_handler = proc_do_uts_string, - .strategy = sysctl_uts_string, }, {} }; static struct ctl_table uts_root_table[] = { { - .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, .child = uts_kern_table, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 47cdd7e76f2..dee48658805 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,6 +68,116 @@ struct workqueue_struct { #endif }; +#ifdef CONFIG_DEBUG_OBJECTS_WORK + +static struct debug_obj_descr work_debug_descr; + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int work_fixup_init(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_init(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int work_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The work struct was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { + debug_object_init(work, &work_debug_descr); + debug_object_activate(work, &work_debug_descr); + return 0; + } + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int work_fixup_free(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_free(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr work_debug_descr = { + .name = "work_struct", + .fixup_init = work_fixup_init, + .fixup_activate = work_fixup_activate, + .fixup_free = work_fixup_free, +}; + +static inline void debug_work_activate(struct work_struct *work) +{ + debug_object_activate(work, &work_debug_descr); +} + +static inline void debug_work_deactivate(struct work_struct *work) +{ + debug_object_deactivate(work, &work_debug_descr); +} + +void __init_work(struct work_struct *work, int onstack) +{ + if (onstack) + debug_object_init_on_stack(work, &work_debug_descr); + else + debug_object_init(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(__init_work); + +void destroy_work_on_stack(struct work_struct *work) +{ + debug_object_free(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_work_on_stack); + +#else +static inline void debug_work_activate(struct work_struct *work) { } +static inline void debug_work_deactivate(struct work_struct *work) { } +#endif + /* Serializes the accesses to the list of workqueues. */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); @@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, { unsigned long flags; + debug_work_activate(work); spin_lock_irqsave(&cwq->lock, flags); insert_work(cwq, work, &cwq->worklist); spin_unlock_irqrestore(&cwq->lock, flags); @@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) struct lockdep_map lockdep_map = work->lockdep_map; #endif trace_workqueue_execution(cwq->thread, work); + debug_work_deactivate(work); cwq->current_work = work; list_del_init(cwq->worklist.next); spin_unlock_irq(&cwq->lock); @@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work) static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, struct wq_barrier *barr, struct list_head *head) { - INIT_WORK(&barr->work, wq_barrier_func); + /* + * debugobject calls are safe here even with cwq->lock locked + * as we know for sure that this will not trigger any of the + * checks and call back into the fixup functions where we + * might deadlock. + */ + INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); init_completion(&barr->done); + debug_work_activate(&barr->work); insert_work(cwq, &barr->work, head); } @@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) } spin_unlock_irq(&cwq->lock); - if (active) + if (active) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } return active; } @@ -451,6 +572,7 @@ out: return 0; wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); return 1; } EXPORT_SYMBOL_GPL(flush_work); @@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work) */ smp_rmb(); if (cwq == get_wq_data(work)) { + debug_work_deactivate(work); list_del_init(&work->entry); ret = 1; } @@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, } spin_unlock_irq(&cwq->lock); - if (unlikely(running)) + if (unlikely(running)) { wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } } static void wait_on_work(struct work_struct *work) @@ -685,6 +810,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; + int orig = -1; struct work_struct *works; works = alloc_percpu(struct work_struct); @@ -692,14 +818,28 @@ int schedule_on_each_cpu(work_func_t func) return -ENOMEM; get_online_cpus(); + + /* + * When running in keventd don't schedule a work item on + * itself. Can just call directly because the work queue is + * already bound. This also is faster. + */ + if (current_is_keventd()) + orig = raw_smp_processor_id(); + for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); - schedule_work_on(cpu, work); + if (cpu != orig) + schedule_work_on(cpu, work); } + if (orig >= 0) + func(per_cpu_ptr(works, orig)); + for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); + put_online_cpus(); free_percpu(works); return 0; |