diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 3 | ||||
-rw-r--r-- | kernel/audit_tree.c | 100 | ||||
-rw-r--r-- | kernel/auditsc.c | 7 | ||||
-rw-r--r-- | kernel/cpu.c | 2 | ||||
-rw-r--r-- | kernel/elfcore.c | 28 | ||||
-rw-r--r-- | kernel/exit.c | 5 | ||||
-rw-r--r-- | kernel/fork.c | 19 | ||||
-rw-r--r-- | kernel/kprobes.c | 647 | ||||
-rw-r--r-- | kernel/padata.c | 8 | ||||
-rw-r--r-- | kernel/panic.c | 46 | ||||
-rw-r--r-- | kernel/params.c | 1 | ||||
-rw-r--r-- | kernel/perf_event.c | 2 | ||||
-rw-r--r-- | kernel/pid.c | 2 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 36 | ||||
-rw-r--r-- | kernel/power/hibernate.c | 9 | ||||
-rw-r--r-- | kernel/power/suspend.c | 3 | ||||
-rw-r--r-- | kernel/printk.c | 3 | ||||
-rw-r--r-- | kernel/relay.c | 5 | ||||
-rw-r--r-- | kernel/sched.c | 4 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 2 | ||||
-rw-r--r-- | kernel/sched_rt.c | 5 | ||||
-rw-r--r-- | kernel/signal.c | 45 | ||||
-rw-r--r-- | kernel/sys.c | 3 | ||||
-rw-r--r-- | kernel/sysctl.c | 12 | ||||
-rw-r--r-- | kernel/sysctl_binary.c | 7 | ||||
-rw-r--r-- | kernel/tsacct.c | 1 |
26 files changed, 762 insertions, 243 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 7b974699f8c..a987aa1676b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 4b05bd9479d..028e85663f2 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } +static int compare_root(struct vfsmount *mnt, void *arg) +{ + return mnt->mnt_root->d_inode == arg; +} + void audit_trim_trees(void) { struct list_head cursor; @@ -559,7 +564,6 @@ void audit_trim_trees(void) struct path path; struct vfsmount *root_mnt; struct node *node; - struct list_head list; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -577,24 +581,16 @@ void audit_trim_trees(void) if (!root_mnt) goto skip_it; - list_add_tail(&list, &root_mnt->mnt_list); spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { - struct audit_chunk *chunk = find_chunk(node); - struct inode *inode = chunk->watch.inode; - struct vfsmount *mnt; + struct inode *inode = find_chunk(node)->watch.inode; node->index |= 1U<<31; - list_for_each_entry(mnt, &list, mnt_list) { - if (mnt->mnt_root->d_inode == inode) { - node->index &= ~(1U<<31); - break; - } - } + if (iterate_mounts(compare_root, inode, root_mnt)) + node->index &= ~(1U<<31); } spin_unlock(&hash_lock); trim_marked(tree); put_tree(tree); - list_del_init(&list); drop_collected_mounts(root_mnt); skip_it: mutex_lock(&audit_filter_mutex); @@ -603,22 +599,6 @@ skip_it: mutex_unlock(&audit_filter_mutex); } -static int is_under(struct vfsmount *mnt, struct dentry *dentry, - struct path *path) -{ - if (mnt != path->mnt) { - for (;;) { - if (mnt->mnt_parent == mnt) - return 0; - if (mnt->mnt_parent == path->mnt) - break; - mnt = mnt->mnt_parent; - } - dentry = mnt->mnt_mountpoint; - } - return is_subdir(dentry, path->dentry); -} - int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) { @@ -638,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } +static int tag_mount(struct vfsmount *mnt, void *arg) +{ + return tag_chunk(mnt->mnt_root->d_inode, arg); +} + /* called with audit_filter_mutex */ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt, *p; - struct list_head list; + struct vfsmount *mnt; int err; list_for_each_entry(tree, &tree_list, list) { @@ -670,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule) err = -ENOMEM; goto Err; } - list_add_tail(&list, &mnt->mnt_list); get_tree(tree); - list_for_each_entry(p, &list, mnt_list) { - err = tag_chunk(p->mnt_root->d_inode, tree); - if (err) - break; - } - - list_del(&list); + err = iterate_mounts(tag_mount, tree, mnt); drop_collected_mounts(mnt); if (!err) { @@ -714,31 +691,23 @@ int audit_tag_tree(char *old, char *new) { struct list_head cursor, barrier; int failed = 0; - struct path path; + struct path path1, path2; struct vfsmount *tagged; - struct list_head list; - struct vfsmount *mnt; - struct dentry *dentry; int err; - err = kern_path(new, 0, &path); + err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path); - path_put(&path); + tagged = collect_mounts(&path2); + path_put(&path2); if (!tagged) return -ENOMEM; - err = kern_path(old, 0, &path); + err = kern_path(old, 0, &path1); if (err) { drop_collected_mounts(tagged); return err; } - mnt = mntget(path.mnt); - dentry = dget(path.dentry); - path_put(&path); - - list_add_tail(&list, &tagged->mnt_list); mutex_lock(&audit_filter_mutex); list_add(&barrier, &tree_list); @@ -746,7 +715,7 @@ int audit_tag_tree(char *old, char *new) while (cursor.next != &tree_list) { struct audit_tree *tree; - struct vfsmount *p; + int good_one = 0; tree = container_of(cursor.next, struct audit_tree, list); get_tree(tree); @@ -754,30 +723,19 @@ int audit_tag_tree(char *old, char *new) list_add(&cursor, &tree->list); mutex_unlock(&audit_filter_mutex); - err = kern_path(tree->pathname, 0, &path); - if (err) { - put_tree(tree); - mutex_lock(&audit_filter_mutex); - continue; + err = kern_path(tree->pathname, 0, &path2); + if (!err) { + good_one = path_is_under(&path1, &path2); + path_put(&path2); } - spin_lock(&vfsmount_lock); - if (!is_under(mnt, dentry, &path)) { - spin_unlock(&vfsmount_lock); - path_put(&path); + if (!good_one) { put_tree(tree); mutex_lock(&audit_filter_mutex); continue; } - spin_unlock(&vfsmount_lock); - path_put(&path); - - list_for_each_entry(p, &list, mnt_list) { - failed = tag_chunk(p->mnt_root->d_inode, tree); - if (failed) - break; - } + failed = iterate_mounts(tag_mount, tree, tagged); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -818,10 +776,8 @@ int audit_tag_tree(char *old, char *new) } list_del(&barrier); list_del(&cursor); - list_del(&list); mutex_unlock(&audit_filter_mutex); - dput(dentry); - mntput(mnt); + path_put(&path1); drop_collected_mounts(tagged); return failed; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fc0f928167e..f3a461c0970 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry) /** * audit_inode_child - collect inode info for created/removed objects - * @dname: inode's dentry name * @dentry: dentry being audited * @parent: inode of dentry parent * @@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry) * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const char *dname, const struct dentry *dentry, +void __audit_inode_child(const struct dentry *dentry, const struct inode *parent) { int idx; struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; + const char *dname = dentry->d_name.name; int dirlen = 0; if (!context->in_syscall) @@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry, if (inode) handle_one(inode); - /* determine matching parent */ - if (!dname) - goto add_names; /* parent is more likely, look for it first */ for (idx = 0; idx < context->name_count; idx++) { diff --git a/kernel/cpu.c b/kernel/cpu.c index 677f25376a3..f8cced2692b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -338,7 +338,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#if defined(CONFIG_IA64) printk(KERN_ERR "please check additional_cpus= boot " "parameter\n"); #endif diff --git a/kernel/elfcore.c b/kernel/elfcore.c new file mode 100644 index 00000000000..ff915efef66 --- /dev/null +++ b/kernel/elfcore.c @@ -0,0 +1,28 @@ +#include <linux/elf.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/elf.h> + + +Elf_Half __weak elf_core_extra_phdrs(void) +{ + return 0; +} + +int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + return 1; +} + +int __weak elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + return 1; +} + +size_t __weak elf_core_extra_data_size(void) +{ + return 0; +} diff --git a/kernel/exit.c b/kernel/exit.c index 45ed043b8bf..ce1e48c2d93 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - + /* sync mm's RSS info before statistics gathering */ + sync_mm_rss(tsk, tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -1188,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (unlikely(wo->wo_flags & WNOWAIT)) { int exit_code = p->exit_code; - int why, status; + int why; get_task_struct(p); read_unlock(&tasklist_lock); diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356..b0ec34abc0b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -329,15 +329,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (!tmp) goto fail_nomem; *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); + if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; - anon_vma_link(tmp); file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; @@ -392,6 +394,8 @@ out: flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; +fail_nomem_anon_vma_fork: + mpol_put(pol); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: @@ -455,8 +459,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; - set_mm_counter(mm, file_rss, 0); - set_mm_counter(mm, anon_rss, 0); + memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; @@ -825,6 +828,8 @@ void __cleanup_sighand(struct sighand_struct *sighand) */ static void posix_cpu_timers_init_group(struct signal_struct *sig) { + unsigned long cpu_limit; + /* Thread group counters. */ thread_group_cputime_init(sig); @@ -839,9 +844,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) sig->cputime_expires.virt_exp = cputime_zero; sig->cputime_expires.sched_exp = 0; - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - sig->cputime_expires.prof_exp = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + if (cpu_limit != RLIM_INFINITY) { + sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); sig->cputimer.running = 1; } @@ -1034,7 +1039,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= - p->signal->rlim[RLIMIT_NPROC].rlim_cur) { + task_rlimit(p, RLIMIT_NPROC)) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER) goto bad_fork_free; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ccec774c716..fa034d29cf7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -42,9 +42,11 @@ #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/sysctl.h> #include <linux/kdebug.h> #include <linux/memory.h> #include <linux/ftrace.h> +#include <linux/cpu.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> @@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { * stepping on the instruction on a vmalloced/kmalloced/data page * is a recipe for disaster */ -#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) - struct kprobe_insn_page { struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ - char slot_used[INSNS_PER_PAGE]; int nused; int ngarbage; + char slot_used[]; +}; + +#define KPROBE_INSN_PAGE_SIZE(slots) \ + (offsetof(struct kprobe_insn_page, slot_used) + \ + (sizeof(char) * (slots))) + +struct kprobe_insn_cache { + struct list_head pages; /* list of kprobe_insn_page */ + size_t insn_size; /* size of instruction slot */ + int nr_garbage; }; +static int slots_per_page(struct kprobe_insn_cache *c) +{ + return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); +} + enum kprobe_slot_state { SLOT_CLEAN = 0, SLOT_DIRTY = 1, SLOT_USED = 2, }; -static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ -static LIST_HEAD(kprobe_insn_pages); -static int kprobe_garbage_slots; -static int collect_garbage_slots(void); +static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ +static struct kprobe_insn_cache kprobe_insn_slots = { + .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), + .insn_size = MAX_INSN_SIZE, + .nr_garbage = 0, +}; +static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); /** * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -static kprobe_opcode_t __kprobes *__get_insn_slot(void) +static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; retry: - list_for_each_entry(kip, &kprobe_insn_pages, list) { - if (kip->nused < INSNS_PER_PAGE) { + list_for_each_entry(kip, &c->pages, list) { + if (kip->nused < slots_per_page(c)) { int i; - for (i = 0; i < INSNS_PER_PAGE; i++) { + for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; kip->nused++; - return kip->insns + (i * MAX_INSN_SIZE); + return kip->insns + (i * c->insn_size); } } - /* Surprise! No unused slots. Fix kip->nused. */ - kip->nused = INSNS_PER_PAGE; + /* kip->nused is broken. Fix it. */ + kip->nused = slots_per_page(c); + WARN_ON(1); } } /* If there are any garbage slots, collect it and try again. */ - if (kprobe_garbage_slots && collect_garbage_slots() == 0) { + if (c->nr_garbage && collect_garbage_slots(c) == 0) goto retry; - } - /* All out of space. Need to allocate a new page. Use slot 0. */ - kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); + + /* All out of space. Need to allocate a new page. */ + kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); if (!kip) return NULL; @@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) return NULL; } INIT_LIST_HEAD(&kip->list); - list_add(&kip->list, &kprobe_insn_pages); - memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); + memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); kip->slot_used[0] = SLOT_USED; kip->nused = 1; kip->ngarbage = 0; + list_add(&kip->list, &c->pages); return kip->insns; } + kprobe_opcode_t __kprobes *get_insn_slot(void) { - kprobe_opcode_t *ret; + kprobe_opcode_t *ret = NULL; + mutex_lock(&kprobe_insn_mutex); - ret = __get_insn_slot(); + ret = __get_insn_slot(&kprobe_insn_slots); mutex_unlock(&kprobe_insn_mutex); + return ret; } @@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) * so as not to have to set it up again the * next time somebody inserts a probe. */ - if (!list_is_singular(&kprobe_insn_pages)) { + if (!list_is_singular(&kip->list)) { list_del(&kip->list); module_free(NULL, kip->insns); kfree(kip); @@ -209,51 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) return 0; } -static int __kprobes collect_garbage_slots(void) +static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip, *next; /* Ensure no-one is interrupted on the garbages */ synchronize_sched(); - list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { + list_for_each_entry_safe(kip, next, &c->pages, list) { int i; if (kip->ngarbage == 0) continue; kip->ngarbage = 0; /* we will collect all garbages */ - for (i = 0; i < INSNS_PER_PAGE; i++) { + for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i)) break; } } - kprobe_garbage_slots = 0; + c->nr_garbage = 0; return 0; } -void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) +static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; - mutex_lock(&kprobe_insn_mutex); - list_for_each_entry(kip, &kprobe_insn_pages, list) { - if (kip->insns <= slot && - slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { - int i = (slot - kip->insns) / MAX_INSN_SIZE; + list_for_each_entry(kip, &c->pages, list) { + long idx = ((long)slot - (long)kip->insns) / c->insn_size; + if (idx >= 0 && idx < slots_per_page(c)) { + WARN_ON(kip->slot_used[idx] != SLOT_USED); if (dirty) { - kip->slot_used[i] = SLOT_DIRTY; + kip->slot_used[idx] = SLOT_DIRTY; kip->ngarbage++; + if (++c->nr_garbage > slots_per_page(c)) + collect_garbage_slots(c); } else - collect_one_slot(kip, i); - break; + collect_one_slot(kip, idx); + return; } } + /* Could not free this slot. */ + WARN_ON(1); +} - if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) - collect_garbage_slots(); - +void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) +{ + mutex_lock(&kprobe_insn_mutex); + __free_insn_slot(&kprobe_insn_slots, slot, dirty); mutex_unlock(&kprobe_insn_mutex); } +#ifdef CONFIG_OPTPROBES +/* For optimized_kprobe buffer */ +static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ +static struct kprobe_insn_cache kprobe_optinsn_slots = { + .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), + /* .insn_size is initialized later */ + .nr_garbage = 0, +}; +/* Get a slot for optimized_kprobe buffer */ +kprobe_opcode_t __kprobes *get_optinsn_slot(void) +{ + kprobe_opcode_t *ret = NULL; + + mutex_lock(&kprobe_optinsn_mutex); + ret = __get_insn_slot(&kprobe_optinsn_slots); + mutex_unlock(&kprobe_optinsn_mutex); + + return ret; +} + +void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) +{ + mutex_lock(&kprobe_optinsn_mutex); + __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); + mutex_unlock(&kprobe_optinsn_mutex); +} +#endif #endif /* We have preemption disabled.. so it is safe to use __ versions */ @@ -284,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr) if (p->addr == addr) return p; } + + return NULL; +} + +static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); + +/* Return true if the kprobe is an aggregator */ +static inline int kprobe_aggrprobe(struct kprobe *p) +{ + return p->pre_handler == aggr_pre_handler; +} + +/* + * Keep all fields in the kprobe consistent + */ +static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) +{ + memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); + memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); +} + +#ifdef CONFIG_OPTPROBES +/* NOTE: change this value only with kprobe_mutex held */ +static bool kprobes_allow_optimization; + +/* + * Call all pre_handler on the list, but ignores its return value. + * This must be called from arch-dep optimized caller. + */ +void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &p->list, list) { + if (kp->pre_handler && likely(!kprobe_disabled(kp))) { + set_kprobe_instance(kp); + kp->pre_handler(kp, regs); + } + reset_kprobe_instance(); + } +} + +/* Return true(!0) if the kprobe is ready for optimization. */ +static inline int kprobe_optready(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if (kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + return arch_prepared_optinsn(&op->optinsn); + } + + return 0; +} + +/* + * Return an optimized kprobe whose optimizing code replaces + * instructions including addr (exclude breakpoint). + */ +struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +{ + int i; + struct kprobe *p = NULL; + struct optimized_kprobe *op; + + /* Don't check i == 0, since that is a breakpoint case. */ + for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) + p = get_kprobe((void *)(addr - i)); + + if (p && kprobe_optready(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (arch_within_optimized_kprobe(op, addr)) + return p; + } + return NULL; } +/* Optimization staging list, protected by kprobe_mutex */ +static LIST_HEAD(optimizing_list); + +static void kprobe_optimizer(struct work_struct *work); +static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); +#define OPTIMIZE_DELAY 5 + +/* Kprobe jump optimizer */ +static __kprobes void kprobe_optimizer(struct work_struct *work) +{ + struct optimized_kprobe *op, *tmp; + + /* Lock modules while optimizing kprobes */ + mutex_lock(&module_mutex); + mutex_lock(&kprobe_mutex); + if (kprobes_all_disarmed || !kprobes_allow_optimization) + goto end; + + /* + * Wait for quiesence period to ensure all running interrupts + * are done. Because optprobe may modify multiple instructions + * there is a chance that Nth instruction is interrupted. In that + * case, running interrupt can return to 2nd-Nth byte of jump + * instruction. This wait is for avoiding it. + */ + synchronize_sched(); + + /* + * The optimization/unoptimization refers online_cpus via + * stop_machine() and cpu-hotplug modifies online_cpus. + * And same time, text_mutex will be held in cpu-hotplug and here. + * This combination can cause a deadlock (cpu-hotplug try to lock + * text_mutex but stop_machine can not be done because online_cpus + * has been changed) + * To avoid this deadlock, we need to call get_online_cpus() + * for preventing cpu-hotplug outside of text_mutex locking. + */ + get_online_cpus(); + mutex_lock(&text_mutex); + list_for_each_entry_safe(op, tmp, &optimizing_list, list) { + WARN_ON(kprobe_disabled(&op->kp)); + if (arch_optimize_kprobe(op) < 0) + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + list_del_init(&op->list); + } + mutex_unlock(&text_mutex); + put_online_cpus(); +end: + mutex_unlock(&kprobe_mutex); + mutex_unlock(&module_mutex); +} + +/* Optimize kprobe if p is ready to be optimized */ +static __kprobes void optimize_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + /* Check if the kprobe is disabled or not ready for optimization. */ + if (!kprobe_optready(p) || !kprobes_allow_optimization || + (kprobe_disabled(p) || kprobes_all_disarmed)) + return; + + /* Both of break_handler and post_handler are not supported. */ + if (p->break_handler || p->post_handler) + return; + + op = container_of(p, struct optimized_kprobe, kp); + + /* Check there is no other kprobes at the optimized instructions */ + if (arch_check_optimized_kprobe(op) < 0) + return; + + /* Check if it is already optimized. */ + if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) + return; + + op->kp.flags |= KPROBE_FLAG_OPTIMIZED; + list_add(&op->list, &optimizing_list); + if (!delayed_work_pending(&optimizing_work)) + schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); +} + +/* Unoptimize a kprobe if p is optimized */ +static __kprobes void unoptimize_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) + /* Dequeue from the optimization queue */ + list_del_init(&op->list); + else + /* Replace jump with break */ + arch_unoptimize_kprobe(op); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + } +} + +/* Remove optimized instructions */ +static void __kprobes kill_optimized_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) { + /* Dequeue from the optimization queue */ + list_del_init(&op->list); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + } + /* Don't unoptimize, because the target code will be freed. */ + arch_remove_optimized_kprobe(op); +} + +/* Try to prepare optimized instructions */ +static __kprobes void prepare_optimized_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_prepare_optimized_kprobe(op); +} + +/* Free optimized instructions and optimized_kprobe */ +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_remove_optimized_kprobe(op); + kfree(op); +} + +/* Allocate new optimized_kprobe and try to prepare optimized instructions */ +static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL); + if (!op) + return NULL; + + INIT_LIST_HEAD(&op->list); + op->kp.addr = p->addr; + arch_prepare_optimized_kprobe(op); + + return &op->kp; +} + +static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); + +/* + * Prepare an optimized_kprobe and optimize it + * NOTE: p must be a normal registered kprobe + */ +static __kprobes void try_to_optimize_kprobe(struct kprobe *p) +{ + struct kprobe *ap; + struct optimized_kprobe *op; + + ap = alloc_aggr_kprobe(p); + if (!ap) + return; + + op = container_of(ap, struct optimized_kprobe, kp); + if (!arch_prepared_optinsn(&op->optinsn)) { + /* If failed to setup optimizing, fallback to kprobe */ + free_aggr_kprobe(ap); + return; + } + + init_aggr_kprobe(ap, p); + optimize_kprobe(ap); +} + +#ifdef CONFIG_SYSCTL +static void __kprobes optimize_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + /* If optimization is already allowed, just return */ + if (kprobes_allow_optimization) + return; + + kprobes_allow_optimization = true; + mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (!kprobe_disabled(p)) + optimize_kprobe(p); + } + mutex_unlock(&text_mutex); + printk(KERN_INFO "Kprobes globally optimized\n"); +} + +static void __kprobes unoptimize_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + /* If optimization is already prohibited, just return */ + if (!kprobes_allow_optimization) + return; + + kprobes_allow_optimization = false; + printk(KERN_INFO "Kprobes globally unoptimized\n"); + get_online_cpus(); /* For avoiding text_mutex deadlock */ + mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) { + if (!kprobe_disabled(p)) + unoptimize_kprobe(p); + } + } + + mutex_unlock(&text_mutex); + put_online_cpus(); + /* Allow all currently running kprobes to complete */ + synchronize_sched(); +} + +int sysctl_kprobes_optimization; +int proc_kprobes_optimization_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + + mutex_lock(&kprobe_mutex); + sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + + if (sysctl_kprobes_optimization) + optimize_all_kprobes(); + else + unoptimize_all_kprobes(); + mutex_unlock(&kprobe_mutex); + + return ret; +} +#endif /* CONFIG_SYSCTL */ + +static void __kprobes __arm_kprobe(struct kprobe *p) +{ + struct kprobe *old_p; + + /* Check collision with other optimized kprobes */ + old_p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(old_p)) + unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ + + arch_arm_kprobe(p); + optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ +} + +static void __kprobes __disarm_kprobe(struct kprobe *p) +{ + struct kprobe *old_p; + + unoptimize_kprobe(p); /* Try to unoptimize */ + arch_disarm_kprobe(p); + + /* If another kprobe was blocked, optimize it. */ + old_p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(old_p)) + optimize_kprobe(old_p); +} + +#else /* !CONFIG_OPTPROBES */ + +#define optimize_kprobe(p) do {} while (0) +#define unoptimize_kprobe(p) do {} while (0) +#define kill_optimized_kprobe(p) do {} while (0) +#define prepare_optimized_kprobe(p) do {} while (0) +#define try_to_optimize_kprobe(p) do {} while (0) +#define __arm_kprobe(p) arch_arm_kprobe(p) +#define __disarm_kprobe(p) arch_disarm_kprobe(p) + +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + kfree(p); +} + +static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +{ + return kzalloc(sizeof(struct kprobe), GFP_KERNEL); +} +#endif /* CONFIG_OPTPROBES */ + /* Arm a kprobe with text_mutex */ static void __kprobes arm_kprobe(struct kprobe *kp) { + /* + * Here, since __arm_kprobe() doesn't use stop_machine(), + * this doesn't cause deadlock on text_mutex. So, we don't + * need get_online_cpus(). + */ mutex_lock(&text_mutex); - arch_arm_kprobe(kp); + __arm_kprobe(kp); mutex_unlock(&text_mutex); } /* Disarm a kprobe with text_mutex */ static void __kprobes disarm_kprobe(struct kprobe *kp) { + get_online_cpus(); /* For avoiding text_mutex deadlock */ mutex_lock(&text_mutex); - arch_disarm_kprobe(kp); + __disarm_kprobe(kp); mutex_unlock(&text_mutex); + put_online_cpus(); } /* @@ -369,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) { struct kprobe *kp; - if (p->pre_handler != aggr_pre_handler) { + if (!kprobe_aggrprobe(p)) { p->nmissed++; } else { list_for_each_entry_rcu(kp, &p->list, list) @@ -493,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) } /* - * Keep all fields in the kprobe consistent - */ -static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) -{ - memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); - memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); -} - -/* * Add the new probe to ap->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) { BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); + + if (p->break_handler || p->post_handler) + unoptimize_kprobe(ap); /* Fall back to normal kprobe */ + if (p->break_handler) { if (ap->break_handler) return -EEXIST; @@ -522,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) ap->flags &= ~KPROBE_FLAG_DISABLED; if (!kprobes_all_disarmed) /* Arm the breakpoint again. */ - arm_kprobe(ap); + __arm_kprobe(ap); } return 0; } @@ -531,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe */ -static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) +static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { + /* Copy p's insn slot to ap */ copy_kprobe(p, ap); flush_insn_slot(ap); ap->addr = p->addr; - ap->flags = p->flags; + ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; /* We don't care the kprobe which has gone. */ @@ -546,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); - list_add_rcu(&p->list, &ap->list); + INIT_HLIST_NODE(&ap->hlist); + list_add_rcu(&p->list, &ap->list); hlist_replace_rcu(&p->hlist, &ap->hlist); } @@ -561,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, int ret = 0; struct kprobe *ap = old_p; - if (old_p->pre_handler != aggr_pre_handler) { - /* If old_p is not an aggr_probe, create new aggr_kprobe. */ - ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); + if (!kprobe_aggrprobe(old_p)) { + /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ + ap = alloc_aggr_kprobe(old_p); if (!ap) return -ENOMEM; - add_aggr_kprobe(ap, old_p); + init_aggr_kprobe(ap, old_p); } if (kprobe_gone(ap)) { @@ -585,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, */ return ret; + /* Prepare optimized instructions if possible. */ + prepare_optimized_kprobe(ap); + /* * Clear gone flag to prevent allocating new slot again, and * set disabled flag because it is not armed yet. @@ -593,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | KPROBE_FLAG_DISABLED; } + /* Copy ap's insn slot to p */ copy_kprobe(ap, p); return add_new_kprobe(ap, p); } @@ -743,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p) p->nmissed = 0; INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); + + get_online_cpus(); /* For avoiding text_mutex deadlock. */ + mutex_lock(&text_mutex); + old_p = get_kprobe(p->addr); if (old_p) { + /* Since this may unoptimize old_p, locking text_mutex. */ ret = register_aggr_kprobe(old_p, p); goto out; } - mutex_lock(&text_mutex); ret = arch_prepare_kprobe(p); if (ret) - goto out_unlock_text; + goto out; INIT_HLIST_NODE(&p->hlist); hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); if (!kprobes_all_disarmed && !kprobe_disabled(p)) - arch_arm_kprobe(p); + __arm_kprobe(p); + + /* Try to optimize kprobe */ + try_to_optimize_kprobe(p); -out_unlock_text: - mutex_unlock(&text_mutex); out: + mutex_unlock(&text_mutex); + put_online_cpus(); mutex_unlock(&kprobe_mutex); if (probed_mod) @@ -785,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) return -EINVAL; if (old_p == p || - (old_p->pre_handler == aggr_pre_handler && + (kprobe_aggrprobe(old_p) && list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are @@ -793,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) * already have been removed. We save on flushing icache. */ if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) - disarm_kprobe(p); + disarm_kprobe(old_p); hlist_del_rcu(&old_p->hlist); } else { if (p->break_handler && !kprobe_gone(p)) @@ -809,8 +1250,13 @@ noclean: list_del_rcu(&p->list); if (!kprobe_disabled(old_p)) { try_to_disable_aggr_kprobe(old_p); - if (!kprobes_all_disarmed && kprobe_disabled(old_p)) - disarm_kprobe(old_p); + if (!kprobes_all_disarmed) { + if (kprobe_disabled(old_p)) + disarm_kprobe(old_p); + else + /* Try to optimize this probe again */ + optimize_kprobe(old_p); + } } } return 0; @@ -827,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) old_p = list_entry(p->list.next, struct kprobe, list); list_del(&p->list); arch_remove_kprobe(old_p); - kfree(old_p); + free_aggr_kprobe(old_p); } } @@ -1123,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) struct kprobe *kp; p->flags |= KPROBE_FLAG_GONE; - if (p->pre_handler == aggr_pre_handler) { + if (kprobe_aggrprobe(p)) { /* * If this is an aggr_kprobe, we have to list all the * chained probes and mark them GONE. @@ -1132,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) kp->flags |= KPROBE_FLAG_GONE; p->post_handler = NULL; p->break_handler = NULL; + kill_optimized_kprobe(p); } /* * Here, we can remove insn_slot safely, because no thread calls @@ -1241,6 +1688,15 @@ static int __init init_kprobes(void) } } +#if defined(CONFIG_OPTPROBES) +#if defined(__ARCH_WANT_KPROBES_INSN_SLOT) + /* Init kprobe_optinsn_slots */ + kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; +#endif + /* By default, kprobes can be optimized */ + kprobes_allow_optimization = true; +#endif + /* By default, kprobes are armed */ kprobes_all_disarmed = false; @@ -1259,7 +1715,7 @@ static int __init init_kprobes(void) #ifdef CONFIG_DEBUG_FS static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, - const char *sym, int offset,char *modname) + const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; @@ -1269,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, kprobe_type = "j"; else kprobe_type = "k"; + if (sym) - seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", + seq_printf(pi, "%p %s %s+0x%x %s ", p->addr, kprobe_type, sym, offset, - (modname ? modname : " "), - (kprobe_gone(p) ? "[GONE]" : ""), - ((kprobe_disabled(p) && !kprobe_gone(p)) ? - "[DISABLED]" : "")); + (modname ? modname : " ")); else - seq_printf(pi, "%p %s %p %s%s\n", - p->addr, kprobe_type, p->addr, - (kprobe_gone(p) ? "[GONE]" : ""), - ((kprobe_disabled(p) && !kprobe_gone(p)) ? - "[DISABLED]" : "")); + seq_printf(pi, "%p %s %p ", + p->addr, kprobe_type, p->addr); + + if (!pp) + pp = p; + seq_printf(pi, "%s%s%s\n", + (kprobe_gone(p) ? "[GONE]" : ""), + ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), + (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -1317,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) hlist_for_each_entry_rcu(p, node, head, hlist) { sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); - if (p->pre_handler == aggr_pre_handler) { + if (kprobe_aggrprobe(p)) { list_for_each_entry_rcu(kp, &p->list, list) - report_probe(pi, kp, sym, offset, modname); + report_probe(pi, kp, sym, offset, modname, p); } else - report_probe(pi, p, sym, offset, modname); + report_probe(pi, p, sym, offset, modname, NULL); } preempt_enable(); return 0; @@ -1399,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp) goto out; } - if (!kprobes_all_disarmed && kprobe_disabled(p)) - arm_kprobe(p); - - p->flags &= ~KPROBE_FLAG_DISABLED; if (p != kp) kp->flags &= ~KPROBE_FLAG_DISABLED; + + if (!kprobes_all_disarmed && kprobe_disabled(p)) { + p->flags &= ~KPROBE_FLAG_DISABLED; + arm_kprobe(p); + } out: mutex_unlock(&kprobe_mutex); return ret; @@ -1424,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void) if (!kprobes_all_disarmed) goto already_enabled; + /* Arming kprobes doesn't optimize kprobe itself */ mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) - arch_arm_kprobe(p); + __arm_kprobe(p); } mutex_unlock(&text_mutex); @@ -1456,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void) kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); + + /* + * Here we call get_online_cpus() for avoiding text_mutex deadlock, + * because disarming may also unoptimize kprobes. + */ + get_online_cpus(); mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - arch_disarm_kprobe(p); + __disarm_kprobe(p); } } mutex_unlock(&text_mutex); + put_online_cpus(); mutex_unlock(&kprobe_mutex); /* Allow all currently running kprobes to complete */ synchronize_sched(); diff --git a/kernel/padata.c b/kernel/padata.c index 6f9bcb8313d..93caf65ff57 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -642,6 +642,9 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, if (!pd) goto err_free_inst; + if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) + goto err_free_pd; + rcu_assign_pointer(pinst->pd, pd); pinst->wq = wq; @@ -654,12 +657,14 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, pinst->cpu_notifier.priority = 0; err = register_hotcpu_notifier(&pinst->cpu_notifier); if (err) - goto err_free_pd; + goto err_free_cpumask; mutex_init(&pinst->lock); return pinst; +err_free_cpumask: + free_cpumask_var(pinst->cpumask); err_free_pd: padata_free_pd(pd); err_free_inst: @@ -685,6 +690,7 @@ void padata_free(struct padata_instance *pinst) unregister_hotcpu_notifier(&pinst->cpu_notifier); padata_free_pd(pinst->pd); + free_cpumask_var(pinst->cpumask); kfree(pinst); } EXPORT_SYMBOL(padata_free); diff --git a/kernel/panic.c b/kernel/panic.c index c787333282b..13d966b4c14 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -static long no_blink(long time) -{ - return 0; -} - /* Returns how long it waited in ms */ long (*panic_blink)(long time); EXPORT_SYMBOL(panic_blink); +static void panic_blink_one_second(void) +{ + static long i = 0, end; + + if (panic_blink) { + end = i + MSEC_PER_SEC; + + while (i < end) { + i += panic_blink(i); + mdelay(1); + i++; + } + } else { + /* + * When running under a hypervisor a small mdelay may get + * rounded up to the hypervisor timeslice. For example, with + * a 1ms in 10ms hypervisor timeslice we might inflate a + * mdelay(1) loop by 10x. + * + * If we have nothing to blink, spin on 1 second calls to + * mdelay to avoid this. + */ + mdelay(MSEC_PER_SEC); + } +} + /** * panic - halt the system * @fmt: The text string to print @@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...) bust_spinlocks(0); - if (!panic_blink) - panic_blink = no_blink; - if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. @@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...) */ printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); - for (i = 0; i < panic_timeout*1000; ) { + for (i = 0; i < panic_timeout; i++) { touch_nmi_watchdog(); - i += panic_blink(i); - mdelay(1); - i++; + panic_blink_one_second(); } /* * This will not be a clean reboot, with everything @@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...) } #endif local_irq_enable(); - for (i = 0; ; ) { + while (1) { touch_softlockup_watchdog(); - i += panic_blink(i); - mdelay(1); - i++; + panic_blink_one_second(); } } diff --git a/kernel/params.c b/kernel/params.c index cf1b6918312..8d95f5451b2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -24,7 +24,6 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/ctype.h> -#include <linux/string.h> #if 0 #define DEBUGP printk diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a661e799186..8e352c756ba 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2610,7 +2610,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (user_locked > user_lock_limit) extra = user_locked - user_lock_limit; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = vma->vm_mm->locked_vm + extra; diff --git a/kernel/pid.c b/kernel/pid.c index b08e697cd83..86b296943e5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -376,7 +376,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) EXPORT_SYMBOL(pid_task); /* - * Must be called under rcu_read_lock() or with tasklist_lock read-held. + * Must be called under rcu_read_lock(). */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 438ff452351..1a22dfd42df 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk, int maxfire; struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; + unsigned long soft; maxfire = 20; tsk->cputime_expires.prof_exp = cputime_zero; @@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk, /* * Check for the special case thread timers. */ - if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { - unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; - unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; + soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + if (soft != RLIM_INFINITY) { + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); if (hard != RLIM_INFINITY && tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { @@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { + if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { /* * At the soft limit, send a SIGXCPU every second. */ - if (sig->rlim[RLIMIT_RTTIME].rlim_cur - < sig->rlim[RLIMIT_RTTIME].rlim_max) { - sig->rlim[RLIMIT_RTTIME].rlim_cur += - USEC_PER_SEC; + if (soft < hard) { + soft += USEC_PER_SEC; + sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; } printk(KERN_INFO "RT Watchdog Timeout: %s[%d]\n", @@ -1121,6 +1122,7 @@ static void check_process_timers(struct task_struct *tsk, unsigned long long sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; + unsigned long soft; /* * Don't sample the current process CPU clocks if there are no timers. @@ -1193,11 +1195,13 @@ static void check_process_timers(struct task_struct *tsk, SIGPROF); check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, SIGVTALRM); - - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { + soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + if (soft != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); cputime_t x; - if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { + if (psecs >= hard) { /* * At the hard limit, we just die. * No need to calculate anything else now. @@ -1205,17 +1209,17 @@ static void check_process_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { + if (psecs >= soft) { /* * At the soft limit, send a SIGXCPU every second. */ __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - if (sig->rlim[RLIMIT_CPU].rlim_cur - < sig->rlim[RLIMIT_CPU].rlim_max) { - sig->rlim[RLIMIT_CPU].rlim_cur++; + if (soft < hard) { + soft++; + sig->rlim[RLIMIT_CPU].rlim_cur = soft; } } - x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + x = secs_to_cputime(soft); if (cputime_eq(prof_expires, cputime_zero) || cputime_lt(x, prof_expires)) { prof_expires = x; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index bbfe472d752..da5288ec239 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -323,6 +323,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { int error; + gfp_t saved_mask; error = platform_begin(platform_mode); if (error) @@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode) dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: platform_end(platform_mode); @@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode) int hibernation_restore(int platform_mode) { int error; + gfp_t saved_mask; pm_prepare_console(); suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); dpm_resume_end(PMSG_RECOVER); } + set_gfp_allowed_mask(saved_mask); resume_console(); pm_restore_console(); return error; @@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { int error; + gfp_t saved_mask; if (!hibernation_ops) return -ENOSYS; @@ -481,6 +488,7 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -518,6 +526,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6f10dfc2d3e..44cce10b582 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; + gfp_t saved_mask; if (!suspend_ops) return -ENOSYS; @@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/printk.c b/kernel/printk.c index 40674122ecf..75077ad0b53 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -70,8 +70,6 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; -static int saved_console_loglevel = -1; - /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -146,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN]; static char *log_buf = __log_buf; static int log_buf_len = __LOG_BUF_LEN; static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ +static int saved_console_loglevel = -1; #ifdef CONFIG_KEXEC /* diff --git a/kernel/relay.c b/kernel/relay.c index c705a41b4ba..3d97f282161 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) /* * subbuf_splice_actor - splice up to one subbuf's worth of data */ -static int subbuf_splice_actor(struct file *in, +static ssize_t subbuf_splice_actor(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags, int *nonpad_ret) { - unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; + unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; uint64_t pos = (uint64_t) *ppos; @@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in, .ops = &relay_pipe_buf_ops, .spd_release = relay_page_release, }; + ssize_t ret; if (rbuf->subbufs_produced == rbuf->subbufs_consumed) return 0; diff --git a/kernel/sched.c b/kernel/sched.c index abb36b16b93..b47ceeec1a9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4353,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice) /* convert nice value [19,-20] to rlimit style value [1,40] */ int nice_rlim = 20 - nice; - return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); } @@ -4530,7 +4530,7 @@ recheck: if (!lock_task_sighand(p, &flags)) return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); unlock_task_sighand(p, &flags); /* can't set/change the rt policy */ diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index eeb3506c483..82095bf2099 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -47,7 +47,7 @@ static int convert_prio(int prio) } #define for_each_cpupri_active(array, idx) \ - for_each_bit(idx, array, CPUPRI_NR_PRIORITIES) + for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) /** * cpupri_find - find the best (lowest-pri) CPU in the system diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bf3e38fdbe6..5a6ed1f0990 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1662,8 +1662,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) if (!p->signal) return; - soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; - hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; + /* max may change after cur was read, this will be fixed next tick */ + soft = task_rlimit(p, RLIMIT_RTTIME); + hard = task_rlimit_max(p, RLIMIT_RTTIME); if (soft != RLIM_INFINITY) { unsigned long next; diff --git a/kernel/signal.c b/kernel/signal.c index 934ae5e687b..dbd7fe073c5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -159,6 +159,10 @@ void recalc_sigpending(void) /* Given the mask, find the first available signal that should be serviced. */ +#define SYNCHRONOUS_MASK \ + (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ + sigmask(SIGTRAP) | sigmask(SIGFPE)) + int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; @@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask) s = pending->signal.sig; m = mask->sig; + + /* + * Handle the first word specially: it contains the + * synchronous signals that need to be dequeued first. + */ + x = *s &~ *m; + if (x) { + if (x & SYNCHRONOUS_MASK) + x &= SYNCHRONOUS_MASK; + sig = ffz(~x) + 1; + return sig; + } + switch (_NSIG_WORDS) { default: - for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) - if ((x = *s &~ *m) != 0) { - sig = ffz(~x) + i*_NSIG_BPW + 1; - break; - } + for (i = 1; i < _NSIG_WORDS; ++i) { + x = *++s &~ *++m; + if (!x) + continue; + sig = ffz(~x) + i*_NSIG_BPW + 1; + break; + } break; - case 2: if ((x = s[0] &~ m[0]) != 0) - sig = 1; - else if ((x = s[1] &~ m[1]) != 0) - sig = _NSIG_BPW + 1; - else + case 2: + x = s[1] &~ m[1]; + if (!x) break; - sig += ffz(~x); + sig = ffz(~x) + _NSIG_BPW + 1; break; - case 1: if ((x = *s &~ *m) != 0) - sig = ffz(~x) + 1; + case 1: + /* Nothing to do */ break; } @@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { + task_rlimit(t, RLIMIT_SIGPENDING)) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); diff --git a/kernel/sys.c b/kernel/sys.c index 877fe4f8e05..9814e43fb23 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -571,8 +571,7 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; - if (atomic_read(&new_user->processes) >= - current->signal->rlim[RLIMIT_NPROC].rlim_cur && + if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && new_user != INIT_USER) { free_uid(new_user); return -EAGAIN; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 33e7a38b6eb..0ef19c614f6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,6 +50,7 @@ #include <linux/ftrace.h> #include <linux/slow-work.h> #include <linux/perf_event.h> +#include <linux/kprobes.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -1450,6 +1451,17 @@ static struct ctl_table debug_table[] = { .proc_handler = proc_dointvec }, #endif +#if defined(CONFIG_OPTPROBES) + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 8f5d16e0707..8cd50d8f9bd 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, ssize_t result; char *pathname; int flags; - int acc_mode, fmode; + int acc_mode; pathname = sysctl_getname(name, nlen, &table); result = PTR_ERR(pathname); @@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen, if (oldval && oldlen && newval && newlen) { flags = O_RDWR; acc_mode = MAY_READ | MAY_WRITE; - fmode = FMODE_READ | FMODE_WRITE; } else if (newval && newlen) { flags = O_WRONLY; acc_mode = MAY_WRITE; - fmode = FMODE_WRITE; } else if (oldval && oldlen) { flags = O_RDONLY; acc_mode = MAY_READ; - fmode = FMODE_READ; } else { result = 0; goto out_putname; @@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, if (result) goto out_putname; - result = may_open(&nd.path, acc_mode, fmode); + result = may_open(&nd.path, acc_mode, flags); if (result) goto out_putpath; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 00d59d048ed..0a67e041edf 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -21,6 +21,7 @@ #include <linux/tsacct_kern.h> #include <linux/acct.h> #include <linux/jiffies.h> +#include <linux/mm.h> /* * fill in basic accounting fields |