diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 4 | ||||
-rw-r--r-- | kernel/auditfilter.c | 26 | ||||
-rw-r--r-- | kernel/auditsc.c | 117 | ||||
-rw-r--r-- | kernel/cpuset.c | 35 | ||||
-rw-r--r-- | kernel/delayacct.c | 24 | ||||
-rw-r--r-- | kernel/exit.c | 3 | ||||
-rw-r--r-- | kernel/fork.c | 10 | ||||
-rw-r--r-- | kernel/futex.c | 140 | ||||
-rw-r--r-- | kernel/futex_compat.c | 34 | ||||
-rw-r--r-- | kernel/hrtimer.c | 6 | ||||
-rw-r--r-- | kernel/irq/handle.c | 5 | ||||
-rw-r--r-- | kernel/irq/manage.c | 28 | ||||
-rw-r--r-- | kernel/kprobes.c | 1 | ||||
-rw-r--r-- | kernel/panic.c | 1 | ||||
-rw-r--r-- | kernel/power/process.c | 26 | ||||
-rw-r--r-- | kernel/printk.c | 4 | ||||
-rw-r--r-- | kernel/rcupdate.c | 4 | ||||
-rw-r--r-- | kernel/resource.c | 9 | ||||
-rw-r--r-- | kernel/rtmutex.c | 2 | ||||
-rw-r--r-- | kernel/sched.c | 26 | ||||
-rw-r--r-- | kernel/signal.c | 25 | ||||
-rw-r--r-- | kernel/softirq.c | 22 | ||||
-rw-r--r-- | kernel/softlockup.c | 4 | ||||
-rw-r--r-- | kernel/stop_machine.c | 1 | ||||
-rw-r--r-- | kernel/taskstats.c | 32 | ||||
-rw-r--r-- | kernel/timer.c | 49 | ||||
-rw-r--r-- | kernel/workqueue.c | 91 |
27 files changed, 494 insertions, 235 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index d417ca1db79..0a36091ed71 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -690,9 +690,7 @@ static const struct inotify_operations audit_inotify_ops = { /* Initialize audit support at boot time. */ static int __init audit_init(void) { -#ifdef CONFIG_AUDITSYSCALL int i; -#endif printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); @@ -717,10 +715,10 @@ static int __init audit_init(void) audit_ih = inotify_init(&audit_inotify_ops); if (IS_ERR(audit_ih)) audit_panic("cannot initialize inotify handle"); +#endif for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); -#endif return 0; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 5b4e16276ca..6a9a5c5a4e7 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -442,6 +442,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_EQUAL: break; default: + err = -EINVAL; goto exit_free; } } @@ -579,6 +580,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_EQUAL: break; default: + err = -EINVAL; goto exit_free; } } @@ -1134,6 +1136,14 @@ static inline int audit_add_rule(struct audit_entry *entry, struct audit_watch *watch = entry->rule.watch; struct nameidata *ndp, *ndw; int h, err, putnd_needed = 0; +#ifdef CONFIG_AUDITSYSCALL + int dont_count = 0; + + /* If either of these, don't count towards total */ + if (entry->rule.listnr == AUDIT_FILTER_USER || + entry->rule.listnr == AUDIT_FILTER_TYPE) + dont_count = 1; +#endif if (inode_f) { h = audit_hash_ino(inode_f->val); @@ -1174,6 +1184,10 @@ static inline int audit_add_rule(struct audit_entry *entry, } else { list_add_tail_rcu(&entry->list, list); } +#ifdef CONFIG_AUDITSYSCALL + if (!dont_count) + audit_n_rules++; +#endif mutex_unlock(&audit_filter_mutex); if (putnd_needed) @@ -1198,6 +1212,14 @@ static inline int audit_del_rule(struct audit_entry *entry, struct audit_watch *watch, *tmp_watch = entry->rule.watch; LIST_HEAD(inotify_list); int h, ret = 0; +#ifdef CONFIG_AUDITSYSCALL + int dont_count = 0; + + /* If either of these, don't count towards total */ + if (entry->rule.listnr == AUDIT_FILTER_USER || + entry->rule.listnr == AUDIT_FILTER_TYPE) + dont_count = 1; +#endif if (inode_f) { h = audit_hash_ino(inode_f->val); @@ -1235,6 +1257,10 @@ static inline int audit_del_rule(struct audit_entry *entry, list_del_rcu(&e->list); call_rcu(&e->rcu, audit_free_rule_rcu); +#ifdef CONFIG_AUDITSYSCALL + if (!dont_count) + audit_n_rules--; +#endif mutex_unlock(&audit_filter_mutex); if (!list_empty(&inotify_list)) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ae40ac8c39e..efc1b74bebf 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -85,6 +85,9 @@ extern int audit_enabled; /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 +/* number of audit rules */ +int audit_n_rules; + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -174,6 +177,7 @@ struct audit_aux_data_path { /* The per-task audit context. */ struct audit_context { + int dummy; /* must be the first element */ int in_syscall; /* 1 if task is in a syscall */ enum audit_state state; unsigned int serial; /* serial number for record */ @@ -514,7 +518,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, context->return_valid = return_valid; context->return_code = return_code; - if (context->in_syscall && !context->auditable) { + if (context->in_syscall && !context->dummy && !context->auditable) { enum audit_state state; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); @@ -530,17 +534,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, } get_context: - context->pid = tsk->pid; - context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ - context->uid = tsk->uid; - context->gid = tsk->gid; - context->euid = tsk->euid; - context->suid = tsk->suid; - context->fsuid = tsk->fsuid; - context->egid = tsk->egid; - context->sgid = tsk->sgid; - context->fsgid = tsk->fsgid; - context->personality = tsk->personality; + tsk->audit_context = NULL; return context; } @@ -749,6 +743,17 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts const char *tty; /* tsk == current */ + context->pid = tsk->pid; + context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ + context->uid = tsk->uid; + context->gid = tsk->gid; + context->euid = tsk->euid; + context->suid = tsk->suid; + context->fsuid = tsk->fsuid; + context->egid = tsk->egid; + context->sgid = tsk->sgid; + context->fsgid = tsk->fsgid; + context->personality = tsk->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); if (!ab) @@ -1066,7 +1071,8 @@ void audit_syscall_entry(int arch, int major, context->argv[3] = a4; state = context->state; - if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) + context->dummy = !audit_n_rules; + if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); if (likely(state == AUDIT_DISABLED)) return; @@ -1199,14 +1205,18 @@ void audit_putname(const char *name) #endif } -static void audit_inode_context(int idx, const struct inode *inode) +/* Copy inode data into an audit_names. */ +static void audit_copy_inode(struct audit_names *name, const struct inode *inode) { - struct audit_context *context = current->audit_context; - - selinux_get_inode_sid(inode, &context->names[idx].osid); + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + selinux_get_inode_sid(inode, &name->osid); } - /** * audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1240,20 +1250,14 @@ void __audit_inode(const char *name, const struct inode *inode) ++context->ino_count; #endif } - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; - audit_inode_context(idx, inode); + audit_copy_inode(&context->names[idx], inode); } /** * audit_inode_child - collect inode info for created/removed objects * @dname: inode's dentry name * @inode: inode being audited - * @pino: inode number of dentry parent + * @parent: inode of dentry parent * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. @@ -1264,7 +1268,7 @@ void __audit_inode(const char *name, const struct inode *inode) * unsuccessful attempts. */ void __audit_inode_child(const char *dname, const struct inode *inode, - unsigned long pino) + const struct inode *parent) { int idx; struct audit_context *context = current->audit_context; @@ -1278,7 +1282,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode, if (!dname) goto update_context; for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].ino == pino) { + if (context->names[idx].ino == parent->i_ino) { const char *name = context->names[idx].name; if (!name) @@ -1302,16 +1306,47 @@ update_context: context->names[idx].name_len = AUDIT_NAME_FULL; context->names[idx].name_put = 0; /* don't call __putname() */ - if (inode) { - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; - audit_inode_context(idx, inode); - } else - context->names[idx].ino = (unsigned long)-1; + if (!inode) + context->names[idx].ino = (unsigned long)-1; + else + audit_copy_inode(&context->names[idx], inode); + + /* A parent was not found in audit_names, so copy the inode data for the + * provided parent. */ + if (!found_name) { + idx = context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + audit_copy_inode(&context->names[idx], parent); + } +} + +/** + * audit_inode_update - update inode info for last collected name + * @inode: inode being audited + * + * When open() is called on an existing object with the O_CREAT flag, the inode + * data audit initially collects is incorrect. This additional hook ensures + * audit has the inode data for the actual object to be opened. + */ +void __audit_inode_update(const struct inode *inode) +{ + struct audit_context *context = current->audit_context; + int idx; + + if (!context->in_syscall || !inode) + return; + + if (context->name_count == 0) { + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + } + idx = context->name_count - 1; + + audit_copy_inode(&context->names[idx], inode); } /** @@ -1642,7 +1677,7 @@ int audit_bprm(struct linux_binprm *bprm) unsigned long p, next; void *to; - if (likely(!audit_enabled || !context)) + if (likely(!audit_enabled || !context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, @@ -1680,7 +1715,7 @@ int audit_socketcall(int nargs, unsigned long *args) struct audit_aux_data_socketcall *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) + if (likely(!context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); @@ -1708,7 +1743,7 @@ int audit_sockaddr(int len, void *a) struct audit_aux_data_sockaddr *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) + if (likely(!context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1a649f2bb9b..4ea6f0dc2fc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -816,6 +816,10 @@ static int update_cpumask(struct cpuset *cs, char *buf) struct cpuset trialcs; int retval, cpus_unchanged; + /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ + if (cs == &top_cpuset) + return -EACCES; + trialcs = *cs; retval = cpulist_parse(buf, trialcs.cpus_allowed); if (retval < 0) @@ -2033,6 +2037,33 @@ out: return err; } +/* + * The top_cpuset tracks what CPUs and Memory Nodes are online, + * period. This is necessary in order to make cpusets transparent + * (of no affect) on systems that are actively using CPU hotplug + * but making no active use of cpusets. + * + * This handles CPU hotplug (cpuhp) events. If someday Memory + * Nodes can be hotplugged (dynamically changing node_online_map) + * then we should handle that too, perhaps in a similar way. + */ + +#ifdef CONFIG_HOTPLUG_CPU +static int cpuset_handle_cpuhp(struct notifier_block *nb, + unsigned long phase, void *cpu) +{ + mutex_lock(&manage_mutex); + mutex_lock(&callback_mutex); + + top_cpuset.cpus_allowed = cpu_online_map; + + mutex_unlock(&callback_mutex); + mutex_unlock(&manage_mutex); + + return 0; +} +#endif + /** * cpuset_init_smp - initialize cpus_allowed * @@ -2043,6 +2074,8 @@ void __init cpuset_init_smp(void) { top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_online_map; + + hotcpu_notifier(cpuset_handle_cpuhp, 0); } /** @@ -2387,7 +2420,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); int cpuset_excl_nodes_overlap(const struct task_struct *p) { const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ - int overlap = 0; /* do cpusets overlap? */ + int overlap = 1; /* do cpusets overlap? */ task_lock(current); if (current->flags & PF_EXITING) { diff --git a/kernel/delayacct.c b/kernel/delayacct.c index f05392d6426..36752f124c6 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -19,15 +19,15 @@ #include <linux/sysctl.h> #include <linux/delayacct.h> -int delayacct_on __read_mostly; /* Delay accounting turned on/off */ +int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ kmem_cache_t *delayacct_cache; -static int __init delayacct_setup_enable(char *str) +static int __init delayacct_setup_disable(char *str) { - delayacct_on = 1; + delayacct_on = 0; return 1; } -__setup("delayacct", delayacct_setup_enable); +__setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { @@ -41,24 +41,11 @@ void delayacct_init(void) void __delayacct_tsk_init(struct task_struct *tsk) { - spin_lock_init(&tsk->delays_lock); - /* No need to acquire tsk->delays_lock for allocation here unless - __delayacct_tsk_init called after tsk is attached to tasklist - */ tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); if (tsk->delays) spin_lock_init(&tsk->delays->lock); } -void __delayacct_tsk_exit(struct task_struct *tsk) -{ - struct task_delay_info *delays = tsk->delays; - spin_lock(&tsk->delays_lock); - tsk->delays = NULL; - spin_unlock(&tsk->delays_lock); - kmem_cache_free(delayacct_cache, delays); -} - /* * Start accounting for a delay statistic using * its starting timestamp (@start) @@ -118,8 +105,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) struct timespec ts; unsigned long t1,t2,t3; - spin_lock(&tsk->delays_lock); - /* Though tsk->delays accessed later, early exit avoids * unnecessary returning of other data */ @@ -161,7 +146,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) spin_unlock(&tsk->delays->lock); done: - spin_unlock(&tsk->delays_lock); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index dba194a8d41..d891883420f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -908,7 +908,6 @@ fastcall NORET_TYPE void do_exit(long code) audit_free(tsk); taskstats_exit_send(tsk, tidstats, group_dead, mycpu); taskstats_exit_free(tidstats); - delayacct_tsk_exit(tsk); exit_mm(tsk); @@ -1054,7 +1053,7 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p) * Do not consider thread group leaders that are * in a non-empty thread group: */ - if (current->tgid != p->tgid && delay_group_leader(p)) + if (delay_group_leader(p)) return 2; if (security_task_wait(p)) diff --git a/kernel/fork.c b/kernel/fork.c index 1b0f7b1e088..f9b014e3e70 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -117,6 +117,7 @@ void __put_task_struct(struct task_struct *tsk) security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); + delayacct_tsk_free(tsk); if (!profile_handoff_task(tsk)) free_task(tsk); @@ -1011,7 +1012,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = -EFAULT; if (clone_flags & CLONE_PARENT_SETTID) if (put_user(p->pid, parent_tidptr)) - goto bad_fork_cleanup; + goto bad_fork_cleanup_delays_binfmt; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); @@ -1277,7 +1278,8 @@ bad_fork_cleanup_policy: bad_fork_cleanup_cpuset: #endif cpuset_exit(p); -bad_fork_cleanup: +bad_fork_cleanup_delays_binfmt: + delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); bad_fork_cleanup_put_domain: @@ -1387,8 +1389,10 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { wait_for_completion(&vfork); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) + if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { + current->ptrace_message = nr; ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); + } } } else { free_pid(pid); diff --git a/kernel/futex.c b/kernel/futex.c index cf0c8e21d1a..b9b8aea5389 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -297,7 +297,7 @@ static int futex_handle_fault(unsigned long address, int attempt) struct vm_area_struct * vma; struct mm_struct *mm = current->mm; - if (attempt >= 2 || !(vma = find_vma(mm, address)) || + if (attempt > 2 || !(vma = find_vma(mm, address)) || vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) return -EFAULT; @@ -397,7 +397,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) p = NULL; goto out_unlock; } - if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { + if (p->exit_state != 0) { p = NULL; goto out_unlock; } @@ -415,15 +415,15 @@ out_unlock: */ void exit_pi_state_list(struct task_struct *curr) { - struct futex_hash_bucket *hb; struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; union futex_key key; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselfs + * versus waiters unqueueing themselves: */ spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { @@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; + hb = hash_futex(&key); spin_unlock_irq(&curr->pi_lock); - hb = hash_futex(&key); spin_lock(&hb->lock); spin_lock_irq(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ if (head->next != next) { spin_unlock(&hb->lock); continue; } - list_del_init(&pi_state->list); - WARN_ON(pi_state->owner != curr); - + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); pi_state->owner = NULL; spin_unlock_irq(&curr->pi_lock); @@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &me->key)) { + if (match_futex(&this->key, &me->key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: @@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) if (unlikely(!pi_state)) return -EINVAL; + WARN_ON(!atomic_read(&pi_state->refcount)); + atomic_inc(&pi_state->refcount); me->pi_state = pi_state; @@ -490,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) } /* - * We are the first waiter - try to look up the real owner and - * attach the new pi_state to it: + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when the owner died bit is set + * and TID = 0: */ pid = uval & FUTEX_TID_MASK; + if (!pid && (uval & FUTEX_OWNER_DIED)) + return -ESRCH; p = futex_find_get_task(pid); if (!p) return -ESRCH; @@ -510,6 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) pi_state->key = me->key; spin_lock_irq(&p->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; spin_unlock_irq(&p->pi_lock); @@ -573,20 +582,29 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * kept enabled while there is PI state around. We must also * preserve the owner died bit.) */ - newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + if (!(uval & FUTEX_OWNER_DIED)) { + newval = FUTEX_WAITERS | new_owner->pid; - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + } - if (curval == -EFAULT) - return -EFAULT; - if (curval != uval) - return -EINVAL; + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->owner->pi_state_list); + spin_lock_irq(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; + spin_unlock_irq(&new_owner->pi_lock); + rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -729,8 +747,10 @@ retry: */ if (attempt++) { if (futex_handle_fault((unsigned long)uaddr2, - attempt)) + attempt)) { + ret = -EFAULT; goto out; + } goto retry; } @@ -930,6 +950,7 @@ static int unqueue_me(struct futex_q *q) /* In the common case we don't take the spinlock, which is nice. */ retry: lock_ptr = q->lock_ptr; + barrier(); if (lock_ptr != 0) { spin_lock(lock_ptr); /* @@ -1236,6 +1257,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, /* Owner died? */ if (q.pi_state->owner != NULL) { spin_lock_irq(&q.pi_state->owner->pi_lock); + WARN_ON(list_empty(&q.pi_state->list)); list_del_init(&q.pi_state->list); spin_unlock_irq(&q.pi_state->owner->pi_lock); } else @@ -1244,6 +1266,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, q.pi_state->owner = current; spin_lock_irq(¤t->pi_lock); + WARN_ON(!list_empty(&q.pi_state->list)); list_add(&q.pi_state->list, ¤t->pi_state_list); spin_unlock_irq(¤t->pi_lock); @@ -1301,9 +1324,10 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) + if (futex_handle_fault((unsigned long)uaddr, attempt)) { + ret = -EFAULT; goto out_unlock_release_sem; - + } goto retry_locked; } @@ -1427,9 +1451,11 @@ retry_locked: * again. If it succeeds then we can return without waking * anyone else up: */ - inc_preempt_count(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + } if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1462,9 +1488,11 @@ retry_locked: /* * No waiters - kernel unlocks the futex: */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; + if (!(uval & FUTEX_OWNER_DIED)) { + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + } out_unlock: spin_unlock(&hb->lock); @@ -1481,9 +1509,10 @@ pi_faulted: * still holding the mmap_sem. */ if (attempt++) { - if (futex_handle_fault((unsigned long)uaddr, attempt)) + if (futex_handle_fault((unsigned long)uaddr, attempt)) { + ret = -EFAULT; goto out_unlock; - + } goto retry_locked; } @@ -1683,9 +1712,9 @@ err_unlock: * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval; + u32 uval, nval, mval; retry: if (get_user(uval, uaddr)) @@ -1702,21 +1731,45 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED); + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); + if (nval == -EFAULT) return -1; if (nval != uval) goto retry; - if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi) { + if (uval & FUTEX_WAITERS) + futex_wake(uaddr, 1); + } } return 0; } /* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user **head, int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long *)head)) + return -EFAULT; + + *entry = (void *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + +/* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * @@ -1726,14 +1779,14 @@ void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned long futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(entry, &head->list.next)) + if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: @@ -1744,10 +1797,11 @@ void exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(pending, &head->list_op_pending)) + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; + if (pending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (entry != &head->list) { /* @@ -1756,12 +1810,12 @@ void exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(entry, &entry->next)) + if (fetch_robust_entry(&entry, &entry->next, &pi)) return; /* * Avoid excessively long or circular lists: diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d1d92b441fb..c5cca3f65cb 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -12,6 +12,23 @@ #include <asm/uaccess.h> + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t *head, int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; compat_uptr_t uentry, upending; - unsigned int limit = ROBUST_LIST_LIMIT; compat_long_t futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(uentry, &head->list.next)) + if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; - entry = compat_ptr(uentry); /* * Fetch the relative futex offset: */ @@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(upending, &head->list_op_pending)) + if (fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pip)) return; - pending = compat_ptr(upending); if (upending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (compat_ptr(uentry) != &head->list) { /* @@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(uentry, (compat_uptr_t *)&entry->next)) + if (fetch_robust_entry(&uentry, &entry, + (compat_uptr_t *)&entry->next, &pi)) return; - entry = compat_ptr(uentry); /* * Avoid excessively long or circular lists: */ diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d17766d40da..21c38a7e666 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -187,7 +187,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) { struct hrtimer_base *new_base; - new_base = &__get_cpu_var(hrtimer_bases[base->index]); + new_base = &__get_cpu_var(hrtimer_bases)[base->index]; if (base != new_base) { /* @@ -835,7 +835,7 @@ static void migrate_hrtimers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit hrtimer_cpu_notify(struct notifier_block *self, +static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -859,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata hrtimers_nb = { +static struct notifier_block __cpuinitdata hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index fc4e906aedb..48a53f68af9 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -20,6 +20,11 @@ /** * handle_bad_irq - handle spurious and unhandled irqs + * @irq: the interrupt number + * @desc: description of the interrupt + * @regs: pointer to a register structure + * + * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ void fastcall handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4e461438e48..92be519eff2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq); * @irq: interrupt to control * @on: enable/disable power management wakeup * - * Enable/disable power management wakeup mode + * Enable/disable power management wakeup mode, which is + * disabled by default. Enables and disables must match, + * just as they match for non-wakeup mode support. + * + * Wakeup mode lets this IRQ wake the system from sleep + * states like "suspend to RAM". */ int set_irq_wake(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_desc + irq; unsigned long flags; int ret = -ENXIO; + int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake; + /* wakeup-capable irqs can be shared between drivers that + * don't need to have the same sleep mode behaviors. + */ spin_lock_irqsave(&desc->lock, flags); - if (desc->chip->set_wake) + if (on) { + if (desc->wake_depth++ == 0) + desc->status |= IRQ_WAKEUP; + else + set_wake = NULL; + } else { + if (desc->wake_depth == 0) { + printk(KERN_WARNING "Unbalanced IRQ %d " + "wake disable\n", irq); + WARN_ON(1); + } else if (--desc->wake_depth == 0) + desc->status &= ~IRQ_WAKEUP; + else + set_wake = NULL; + } + if (set_wake) ret = desc->chip->set_wake(irq, on); spin_unlock_irqrestore(&desc->lock, flags); return ret; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 64aab081153..3f57dfdc8f9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { copy_kprobe(p, ap); + flush_insn_slot(ap); ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; diff --git a/kernel/panic.c b/kernel/panic.c index d8a0bca2123..9b8dcfd1ca9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/nmi.h> #include <linux/kexec.h> +#include <linux/debug_locks.h> int panic_on_oops; int tainted; diff --git a/kernel/power/process.c b/kernel/power/process.c index b2a5f671d6c..72e72d2c61e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -66,13 +66,25 @@ static inline void freeze_process(struct task_struct *p) } } +static void cancel_freezing(struct task_struct *p) +{ + unsigned long flags; + + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + do_not_freeze(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_tsk(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} + /* 0 = success, else # of processes that we failed to stop */ int freeze_processes(void) { int todo, nr_user, user_frozen; unsigned long start_time; struct task_struct *g, *p; - unsigned long flags; printk( "Stopping tasks: " ); start_time = jiffies; @@ -85,6 +97,10 @@ int freeze_processes(void) continue; if (frozen(p)) continue; + if (p->state == TASK_TRACED && frozen(p->parent)) { + cancel_freezing(p); + continue; + } if (p->mm && !(p->flags & PF_BORROWED_MM)) { /* The task is a user-space one. * Freeze it unless there's a vfork completion @@ -126,13 +142,7 @@ int freeze_processes(void) do_each_thread(g, p) { if (freezeable(p) && !frozen(p)) printk(KERN_ERR " %s\n", p->comm); - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - p->flags &= ~PF_FREEZE; - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_tsk(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } + cancel_freezing(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); return todo; diff --git a/kernel/printk.c b/kernel/printk.c index 65ca0688f86..1149365e989 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -799,6 +799,9 @@ void release_console_sem(void) up(&secondary_console_sem); return; } + + console_may_schedule = 0; + for ( ; ; ) { spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; @@ -812,7 +815,6 @@ void release_console_sem(void) local_irq_restore(flags); } console_locked = 0; - console_may_schedule = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 759805c9859..436ab35f6fa 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); } -static int __devinit rcu_cpu_notify(struct notifier_block *self, +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -565,7 +565,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata rcu_nb = { +static struct notifier_block __cpuinitdata rcu_nb = { .notifier_call = rcu_cpu_notify, }; diff --git a/kernel/resource.c b/kernel/resource.c index 0dd3a857579..46286434af8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -244,6 +244,7 @@ int find_next_system_ram(struct resource *res) start = res->start; end = res->end; + BUG_ON(start >= end); read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { @@ -254,15 +255,17 @@ int find_next_system_ram(struct resource *res) p = NULL; break; } - if (p->start >= start) + if ((p->end >= start) && (p->start < end)) break; } read_unlock(&resource_lock); if (!p) return -1; /* copy data */ - res->start = p->start; - res->end = p->end; + if (res->start < p->start) + res->start = p->start; + if (res->end > p->end) + res->end = p->end; return 0; } #endif diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index d2ef13b485e..3e13a1e5856 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -7,6 +7,8 @@ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen + * + * See Documentation/rt-mutex-design.txt for details. */ #include <linux/spinlock.h> #include <linux/module.h> diff --git a/kernel/sched.c b/kernel/sched.c index b44b9a43b0f..a234fbee123 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4162,10 +4162,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) read_unlock_irq(&tasklist_lock); return -ESRCH; } - get_task_struct(p); - read_unlock_irq(&tasklist_lock); retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); + read_unlock_irq(&tasklist_lock); return retval; } @@ -4456,9 +4454,9 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline int __resched_legal(void) +static inline int __resched_legal(int expected_preempt_count) { - if (unlikely(preempt_count())) + if (unlikely(preempt_count() != expected_preempt_count)) return 0; if (unlikely(system_state != SYSTEM_RUNNING)) return 0; @@ -4484,7 +4482,7 @@ static void __cond_resched(void) int __sched cond_resched(void) { - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(0)) { __cond_resched(); return 1; } @@ -4510,7 +4508,7 @@ int cond_resched_lock(spinlock_t *lock) ret = 1; spin_lock(lock); } - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(1)) { spin_release(&lock->dep_map, 1, _THIS_IP_); _raw_spin_unlock(lock); preempt_enable_no_resched(); @@ -4526,7 +4524,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(0)) { raw_local_irq_disable(); _local_bh_enable(); raw_local_irq_enable(); @@ -6494,7 +6492,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) for (i = 0; i < MAX_NUMNODES; i++) init_numa_sched_groups_power(sched_group_nodes[i]); - init_numa_sched_groups_power(sched_group_allnodes); + if (sched_group_allnodes) { + int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); + struct sched_group *sg = &sched_group_allnodes[group]; + + init_numa_sched_groups_power(sg); + } #endif /* Attach the domains */ @@ -6761,6 +6764,11 @@ void __init sched_init(void) } set_load_weight(&init_task); + +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); +#endif + /* * The boot idle thread does lazy MMU switching as well: */ diff --git a/kernel/signal.c b/kernel/signal.c index 7fe874d12fa..bfdb5686fa3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -791,22 +791,31 @@ out: /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. + * + * Note: If we unblock the signal, we always reset it to SIG_DFL, + * since we do not want to have a signal handler that was blocked + * be invoked when user space had explicitly blocked it. + * + * We don't want to have recursive SIGSEGV's etc, for example. */ - int force_sig_info(int sig, struct siginfo *info, struct task_struct *t) { unsigned long int flags; - int ret; + int ret, blocked, ignored; + struct k_sigaction *action; spin_lock_irqsave(&t->sighand->siglock, flags); - if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { - t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; - } - if (sigismember(&t->blocked, sig)) { - sigdelset(&t->blocked, sig); + action = &t->sighand->action[sig-1]; + ignored = action->sa.sa_handler == SIG_IGN; + blocked = sigismember(&t->blocked, sig); + if (blocked || ignored) { + action->sa.sa_handler = SIG_DFL; + if (blocked) { + sigdelset(&t->blocked, sig); + recalc_sigpending_tsk(t); + } } - recalc_sigpending_tsk(t); ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); diff --git a/kernel/softirq.c b/kernel/softirq.c index 0f08a84ae30..3789ca98197 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -65,6 +65,7 @@ static inline void wakeup_softirqd(void) * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ +#ifdef CONFIG_TRACE_IRQFLAGS static void __local_bh_disable(unsigned long ip) { unsigned long flags; @@ -80,6 +81,13 @@ static void __local_bh_disable(unsigned long ip) trace_softirqs_off(ip); raw_local_irq_restore(flags); } +#else /* !CONFIG_TRACE_IRQFLAGS */ +static inline void __local_bh_disable(unsigned long ip) +{ + add_preempt_count(SOFTIRQ_OFFSET); + barrier(); +} +#endif /* CONFIG_TRACE_IRQFLAGS */ void local_bh_disable(void) { @@ -121,12 +129,16 @@ EXPORT_SYMBOL(_local_bh_enable); void local_bh_enable(void) { +#ifdef CONFIG_TRACE_IRQFLAGS unsigned long flags; WARN_ON_ONCE(in_irq()); +#endif WARN_ON_ONCE(irqs_disabled()); +#ifdef CONFIG_TRACE_IRQFLAGS local_irq_save(flags); +#endif /* * Are softirqs going to be turned on now: */ @@ -142,18 +154,22 @@ void local_bh_enable(void) do_softirq(); dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS local_irq_restore(flags); +#endif preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable); void local_bh_enable_ip(unsigned long ip) { +#ifdef CONFIG_TRACE_IRQFLAGS unsigned long flags; WARN_ON_ONCE(in_irq()); local_irq_save(flags); +#endif /* * Are softirqs going to be turned on now: */ @@ -169,7 +185,9 @@ void local_bh_enable_ip(unsigned long ip) do_softirq(); dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS local_irq_restore(flags); +#endif preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable_ip); @@ -547,7 +565,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit cpu_callback(struct notifier_block *nfb, +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -587,7 +605,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __devinitdata cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 6b76caa2298..03e6a2b0b78 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) /* * Create/destroy watchdog threads as CPUs come and go: */ -static int __devinit +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __devinitdata cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dcfb5d73146..51cacd111db 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -111,7 +111,6 @@ static int stop_machine(void) /* If some failed, kill them all. */ if (ret < 0) { stopmachine_set_state(STOPMACHINE_EXIT); - up(&stopmachine_mutex); return ret; } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f45179ce028..e7818765733 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -121,46 +121,45 @@ static int send_reply(struct sk_buff *skb, pid_t pid) /* * Send taskstats data in @skb to listeners registered for @cpu's exit data */ -static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); struct listener_list *listeners; struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); - int rc, ret, delcount = 0; + int rc, delcount = 0; rc = genlmsg_end(skb, reply); if (rc < 0) { nlmsg_free(skb); - return rc; + return; } rc = 0; listeners = &per_cpu(listener_array, cpu); down_read(&listeners->sem); - list_for_each_entry_safe(s, tmp, &listeners->list, list) { + list_for_each_entry(s, &listeners->list, list) { skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { skb_next = skb_clone(skb_cur, GFP_KERNEL); - if (!skb_next) { - nlmsg_free(skb_cur); - rc = -ENOMEM; + if (!skb_next) break; - } } - ret = genlmsg_unicast(skb_cur, s->pid); - if (ret == -ECONNREFUSED) { + rc = genlmsg_unicast(skb_cur, s->pid); + if (rc == -ECONNREFUSED) { s->valid = 0; delcount++; - rc = ret; } skb_cur = skb_next; } up_read(&listeners->sem); + if (skb_cur) + nlmsg_free(skb_cur); + if (!delcount) - return rc; + return; /* Delete invalidated entries */ down_write(&listeners->sem); @@ -171,13 +170,12 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) } } up_write(&listeners->sem); - return rc; } static int fill_pid(pid_t pid, struct task_struct *pidtsk, struct taskstats *stats) { - int rc; + int rc = 0; struct task_struct *tsk = pidtsk; if (!pidtsk) { @@ -196,12 +194,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, * Each accounting subsystem adds calls to its functions to * fill in relevant parts of struct taskstsats as follows * - * rc = per-task-foo(stats, tsk); - * if (rc) - * goto err; + * per-task-foo(stats, tsk); */ - rc = delayacct_add_tsk(stats, tsk); + delayacct_add_tsk(stats, tsk); stats->version = TASKSTATS_VERSION; /* Define err: label here if needed */ diff --git a/kernel/timer.c b/kernel/timer.c index 05809c2e2fd..1d7dd6267c2 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t; tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases }; +static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) @@ -408,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) * This function cascades all vectors and executes all expired timer * vectors. */ -#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) static inline void __run_timers(tvec_base_t *base) { @@ -1324,46 +1324,19 @@ asmlinkage long sys_getpid(void) } /* - * Accessing ->group_leader->real_parent is not SMP-safe, it could - * change from under us. However, rather than getting any lock - * we can use an optimistic algorithm: get the parent - * pid, and go back and check that the parent is still - * the same. If it has changed (which is extremely unlikely - * indeed), we just try again.. - * - * NOTE! This depends on the fact that even if we _do_ - * get an old value of "parent", we can happily dereference - * the pointer (it was and remains a dereferencable kernel pointer - * no matter what): we just can't necessarily trust the result - * until we know that the parent pointer is valid. - * - * NOTE2: ->group_leader never changes from under us. + * Accessing ->real_parent is not SMP-safe, it could + * change from under us. However, we can use a stale + * value of ->real_parent under rcu_read_lock(), see + * release_task()->call_rcu(delayed_put_task_struct). */ asmlinkage long sys_getppid(void) { int pid; - struct task_struct *me = current; - struct task_struct *parent; - parent = me->group_leader->real_parent; - for (;;) { - pid = parent->tgid; -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) -{ - struct task_struct *old = parent; + rcu_read_lock(); + pid = rcu_dereference(current->real_parent)->tgid; + rcu_read_unlock(); - /* - * Make sure we read the pid before re-reading the - * parent pointer: - */ - smp_rmb(); - parent = me->group_leader->real_parent; - if (old != parent) - continue; -} -#endif - break; - } return pid; } @@ -1688,7 +1661,7 @@ static void __devinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit timer_cpu_notify(struct notifier_block *self, +static int __cpuinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1708,7 +1681,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata timers_nb = { +static struct notifier_block __cpuinitdata timers_nb = { .notifier_call = timer_cpu_notify, }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eebb1d83923..835fe28b87a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,7 +68,7 @@ struct workqueue_struct { /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove threads to each one as cpus come/go. */ -static DEFINE_SPINLOCK(workqueue_lock); +static DEFINE_MUTEX(workqueue_mutex); static LIST_HEAD(workqueues); static int singlethread_cpu; @@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, spin_unlock_irqrestore(&cwq->lock, flags); } -/* - * Queue work on a workqueue. Return non-zero if it was successfully - * added. +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns non-zero if it was successfully added. * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. @@ -128,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data) __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @work: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns non-zero if it was successfully added. + */ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) { @@ -150,6 +161,15 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work); +/** + * queue_delayed_work_on - queue work on specific CPU after delay + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns non-zero if it was successfully added. + */ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) { @@ -275,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) } } -/* +/** * flush_workqueue - ensure that any scheduled work has run to completion. + * @wq: workqueue to flush * * Forces execution of the workqueue and blocks until its completion. * This is typically used in driver shutdown handlers. @@ -299,10 +320,10 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) } else { int cpu; - lock_cpu_hotplug(); + mutex_lock(&workqueue_mutex); for_each_online_cpu(cpu) flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); - unlock_cpu_hotplug(); + mutex_unlock(&workqueue_mutex); } } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -350,8 +371,7 @@ struct workqueue_struct *__create_workqueue(const char *name, } wq->name = name; - /* We don't need the distraction of CPUs appearing and vanishing. */ - lock_cpu_hotplug(); + mutex_lock(&workqueue_mutex); if (singlethread) { INIT_LIST_HEAD(&wq->list); p = create_workqueue_thread(wq, singlethread_cpu); @@ -360,9 +380,7 @@ struct workqueue_struct *__create_workqueue(const char *name, else wake_up_process(p); } else { - spin_lock(&workqueue_lock); list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); for_each_online_cpu(cpu) { p = create_workqueue_thread(wq, cpu); if (p) { @@ -372,7 +390,7 @@ struct workqueue_struct *__create_workqueue(const char *name, destroy = 1; } } - unlock_cpu_hotplug(); + mutex_unlock(&workqueue_mutex); /* * Was there any error during startup? If yes then clean up: @@ -400,6 +418,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) kthread_stop(p); } +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ void destroy_workqueue(struct workqueue_struct *wq) { int cpu; @@ -407,17 +431,15 @@ void destroy_workqueue(struct workqueue_struct *wq) flush_workqueue(wq); /* We don't need the distraction of CPUs appearing and vanishing. */ - lock_cpu_hotplug(); + mutex_lock(&workqueue_mutex); if (is_single_threaded(wq)) cleanup_workqueue_thread(wq, singlethread_cpu); else { for_each_online_cpu(cpu) cleanup_workqueue_thread(wq, cpu); - spin_lock(&workqueue_lock); list_del(&wq->list); - spin_unlock(&workqueue_lock); } - unlock_cpu_hotplug(); + mutex_unlock(&workqueue_mutex); free_percpu(wq->cpu_wq); kfree(wq); } @@ -425,18 +447,41 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); static struct workqueue_struct *keventd_wq; +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * This puts a job in the kernel-global workqueue. + */ int fastcall schedule_work(struct work_struct *work) { return queue_work(keventd_wq, work); } EXPORT_SYMBOL(schedule_work); +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @work: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) { return queue_delayed_work(keventd_wq, work, delay); } EXPORT_SYMBOL(schedule_delayed_work); +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @work: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay) { @@ -465,11 +510,13 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info) if (!works) return -ENOMEM; + mutex_lock(&workqueue_mutex); for_each_online_cpu(cpu) { INIT_WORK(per_cpu_ptr(works, cpu), func, info); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), per_cpu_ptr(works, cpu)); } + mutex_unlock(&workqueue_mutex); flush_workqueue(keventd_wq); free_percpu(works); return 0; @@ -585,6 +632,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: + mutex_lock(&workqueue_mutex); /* Create a new workqueue thread for it. */ list_for_each_entry(wq, &workqueues, list) { if (!create_workqueue_thread(wq, hotcpu)) { @@ -603,6 +651,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, kthread_bind(cwq->thread, hotcpu); wake_up_process(cwq->thread); } + mutex_unlock(&workqueue_mutex); break; case CPU_UP_CANCELED: @@ -614,6 +663,15 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, any_online_cpu(cpu_online_map)); cleanup_workqueue_thread(wq, hotcpu); } + mutex_unlock(&workqueue_mutex); + break; + + case CPU_DOWN_PREPARE: + mutex_lock(&workqueue_mutex); + break; + + case CPU_DOWN_FAILED: + mutex_unlock(&workqueue_mutex); break; case CPU_DEAD: @@ -621,6 +679,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, cleanup_workqueue_thread(wq, hotcpu); list_for_each_entry(wq, &workqueues, list) take_over_work(wq, hotcpu); + mutex_unlock(&workqueue_mutex); break; } |