diff options
Diffstat (limited to 'kernel')
50 files changed, 1867 insertions, 1108 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 2a202a84675..382dd5a8b2d 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS + def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS diff --git a/kernel/Makefile b/kernel/Makefile index 15ab63ffe64..54f69837d35 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ +obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ @@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -mno-spe -pg endif +obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/acct.c b/kernel/acct.c index 91e1cfd734d..dd68b905941 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,37 +75,39 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct pid_namespace *ns, struct file *); +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *); /* * This structure is used so that all the data protected by lock * can be placed in the same cache line as the lock. This primes * the cache line to have the data after getting the lock. */ -struct acct_glbs { - spinlock_t lock; +struct bsd_acct_struct { volatile int active; volatile int needcheck; struct file *file; struct pid_namespace *ns; struct timer_list timer; + struct list_head list; }; -static struct acct_glbs acct_globals __cacheline_aligned = - {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; +static DEFINE_SPINLOCK(acct_lock); +static LIST_HEAD(acct_list); /* * Called whenever the timer says to check the free space. */ -static void acct_timeout(unsigned long unused) +static void acct_timeout(unsigned long x) { - acct_globals.needcheck = 1; + struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x; + acct->needcheck = 1; } /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct file *file) +static int check_free_space(struct bsd_acct_struct *acct, struct file *file) { struct kstatfs sbuf; int res; @@ -113,11 +115,11 @@ static int check_free_space(struct file *file) sector_t resume; sector_t suspend; - spin_lock(&acct_globals.lock); - res = acct_globals.active; - if (!file || !acct_globals.needcheck) + spin_lock(&acct_lock); + res = acct->active; + if (!file || !acct->needcheck) goto out; - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); /* May block */ if (vfs_statfs(file->f_path.dentry, &sbuf)) @@ -136,35 +138,35 @@ static int check_free_space(struct file *file) act = 0; /* - * If some joker switched acct_globals.file under us we'ld better be + * If some joker switched acct->file under us we'ld better be * silent and _not_ touch anything. */ - spin_lock(&acct_globals.lock); - if (file != acct_globals.file) { + spin_lock(&acct_lock); + if (file != acct->file) { if (act) res = act>0; goto out; } - if (acct_globals.active) { + if (acct->active) { if (act < 0) { - acct_globals.active = 0; + acct->active = 0; printk(KERN_INFO "Process accounting paused\n"); } } else { if (act > 0) { - acct_globals.active = 1; + acct->active = 1; printk(KERN_INFO "Process accounting resumed\n"); } } - del_timer(&acct_globals.timer); - acct_globals.needcheck = 0; - acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct_globals.timer); - res = acct_globals.active; + del_timer(&acct->timer); + acct->needcheck = 0; + acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct->timer); + res = acct->active; out: - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); return res; } @@ -172,39 +174,41 @@ out: * Close the old accounting file (if currently open) and then replace * it with file (if non-NULL). * - * NOTE: acct_globals.lock MUST be held on entry and exit. + * NOTE: acct_lock MUST be held on entry and exit. */ -static void acct_file_reopen(struct file *file) +static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, + struct pid_namespace *ns) { struct file *old_acct = NULL; struct pid_namespace *old_ns = NULL; - if (acct_globals.file) { - old_acct = acct_globals.file; - old_ns = acct_globals.ns; - del_timer(&acct_globals.timer); - acct_globals.active = 0; - acct_globals.needcheck = 0; - acct_globals.file = NULL; + if (acct->file) { + old_acct = acct->file; + old_ns = acct->ns; + del_timer(&acct->timer); + acct->active = 0; + acct->needcheck = 0; + acct->file = NULL; + acct->ns = NULL; + list_del(&acct->list); } if (file) { - acct_globals.file = file; - acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); - acct_globals.needcheck = 0; - acct_globals.active = 1; + acct->file = file; + acct->ns = ns; + acct->needcheck = 0; + acct->active = 1; + list_add(&acct->list, &acct_list); /* It's been deleted if it was used before so this is safe */ - init_timer(&acct_globals.timer); - acct_globals.timer.function = acct_timeout; - acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct_globals.timer); + setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); + acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct->timer); } if (old_acct) { mnt_unpin(old_acct->f_path.mnt); - spin_unlock(&acct_globals.lock); - do_acct_process(old_ns, old_acct); + spin_unlock(&acct_lock); + do_acct_process(acct, old_ns, old_acct); filp_close(old_acct, NULL); - put_pid_ns(old_ns); - spin_lock(&acct_globals.lock); + spin_lock(&acct_lock); } } @@ -212,6 +216,8 @@ static int acct_on(char *name) { struct file *file; int error; + struct pid_namespace *ns; + struct bsd_acct_struct *acct = NULL; /* Difference from BSD - they don't do O_APPEND */ file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); @@ -228,18 +234,34 @@ static int acct_on(char *name) return -EIO; } + ns = task_active_pid_ns(current); + if (ns->bacct == NULL) { + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (acct == NULL) { + filp_close(file, NULL); + return -ENOMEM; + } + } + error = security_acct(file); if (error) { + kfree(acct); filp_close(file, NULL); return error; } - spin_lock(&acct_globals.lock); + spin_lock(&acct_lock); + if (ns->bacct == NULL) { + ns->bacct = acct; + acct = NULL; + } + mnt_pin(file->f_path.mnt); - acct_file_reopen(file); - spin_unlock(&acct_globals.lock); + acct_file_reopen(ns->bacct, file, ns); + spin_unlock(&acct_lock); mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ + kfree(acct); return 0; } @@ -269,11 +291,17 @@ asmlinkage long sys_acct(const char __user *name) error = acct_on(tmp); putname(tmp); } else { + struct bsd_acct_struct *acct; + + acct = task_active_pid_ns(current)->bacct; + if (acct == NULL) + return 0; + error = security_acct(NULL); if (!error) { - spin_lock(&acct_globals.lock); - acct_file_reopen(NULL); - spin_unlock(&acct_globals.lock); + spin_lock(&acct_lock); + acct_file_reopen(acct, NULL, NULL); + spin_unlock(&acct_lock); } } return error; @@ -288,10 +316,16 @@ asmlinkage long sys_acct(const char __user *name) */ void acct_auto_close_mnt(struct vfsmount *m) { - spin_lock(&acct_globals.lock); - if (acct_globals.file && acct_globals.file->f_path.mnt == m) - acct_file_reopen(NULL); - spin_unlock(&acct_globals.lock); + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); +restart: + list_for_each_entry(acct, &acct_list, list) + if (acct->file && acct->file->f_path.mnt == m) { + acct_file_reopen(acct, NULL, NULL); + goto restart; + } + spin_unlock(&acct_lock); } /** @@ -303,12 +337,31 @@ void acct_auto_close_mnt(struct vfsmount *m) */ void acct_auto_close(struct super_block *sb) { - spin_lock(&acct_globals.lock); - if (acct_globals.file && - acct_globals.file->f_path.mnt->mnt_sb == sb) { - acct_file_reopen(NULL); + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); +restart: + list_for_each_entry(acct, &acct_list, list) + if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { + acct_file_reopen(acct, NULL, NULL); + goto restart; + } + spin_unlock(&acct_lock); +} + +void acct_exit_ns(struct pid_namespace *ns) +{ + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); + acct = ns->bacct; + if (acct != NULL) { + if (acct->file != NULL) + acct_file_reopen(acct, NULL, NULL); + + kfree(acct); } - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); } /* @@ -425,7 +478,8 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct pid_namespace *ns, struct file *file) +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *file) { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; @@ -440,7 +494,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file) * First check to see if there is enough free_space to continue * the process accounting system. */ - if (!check_free_space(file)) + if (!check_free_space(acct, file)) return; /* @@ -577,34 +631,46 @@ void acct_collect(long exitcode, int group_dead) spin_unlock_irq(¤t->sighand->siglock); } -/** - * acct_process - now just a wrapper around do_acct_process - * @exitcode: task exit code - * - * handles process accounting for an exiting task - */ -void acct_process(void) +static void acct_process_in_ns(struct pid_namespace *ns) { struct file *file = NULL; - struct pid_namespace *ns; + struct bsd_acct_struct *acct; + acct = ns->bacct; /* * accelerate the common fastpath: */ - if (!acct_globals.file) + if (!acct || !acct->file) return; - spin_lock(&acct_globals.lock); - file = acct_globals.file; + spin_lock(&acct_lock); + file = acct->file; if (unlikely(!file)) { - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); return; } get_file(file); - ns = get_pid_ns(acct_globals.ns); - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); - do_acct_process(ns, file); + do_acct_process(acct, ns, file); fput(file); - put_pid_ns(ns); +} + +/** + * acct_process - now just a wrapper around acct_process_in_ns, + * which in turn is a wrapper around do_acct_process. + * + * handles process accounting for an exiting task + */ +void acct_process(void) +{ + struct pid_namespace *ns; + + /* + * This loop is safe lockless, since current is still + * alive and holds its namespace, which in turn holds + * its parent. + */ + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) + acct_process_in_ns(ns); } diff --git a/kernel/capability.c b/kernel/capability.c index 901e0fdc3ff..0101e847603 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -115,11 +115,208 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) return 0; } +#ifndef CONFIG_SECURITY_FILE_CAPABILITIES + +/* + * Without filesystem capability support, we nominally support one process + * setting the capabilities of another + */ +static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, + kernel_cap_t *pIp, kernel_cap_t *pPp) +{ + struct task_struct *target; + int ret; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + if (pid && pid != task_pid_vnr(current)) { + target = find_task_by_vpid(pid); + if (!target) { + ret = -ESRCH; + goto out; + } + } else + target = current; + + ret = security_capget(target, pEp, pIp, pPp); + +out: + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + return ret; +} + +/* + * cap_set_pg - set capabilities for all processes in a given process + * group. We call this holding task_capability_lock and tasklist_lock. + */ +static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + struct task_struct *g, *target; + int ret = -EPERM; + int found = 0; + struct pid *pgrp; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + pgrp = find_vpid(pgrp_nr); + do_each_pid_task(pgrp, PIDTYPE_PGID, g) { + target = g; + while_each_thread(g, target) { + if (!security_capset_check(target, effective, + inheritable, permitted)) { + security_capset_set(target, effective, + inheritable, permitted); + ret = 0; + } + found = 1; + } + } while_each_pid_task(pgrp, PIDTYPE_PGID, g); + + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + if (!found) + ret = 0; + return ret; +} + /* - * For sys_getproccap() and sys_setproccap(), any of the three - * capability set pointers may be NULL -- indicating that that set is - * uninteresting and/or not to be changed. + * cap_set_all - set capabilities for all processes other than init + * and self. We call this holding task_capability_lock and tasklist_lock. */ +static inline int cap_set_all(kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + struct task_struct *g, *target; + int ret = -EPERM; + int found = 0; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + do_each_thread(g, target) { + if (target == current + || is_container_init(target->group_leader)) + continue; + found = 1; + if (security_capset_check(target, effective, inheritable, + permitted)) + continue; + ret = 0; + security_capset_set(target, effective, inheritable, permitted); + } while_each_thread(g, target); + + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + if (!found) + ret = 0; + + return ret; +} + +/* + * Given the target pid does not refer to the current process we + * need more elaborate support... (This support is not present when + * filesystem capabilities are configured.) + */ +static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + struct task_struct *target; + int ret; + + if (!capable(CAP_SETPCAP)) + return -EPERM; + + if (pid == -1) /* all procs other than current and init */ + return cap_set_all(effective, inheritable, permitted); + + else if (pid < 0) /* all procs in process group */ + return cap_set_pg(-pid, effective, inheritable, permitted); + + /* target != current */ + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + target = find_task_by_vpid(pid); + if (!target) + ret = -ESRCH; + else { + ret = security_capset_check(target, effective, inheritable, + permitted); + + /* having verified that the proposed changes are legal, + we now put them into effect. */ + if (!ret) + security_capset_set(target, effective, inheritable, + permitted); + } + + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + + return ret; +} + +#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */ + +/* + * If we have configured with filesystem capability support, then the + * only thing that can change the capabilities of the current process + * is the current process. As such, we can't be in this code at the + * same time as we are in the process of setting capabilities in this + * process. The net result is that we can limit our use of locks to + * when we are reading the caps of another process. + */ +static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, + kernel_cap_t *pIp, kernel_cap_t *pPp) +{ + int ret; + + if (pid && (pid != task_pid_vnr(current))) { + struct task_struct *target; + + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + + target = find_task_by_vpid(pid); + if (!target) + ret = -ESRCH; + else + ret = security_capget(target, pEp, pIp, pPp); + + read_unlock(&tasklist_lock); + spin_unlock(&task_capability_lock); + } else + ret = security_capget(current, pEp, pIp, pPp); + + return ret; +} + +/* + * With filesystem capability support configured, the kernel does not + * permit the changing of capabilities in one process by another + * process. (CAP_SETPCAP has much less broad semantics when configured + * this way.) + */ +static inline int do_sys_capset_other_tasks(pid_t pid, + kernel_cap_t *effective, + kernel_cap_t *inheritable, + kernel_cap_t *permitted) +{ + return -EPERM; +} + +#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */ /* * Atomically modify the effective capabilities returning the original @@ -155,7 +352,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) { int ret = 0; pid_t pid; - struct task_struct *target; unsigned tocopy; kernel_cap_t pE, pI, pP; @@ -169,23 +365,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) if (pid < 0) return -EINVAL; - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - if (pid && pid != task_pid_vnr(current)) { - target = find_task_by_vpid(pid); - if (!target) { - ret = -ESRCH; - goto out; - } - } else - target = current; - - ret = security_capget(target, &pE, &pI, &pP); - -out: - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); + ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (!ret) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; @@ -216,7 +396,6 @@ out: * before modification is attempted and the application * fails. */ - if (copy_to_user(dataptr, kdata, tocopy * sizeof(struct __user_cap_data_struct))) { return -EFAULT; @@ -226,70 +405,8 @@ out: return ret; } -/* - * cap_set_pg - set capabilities for all processes in a given process - * group. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - struct pid *pgrp; - - pgrp = find_vpid(pgrp_nr); - do_each_pid_task(pgrp, PIDTYPE_PGID, g) { - target = g; - while_each_thread(g, target) { - if (!security_capset_check(target, effective, - inheritable, - permitted)) { - security_capset_set(target, effective, - inheritable, - permitted); - ret = 0; - } - found = 1; - } - } while_each_pid_task(pgrp, PIDTYPE_PGID, g); - - if (!found) - ret = 0; - return ret; -} - -/* - * cap_set_all - set capabilities for all processes other than init - * and self. We call this holding task_capability_lock and tasklist_lock. - */ -static inline int cap_set_all(kernel_cap_t *effective, - kernel_cap_t *inheritable, - kernel_cap_t *permitted) -{ - struct task_struct *g, *target; - int ret = -EPERM; - int found = 0; - - do_each_thread(g, target) { - if (target == current || is_container_init(target->group_leader)) - continue; - found = 1; - if (security_capset_check(target, effective, inheritable, - permitted)) - continue; - ret = 0; - security_capset_set(target, effective, inheritable, permitted); - } while_each_thread(g, target); - - if (!found) - ret = 0; - return ret; -} - /** - * sys_capset - set capabilities for a process or a group of processes + * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, @@ -313,7 +430,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; - struct task_struct *target; int ret; pid_t pid; @@ -324,9 +440,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) if (get_user(pid, &header->pid)) return -EFAULT; - if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP)) - return -EPERM; - if (copy_from_user(&kdata, data, tocopy * sizeof(struct __user_cap_data_struct))) { return -EFAULT; @@ -344,40 +457,31 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) i++; } - spin_lock(&task_capability_lock); - read_lock(&tasklist_lock); - - if (pid > 0 && pid != task_pid_vnr(current)) { - target = find_task_by_vpid(pid); - if (!target) { - ret = -ESRCH; - goto out; - } - } else - target = current; - - ret = 0; - - /* having verified that the proposed changes are legal, - we now put them into effect. */ - if (pid < 0) { - if (pid == -1) /* all procs other than current and init */ - ret = cap_set_all(&effective, &inheritable, &permitted); + if (pid && (pid != task_pid_vnr(current))) + ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, + &permitted); + else { + /* + * This lock is required even when filesystem + * capability support is configured - it protects the + * sys_capget() call from returning incorrect data in + * the case that the targeted process is not the + * current one. + */ + spin_lock(&task_capability_lock); - else /* all procs in process group */ - ret = cap_set_pg(-pid, &effective, &inheritable, - &permitted); - } else { - ret = security_capset_check(target, &effective, &inheritable, + ret = security_capset_check(current, &effective, &inheritable, &permitted); + /* + * Having verified that the proposed changes are + * legal, we now put them into effect. + */ if (!ret) - security_capset_set(target, &effective, &inheritable, + security_capset_set(current, &effective, &inheritable, &permitted); + spin_unlock(&task_capability_lock); } -out: - read_unlock(&tasklist_lock); - spin_unlock(&task_capability_lock); return ret; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15ac0e1e4f4..89bd6fb7894 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -89,11 +89,7 @@ struct cgroupfs_root { /* Hierarchy-specific flags */ unsigned long flags; - /* The path to use for release notifications. No locking - * between setting and use - so if userspace updates this - * while child cgroups exist, you could miss a - * notification. We ensure that it's always a valid - * NUL-terminated string */ + /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; }; @@ -118,7 +114,7 @@ static int root_count; * extra work in the fork/exit path if none of the subsystems need to * be called. */ -static int need_forkexit_callback; +static int need_forkexit_callback __read_mostly; static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ @@ -220,7 +216,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ -static int use_task_css_set_links; +static int use_task_css_set_links __read_mostly; /* When we create or destroy a css_set, the operation simply * takes/releases a reference count on all the cgroups referenced @@ -241,17 +237,20 @@ static int use_task_css_set_links; */ static void unlink_css_set(struct css_set *cg) { + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + write_lock(&css_set_lock); hlist_del(&cg->hlist); css_set_count--; - while (!list_empty(&cg->cg_links)) { - struct cg_cgroup_link *link; - link = list_entry(cg->cg_links.next, - struct cg_cgroup_link, cg_link_list); + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); kfree(link); } + write_unlock(&css_set_lock); } @@ -363,15 +362,14 @@ static struct css_set *find_existing_css_set( static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; int i; INIT_LIST_HEAD(tmp); for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - while (!list_empty(tmp)) { - link = list_entry(tmp->next, - struct cg_cgroup_link, - cgrp_link_list); + list_for_each_entry_safe(link, saved_link, tmp, + cgrp_link_list) { list_del(&link->cgrp_link_list); kfree(link); } @@ -384,11 +382,10 @@ static int allocate_cg_links(int count, struct list_head *tmp) static void free_cg_links(struct list_head *tmp) { - while (!list_empty(tmp)) { - struct cg_cgroup_link *link; - link = list_entry(tmp->next, - struct cg_cgroup_link, - cgrp_link_list); + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { list_del(&link->cgrp_link_list); kfree(link); } @@ -415,11 +412,11 @@ static struct css_set *find_css_set( /* First see if we already have a cgroup group that matches * the desired set */ - write_lock(&css_set_lock); + read_lock(&css_set_lock); res = find_existing_css_set(oldcg, cgrp, template); if (res) get_css_set(res); - write_unlock(&css_set_lock); + read_unlock(&css_set_lock); if (res) return res; @@ -507,10 +504,6 @@ static struct css_set *find_css_set( * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * - * The cgroup_common_file_write handler for operations that modify - * the cgroup hierarchy holds cgroup_mutex across the entire operation, - * single threading all such cgroup modifications across the system. - * * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't * (usually) take cgroup_mutex. These are the two most performance * critical pieces of code here. The exception occurs on cgroup_exit(), @@ -1093,6 +1086,8 @@ static void cgroup_kill_sb(struct super_block *sb) { struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; int ret; + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; BUG_ON(!root); @@ -1112,10 +1107,9 @@ static void cgroup_kill_sb(struct super_block *sb) { * root cgroup */ write_lock(&css_set_lock); - while (!list_empty(&cgrp->css_sets)) { - struct cg_cgroup_link *link; - link = list_entry(cgrp->css_sets.next, - struct cg_cgroup_link, cgrp_link_list); + + list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, + cgrp_link_list) { list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); kfree(link); @@ -1281,18 +1275,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with - * cgroup_mutex, may take task_lock of task + * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex + * held. May take task_lock of task */ -static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) { - pid_t pid; struct task_struct *tsk; int ret; - if (sscanf(pidbuf, "%d", &pid) != 1) - return -EIO; - if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); @@ -1318,6 +1308,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) return ret; } +static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +{ + int ret; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + ret = attach_task_by_pid(cgrp, pid); + cgroup_unlock(); + return ret; +} + /* The various types of files and directories in a cgroup file system */ enum cgroup_filetype { FILE_ROOT, @@ -1327,12 +1327,54 @@ enum cgroup_filetype { FILE_RELEASE_AGENT, }; +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * On success, returns true; the lock should be later released with + * cgroup_unlock(). On failure returns false with no lock held. + */ +bool cgroup_lock_live_group(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_mutex); + if (cgroup_is_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return false; + } + return true; +} + +static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + strcpy(cgrp->root->release_agent_path, buffer); + cgroup_unlock(); + return 0; +} + +static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *seq) +{ + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + seq_puts(seq, cgrp->root->release_agent_path); + seq_putc(seq, '\n'); + cgroup_unlock(); + return 0; +} + +/* A buffer size big enough for numbers or short strings */ +#define CGROUP_LOCAL_BUFFER_SIZE 64 + static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) { - char buffer[64]; + char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; char *end; @@ -1361,68 +1403,36 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return retval; } -static ssize_t cgroup_common_file_write(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) { - enum cgroup_filetype type = cft->private; - char *buffer; + char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; + size_t max_bytes = cft->max_write_len; + char *buffer = local_buffer; - if (nbytes >= PATH_MAX) + if (!max_bytes) + max_bytes = sizeof(local_buffer) - 1; + if (nbytes >= max_bytes) return -E2BIG; - - /* +1 for nul-terminator */ - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - - if (copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out1; + /* Allocate a dynamic buffer if we need one */ + if (nbytes >= sizeof(local_buffer)) { + buffer = kmalloc(nbytes + 1, GFP_KERNEL); + if (buffer == NULL) + return -ENOMEM; } - buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); /* strip -just- trailing whitespace */ - - mutex_lock(&cgroup_mutex); + if (nbytes && copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; - /* - * This was already checked for in cgroup_file_write(), but - * check again now we're holding cgroup_mutex. - */ - if (cgroup_is_removed(cgrp)) { - retval = -ENODEV; - goto out2; - } - - switch (type) { - case FILE_TASKLIST: - retval = attach_task_by_pid(cgrp, buffer); - break; - case FILE_NOTIFY_ON_RELEASE: - clear_bit(CGRP_RELEASABLE, &cgrp->flags); - if (simple_strtoul(buffer, NULL, 10) != 0) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - break; - case FILE_RELEASE_AGENT: - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - strcpy(cgrp->root->release_agent_path, buffer); - break; - default: - retval = -EINVAL; - goto out2; - } - - if (retval == 0) + buffer[nbytes] = 0; /* nul-terminate */ + strstrip(buffer); + retval = cft->write_string(cgrp, cft, buffer); + if (!retval) retval = nbytes; -out2: - mutex_unlock(&cgroup_mutex); -out1: - kfree(buffer); + if (buffer != local_buffer) + kfree(buffer); return retval; } @@ -1438,6 +1448,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, return cft->write(cgrp, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + if (cft->write_string) + return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); if (cft->trigger) { int ret = cft->trigger(cgrp, (unsigned int)cft->private); return ret ? ret : nbytes; @@ -1450,7 +1462,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, char __user *buf, size_t nbytes, loff_t *ppos) { - char tmp[64]; + char tmp[CGROUP_LOCAL_BUFFER_SIZE]; u64 val = cft->read_u64(cgrp, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); @@ -1462,56 +1474,13 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, char __user *buf, size_t nbytes, loff_t *ppos) { - char tmp[64]; + char tmp[CGROUP_LOCAL_BUFFER_SIZE]; s64 val = cft->read_s64(cgrp, cft); int len = sprintf(tmp, "%lld\n", (long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } -static ssize_t cgroup_common_file_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, - char __user *buf, - size_t nbytes, loff_t *ppos) -{ - enum cgroup_filetype type = cft->private; - char *page; - ssize_t retval = 0; - char *s; - - if (!(page = (char *)__get_free_page(GFP_KERNEL))) - return -ENOMEM; - - s = page; - - switch (type) { - case FILE_RELEASE_AGENT: - { - struct cgroupfs_root *root; - size_t n; - mutex_lock(&cgroup_mutex); - root = cgrp->root; - n = strnlen(root->release_agent_path, - sizeof(root->release_agent_path)); - n = min(n, (size_t) PAGE_SIZE); - strncpy(s, root->release_agent_path, n); - mutex_unlock(&cgroup_mutex); - s += n; - break; - } - default: - retval = -EINVAL; - goto out; - } - *s++ = '\n'; - - retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); -out: - free_page((unsigned long)page); - return retval; -} - static ssize_t cgroup_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { @@ -1560,7 +1529,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) return cft->read_seq_string(state->cgroup, cft, m); } -int cgroup_seqfile_release(struct inode *inode, struct file *file) +static int cgroup_seqfile_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; kfree(seq->private); @@ -1569,6 +1538,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file) static struct file_operations cgroup_seqfile_operations = { .read = seq_read, + .write = cgroup_file_write, .llseek = seq_lseek, .release = cgroup_seqfile_release, }; @@ -1756,15 +1726,11 @@ int cgroup_add_files(struct cgroup *cgrp, int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; - struct list_head *l; + struct cg_cgroup_link *link; read_lock(&css_set_lock); - l = cgrp->css_sets.next; - while (l != &cgrp->css_sets) { - struct cg_cgroup_link *link = - list_entry(l, struct cg_cgroup_link, cgrp_link_list); + list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { count += atomic_read(&link->cg->ref.refcount); - l = l->next; } read_unlock(&css_set_lock); return count; @@ -2227,6 +2193,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, return notify_on_release(cgrp); } +static int cgroup_write_notify_on_release(struct cgroup *cgrp, + struct cftype *cft, + u64 val) +{ + clear_bit(CGRP_RELEASABLE, &cgrp->flags); + if (val) + set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + return 0; +} + /* * for the common functions, 'private' gives the type of file */ @@ -2235,7 +2213,7 @@ static struct cftype files[] = { .name = "tasks", .open = cgroup_tasks_open, .read = cgroup_tasks_read, - .write = cgroup_common_file_write, + .write_u64 = cgroup_tasks_write, .release = cgroup_tasks_release, .private = FILE_TASKLIST, }, @@ -2243,15 +2221,16 @@ static struct cftype files[] = { { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, - .write = cgroup_common_file_write, + .write_u64 = cgroup_write_notify_on_release, .private = FILE_NOTIFY_ON_RELEASE, }, }; static struct cftype cft_release_agent = { .name = "release_agent", - .read = cgroup_common_file_read, - .write = cgroup_common_file_write, + .read_seq_string = cgroup_release_agent_show, + .write_string = cgroup_release_agent_write, + .max_write_len = PATH_MAX, .private = FILE_RELEASE_AGENT, }; @@ -2869,16 +2848,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) * cgroup_clone - clone the cgroup the given subsystem is attached to * @tsk: the task to be moved * @subsys: the given subsystem + * @nodename: the name for the new cgroup * * Duplicate the current cgroup in the hierarchy that the given * subsystem is attached to, and move this task into the new * child. */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) +int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, + char *nodename) { struct dentry *dentry; int ret = 0; - char nodename[MAX_CGROUP_TYPE_NAMELEN]; struct cgroup *parent, *child; struct inode *inode; struct css_set *cg; @@ -2903,8 +2883,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); - snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid); - /* Pin the hierarchy */ atomic_inc(&parent->root->sb->s_active); @@ -3078,27 +3056,24 @@ static void cgroup_release_agent(struct work_struct *work) while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; - char *pathbuf; + char *pathbuf = NULL, *agentbuf = NULL; struct cgroup *cgrp = list_entry(release_list.next, struct cgroup, release_list); list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!pathbuf) { - spin_lock(&release_list_lock); - continue; - } - - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) { - kfree(pathbuf); - spin_lock(&release_list_lock); - continue; - } + if (!pathbuf) + goto continue_free; + if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) + goto continue_free; + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!agentbuf) + goto continue_free; i = 0; - argv[i++] = cgrp->root->release_agent_path; - argv[i++] = (char *)pathbuf; + argv[i++] = agentbuf; + argv[i++] = pathbuf; argv[i] = NULL; i = 0; @@ -3112,8 +3087,10 @@ static void cgroup_release_agent(struct work_struct *work) * be a slow process */ mutex_unlock(&cgroup_mutex); call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - kfree(pathbuf); mutex_lock(&cgroup_mutex); + continue_free: + kfree(pathbuf); + kfree(agentbuf); spin_lock(&release_list_lock); } spin_unlock(&release_list_lock); diff --git a/kernel/cpu.c b/kernel/cpu.c index 2cc409ce0a8..10ba5f1004a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -285,6 +285,11 @@ out_allowed: set_cpus_allowed_ptr(current, &old_allowed); out_release: cpu_hotplug_done(); + if (!err) { + if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, + hcpu) == NOTIFY_BAD) + BUG(); + } return err; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d5738910c34..91cf85b36dd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -227,10 +227,6 @@ static struct cpuset top_cpuset = { * The task_struct fields mems_allowed and mems_generation may only * be accessed in the context of that task, so require no locks. * - * The cpuset_common_file_write handler for operations that modify - * the cpuset hierarchy holds cgroup_mutex across the entire operation, - * single threading all such cpuset modifications across the system. - * * The cpuset_common_file_read() handlers only hold callback_mutex across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. @@ -369,7 +365,7 @@ void cpuset_update_task_memory_state(void) my_cpusets_mem_gen = top_cpuset.mems_generation; } else { rcu_read_lock(); - my_cpusets_mem_gen = task_cs(current)->mems_generation; + my_cpusets_mem_gen = task_cs(tsk)->mems_generation; rcu_read_unlock(); } @@ -500,11 +496,16 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) /* * rebuild_sched_domains() * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. + * This routine will be called to rebuild the scheduler's dynamic + * sched domains: + * - if the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, + * - or if the 'cpus' allowed changes in any cpuset which has that + * flag enabled, + * - or if the 'sched_relax_domain_level' of any cpuset which has + * that flag enabled and with non-empty 'cpus' changes, + * - or if any cpuset with non-empty 'cpus' is removed, + * - or if a cpu gets offlined. * * This routine builds a partial partition of the systems CPUs * (the set of non-overlappping cpumask_t's in the array 'part' @@ -609,8 +610,13 @@ void rebuild_sched_domains(void) while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { struct cgroup *cont; struct cpuset *child; /* scans child cpusets of cp */ + + if (cpus_empty(cp->cpus_allowed)) + continue; + if (is_sched_load_balance(cp)) csa[csn++] = cp; + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); __kfifo_put(q, (void *)&child, sizeof(cp)); @@ -703,36 +709,6 @@ done: /* Don't kfree(dattr) -- partition_sched_domains() does that. */ } -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) -{ - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) - * simultaneously. - */ - return t1 > t2; - } -} - -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); -} - /** * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's * @tsk: task to test @@ -768,15 +744,49 @@ static void cpuset_change_cpumask(struct task_struct *tsk, } /** + * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. + * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * + * Called with cgroup_mutex held + * + * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * calling callback functions for each. + * + * Return 0 if successful, -errno if not. + */ +static int update_tasks_cpumask(struct cpuset *cs) +{ + struct cgroup_scanner scan; + struct ptr_heap heap; + int retval; + + /* + * cgroup_scan_tasks() will initialize heap->gt for us. + * heap_init() is still needed here for we should not change + * cs->cpus_allowed when heap_init() fails. + */ + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (retval) + return retval; + + scan.cg = cs->css.cgroup; + scan.test_task = cpuset_test_cpumask; + scan.process_task = cpuset_change_cpumask; + scan.heap = &heap; + retval = cgroup_scan_tasks(&scan); + + heap_free(&heap); + return retval; +} + +/** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider * @buf: buffer of cpu numbers written to this cpuset */ -static int update_cpumask(struct cpuset *cs, char *buf) +static int update_cpumask(struct cpuset *cs, const char *buf) { struct cpuset trialcs; - struct cgroup_scanner scan; - struct ptr_heap heap; int retval; int is_load_balanced; @@ -792,7 +802,6 @@ static int update_cpumask(struct cpuset *cs, char *buf) * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. */ - buf = strstrip(buf); if (!*buf) { cpus_clear(trialcs.cpus_allowed); } else { @@ -811,10 +820,6 @@ static int update_cpumask(struct cpuset *cs, char *buf) if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) return 0; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); - if (retval) - return retval; - is_load_balanced = is_sched_load_balance(&trialcs); mutex_lock(&callback_mutex); @@ -825,12 +830,9 @@ static int update_cpumask(struct cpuset *cs, char *buf) * Scan tasks in the cpuset, and update the cpumasks of any * that need an update. */ - scan.cg = cs->css.cgroup; - scan.test_task = cpuset_test_cpumask; - scan.process_task = cpuset_change_cpumask; - scan.heap = &heap; - cgroup_scan_tasks(&scan); - heap_free(&heap); + retval = update_tasks_cpumask(cs); + if (retval < 0) + return retval; if (is_load_balanced) rebuild_sched_domains(); @@ -886,74 +888,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, mutex_unlock(&callback_mutex); } -/* - * Handle user request to change the 'mems' memory placement - * of a cpuset. Needs to validate the request, update the - * cpusets mems_allowed and mems_generation, and for each - * task in the cpuset, rebind any vma mempolicies and if - * the cpuset is marked 'memory_migrate', migrate the tasks - * pages to the new memory. - * - * Call with cgroup_mutex held. May take callback_mutex during call. - * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, - * lock each such tasks mm->mmap_sem, scan its vma's and rebind - * their mempolicies to the cpusets new mems_allowed. - */ - static void *cpuset_being_rebound; -static int update_nodemask(struct cpuset *cs, char *buf) +/** + * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. + * @cs: the cpuset in which each task's mems_allowed mask needs to be changed + * @oldmem: old mems_allowed of cpuset cs + * + * Called with cgroup_mutex held + * Return 0 if successful, -errno if not. + */ +static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) { - struct cpuset trialcs; - nodemask_t oldmem; struct task_struct *p; struct mm_struct **mmarray; int i, n, ntasks; int migrate; int fudge; - int retval; struct cgroup_iter it; - - /* - * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; - * it's read-only - */ - if (cs == &top_cpuset) - return -EACCES; - - trialcs = *cs; - - /* - * An empty mems_allowed is ok iff there are no tasks in the cpuset. - * Since nodelist_parse() fails on an empty mask, we special case - * that parsing. The validate_change() call ensures that cpusets - * with tasks have memory. - */ - buf = strstrip(buf); - if (!*buf) { - nodes_clear(trialcs.mems_allowed); - } else { - retval = nodelist_parse(buf, trialcs.mems_allowed); - if (retval < 0) - goto done; - - if (!nodes_subset(trialcs.mems_allowed, - node_states[N_HIGH_MEMORY])) - return -EINVAL; - } - oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs.mems_allowed)) { - retval = 0; /* Too easy - nothing to do */ - goto done; - } - retval = validate_change(cs, &trialcs); - if (retval < 0) - goto done; - - mutex_lock(&callback_mutex); - cs->mems_allowed = trialcs.mems_allowed; - cs->mems_generation = cpuset_mems_generation++; - mutex_unlock(&callback_mutex); + int retval; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ @@ -1020,7 +973,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) - cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); + cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); mmput(mm); } @@ -1032,6 +985,70 @@ done: return retval; } +/* + * Handle user request to change the 'mems' memory placement + * of a cpuset. Needs to validate the request, update the + * cpusets mems_allowed and mems_generation, and for each + * task in the cpuset, rebind any vma mempolicies and if + * the cpuset is marked 'memory_migrate', migrate the tasks + * pages to the new memory. + * + * Call with cgroup_mutex held. May take callback_mutex during call. + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, + * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * their mempolicies to the cpusets new mems_allowed. + */ +static int update_nodemask(struct cpuset *cs, const char *buf) +{ + struct cpuset trialcs; + nodemask_t oldmem; + int retval; + + /* + * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; + * it's read-only + */ + if (cs == &top_cpuset) + return -EACCES; + + trialcs = *cs; + + /* + * An empty mems_allowed is ok iff there are no tasks in the cpuset. + * Since nodelist_parse() fails on an empty mask, we special case + * that parsing. The validate_change() call ensures that cpusets + * with tasks have memory. + */ + if (!*buf) { + nodes_clear(trialcs.mems_allowed); + } else { + retval = nodelist_parse(buf, trialcs.mems_allowed); + if (retval < 0) + goto done; + + if (!nodes_subset(trialcs.mems_allowed, + node_states[N_HIGH_MEMORY])) + return -EINVAL; + } + oldmem = cs->mems_allowed; + if (nodes_equal(oldmem, trialcs.mems_allowed)) { + retval = 0; /* Too easy - nothing to do */ + goto done; + } + retval = validate_change(cs, &trialcs); + if (retval < 0) + goto done; + + mutex_lock(&callback_mutex); + cs->mems_allowed = trialcs.mems_allowed; + cs->mems_generation = cpuset_mems_generation++; + mutex_unlock(&callback_mutex); + + retval = update_tasks_nodemask(cs, &oldmem); +done: + return retval; +} + int current_cpuset_is_being_rebound(void) { return task_cs(current) == cpuset_being_rebound; @@ -1044,7 +1061,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; - rebuild_sched_domains(); + if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) + rebuild_sched_domains(); } return 0; @@ -1256,72 +1274,14 @@ typedef enum { FILE_SPREAD_SLAB, } cpuset_filetype_t; -static ssize_t cpuset_common_file_write(struct cgroup *cont, - struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) -{ - struct cpuset *cs = cgroup_cs(cont); - cpuset_filetype_t type = cft->private; - char *buffer; - int retval = 0; - - /* Crude upper limit on largest legitimate cpulist user might write. */ - if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES)) - return -E2BIG; - - /* +1 for nul-terminator */ - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - if (copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out1; - } - buffer[nbytes] = 0; /* nul-terminate */ - - cgroup_lock(); - - if (cgroup_is_removed(cont)) { - retval = -ENODEV; - goto out2; - } - - switch (type) { - case FILE_CPULIST: - retval = update_cpumask(cs, buffer); - break; - case FILE_MEMLIST: - retval = update_nodemask(cs, buffer); - break; - default: - retval = -EINVAL; - goto out2; - } - - if (retval == 0) - retval = nbytes; -out2: - cgroup_unlock(); -out1: - kfree(buffer); - return retval; -} - static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) { int retval = 0; struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; - cgroup_lock(); - - if (cgroup_is_removed(cgrp)) { - cgroup_unlock(); + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; - } switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1367,12 +1327,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) struct cpuset *cs = cgroup_cs(cgrp); cpuset_filetype_t type = cft->private; - cgroup_lock(); - - if (cgroup_is_removed(cgrp)) { - cgroup_unlock(); + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; - } + switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: retval = update_relax_domain_level(cs, val); @@ -1386,6 +1343,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) } /* + * Common handling for a write to a "cpus" or "mems" file. + */ +static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, + const char *buf) +{ + int retval = 0; + + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + + switch (cft->private) { + case FILE_CPULIST: + retval = update_cpumask(cgroup_cs(cgrp), buf); + break; + case FILE_MEMLIST: + retval = update_nodemask(cgroup_cs(cgrp), buf); + break; + default: + retval = -EINVAL; + break; + } + cgroup_unlock(); + return retval; +} + +/* * These ascii lists should be read in a single call, by using a user * buffer large enough to hold the entire map. If read in smaller * chunks, there is no guarantee of atomicity. Since the display format @@ -1504,14 +1487,16 @@ static struct cftype files[] = { { .name = "cpus", .read = cpuset_common_file_read, - .write = cpuset_common_file_write, + .write_string = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, }, { .name = "mems", .read = cpuset_common_file_read, - .write = cpuset_common_file_write, + .write_string = cpuset_write_resmask, + .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, }, @@ -1792,7 +1777,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) scan.scan.heap = NULL; scan.to = to->css.cgroup; - if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) + if (cgroup_scan_tasks(&scan.scan)) printk(KERN_ERR "move_member_tasks_to_cpuset: " "cgroup_scan_tasks failed\n"); } @@ -1852,6 +1837,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root) struct cpuset *child; /* scans child cpusets of cp */ struct list_head queue; struct cgroup *cont; + nodemask_t oldmems; INIT_LIST_HEAD(&queue); @@ -1871,6 +1857,8 @@ static void scan_for_empty_cpusets(const struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; + oldmems = cp->mems_allowed; + /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); @@ -1882,6 +1870,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root) if (cpus_empty(cp->cpus_allowed) || nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); + else { + update_tasks_cpumask(cp); + update_tasks_nodemask(cp, &oldmems); + } } } @@ -1974,7 +1966,6 @@ void __init cpuset_init_smp(void) } /** - * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 10e43fd8b72..b3179dad71b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; + tmp = d->freepages_delay_total + tsk->delays->freepages_delay; + d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; + d->freepages_count += tsk->delays->freepages_count; spin_unlock_irqrestore(&tsk->delays->lock, flags); done: @@ -165,3 +168,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) return ret; } +void __delayacct_freepages_start(void) +{ + delayacct_start(¤t->delays->freepages_start); +} + +void __delayacct_freepages_end(void) +{ + delayacct_end(¤t->delays->freepages_start, + ¤t->delays->freepages_end, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); +} + diff --git a/kernel/exit.c b/kernel/exit.c index 93d2711b938..6cdf60712bd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -46,6 +46,7 @@ #include <linux/resource.h> #include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> +#include <linux/tracehook.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -85,7 +86,6 @@ static void __exit_signal(struct task_struct *tsk) BUG_ON(!sig); BUG_ON(!atomic_read(&sig->count)); - rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); @@ -121,6 +121,18 @@ static void __exit_signal(struct task_struct *tsk) sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); +#ifdef CONFIG_TASK_XACCT + sig->rchar += tsk->rchar; + sig->wchar += tsk->wchar; + sig->syscr += tsk->syscr; + sig->syscw += tsk->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + sig->ioac.read_bytes += tsk->ioac.read_bytes; + sig->ioac.write_bytes += tsk->ioac.write_bytes; + sig->ioac.cancelled_write_bytes += + tsk->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -136,7 +148,6 @@ static void __exit_signal(struct task_struct *tsk) tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); - rcu_read_unlock(); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); @@ -152,27 +163,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(container_of(rhp, struct task_struct, rcu)); } -/* - * Do final ptrace-related cleanup of a zombie being reaped. - * - * Called with write_lock(&tasklist_lock) held. - */ -static void ptrace_release_task(struct task_struct *p) -{ - BUG_ON(!list_empty(&p->ptraced)); - ptrace_unlink(p); - BUG_ON(!list_empty(&p->ptrace_entry)); -} void release_task(struct task_struct * p) { struct task_struct *leader; int zap_leader; repeat: + tracehook_prepare_release_task(p); atomic_dec(&p->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); - ptrace_release_task(p); + tracehook_finish_release_task(p); __exit_signal(p); /* @@ -194,6 +195,13 @@ repeat: * that case. */ zap_leader = task_detached(leader); + + /* + * This maintains the invariant that release_task() + * only runs on a task in EXIT_DEAD, just for sanity. + */ + if (zap_leader) + leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); @@ -432,7 +440,7 @@ void daemonize(const char *name, ...) * We don't want to have TIF_FREEZE set if the system-wide hibernation * or suspend transition begins right now. */ - current->flags |= PF_NOFREEZE; + current->flags |= (PF_NOFREEZE | PF_KTHREAD); if (current->nsproxy != &init_nsproxy) { get_nsproxy(&init_nsproxy); @@ -666,26 +674,40 @@ assign_new_owner: static void exit_mm(struct task_struct * tsk) { struct mm_struct *mm = tsk->mm; + struct core_state *core_state; mm_release(tsk, mm); if (!mm) return; /* * Serialize with any possible pending coredump. - * We must hold mmap_sem around checking core_waiters + * We must hold mmap_sem around checking core_state * and clearing tsk->mm. The core-inducing thread - * will increment core_waiters for each thread in the + * will increment ->nr_threads for each thread in the * group with ->mm != NULL. */ down_read(&mm->mmap_sem); - if (mm->core_waiters) { + core_state = mm->core_state; + if (core_state) { + struct core_thread self; up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); - if (!--mm->core_waiters) - complete(mm->core_startup_done); - up_write(&mm->mmap_sem); - wait_for_completion(&mm->core_done); + self.task = tsk; + self.next = xchg(&core_state->dumper.next, &self); + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_task_state(tsk, TASK_RUNNING); down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); @@ -863,7 +885,8 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int state; + int signal; + void *cookie; /* * This does two things: @@ -900,22 +923,11 @@ static void exit_notify(struct task_struct *tsk, int group_dead) !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; - /* If something other than our normal parent is ptracing us, then - * send it a SIGCHLD instead of honoring exit_signal. exit_signal - * only has special meaning to our real parent. - */ - if (!task_detached(tsk) && thread_group_empty(tsk)) { - int signal = ptrace_reparented(tsk) ? - SIGCHLD : tsk->exit_signal; - do_notify_parent(tsk, signal); - } else if (tsk->ptrace) { - do_notify_parent(tsk, SIGCHLD); - } + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal > 0) + signal = do_notify_parent(tsk, signal); - state = EXIT_ZOMBIE; - if (task_detached(tsk) && likely(!tsk->ptrace)) - state = EXIT_DEAD; - tsk->exit_state = state; + tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && @@ -925,8 +937,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) write_unlock_irq(&tasklist_lock); + tracehook_report_death(tsk, signal, cookie, group_dead); + /* If the process is dead, release it - nobody will wait for it */ - if (state == EXIT_DEAD) + if (signal < 0) release_task(tsk); } @@ -1005,10 +1019,7 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(current->ptrace & PT_TRACE_EXIT)) { - current->ptrace_message = code; - ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); - } + tracehook_report_exit(&code); /* * We're taking recursive faults here in do_exit. Safest is to just @@ -1354,6 +1365,21 @@ static int wait_task_zombie(struct task_struct *p, int options, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; +#ifdef CONFIG_TASK_XACCT + psig->rchar += p->rchar + sig->rchar; + psig->wchar += p->wchar + sig->wchar; + psig->syscr += p->syscr + sig->syscr; + psig->syscw += p->syscw + sig->syscw; +#endif /* CONFIG_TASK_XACCT */ +#ifdef CONFIG_TASK_IO_ACCOUNTING + psig->ioac.read_bytes += + p->ioac.read_bytes + sig->ioac.read_bytes; + psig->ioac.write_bytes += + p->ioac.write_bytes + sig->ioac.write_bytes; + psig->ioac.cancelled_write_bytes += + p->ioac.cancelled_write_bytes + + sig->ioac.cancelled_write_bytes; +#endif /* CONFIG_TASK_IO_ACCOUNTING */ spin_unlock_irq(&p->parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index adefc1131f2..abb3ed6298f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -33,9 +33,11 @@ #include <linux/cpu.h> #include <linux/cgroup.h> #include <linux/security.h> +#include <linux/hugetlb.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> +#include <linux/tracehook.h> #include <linux/futex.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> @@ -92,6 +94,23 @@ int nr_processes(void) static struct kmem_cache *task_struct_cachep; #endif +#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR +static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +{ +#ifdef CONFIG_DEBUG_STACK_USAGE + gfp_t mask = GFP_KERNEL | __GFP_ZERO; +#else + gfp_t mask = GFP_KERNEL; +#endif + return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); +} + +static inline void free_thread_info(struct thread_info *ti) +{ + free_pages((unsigned long)ti, THREAD_SIZE_ORDER); +} +#endif + /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -307,6 +326,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) } /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* * Link in the new vma and copy the page table entries. */ *pprev = tmp; @@ -374,7 +401,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : MMF_DUMP_FILTER_DEFAULT; - mm->core_waiters = 0; + mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); @@ -448,7 +475,7 @@ EXPORT_SYMBOL_GPL(mmput); /** * get_task_mm - acquire a reference to the task's mm * - * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning + * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() @@ -461,7 +488,7 @@ struct mm_struct *get_task_mm(struct task_struct *task) task_lock(task); mm = task->mm; if (mm) { - if (task->flags & PF_BORROWED_MM) + if (task->flags & PF_KTHREAD) mm = NULL; else atomic_inc(&mm->mm_users); @@ -786,6 +813,12 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; +#ifdef CONFIG_TASK_XACCT + sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0; +#endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + memset(&sig->ioac, 0, sizeof(sig->ioac)); +#endif sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); @@ -833,8 +866,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags &= ~PF_SUPERPRIV; new_flags |= PF_FORKNOEXEC; - if (!(clone_flags & CLONE_PTRACE)) - p->ptrace = 0; + new_flags |= PF_STARTING; p->flags = new_flags; clear_freeze_flag(p); } @@ -875,7 +907,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, - struct pid *pid) + struct pid *pid, + int trace) { int retval; struct task_struct *p; @@ -1081,6 +1114,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; + if (current->nsproxy != p->nsproxy) { + retval = ns_cgroup_clone(p, pid); + if (retval) + goto bad_fork_free_pid; + } + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? @@ -1125,8 +1164,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_LIST_HEAD(&p->ptrace_entry); - INIT_LIST_HEAD(&p->ptraced); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible @@ -1157,7 +1194,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_parent = current->real_parent; else p->real_parent = current; - p->parent = p->real_parent; spin_lock(¤t->sighand->siglock); @@ -1199,8 +1235,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (likely(p->pid)) { list_add_tail(&p->sibling, &p->real_parent->children); - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); + tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { if (clone_flags & CLONE_NEWPID) @@ -1285,29 +1320,13 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid); + &init_struct_pid, 0); if (!IS_ERR(task)) init_idle(task, cpu); return task; } -static int fork_traceflag(unsigned clone_flags) -{ - if (clone_flags & CLONE_UNTRACED) - return 0; - else if (clone_flags & CLONE_VFORK) { - if (current->ptrace & PT_TRACE_VFORK) - return PTRACE_EVENT_VFORK; - } else if ((clone_flags & CSIGNAL) != SIGCHLD) { - if (current->ptrace & PT_TRACE_CLONE) - return PTRACE_EVENT_CLONE; - } else if (current->ptrace & PT_TRACE_FORK) - return PTRACE_EVENT_FORK; - - return 0; -} - /* * Ok, this is the main fork-routine. * @@ -1342,14 +1361,14 @@ long do_fork(unsigned long clone_flags, } } - if (unlikely(current->ptrace)) { - trace = fork_traceflag (clone_flags); - if (trace) - clone_flags |= CLONE_PTRACE; - } + /* + * When called from kernel_thread, don't do user tracing stuff. + */ + if (likely(user_mode(regs))) + trace = tracehook_prepare_clone(clone_flags); p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL); + child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1367,32 +1386,35 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + tracehook_report_clone(trace, regs, clone_flags, nr, p); + + /* + * We set PF_STARTING at creation in case tracing wants to + * use this to distinguish a fully live task from one that + * hasn't gotten to tracehook_report_clone() yet. Now we + * clear it and set the child going. + */ + p->flags &= ~PF_STARTING; + + if (unlikely(clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); - } - - if (!(clone_flags & CLONE_STOPPED)) - wake_up_new_task(p, clone_flags); - else __set_task_state(p, TASK_STOPPED); - - if (unlikely (trace)) { - current->ptrace_message = nr; - ptrace_notify ((trace << 8) | SIGTRAP); + } else { + wake_up_new_task(p, clone_flags); } + tracehook_report_clone_complete(trace, regs, + clone_flags, nr, p); + if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { - current->ptrace_message = nr; - ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); - } + tracehook_report_vfork_done(p, nr); } } else { nr = PTR_ERR(p); @@ -1404,7 +1426,7 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -static void sighand_ctor(struct kmem_cache *cachep, void *data) +static void sighand_ctor(void *data) { struct sighand_struct *sighand = data; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 964964baefa..3cd441ebf5d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq) unsigned long flags; if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; } @@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned long flags; if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; } @@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); - printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n", + WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", irq); - WARN_ON(1); return; } desc->msi_desc = NULL; @@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) unsigned long flags; if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3cfc0fefb5e..152abfd3589 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -177,8 +177,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) { switch (desc->depth) { case 0: - printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); - WARN_ON(1); + WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { unsigned int status = desc->status & ~IRQ_DISABLED; @@ -260,9 +259,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) } } else { if (desc->wake_depth == 0) { - printk(KERN_WARNING "Unbalanced IRQ %d " - "wake disable\n", irq); - WARN_ON(1); + WARN(1, "Unbalanced IRQ %d wake disable\n", irq); } else if (--desc->wake_depth == 0) { ret = set_irq_wake_real(irq, on); if (ret) @@ -308,6 +305,30 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) desc->handle_irq = NULL; } +static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, + unsigned long flags) +{ + int ret; + + if (!chip || !chip->set_type) { + /* + * IRQF_TRIGGER_* but the PIC does not support multiple + * flow-types? + */ + pr_warning("No set_type function for IRQ %d (%s)\n", irq, + chip ? (chip->name ? : "unknown") : "unknown"); + return 0; + } + + ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); + + if (ret) + pr_err("setting flow type for irq %u failed (%pF)\n", + irq, chip->set_type); + + return ret; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -319,6 +340,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) const char *old_name = NULL; unsigned long flags; int shared = 0; + int ret; if (irq >= NR_IRQS) return -EINVAL; @@ -376,35 +398,23 @@ int setup_irq(unsigned int irq, struct irqaction *new) shared = 1; } - *p = new; - - /* Exclude IRQ from balancing */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; - if (!shared) { irq_chip_set_defaults(desc->chip); -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif - /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - if (desc->chip->set_type) - desc->chip->set_type(irq, - new->flags & IRQF_TRIGGER_MASK); - else - /* - * IRQF_TRIGGER_* but the PIC does not support - * multiple flow-types? - */ - printk(KERN_WARNING "No IRQF_TRIGGER set_type " - "function for IRQ %d (%s)\n", irq, - desc->chip->name); + ret = __irq_set_trigger(desc->chip, irq, new->flags); + + if (ret) { + spin_unlock_irqrestore(&desc->lock, flags); + return ret; + } } else compat_irq_chip_set_default_handler(desc); +#if defined(CONFIG_IRQ_PER_CPU) + if (new->flags & IRQF_PERCPU) + desc->status |= IRQ_PER_CPU; +#endif desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); @@ -423,6 +433,13 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* Set default affinity mask once everything is setup */ irq_select_affinity(irq); } + + *p = new; + + /* Exclude IRQ from balancing */ + if (new->flags & IRQF_NOBALANCING) + desc->status |= IRQ_NO_BALANCING; + /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6fc0040f3e3..38fc10ac754 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr, high = kallsyms_num_syms; while (high - low > 1) { - mid = (low + high) / 2; + mid = low + (high - low) / 2; if (kallsyms_addresses[mid] <= addr) low = mid; else diff --git a/kernel/kexec.c b/kernel/kexec.c index 1c5fcacbcf3..c8a4370e2a3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -24,6 +24,12 @@ #include <linux/utsrelease.h> #include <linux/utsname.h> #include <linux/numa.h> +#include <linux/suspend.h> +#include <linux/device.h> +#include <linux/freezer.h> +#include <linux/pm.h> +#include <linux/cpu.h> +#include <linux/console.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -242,6 +248,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, goto out; } + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + printk(KERN_ERR "Could not allocate swap buffer\n"); + goto out; + } + result = 0; out: if (result == 0) @@ -589,14 +601,12 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->unuseable_pages); } -static int kimage_terminate(struct kimage *image) +static void kimage_terminate(struct kimage *image) { if (*image->entry != 0) image->entry++; *image->entry = IND_DONE; - - return 0; } #define for_each_kimage_entry(image, ptr, entry) \ @@ -988,6 +998,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, if (result) goto out; + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; result = machine_kexec_prepare(image); if (result) goto out; @@ -997,9 +1009,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, if (result) goto out; } - result = kimage_terminate(image); - if (result) - goto out; + kimage_terminate(image); } /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); @@ -1415,3 +1425,85 @@ static int __init crash_save_vmcoreinfo_init(void) } module_init(crash_save_vmcoreinfo_init) + +/** + * kernel_kexec - reboot the system + * + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ +int kernel_kexec(void) +{ + int error = 0; + + if (xchg(&kexec_lock, 1)) + return -EBUSY; + if (!kexec_image) { + error = -EINVAL; + goto Unlock; + } + + if (kexec_image->preserve_context) { +#ifdef CONFIG_KEXEC_JUMP + mutex_lock(&pm_mutex); + pm_prepare_console(); + error = freeze_processes(); + if (error) { + error = -EBUSY; + goto Restore_console; + } + suspend_console(); + error = device_suspend(PMSG_FREEZE); + if (error) + goto Resume_console; + error = disable_nonboot_cpus(); + if (error) + goto Resume_devices; + local_irq_disable(); + /* At this point, device_suspend() has been called, + * but *not* device_power_down(). We *must* + * device_power_down() now. Otherwise, drivers for + * some devices (e.g. interrupt controllers) become + * desynchronized with the actual state of the + * hardware at resume time, and evil weirdness ensues. + */ + error = device_power_down(PMSG_FREEZE); + if (error) + goto Enable_irqs; + save_processor_state(); +#endif + } else { + blocking_notifier_call_chain(&reboot_notifier_list, + SYS_RESTART, NULL); + system_state = SYSTEM_RESTART; + device_shutdown(); + sysdev_shutdown(); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + } + + machine_kexec(kexec_image); + + if (kexec_image->preserve_context) { +#ifdef CONFIG_KEXEC_JUMP + restore_processor_state(); + device_power_up(PMSG_RESTORE); + Enable_irqs: + local_irq_enable(); + enable_nonboot_cpus(); + Resume_devices: + device_resume(PMSG_RESTORE); + Resume_console: + resume_console(); + thaw_processes(); + Restore_console: + pm_restore_console(); + mutex_unlock(&pm_mutex); +#endif + } + + Unlock: + xchg(&kexec_lock, 0); + + return error; +} diff --git a/kernel/kmod.c b/kernel/kmod.c index 90d7af1c165..2456d1a0bef 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -352,16 +352,17 @@ static inline void register_pm_notifier_callback(void) {} * @path: path to usermode executable * @argv: arg vector for process * @envp: environment for process + * @gfp_mask: gfp mask for memory allocation * * Returns either %NULL on allocation failure, or a subprocess_info * structure. This should be passed to call_usermodehelper_exec to * exec the process and free the structure. */ -struct subprocess_info *call_usermodehelper_setup(char *path, - char **argv, char **envp) +struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, + char **envp, gfp_t gfp_mask) { struct subprocess_info *sub_info; - sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); + sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); if (!sub_info) goto out; @@ -417,12 +418,12 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, { struct file *f; - f = create_write_pipe(); + f = create_write_pipe(0); if (IS_ERR(f)) return PTR_ERR(f); *filp = f; - f = create_read_pipe(f); + f = create_read_pipe(f, 0); if (IS_ERR(f)) { free_write_pipe(*filp); return PTR_ERR(f); @@ -494,7 +495,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, struct subprocess_info *sub_info; int ret; - sub_info = call_usermodehelper_setup(path, argv, envp); + sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL); if (sub_info == NULL) return -ENOMEM; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1485ca8d0e0..75bc2cd9ebc 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -62,6 +62,7 @@ addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) #endif +static int kprobes_initialized; static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; @@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; static bool kprobe_enabled; DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ -DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static struct { + spinlock_t lock ____cacheline_aligned; +} kretprobe_table_locks[KPROBE_TABLE_SIZE]; + +static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) +{ + return &(kretprobe_table_locks[hash].lock); +} /* * Normally, functions that we'd want to prohibit kprobes in, are marked @@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) return; } -/* Called with kretprobe_lock held */ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head) { + struct kretprobe *rp = ri->rp; + /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); - if (ri->rp) { - /* remove rp inst off the used list */ - hlist_del(&ri->uflist); - /* put rp inst back onto the free list */ - INIT_HLIST_NODE(&ri->uflist); - hlist_add_head(&ri->uflist, &ri->rp->free_instances); + INIT_HLIST_NODE(&ri->hlist); + if (likely(rp)) { + spin_lock(&rp->lock); + hlist_add_head(&ri->hlist, &rp->free_instances); + spin_unlock(&rp->lock); } else /* Unregistering */ hlist_add_head(&ri->hlist, head); } -struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) +void kretprobe_hash_lock(struct task_struct *tsk, + struct hlist_head **head, unsigned long *flags) { - return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); + spinlock_t *hlist_lock; + + *head = &kretprobe_inst_table[hash]; + hlist_lock = kretprobe_table_lock_ptr(hash); + spin_lock_irqsave(hlist_lock, *flags); +} + +void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +{ + spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + spin_lock_irqsave(hlist_lock, *flags); +} + +void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +{ + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); + spinlock_t *hlist_lock; + + hlist_lock = kretprobe_table_lock_ptr(hash); + spin_unlock_irqrestore(hlist_lock, *flags); +} + +void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +{ + spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + spin_unlock_irqrestore(hlist_lock, *flags); } /* @@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) struct kretprobe_instance *ri; struct hlist_head *head, empty_rp; struct hlist_node *node, *tmp; - unsigned long flags = 0; + unsigned long hash, flags = 0; - INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(tk); + if (unlikely(!kprobes_initialized)) + /* Early boot. kretprobe_table_locks not yet initialized. */ + return; + + hash = hash_ptr(tk, KPROBE_HASH_BITS); + head = &kretprobe_inst_table[hash]; + kretprobe_table_lock(hash, &flags); hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { if (ri->task == tk) recycle_rp_inst(ri, &empty_rp); } - spin_unlock_irqrestore(&kretprobe_lock, flags); - + kretprobe_table_unlock(hash, &flags); + INIT_HLIST_HEAD(&empty_rp); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); @@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp) struct kretprobe_instance *ri; struct hlist_node *pos, *next; - hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) { - hlist_del(&ri->uflist); + hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { + hlist_del(&ri->hlist); kfree(ri); } } static void __kprobes cleanup_rp_inst(struct kretprobe *rp) { - unsigned long flags; + unsigned long flags, hash; struct kretprobe_instance *ri; struct hlist_node *pos, *next; + struct hlist_head *head; + /* No race here */ - spin_lock_irqsave(&kretprobe_lock, flags); - hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { - ri->rp = NULL; - hlist_del(&ri->uflist); + for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { + kretprobe_table_lock(hash, &flags); + head = &kretprobe_inst_table[hash]; + hlist_for_each_entry_safe(ri, pos, next, head, hlist) { + if (ri->rp == rp) + ri->rp = NULL; + } + kretprobe_table_unlock(hash, &flags); } - spin_unlock_irqrestore(&kretprobe_lock, flags); free_rp_inst(rp); } @@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); - unsigned long flags = 0; + unsigned long hash, flags = 0; + struct kretprobe_instance *ri; /*TODO: consider to only swap the RA after the last pre_handler fired */ - spin_lock_irqsave(&kretprobe_lock, flags); + hash = hash_ptr(current, KPROBE_HASH_BITS); + spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { - struct kretprobe_instance *ri; - ri = hlist_entry(rp->free_instances.first, - struct kretprobe_instance, uflist); + struct kretprobe_instance, hlist); + hlist_del(&ri->hlist); + spin_unlock_irqrestore(&rp->lock, flags); + ri->rp = rp; ri->task = current; if (rp->entry_handler && rp->entry_handler(ri, regs)) { - spin_unlock_irqrestore(&kretprobe_lock, flags); + spin_unlock_irqrestore(&rp->lock, flags); return 0; } arch_prepare_kretprobe(ri, regs); /* XXX(hch): why is there no hlist_move_head? */ - hlist_del(&ri->uflist); - hlist_add_head(&ri->uflist, &ri->rp->used_instances); - hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task)); - } else + INIT_HLIST_NODE(&ri->hlist); + kretprobe_table_lock(hash, &flags); + hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]); + kretprobe_table_unlock(hash, &flags); + } else { rp->nmissed++; - spin_unlock_irqrestore(&kretprobe_lock, flags); + spin_unlock_irqrestore(&rp->lock, flags); + } return 0; } @@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, rp->maxactive = NR_CPUS; #endif } - INIT_HLIST_HEAD(&rp->used_instances); + spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { inst = kmalloc(sizeof(struct kretprobe_instance) + @@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp, free_rp_inst(rp); return -ENOMEM; } - INIT_HLIST_NODE(&inst->uflist); - hlist_add_head(&inst->uflist, &rp->free_instances); + INIT_HLIST_NODE(&inst->hlist); + hlist_add_head(&inst->hlist, &rp->free_instances); } rp->nmissed = 0; @@ -1009,6 +1058,7 @@ static int __init init_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); + spin_lock_init(&(kretprobe_table_locks[i].lock)); } /* @@ -1050,6 +1100,7 @@ static int __init init_kprobes(void) err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); + kprobes_initialized = (err == 0); if (!err) init_test_probes(); @@ -1286,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe); EXPORT_SYMBOL_GPL(unregister_jprobe); EXPORT_SYMBOL_GPL(register_jprobes); EXPORT_SYMBOL_GPL(unregister_jprobes); -#ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(jprobe_return); -#endif - -#ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(register_kretprobe); EXPORT_SYMBOL_GPL(unregister_kretprobe); EXPORT_SYMBOL_GPL(register_kretprobes); EXPORT_SYMBOL_GPL(unregister_kretprobes); -#endif diff --git a/kernel/kthread.c b/kernel/kthread.c index ac3fb732664..96cff2f8710 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -106,7 +106,7 @@ static void create_kthread(struct kthread_create_info *create) */ sched_setscheduler(create->result, SCHED_NORMAL, ¶m); set_user_nice(create->result, KTHREAD_NICE_LEVEL); - set_cpus_allowed(create->result, CPU_MASK_ALL); + set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR); } complete(&create->done); } @@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) return; } /* Must have done schedule() in kthread() before we set_task_cpu */ - wait_task_inactive(k); + wait_task_inactive(k, 0); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; @@ -233,7 +233,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_user_nice(tsk, KTHREAD_NICE_LEVEL); - set_cpus_allowed(tsk, CPU_MASK_ALL); + set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; diff --git a/kernel/marker.c b/kernel/marker.c index 1abfb923b76..971da531790 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -441,7 +441,7 @@ static int remove_marker(const char *name) hlist_del(&e->hlist); /* Make sure the call_rcu has been executed */ if (e->rcu_pending) - rcu_barrier(); + rcu_barrier_sched(); kfree(e); return 0; } @@ -476,7 +476,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format) hlist_del(&(*entry)->hlist); /* Make sure the call_rcu has been executed */ if ((*entry)->rcu_pending) - rcu_barrier(); + rcu_barrier_sched(); kfree(*entry); *entry = e; trace_mark(core_marker_format, "name %s format %s", @@ -655,7 +655,7 @@ int marker_probe_register(const char *name, const char *format, * make sure it's executed now. */ if (entry->rcu_pending) - rcu_barrier(); + rcu_barrier_sched(); old = marker_entry_add_probe(entry, probe, probe_private); if (IS_ERR(old)) { ret = PTR_ERR(old); @@ -670,10 +670,7 @@ int marker_probe_register(const char *name, const char *format, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); -#ifdef CONFIG_PREEMPT_RCU - synchronize_sched(); /* Until we have the call_rcu_sched() */ -#endif - call_rcu(&entry->rcu, free_old_closure); + call_rcu_sched(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); return ret; @@ -704,7 +701,7 @@ int marker_probe_unregister(const char *name, if (!entry) goto end; if (entry->rcu_pending) - rcu_barrier(); + rcu_barrier_sched(); old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); marker_update_probes(); /* may update entry */ @@ -716,10 +713,7 @@ int marker_probe_unregister(const char *name, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); -#ifdef CONFIG_PREEMPT_RCU - synchronize_sched(); /* Until we have the call_rcu_sched() */ -#endif - call_rcu(&entry->rcu, free_old_closure); + call_rcu_sched(&entry->rcu, free_old_closure); remove_marker(name); /* Ignore busy error message */ ret = 0; end: @@ -786,7 +780,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, goto end; } if (entry->rcu_pending) - rcu_barrier(); + rcu_barrier_sched(); old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); marker_update_probes(); /* may update entry */ @@ -797,10 +791,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); -#ifdef CONFIG_PREEMPT_RCU - synchronize_sched(); /* Until we have the call_rcu_sched() */ -#endif - call_rcu(&entry->rcu, free_old_closure); + call_rcu_sched(&entry->rcu, free_old_closure); remove_marker(entry->name); /* Ignore busy error message */ end: mutex_unlock(&markers_mutex); diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 48d7ed6fc3a..43c2111cd54 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/cgroup.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/slab.h> #include <linux/nsproxy.h> @@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns( struct ns_cgroup, css); } -int ns_cgroup_clone(struct task_struct *task) +int ns_cgroup_clone(struct task_struct *task, struct pid *pid) { - return cgroup_clone(task, &ns_subsys); + char name[PROC_NUMBUF]; + + snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); + return cgroup_clone(task, &ns_subsys, name); } /* diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index adc785146a1..21575fc46d0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -157,12 +157,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } - err = ns_cgroup_clone(tsk); - if (err) { - put_nsproxy(new_ns); - goto out; - } - tsk->nsproxy = new_ns; out: @@ -209,7 +203,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, goto out; } - err = ns_cgroup_clone(current); + err = ns_cgroup_clone(current, task_pid(current)); if (err) put_nsproxy(*new_nsp); diff --git a/kernel/panic.c b/kernel/panic.c index 425567f45b9..12c5a0a6c89 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -318,6 +318,28 @@ void warn_on_slowpath(const char *file, int line) add_taint(TAINT_WARN); } EXPORT_SYMBOL(warn_on_slowpath); + + +void warn_slowpath(const char *file, int line, const char *fmt, ...) +{ + va_list args; + char function[KSYM_SYMBOL_LEN]; + unsigned long caller = (unsigned long)__builtin_return_address(0); + sprint_symbol(function, caller); + + printk(KERN_WARNING "------------[ cut here ]------------\n"); + printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, + line, function); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + print_modules(); + dump_stack(); + print_oops_end_marker(); + add_taint(TAINT_WARN); +} +EXPORT_SYMBOL(warn_slowpath); #endif #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/kernel/pid.c b/kernel/pid.c index 30bd5d4b2ac..064e76afa50 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -309,12 +309,6 @@ struct pid *find_vpid(int nr) } EXPORT_SYMBOL_GPL(find_vpid); -struct pid *find_pid(int nr) -{ - return find_pid_ns(nr, &init_pid_ns); -} -EXPORT_SYMBOL_GPL(find_pid); - /* * attach_pid() must be called with the tasklist_lock write-held. */ @@ -435,6 +429,7 @@ struct pid *find_get_pid(pid_t nr) return pid; } +EXPORT_SYMBOL_GPL(find_get_pid); pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) { @@ -482,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns); /* * Used by proc to find the first pid that is greater then or equal to nr. * - * If there is a pid at nr this function is exactly the same as find_pid. + * If there is a pid at nr this function is exactly the same as find_pid_ns. */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { @@ -497,7 +492,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return pid; } -EXPORT_SYMBOL_GPL(find_get_pid); /* * The pid hash table is scaled according to the amount of memory in the diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 98702b4b885..ea567b78d1a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -12,6 +12,7 @@ #include <linux/pid_namespace.h> #include <linux/syscalls.h> #include <linux/err.h> +#include <linux/acct.h> #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -71,7 +72,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level) struct pid_namespace *ns; int i; - ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL); + ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out; @@ -84,17 +85,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level) goto out_free_map; kref_init(&ns->kref); - ns->last_pid = 0; - ns->child_reaper = NULL; ns->level = level; set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); - for (i = 1; i < PIDMAP_ENTRIES; i++) { - ns->pidmap[i].page = NULL; + for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - } return ns; @@ -185,6 +182,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* Child reaper for the pid namespace is going away */ pid_ns->child_reaper = NULL; + acct_exit_ns(pid_ns); return; } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index dbd8398ddb0..9a21681aa80 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -449,9 +449,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) spin_unlock_irqrestore(&idr_lock, flags); } sigqueue_free(tmr->sigq); - if (unlikely(tmr->it_process) && - tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(tmr->it_process); kmem_cache_free(posix_timers_cache, tmr); } @@ -856,11 +853,10 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_process) { - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); - timer->it_process = NULL; - } + if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) + put_task_struct(timer->it_process); + timer->it_process = NULL; + unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); return 0; @@ -885,11 +881,10 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_process) { - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); - timer->it_process = NULL; - } + if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) + put_task_struct(timer->it_process); + timer->it_process = NULL; + unlock_timer(timer, flags); release_posix_timer(timer, IT_ID_SET); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 59dfdf1e1d2..dcd165f92a8 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -94,6 +94,17 @@ config SUSPEND powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_LIB=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + config SUSPEND_FREEZER bool "Enable freezer for suspend to RAM/standby" \ if ARCH_WANTS_FREEZER_CONTROL || BROKEN diff --git a/kernel/power/main.c b/kernel/power/main.c index 3398f4651aa..0b7476f5d2a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -132,6 +132,61 @@ static inline int suspend_test(int level) { return 0; } #ifdef CONFIG_SUSPEND +#ifdef CONFIG_PM_TEST_SUSPEND + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending. Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS 5 + +static unsigned long suspend_test_start_time; + +static void suspend_test_start(void) +{ + /* FIXME Use better timebase than "jiffies", ideally a clocksource. + * What we want is a hardware counter that will work correctly even + * during the irqs-are-off stages of the suspend/resume cycle... + */ + suspend_test_start_time = jiffies; +} + +static void suspend_test_finish(const char *label) +{ + long nj = jiffies - suspend_test_start_time; + unsigned msec; + + msec = jiffies_to_msecs(abs(nj)); + pr_info("PM: %s took %d.%03d seconds\n", label, + msec / 1000, msec % 1000); + + /* Warning on suspend means the RTC alarm period needs to be + * larger -- the system was sooo slooowwww to suspend that the + * alarm (should have) fired before the system went to sleep! + * + * Warning on either suspend or resume also means the system + * has some performance issues. The stack dump of a WARN_ON + * is more likely to get the right attention than a printk... + */ + WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); +} + +#else + +static void suspend_test_start(void) +{ +} + +static void suspend_test_finish(const char *label) +{ +} + +#endif + /* This is just an arbitrary number */ #define FREE_PAGE_NUMBER (100) @@ -266,12 +321,13 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to suspend\n"); goto Recover_platform; } - + suspend_test_finish("suspend devices"); if (suspend_test(TEST_DEVICES)) goto Recover_platform; @@ -293,7 +349,9 @@ int suspend_devices_and_enter(suspend_state_t state) if (suspend_ops->finish) suspend_ops->finish(); Resume_devices: + suspend_test_start(); device_resume(PMSG_RESUME); + suspend_test_finish("resume devices"); resume_console(); Close: if (suspend_ops->end) @@ -521,3 +579,144 @@ static int __init pm_init(void) } core_initcall(pm_init); + + +#ifdef CONFIG_PM_TEST_SUSPEND + +#include <linux/rtc.h> + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system. RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ + static char err_readtime[] __initdata = + KERN_ERR "PM: can't read %s time, err %d\n"; + static char err_wakealarm [] __initdata = + KERN_ERR "PM: can't set %s wakealarm, err %d\n"; + static char err_suspend[] __initdata = + KERN_ERR "PM: suspend test failed, error %d\n"; + static char info_test[] __initdata = + KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + + unsigned long now; + struct rtc_wkalrm alm; + int status; + + /* this may fail if the RTC hasn't been initialized */ + status = rtc_read_time(rtc, &alm.time); + if (status < 0) { + printk(err_readtime, rtc->dev.bus_id, status); + return; + } + rtc_tm_to_time(&alm.time, &now); + + memset(&alm, 0, sizeof alm); + rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + alm.enabled = true; + + status = rtc_set_alarm(rtc, &alm); + if (status < 0) { + printk(err_wakealarm, rtc->dev.bus_id, status); + return; + } + + if (state == PM_SUSPEND_MEM) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + if (status == -ENODEV) + state = PM_SUSPEND_STANDBY; + } + if (state == PM_SUSPEND_STANDBY) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) + printk(err_suspend, status); + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); +} + +static int __init has_wakealarm(struct device *dev, void *name_ptr) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + *(char **)name_ptr = dev->bus_id; + return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time. They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static suspend_state_t test_state __initdata = PM_SUSPEND_ON; + +static char warn_bad_state[] __initdata = + KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ + unsigned i; + + /* "=mem" ==> "mem" */ + value++; + for (i = 0; i < PM_SUSPEND_MAX; i++) { + if (!pm_states[i]) + continue; + if (strcmp(pm_states[i], value) != 0) + continue; + test_state = (__force suspend_state_t) i; + return 0; + } + printk(warn_bad_state, value); + return 0; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ + static char warn_no_rtc[] __initdata = + KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + + char *pony = NULL; + struct rtc_device *rtc = NULL; + + /* PM is initialized by now; is that state testable? */ + if (test_state == PM_SUSPEND_ON) + goto done; + if (!valid_state(test_state)) { + printk(warn_bad_state, pm_states[test_state]); + goto done; + } + + /* RTCs have initialized by now too ... can we use one? */ + class_find_device(rtc_class, NULL, &pony, has_wakealarm); + if (pony) + rtc = rtc_class_open(pony); + if (!rtc) { + printk(warn_no_rtc); + goto done; + } + + /* go for it */ + test_wakealarm(rtc, test_state); + rtc_class_close(rtc); +done: + return 0; +} +late_initcall(test_suspend); + +#endif /* CONFIG_PM_TEST_SUSPEND */ diff --git a/kernel/power/power.h b/kernel/power/power.h index 700f44ec840..acc0c101dbd 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void); extern int pfn_is_nosave(unsigned long); -extern struct mutex pm_mutex; - #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 678ec736076..72016f05147 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -10,6 +10,7 @@ #include <linux/pm.h> #include <linux/workqueue.h> #include <linux/reboot.h> +#include <linux/cpumask.h> /* * When the user hits Sys-Rq o to power down the machine this is the @@ -25,7 +26,8 @@ static DECLARE_WORK(poweroff_work, do_poweroff); static void handle_poweroff(int key, struct tty_struct *tty) { - schedule_work(&poweroff_work); + /* run sysrq poweroff on boot cpu */ + schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); } static struct sysrq_key_op sysrq_poweroff_op = { diff --git a/kernel/power/process.c b/kernel/power/process.c index 5fb87652f21..278946aecaf 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -149,7 +149,7 @@ static int try_to_freeze_tasks(bool sig_only) unsigned long end_time; unsigned int todo; struct timeval start, end; - s64 elapsed_csecs64; + u64 elapsed_csecs64; unsigned int elapsed_csecs; do_gettimeofday(&start); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5f91a07c4ea..5d2ab836e99 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -205,8 +205,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * objects. The main list's elements are of type struct zone_bitmap * and each of them corresonds to one zone. For each zone bitmap * object there is a list of objects of type struct bm_block that - * represent each blocks of bit chunks in which information is - * stored. + * represent each blocks of bitmap in which information is stored. * * struct memory_bitmap contains a pointer to the main list of zone * bitmap objects, a struct bm_position used for browsing the bitmap, @@ -224,26 +223,27 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * pfns that correspond to the start and end of the represented zone. * * struct bm_block contains a pointer to the memory page in which - * information is stored (in the form of a block of bit chunks - * of type unsigned long each). It also contains the pfns that - * correspond to the start and end of the represented memory area and - * the number of bit chunks in the block. + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. */ #define BM_END_OF_MAP (~0UL) -#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long)) -#define BM_BITS_PER_CHUNK (sizeof(long) << 3) #define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) struct bm_block { struct bm_block *next; /* next element of the list */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ - unsigned int size; /* number of bit chunks */ - unsigned long *data; /* chunks of bits representing pages */ + unsigned long *data; /* bitmap representing pages */ }; +static inline unsigned long bm_block_bits(struct bm_block *bb) +{ + return bb->end_pfn - bb->start_pfn; +} + struct zone_bitmap { struct zone_bitmap *next; /* next element of the list */ unsigned long start_pfn; /* minimal pfn in this zone */ @@ -257,7 +257,6 @@ struct zone_bitmap { struct bm_position { struct zone_bitmap *zone_bm; struct bm_block *block; - int chunk; int bit; }; @@ -272,12 +271,6 @@ struct memory_bitmap { /* Functions that operate on memory bitmaps */ -static inline void memory_bm_reset_chunk(struct memory_bitmap *bm) -{ - bm->cur.chunk = 0; - bm->cur.bit = -1; -} - static void memory_bm_position_reset(struct memory_bitmap *bm) { struct zone_bitmap *zone_bm; @@ -285,7 +278,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm) zone_bm = bm->zone_bm_list; bm->cur.zone_bm = zone_bm; bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); + bm->cur.bit = 0; } static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); @@ -394,12 +387,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) bb->start_pfn = pfn; if (nr >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - bb->size = BM_CHUNKS_PER_BLOCK; nr -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ pfn += nr; - bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK); } bb->end_pfn = pfn; bb = bb->next; @@ -478,8 +469,8 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, } zone_bm->cur_block = bb; pfn -= bb->start_pfn; - *bit_nr = pfn % BM_BITS_PER_CHUNK; - *addr = bb->data + pfn / BM_BITS_PER_CHUNK; + *bit_nr = pfn; + *addr = bb->data; return 0; } @@ -528,36 +519,6 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) return test_bit(bit, addr); } -/* Two auxiliary functions for memory_bm_next_pfn */ - -/* Find the first set bit in the given chunk, if there is one */ - -static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p) -{ - bit++; - while (bit < BM_BITS_PER_CHUNK) { - if (test_bit(bit, chunk_p)) - return bit; - - bit++; - } - return -1; -} - -/* Find a chunk containing some bits set in given block of bits */ - -static inline int next_chunk_in_block(int n, struct bm_block *bb) -{ - n++; - while (n < bb->size) { - if (bb->data[n]) - return n; - - n++; - } - return -1; -} - /** * memory_bm_next_pfn - find the pfn that corresponds to the next set bit * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is @@ -571,40 +532,33 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { struct zone_bitmap *zone_bm; struct bm_block *bb; - int chunk; int bit; do { bb = bm->cur.block; do { - chunk = bm->cur.chunk; bit = bm->cur.bit; - do { - bit = next_bit_in_chunk(bit, bb->data + chunk); - if (bit >= 0) - goto Return_pfn; - - chunk = next_chunk_in_block(chunk, bb); - bit = -1; - } while (chunk >= 0); + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + bb = bb->next; bm->cur.block = bb; - memory_bm_reset_chunk(bm); + bm->cur.bit = 0; } while (bb); zone_bm = bm->cur.zone_bm->next; if (zone_bm) { bm->cur.zone_bm = zone_bm; bm->cur.block = zone_bm->bm_blocks; - memory_bm_reset_chunk(bm); + bm->cur.bit = 0; } } while (zone_bm); memory_bm_position_reset(bm); return BM_END_OF_MAP; Return_pfn: - bm->cur.chunk = chunk; - bm->cur.bit = bit; - return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; + bm->cur.bit = bit + 1; + return bb->start_pfn + bit; } /** diff --git a/kernel/printk.c b/kernel/printk.c index 07ad9e7f7a6..a7f7559c5f6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -933,7 +933,7 @@ void suspend_console(void) { if (!console_suspend_enabled) return; - printk("Suspending console(s)\n"); + printk("Suspending console(s) (use no_console_suspend to debug)\n"); acquire_console_sem(); console_suspended = 1; } @@ -1308,6 +1308,8 @@ void tty_write_message(struct tty_struct *tty, char *msg) } #if defined CONFIG_PRINTK + +DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); /* * printk rate limiting, lifted from the networking subsystem. * @@ -1315,22 +1317,9 @@ void tty_write_message(struct tty_struct *tty, char *msg) * every printk_ratelimit_jiffies to make a denial-of-service * attack impossible. */ -int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) -{ - return __ratelimit(ratelimit_jiffies, ratelimit_burst); -} -EXPORT_SYMBOL(__printk_ratelimit); - -/* minimum time in jiffies between messages */ -int printk_ratelimit_jiffies = 5 * HZ; - -/* number of messages we send before ratelimiting */ -int printk_ratelimit_burst = 10; - int printk_ratelimit(void) { - return __printk_ratelimit(printk_ratelimit_jiffies, - printk_ratelimit_burst); + return __ratelimit(&printk_ratelimit_state); } EXPORT_SYMBOL(printk_ratelimit); diff --git a/kernel/profile.c b/kernel/profile.c index 58926411eb2..cd26bed4cc2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -112,8 +112,6 @@ void __init profile_init(void) /* Profile event notifications */ -#ifdef CONFIG_PROFILING - static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); static ATOMIC_NOTIFIER_HEAD(task_free_notifier); static BLOCKING_NOTIFIER_HEAD(munmap_notifier); @@ -203,8 +201,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *)) } EXPORT_SYMBOL_GPL(unregister_timer_hook); -#endif /* CONFIG_PROFILING */ - #ifdef CONFIG_SMP /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8392a9da645..082b3fcb32a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) read_unlock(&tasklist_lock); if (!ret && !kill) - wait_task_inactive(child); + ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; /* All systems go.. */ return ret; diff --git a/kernel/relay.c b/kernel/relay.c index 7de644cdec4..04006ef970b 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan) } EXPORT_SYMBOL_GPL(relay_reset); +static inline void relay_set_buf_dentry(struct rchan_buf *buf, + struct dentry *dentry) +{ + buf->dentry = dentry; + buf->dentry->d_inode->i_size = buf->early_bytes; +} + +static struct dentry *relay_create_buf_file(struct rchan *chan, + struct rchan_buf *buf, + unsigned int cpu) +{ + struct dentry *dentry; + char *tmpname; + + tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmpname) + return NULL; + snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); + + /* Create file in fs */ + dentry = chan->cb->create_buf_file(tmpname, chan->parent, + S_IRUSR, buf, + &chan->is_global); + + kfree(tmpname); + + return dentry; +} + /* * relay_open_buf - create a new relay channel buffer * @@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) { struct rchan_buf *buf = NULL; struct dentry *dentry; - char *tmpname; if (chan->is_global) return chan->buf[0]; - tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmpname) - goto end; - snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); - buf = relay_create_buf(chan); if (!buf) - goto free_name; + return NULL; + + if (chan->has_base_filename) { + dentry = relay_create_buf_file(chan, buf, cpu); + if (!dentry) + goto free_buf; + relay_set_buf_dentry(buf, dentry); + } buf->cpu = cpu; __relay_reset(buf, 1); - /* Create file in fs */ - dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, - buf, &chan->is_global); - if (!dentry) - goto free_buf; - - buf->dentry = dentry; - if(chan->is_global) { chan->buf[0] = buf; buf->cpu = 0; } - goto free_name; + return buf; free_buf: relay_destroy_buf(buf); - buf = NULL; -free_name: - kfree(tmpname); -end: - return buf; + return NULL; } /** @@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, /** * relay_open - create a new relay channel - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory + * @base_filename: base name of files to create, %NULL for buffering only + * @parent: dentry of parent directory, %NULL for root directory or buffer * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers * @cb: client callback functions @@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename, { unsigned int i; struct rchan *chan; - if (!base_filename) - return NULL; if (!(subbuf_size && n_subbufs)) return NULL; @@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename, chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); chan->parent = parent; chan->private_data = private_data; - strlcpy(chan->base_filename, base_filename, NAME_MAX); + if (base_filename) { + chan->has_base_filename = 1; + strlcpy(chan->base_filename, base_filename, NAME_MAX); + } setup_callbacks(chan, cb); kref_init(&chan->kref); @@ -604,6 +623,94 @@ free_bufs: } EXPORT_SYMBOL_GPL(relay_open); +struct rchan_percpu_buf_dispatcher { + struct rchan_buf *buf; + struct dentry *dentry; +}; + +/* Called in atomic context. */ +static void __relay_set_buf_dentry(void *info) +{ + struct rchan_percpu_buf_dispatcher *p = info; + + relay_set_buf_dentry(p->buf, p->dentry); +} + +/** + * relay_late_setup_files - triggers file creation + * @chan: channel to operate on + * @base_filename: base name of files to create + * @parent: dentry of parent directory, %NULL for root directory + * + * Returns 0 if successful, non-zero otherwise. + * + * Use to setup files for a previously buffer-only channel. + * Useful to do early tracing in kernel, before VFS is up, for example. + */ +int relay_late_setup_files(struct rchan *chan, + const char *base_filename, + struct dentry *parent) +{ + int err = 0; + unsigned int i, curr_cpu; + unsigned long flags; + struct dentry *dentry; + struct rchan_percpu_buf_dispatcher disp; + + if (!chan || !base_filename) + return -EINVAL; + + strlcpy(chan->base_filename, base_filename, NAME_MAX); + + mutex_lock(&relay_channels_mutex); + /* Is chan already set up? */ + if (unlikely(chan->has_base_filename)) + return -EEXIST; + chan->has_base_filename = 1; + chan->parent = parent; + curr_cpu = get_cpu(); + /* + * The CPU hotplug notifier ran before us and created buffers with + * no files associated. So it's safe to call relay_setup_buf_file() + * on all currently online CPUs. + */ + for_each_online_cpu(i) { + if (unlikely(!chan->buf[i])) { + printk(KERN_ERR "relay_late_setup_files: CPU %u " + "has no buffer, it must have!\n", i); + BUG(); + err = -EINVAL; + break; + } + + dentry = relay_create_buf_file(chan, chan->buf[i], i); + if (unlikely(!dentry)) { + err = -EINVAL; + break; + } + + if (curr_cpu == i) { + local_irq_save(flags); + relay_set_buf_dentry(chan->buf[i], dentry); + local_irq_restore(flags); + } else { + disp.buf = chan->buf[i]; + disp.dentry = dentry; + smp_mb(); + /* relay_channels_mutex must be held, so wait. */ + err = smp_call_function_single(i, + __relay_set_buf_dentry, + &disp, 1); + } + if (unlikely(err)) + break; + } + put_cpu(); + mutex_unlock(&relay_channels_mutex); + + return err; +} + /** * relay_switch_subbuf - switch to a new sub-buffer * @buf: channel buffer @@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; buf->padding[old_subbuf] = buf->prev_padding; buf->subbufs_produced++; - buf->dentry->d_inode->i_size += buf->chan->subbuf_size - - buf->padding[old_subbuf]; + if (buf->dentry) + buf->dentry->d_inode->i_size += + buf->chan->subbuf_size - + buf->padding[old_subbuf]; + else + buf->early_bytes += buf->chan->subbuf_size - + buf->padding[old_subbuf]; smp_mb(); if (waitqueue_active(&buf->read_wait)) /* @@ -1237,4 +1349,4 @@ static __init int relay_init(void) return 0; } -module_init(relay_init); +early_initcall(relay_init); diff --git a/kernel/res_counter.c b/kernel/res_counter.c index d3c61b4ebef..f275c8eca77 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/res_counter.h> #include <linux/uaccess.h> +#include <linux/mm.h> void res_counter_init(struct res_counter *counter) { @@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member) return *res_counter_member(counter, member); } -ssize_t res_counter_write(struct res_counter *counter, int member, - const char __user *userbuf, size_t nbytes, loff_t *pos, - int (*write_strategy)(char *st_buf, unsigned long long *val)) +int res_counter_memparse_write_strategy(const char *buf, + unsigned long long *res) { - int ret; - char *buf, *end; - unsigned long flags; - unsigned long long tmp, *val; - - buf = kmalloc(nbytes + 1, GFP_KERNEL); - ret = -ENOMEM; - if (buf == NULL) - goto out; + char *end; + /* FIXME - make memparse() take const char* args */ + *res = memparse((char *)buf, &end); + if (*end != '\0') + return -EINVAL; - buf[nbytes] = '\0'; - ret = -EFAULT; - if (copy_from_user(buf, userbuf, nbytes)) - goto out_free; + *res = PAGE_ALIGN(*res); + return 0; +} - ret = -EINVAL; +int res_counter_write(struct res_counter *counter, int member, + const char *buf, write_strategy_fn write_strategy) +{ + char *end; + unsigned long flags; + unsigned long long tmp, *val; - strstrip(buf); if (write_strategy) { - if (write_strategy(buf, &tmp)) { - goto out_free; - } + if (write_strategy(buf, &tmp)) + return -EINVAL; } else { tmp = simple_strtoull(buf, &end, 10); if (*end != '\0') - goto out_free; + return -EINVAL; } spin_lock_irqsave(&counter->lock, flags); val = res_counter_member(counter, member); *val = tmp; spin_unlock_irqrestore(&counter->lock, flags); - ret = nbytes; -out_free: - kfree(buf); -out: - return ret; + return 0; } diff --git a/kernel/sched.c b/kernel/sched.c index 6acf749d333..0236958addc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1867,16 +1867,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * wait_task_inactive - wait for a thread to unschedule. * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(struct task_struct *p) +unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; int running, on_rq; + unsigned long ncsw; struct rq *rq; for (;;) { @@ -1899,8 +1907,11 @@ void wait_task_inactive(struct task_struct *p) * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; cpu_relax(); + } /* * Ok, time to look more closely! We need the rq @@ -1910,9 +1921,21 @@ void wait_task_inactive(struct task_struct *p) rq = task_rq_lock(p, &flags); running = task_running(rq, p); on_rq = p->se.on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) { + ncsw = p->nivcsw + p->nvcsw; + if (unlikely(!ncsw)) + ncsw = 1; + } task_rq_unlock(rq, &flags); /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* * Was it really running after all now that we * checked with the proper locks actually held? * @@ -1944,6 +1967,8 @@ void wait_task_inactive(struct task_struct *p) */ break; } + + return ncsw; } /*** @@ -4046,6 +4071,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); + /* Account for user time used */ + acct_update_integrals(p); } /* @@ -6387,7 +6414,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { .priority = 10 }; -void __init migration_init(void) +static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; @@ -6397,7 +6424,10 @@ void __init migration_init(void) BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + + return err; } +early_initcall(migration_init); #endif #ifdef CONFIG_SMP diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f85a76363ee..908c04f9dad 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -253,7 +253,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq) diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { - do_div(diff, weight); + diff = div_u64((u64)diff, weight); if (rt_rq->rt_runtime + diff > rt_period) diff = rt_period - rt_rq->rt_runtime; iter->rt_runtime -= diff; diff --git a/kernel/signal.c b/kernel/signal.c index 6c0958e52ea..954f77d7e3b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,6 +22,7 @@ #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/signalfd.h> +#include <linux/tracehook.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> @@ -39,24 +40,21 @@ static struct kmem_cache *sigqueue_cachep; -static int __sig_ignored(struct task_struct *t, int sig) +static void __user *sig_handler(struct task_struct *t, int sig) { - void __user *handler; + return t->sighand->action[sig - 1].sa.sa_handler; +} +static int sig_handler_ignored(void __user *handler, int sig) +{ /* Is it explicitly or implicitly ignored? */ - - handler = t->sighand->action[sig - 1].sa.sa_handler; return handler == SIG_IGN || (handler == SIG_DFL && sig_kernel_ignore(sig)); } static int sig_ignored(struct task_struct *t, int sig) { - /* - * Tracers always want to know about signals.. - */ - if (t->ptrace & PT_PTRACED) - return 0; + void __user *handler; /* * Blocked signals are never ignored, since the @@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - return __sig_ignored(t, sig); + handler = sig_handler(t, sig); + if (!sig_handler_ignored(handler, sig)) + return 0; + + /* + * Tracers may want to know about even ignored signals. + */ + return !tracehook_consider_ignored_signal(t, sig, handler); } /* @@ -129,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) + if (unlikely(tracehook_force_sigpending())) + set_thread_flag(TIF_SIGPENDING); + else if (!recalc_sigpending_tsk(current) && !freezing(current)) clear_thread_flag(TIF_SIGPENDING); } @@ -295,12 +302,12 @@ flush_signal_handlers(struct task_struct *t, int force_default) int unhandled_signal(struct task_struct *tsk, int sig) { + void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) return 1; - if (tsk->ptrace & PT_PTRACED) + if (handler != SIG_IGN && handler != SIG_DFL) return 0; - return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || - (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); + return !tracehook_consider_fatal_signal(tsk, sig, handler); } @@ -338,13 +345,9 @@ unblock_all_signals(void) spin_unlock_irqrestore(¤t->sighand->siglock, flags); } -static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) +static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) { struct sigqueue *q, *first = NULL; - int still_pending = 0; - - if (unlikely(!sigismember(&list->signal, sig))) - return 0; /* * Collect the siginfo appropriate to this signal. Check if @@ -352,33 +355,30 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) */ list_for_each_entry(q, &list->list, list) { if (q->info.si_signo == sig) { - if (first) { - still_pending = 1; - break; - } + if (first) + goto still_pending; first = q; } } + + sigdelset(&list->signal, sig); + if (first) { +still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); __sigqueue_free(first); - if (!still_pending) - sigdelset(&list->signal, sig); } else { - /* Ok, it wasn't in the queue. This must be a fast-pathed signal or we must have been out of queue space. So zero out the info. */ - sigdelset(&list->signal, sig); info->si_signo = sig; info->si_errno = 0; info->si_code = 0; info->si_pid = 0; info->si_uid = 0; } - return 1; } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, @@ -396,8 +396,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, } } - if (!collect_signal(sig, pending, info)) - sig = 0; + collect_signal(sig, pending, info); } return sig; @@ -462,8 +461,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ - if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; } if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* @@ -600,9 +598,6 @@ static int check_kill_permission(int sig, struct siginfo *info, return security_task_kill(t, info, sig, 0); } -/* forward decl */ -static void do_notify_parent_cldstop(struct task_struct *tsk, int why); - /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation @@ -765,7 +760,8 @@ static void complete_signal(int sig, struct task_struct *p, int group) if (sig_fatal(p, sig) && !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { + (sig == SIGKILL || + !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) { /* * This signal will be fatal to the whole group. */ @@ -1125,7 +1121,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); * is probably wrong. Should make it like BSD or SYSV. */ -static int kill_something_info(int sig, struct siginfo *info, int pid) +static int kill_something_info(int sig, struct siginfo *info, pid_t pid) { int ret; @@ -1237,17 +1233,6 @@ int kill_pid(struct pid *pid, int sig, int priv) } EXPORT_SYMBOL(kill_pid); -int -kill_proc(pid_t pid, int sig, int priv) -{ - int ret; - - rcu_read_lock(); - ret = kill_pid_info(sig, __si_special(priv), find_pid(pid)); - rcu_read_unlock(); - return ret; -} - /* * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot @@ -1343,9 +1328,11 @@ static inline void __wake_up_parent(struct task_struct *p, /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. + * + * Returns -1 if our parent ignored us and so we've switched to + * self-reaping, or else @sig. */ - -void do_notify_parent(struct task_struct *tsk, int sig) +int do_notify_parent(struct task_struct *tsk, int sig) { struct siginfo info; unsigned long flags; @@ -1379,10 +1366,9 @@ void do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = tsk->uid; - /* FIXME: find out whether or not this is supposed to be c*time. */ - info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime, + info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, tsk->signal->utime)); - info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime, + info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, tsk->signal->stime)); info.si_status = tsk->exit_code & 0x7f; @@ -1417,12 +1403,14 @@ void do_notify_parent(struct task_struct *tsk, int sig) */ tsk->exit_signal = -1; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = 0; + sig = -1; } if (valid_signal(sig) && sig > 0) __group_send_sig_info(sig, &info, tsk->parent); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); + + return sig; } static void do_notify_parent_cldstop(struct task_struct *tsk, int why) @@ -1450,9 +1438,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) info.si_uid = tsk->uid; - /* FIXME: find out whether or not this is supposed to be c*time. */ - info.si_utime = cputime_to_jiffies(tsk->utime); - info.si_stime = cputime_to_jiffies(tsk->stime); + info.si_utime = cputime_to_clock_t(tsk->utime); + info.si_stime = cputime_to_clock_t(tsk->stime); info.si_code = why; switch (why) { @@ -1491,10 +1478,10 @@ static inline int may_ptrace_stop(void) * is a deadlock situation, and pointless because our tracer * is dead so don't allow us to stop. * If SIGKILL was already sent before the caller unlocked - * ->siglock we must see ->core_waiters != 0. Otherwise it + * ->siglock we must see ->core_state != NULL. Otherwise it * is safe to enter schedule(). */ - if (unlikely(current->mm->core_waiters) && + if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) return 0; @@ -1507,9 +1494,8 @@ static inline int may_ptrace_stop(void) */ static int sigkill_pending(struct task_struct *tsk) { - return ((sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) && - !unlikely(sigismember(&tsk->blocked, SIGKILL))); + return sigismember(&tsk->pending.signal, SIGKILL) || + sigismember(&tsk->signal->shared_pending.signal, SIGKILL); } /* @@ -1525,8 +1511,6 @@ static int sigkill_pending(struct task_struct *tsk) */ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) { - int killed = 0; - if (arch_ptrace_stop_needed(exit_code, info)) { /* * The arch code has something special to do before a @@ -1542,7 +1526,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) spin_unlock_irq(¤t->sighand->siglock); arch_ptrace_stop(exit_code, info); spin_lock_irq(¤t->sighand->siglock); - killed = sigkill_pending(current); + if (sigkill_pending(current)) + return; } /* @@ -1559,7 +1544,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) __set_current_state(TASK_TRACED); spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); - if (!unlikely(killed) && may_ptrace_stop()) { + if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); @@ -1623,7 +1608,7 @@ finish_stop(int stop_count) * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (stop_count == 0 || (current->ptrace & PT_PTRACED)) { + if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(current, CLD_STOPPED); read_unlock(&tasklist_lock); @@ -1658,8 +1643,7 @@ static int do_signal_stop(int signr) } else { struct task_struct *t; - if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE)) - != SIGNAL_STOP_DEQUEUED) || + if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || unlikely(signal_group_exit(sig))) return 0; /* @@ -1760,6 +1744,9 @@ relock: signal->flags &= ~SIGNAL_CLD_MASK; spin_unlock_irq(&sighand->siglock); + if (unlikely(!tracehook_notify_jctl(1, why))) + goto relock; + read_lock(&tasklist_lock); do_notify_parent_cldstop(current->group_leader, why); read_unlock(&tasklist_lock); @@ -1773,17 +1760,33 @@ relock: do_signal_stop(0)) goto relock; - signr = dequeue_signal(current, ¤t->blocked, info); - if (!signr) - break; /* will return 0 */ + /* + * Tracing can induce an artifical signal and choose sigaction. + * The return value in @signr determines the default action, + * but @info->si_signo is the signal number we will report. + */ + signr = tracehook_get_signal(current, regs, info, return_ka); + if (unlikely(signr < 0)) + goto relock; + if (unlikely(signr != 0)) + ka = return_ka; + else { + signr = dequeue_signal(current, ¤t->blocked, + info); - if (signr != SIGKILL) { - signr = ptrace_signal(signr, info, regs, cookie); if (!signr) - continue; + break; /* will return 0 */ + + if (signr != SIGKILL) { + signr = ptrace_signal(signr, info, + regs, cookie); + if (!signr) + continue; + } + + ka = &sighand->action[signr-1]; } - ka = &sighand->action[signr-1]; if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { @@ -1831,7 +1834,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(signr))) { + if (likely(do_signal_stop(info->si_signo))) { /* It released the siglock. */ goto relock; } @@ -1852,7 +1855,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(regs, signr); + print_fatal_signal(regs, info->si_signo); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with @@ -1861,13 +1864,13 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump((long)signr, signr, regs); + do_coredump(info->si_signo, info->si_signo, regs); } /* * Death signals, no core dump. */ - do_group_exit(signr); + do_group_exit(info->si_signo); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); @@ -1909,7 +1912,7 @@ void exit_signals(struct task_struct *tsk) out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop)) { + if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(tsk, CLD_STOPPED); read_unlock(&tasklist_lock); @@ -1920,8 +1923,6 @@ EXPORT_SYMBOL(recalc_sigpending); EXPORT_SYMBOL_GPL(dequeue_signal); EXPORT_SYMBOL(flush_signals); EXPORT_SYMBOL(force_sig); -EXPORT_SYMBOL(kill_proc); -EXPORT_SYMBOL(ptrace_notify); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); @@ -2196,7 +2197,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, } asmlinkage long -sys_kill(int pid, int sig) +sys_kill(pid_t pid, int sig) { struct siginfo info; @@ -2209,7 +2210,7 @@ sys_kill(int pid, int sig) return kill_something_info(sig, &info, pid); } -static int do_tkill(int tgid, int pid, int sig) +static int do_tkill(pid_t tgid, pid_t pid, int sig) { int error; struct siginfo info; @@ -2255,7 +2256,7 @@ static int do_tkill(int tgid, int pid, int sig) * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ -asmlinkage long sys_tgkill(int tgid, int pid, int sig) +asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) @@ -2268,7 +2269,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) * Send a signal to only one task, even if it's a CLONE_THREAD task. */ asmlinkage long -sys_tkill(int pid, int sig) +sys_tkill(pid_t pid, int sig) { /* This is only valid for single tasks */ if (pid <= 0) @@ -2278,7 +2279,7 @@ sys_tkill(int pid, int sig) } asmlinkage long -sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) +sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo) { siginfo_t info; @@ -2325,7 +2326,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ - if (__sig_ignored(t, sig)) { + if (sig_handler_ignored(sig_handler(t, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); rm_from_queue_full(&mask, &t->signal->shared_pending); diff --git a/kernel/smp.c b/kernel/smp.c index 462c785ca1e..96fc7c0edc5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -33,7 +33,7 @@ struct call_single_queue { spinlock_t lock; }; -void __cpuinit init_call_single_data(void) +static int __cpuinit init_call_single_data(void) { int i; @@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void) spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } + return 0; } +early_initcall(init_call_single_data); static void csd_flag_wait(struct call_single_data *data) { diff --git a/kernel/softirq.c b/kernel/softirq.c index 81e2fe0f983..c506f266a6b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -286,7 +286,7 @@ void irq_exit(void) #ifdef CONFIG_NO_HZ /* Make sure that timer wheel updates are propagated */ if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(0); rcu_irq_exit(); #endif preempt_enable_no_resched(); @@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -__init int spawn_ksoftirqd(void) +static __init int spawn_ksoftirqd(void) { void *cpu = (void *)(long)smp_processor_id(); int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); @@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void) register_cpu_notifier(&cpu_nfb); return 0; } +early_initcall(spawn_ksoftirqd); #ifdef CONFIG_SMP /* diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 7bd8d1aadd5..b75b492fbfc 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -338,14 +338,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -__init void spawn_softlockup_task(void) +static int __initdata nosoftlockup; + +static int __init nosoftlockup_setup(char *str) +{ + nosoftlockup = 1; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); + +static int __init spawn_softlockup_task(void) { void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + int err; - BUG_ON(err == NOTIFY_BAD); + if (nosoftlockup) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + if (err == NOTIFY_BAD) { + BUG(); + return 1; + } cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + return 0; } +early_initcall(spawn_softlockup_task); diff --git a/kernel/sys.c b/kernel/sys.c index 14e97282eb6..c01858090a9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -301,26 +301,6 @@ void kernel_restart(char *cmd) } EXPORT_SYMBOL_GPL(kernel_restart); -/** - * kernel_kexec - reboot the system - * - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -static void kernel_kexec(void) -{ -#ifdef CONFIG_KEXEC - struct kimage *image; - image = xchg(&kexec_image, NULL); - if (!image) - return; - kernel_restart_prepare(NULL); - printk(KERN_EMERG "Starting new kernel\n"); - machine_shutdown(); - machine_kexec(image); -#endif -} - static void kernel_shutdown_prepare(enum system_states state) { blocking_notifier_call_chain(&reboot_notifier_list, @@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user kernel_restart(buffer); break; +#ifdef CONFIG_KEXEC case LINUX_REBOOT_CMD_KEXEC: - kernel_kexec(); - unlock_kernel(); - return -EINVAL; + { + int ret; + ret = kernel_kexec(); + unlock_kernel(); + return ret; + } +#endif #ifdef CONFIG_HIBERNATION case LINUX_REBOOT_CMD_SW_SUSPEND: @@ -1343,8 +1328,6 @@ EXPORT_SYMBOL(in_egroup_p); DECLARE_RWSEM(uts_sem); -EXPORT_SYMBOL(uts_sem); - asmlinkage long sys_newuname(struct new_utsname __user * name) { int errno = 0; @@ -1795,7 +1778,7 @@ int orderly_poweroff(bool force) goto out; } - info = call_usermodehelper_setup(argv[0], argv, envp); + info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); if (info == NULL) { argv_free(argv); goto out; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0fea0ee12da..08d6e1bb99a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -31,6 +31,7 @@ cond_syscall(sys_socketpair); cond_syscall(sys_bind); cond_syscall(sys_listen); cond_syscall(sys_accept); +cond_syscall(sys_paccept); cond_syscall(sys_connect); cond_syscall(sys_getsockname); cond_syscall(sys_getpeername); @@ -56,6 +57,7 @@ cond_syscall(compat_sys_set_robust_list); cond_syscall(sys_get_robust_list); cond_syscall(compat_sys_get_robust_list); cond_syscall(sys_epoll_create); +cond_syscall(sys_epoll_create1); cond_syscall(sys_epoll_ctl); cond_syscall(sys_epoll_wait); cond_syscall(sys_epoll_pwait); @@ -95,6 +97,7 @@ cond_syscall(sys_keyctl); cond_syscall(compat_sys_keyctl); cond_syscall(compat_sys_socketcall); cond_syscall(sys_inotify_init); +cond_syscall(sys_inotify_init1); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); @@ -155,10 +158,13 @@ cond_syscall(sys_ioprio_get); /* New file descriptors */ cond_syscall(sys_signalfd); +cond_syscall(sys_signalfd4); cond_syscall(compat_sys_signalfd); +cond_syscall(compat_sys_signalfd4); cond_syscall(sys_timerfd_create); cond_syscall(sys_timerfd_settime); cond_syscall(sys_timerfd_gettime); cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); +cond_syscall(sys_eventfd2); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2a7b9d88706..35a50db9b6c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -43,6 +43,7 @@ #include <linux/limits.h> #include <linux/dcache.h> #include <linux/syscalls.h> +#include <linux/vmstat.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> #include <linux/reboot.h> @@ -80,7 +81,6 @@ extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; -extern int sysctl_stat_interval; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifdef CONFIG_RCU_TORTURE_TEST @@ -624,7 +624,7 @@ static struct ctl_table kern_table[] = { { .ctl_name = KERN_PRINTK_RATELIMIT, .procname = "printk_ratelimit", - .data = &printk_ratelimit_jiffies, + .data = &printk_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_jiffies, @@ -633,7 +633,7 @@ static struct ctl_table kern_table[] = { { .ctl_name = KERN_PRINTK_RATELIMIT_BURST, .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_burst, + .data = &printk_ratelimit_state.burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -959,7 +959,7 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_HUGETLB_PAGE { .procname = "nr_hugepages", - .data = &max_huge_pages, + .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_sysctl_handler, @@ -985,10 +985,12 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nr_overcommit_hugepages", - .data = &sysctl_overcommit_huge_pages, - .maxlen = sizeof(sysctl_overcommit_huge_pages), + .data = NULL, + .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_overcommit_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, }, #endif { diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index c09350d564f..c35da23ab8f 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) sysctl_check_leaf(namespaces, table, &fail); } sysctl_check_bin_path(table, &fail); + if (table->mode > 0777) + set_fail(&fail, table, "bogus .mode"); if (fail) { set_fail(&fail, table, NULL); error = -EINVAL; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 06b17547f4e..bd6be76303c 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -35,7 +35,7 @@ */ #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) -static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; +static DEFINE_PER_CPU(__u32, taskstats_seqnum); static int family_registered; struct kmem_cache *taskstats_cache; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 942fc7c8528..825b4c00fe4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -195,7 +195,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) * Called either from the idle loop or from irq_exit() when an idle period was * just interrupted by an interrupt which did not cause a reschedule. */ -void tick_nohz_stop_sched_tick(void) +void tick_nohz_stop_sched_tick(int inidle) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; struct tick_sched *ts; @@ -224,6 +224,11 @@ void tick_nohz_stop_sched_tick(void) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; + if (!inidle && !ts->inidle) + goto end; + + ts->inidle = 1; + if (need_resched()) goto end; @@ -373,11 +378,14 @@ void tick_nohz_restart_sched_tick(void) local_irq_disable(); tick_nohz_stop_idle(cpu); - if (!ts->tick_stopped) { + if (!ts->inidle || !ts->tick_stopped) { + ts->inidle = 0; local_irq_enable(); return; } + ts->inidle = 0; + rcu_exit_nohz(); /* Update jiffies first */ diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 63528086337..ce2d723c10e 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -161,7 +161,7 @@ static void timer_notify(struct pt_regs *regs, int cpu) __trace_special(tr, data, 2, regs->ip, 0); while (i < sample_max_depth) { - frame.next_fp = 0; + frame.next_fp = NULL; frame.return_address = 0; if (!copy_stack_frame(fp, &frame)) break; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 4ab1b584961..3da47ccdc5e 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -28,14 +28,14 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) { struct timespec uptime, ts; - s64 ac_etime; + u64 ac_etime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in timespec */ do_posix_clock_monotonic_gettime(&uptime); ts = timespec_sub(uptime, tsk->start_time); - /* rebase elapsed time to usec */ + /* rebase elapsed time to usec (should never be negative) */ ac_etime = timespec_to_ns(&ts); do_div(ac_etime, NSEC_PER_USEC); stats->ac_etime = ac_etime; @@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; - /* convert pages-jiffies to Mbyte-usec */ - stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; - stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; + /* convert pages-usec to Mbyte-usec */ + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ @@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) void acct_update_integrals(struct task_struct *tsk) { if (likely(tsk->mm)) { - long delta = cputime_to_jiffies( - cputime_sub(tsk->stime, tsk->acct_stimexpd)); + cputime_t time, dtime; + struct timeval value; + u64 delta; + + time = tsk->stime + tsk->utime; + dtime = cputime_sub(time, tsk->acct_timexpd); + jiffies_to_timeval(cputime_to_jiffies(dtime), &value); + delta = value.tv_sec; + delta = delta * USEC_PER_SEC + value.tv_usec; if (delta == 0) return; - tsk->acct_stimexpd = tsk->stime; + tsk->acct_timexpd = time; tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; } @@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk) */ void acct_clear_integrals(struct task_struct *tsk) { - tsk->acct_stimexpd = 0; + tsk->acct_timexpd = 0; tsk->acct_rss_mem1 = 0; tsk->acct_vm_mem1 = 0; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a6d36346d10..ec7e4f62aaf 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -125,7 +125,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) } static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, int tail) + struct work_struct *work, struct list_head *head) { set_wq_data(work, cwq); /* @@ -133,21 +133,17 @@ static void insert_work(struct cpu_workqueue_struct *cwq, * result of list_add() below, see try_to_grab_pending(). */ smp_wmb(); - if (tail) - list_add_tail(&work->entry, &cwq->worklist); - else - list_add(&work->entry, &cwq->worklist); + list_add_tail(&work->entry, head); wake_up(&cwq->more_work); } -/* Preempt must be disabled. */ static void __queue_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) { unsigned long flags; spin_lock_irqsave(&cwq->lock, flags); - insert_work(cwq, work, 1); + insert_work(cwq, work, &cwq->worklist); spin_unlock_irqrestore(&cwq->lock, flags); } @@ -163,17 +159,39 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, */ int queue_work(struct workqueue_struct *wq, struct work_struct *work) { + int ret; + + ret = queue_work_on(get_cpu(), wq, work); + put_cpu(); + + return ret; +} +EXPORT_SYMBOL_GPL(queue_work); + +/** + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. + */ +int +queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +{ int ret = 0; if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, get_cpu()), work); - put_cpu(); + __queue_work(wq_per_cpu(wq, cpu), work); ret = 1; } return ret; } -EXPORT_SYMBOL_GPL(queue_work); +EXPORT_SYMBOL_GPL(queue_work_on); static void delayed_work_timer_fn(unsigned long __data) { @@ -337,14 +355,14 @@ static void wq_barrier_func(struct work_struct *work) } static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, - struct wq_barrier *barr, int tail) + struct wq_barrier *barr, struct list_head *head) { INIT_WORK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); init_completion(&barr->done); - insert_work(cwq, &barr->work, tail); + insert_work(cwq, &barr->work, head); } static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) @@ -364,7 +382,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) active = 0; spin_lock_irq(&cwq->lock); if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { - insert_wq_barrier(cwq, &barr, 1); + insert_wq_barrier(cwq, &barr, &cwq->worklist); active = 1; } spin_unlock_irq(&cwq->lock); @@ -402,6 +420,57 @@ void flush_workqueue(struct workqueue_struct *wq) } EXPORT_SYMBOL_GPL(flush_workqueue); +/** + * flush_work - block until a work_struct's callback has terminated + * @work: the work which is to be flushed + * + * Returns false if @work has already terminated. + * + * It is expected that, prior to calling flush_work(), the caller has + * arranged for the work to not be requeued, otherwise it doesn't make + * sense to use this function. + */ +int flush_work(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq; + struct list_head *prev; + struct wq_barrier barr; + + might_sleep(); + cwq = get_wq_data(work); + if (!cwq) + return 0; + + lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); + lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + + prev = NULL; + spin_lock_irq(&cwq->lock); + if (!list_empty(&work->entry)) { + /* + * See the comment near try_to_grab_pending()->smp_rmb(). + * If it was re-queued under us we are not going to wait. + */ + smp_rmb(); + if (unlikely(cwq != get_wq_data(work))) + goto out; + prev = &work->entry; + } else { + if (cwq->current_work != work) + goto out; + prev = &cwq->worklist; + } + insert_wq_barrier(cwq, &barr, prev->next); +out: + spin_unlock_irq(&cwq->lock); + if (!prev) + return 0; + + wait_for_completion(&barr.done); + return 1; +} +EXPORT_SYMBOL_GPL(flush_work); + /* * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, * so this work can't be re-armed in any way. @@ -449,7 +518,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, spin_lock_irq(&cwq->lock); if (unlikely(cwq->current_work == work)) { - insert_wq_barrier(cwq, &barr, 0); + insert_wq_barrier(cwq, &barr, cwq->worklist.next); running = 1; } spin_unlock_irq(&cwq->lock); @@ -553,6 +622,19 @@ int schedule_work(struct work_struct *work) } EXPORT_SYMBOL(schedule_work); +/* + * schedule_work_on - put work task on a specific cpu + * @cpu: cpu to put the work task on + * @work: job to be done + * + * This puts a job on a specific cpu + */ +int schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, keventd_wq, work); +} +EXPORT_SYMBOL(schedule_work_on); + /** * schedule_delayed_work - put work task in global workqueue after delay * @dwork: job to be done @@ -607,10 +689,10 @@ int schedule_on_each_cpu(work_func_t func) struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); - set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); - __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); + schedule_work_on(cpu, work); } - flush_workqueue(keventd_wq); + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); put_online_cpus(); free_percpu(works); return 0; @@ -747,7 +829,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, err = create_workqueue_thread(cwq, singlethread_cpu); start_workqueue_thread(cwq, -1); } else { - get_online_cpus(); + cpu_maps_update_begin(); spin_lock(&workqueue_lock); list_add(&wq->list, &workqueues); spin_unlock(&workqueue_lock); @@ -759,7 +841,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, err = create_workqueue_thread(cwq, cpu); start_workqueue_thread(cwq, cpu); } - put_online_cpus(); + cpu_maps_update_done(); } if (err) { @@ -773,8 +855,8 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key); static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* - * Our caller is either destroy_workqueue() or CPU_DEAD, - * get_online_cpus() protects cwq->thread. + * Our caller is either destroy_workqueue() or CPU_POST_DEAD, + * cpu_add_remove_lock protects cwq->thread. */ if (cwq->thread == NULL) return; @@ -784,7 +866,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) flush_cpu_workqueue(cwq); /* - * If the caller is CPU_DEAD and cwq->worklist was not empty, + * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, * a concurrent flush_workqueue() can insert a barrier after us. * However, in that case run_workqueue() won't return and check * kthread_should_stop() until it flushes all work_struct's. @@ -808,14 +890,14 @@ void destroy_workqueue(struct workqueue_struct *wq) const cpumask_t *cpu_map = wq_cpu_map(wq); int cpu; - get_online_cpus(); + cpu_maps_update_begin(); spin_lock(&workqueue_lock); list_del(&wq->list); spin_unlock(&workqueue_lock); for_each_cpu_mask_nr(cpu, *cpu_map) cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); - put_online_cpus(); + cpu_maps_update_done(); free_percpu(wq->cpu_wq); kfree(wq); @@ -829,6 +911,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned int cpu = (unsigned long)hcpu; struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; + int ret = NOTIFY_OK; action &= ~CPU_TASKS_FROZEN; @@ -836,7 +919,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: cpu_set(cpu, cpu_populated_map); } - +undo: list_for_each_entry(wq, &workqueues, list) { cwq = per_cpu_ptr(wq->cpu_wq, cpu); @@ -846,7 +929,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, break; printk(KERN_ERR "workqueue [%s] for %i failed\n", wq->name, cpu); - return NOTIFY_BAD; + action = CPU_UP_CANCELED; + ret = NOTIFY_BAD; + goto undo; case CPU_ONLINE: start_workqueue_thread(cwq, cpu); @@ -854,7 +939,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: start_workqueue_thread(cwq, -1); - case CPU_DEAD: + case CPU_POST_DEAD: cleanup_workqueue_thread(cwq); break; } @@ -862,11 +947,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_CANCELED: - case CPU_DEAD: + case CPU_POST_DEAD: cpu_clear(cpu, cpu_populated_map); } - return NOTIFY_OK; + return ret; } void __init init_workqueues(void) |