aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c12
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cpuset.c69
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/fork.c28
-rw-r--r--kernel/futex.c4
-rw-r--r--kernel/futex_compat.c4
-rw-r--r--kernel/hrtimer.c49
-rw-r--r--kernel/module.c1
-rw-r--r--kernel/pid.c212
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/sched.c84
-rw-r--r--kernel/signal.c1
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/timer.c92
15 files changed, 376 insertions, 211 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 065d8b4e51e..b327f4d2010 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -449,8 +449,8 @@ static void do_acct_process(long exitcode, struct file *file)
/* calculate run_time in nsec*/
do_posix_clock_monotonic_gettime(&uptime);
run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec;
- run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC
- + current->start_time.tv_nsec;
+ run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC
+ + current->group_leader->start_time.tv_nsec;
/* convert nsec -> AHZ */
elapsed = nsec_to_AHZ(run_time);
#if ACCT_VERSION==3
@@ -469,10 +469,10 @@ static void do_acct_process(long exitcode, struct file *file)
#endif
do_div(elapsed, AHZ);
ac.ac_btime = xtime.tv_sec - elapsed;
- jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime,
+ jiffies = cputime_to_jiffies(cputime_add(current->utime,
current->signal->utime));
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
- jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime,
+ jiffies = cputime_to_jiffies(cputime_add(current->stime,
current->signal->stime));
ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
/* we really need to bite the bullet and change layout */
@@ -522,9 +522,9 @@ static void do_acct_process(long exitcode, struct file *file)
ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
ac.ac_minflt = encode_comp_t(current->signal->min_flt +
- current->group_leader->min_flt);
+ current->min_flt);
ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
- current->group_leader->maj_flt);
+ current->maj_flt);
ac.ac_swaps = encode_comp_t(0);
ac.ac_exitcode = exitcode;
diff --git a/kernel/audit.c b/kernel/audit.c
index 04fe2e301b6..c8ccbd09048 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -578,7 +578,7 @@ static int __init audit_enable(char *str)
audit_initialized ? "" : " (after initialization)");
if (audit_initialized)
audit_enabled = audit_default;
- return 0;
+ return 1;
}
__setup("audit=", audit_enable);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 18aea1bd128..72248d1b9e3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -616,12 +616,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
* current->cpuset if a task has its memory placement changed.
* Do not call this routine if in_interrupt().
*
- * Call without callback_mutex or task_lock() held. May be called
- * with or without manage_mutex held. Doesn't need task_lock to guard
- * against another task changing a non-NULL cpuset pointer to NULL,
- * as that is only done by a task on itself, and if the current task
- * is here, it is not simultaneously in the exit code NULL'ing its
- * cpuset pointer. This routine also might acquire callback_mutex and
+ * Call without callback_mutex or task_lock() held. May be
+ * called with or without manage_mutex held. Thanks in part to
+ * 'the_top_cpuset_hack', the tasks cpuset pointer will never
+ * be NULL. This routine also might acquire callback_mutex and
* current->mm->mmap_sem during call.
*
* Reading current->cpuset->mems_generation doesn't need task_lock
@@ -836,6 +834,55 @@ static int update_cpumask(struct cpuset *cs, char *buf)
}
/*
+ * cpuset_migrate_mm
+ *
+ * Migrate memory region from one set of nodes to another.
+ *
+ * Temporarilly set tasks mems_allowed to target nodes of migration,
+ * so that the migration code can allocate pages on these nodes.
+ *
+ * Call holding manage_mutex, so our current->cpuset won't change
+ * during this call, as manage_mutex holds off any attach_task()
+ * calls. Therefore we don't need to take task_lock around the
+ * call to guarantee_online_mems(), as we know no one is changing
+ * our tasks cpuset.
+ *
+ * Hold callback_mutex around the two modifications of our tasks
+ * mems_allowed to synchronize with cpuset_mems_allowed().
+ *
+ * While the mm_struct we are migrating is typically from some
+ * other task, the task_struct mems_allowed that we are hacking
+ * is for our current task, which must allocate new pages for that
+ * migrating memory region.
+ *
+ * We call cpuset_update_task_memory_state() before hacking
+ * our tasks mems_allowed, so that we are assured of being in
+ * sync with our tasks cpuset, and in particular, callbacks to
+ * cpuset_update_task_memory_state() from nested page allocations
+ * won't see any mismatch of our cpuset and task mems_generation
+ * values, so won't overwrite our hacked tasks mems_allowed
+ * nodemask.
+ */
+
+static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
+ const nodemask_t *to)
+{
+ struct task_struct *tsk = current;
+
+ cpuset_update_task_memory_state();
+
+ mutex_lock(&callback_mutex);
+ tsk->mems_allowed = *to;
+ mutex_unlock(&callback_mutex);
+
+ do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+
+ mutex_lock(&callback_mutex);
+ guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);
+ mutex_unlock(&callback_mutex);
+}
+
+/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
* cpusets mems_allowed and mems_generation, and for each
@@ -947,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
struct mm_struct *mm = mmarray[i];
mpol_rebind_mm(mm, &cs->mems_allowed);
- if (migrate) {
- do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
- MPOL_MF_MOVE_ALL);
- }
+ if (migrate)
+ cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
mmput(mm);
}
@@ -1185,11 +1230,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
mm = get_task_mm(tsk);
if (mm) {
mpol_rebind_mm(mm, &to);
+ if (is_memory_migrate(cs))
+ cpuset_migrate_mm(mm, &from, &to);
mmput(mm);
}
- if (is_memory_migrate(cs))
- do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
put_task_struct(tsk);
synchronize_rcu();
if (atomic_dec_and_test(&oldcs->count))
diff --git a/kernel/exit.c b/kernel/exit.c
index bc0ec674d3f..6c2eeb8f639 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -127,6 +127,11 @@ static void __exit_signal(struct task_struct *tsk)
}
}
+static void delayed_put_task_struct(struct rcu_head *rhp)
+{
+ put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
void release_task(struct task_struct * p)
{
int zap_leader;
@@ -168,7 +173,7 @@ repeat:
spin_unlock(&p->proc_lock);
proc_pid_flush(proc_dentry);
release_thread(p);
- put_task_struct(p);
+ call_rcu(&p->rcu, delayed_put_task_struct);
p = leader;
if (unlikely(zap_leader))
diff --git a/kernel/fork.c b/kernel/fork.c
index b3f7a1bb5e5..3384eb89cb1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -108,10 +108,8 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
-void __put_task_struct_cb(struct rcu_head *rhp)
+void __put_task_struct(struct task_struct *tsk)
{
- struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-
WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
@@ -126,6 +124,12 @@ void __put_task_struct_cb(struct rcu_head *rhp)
free_task(tsk);
}
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+ __put_task_struct(tsk);
+}
+
void __init fork_init(unsigned long mempages)
{
#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
@@ -721,7 +725,7 @@ out_release:
free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
free_fd_array(new_fdt->fd, new_fdt->max_fds);
kmem_cache_free(files_cachep, newf);
- goto out;
+ return NULL;
}
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
@@ -1311,17 +1315,19 @@ long do_fork(unsigned long clone_flags,
{
struct task_struct *p;
int trace = 0;
- long pid = alloc_pidmap();
+ struct pid *pid = alloc_pid();
+ long nr;
- if (pid < 0)
+ if (!pid)
return -EAGAIN;
+ nr = pid->nr;
if (unlikely(current->ptrace)) {
trace = fork_traceflag (clone_flags);
if (trace)
clone_flags |= CLONE_PTRACE;
}
- p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
+ p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, nr);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
@@ -1348,7 +1354,7 @@ long do_fork(unsigned long clone_flags,
p->state = TASK_STOPPED;
if (unlikely (trace)) {
- current->ptrace_message = pid;
+ current->ptrace_message = nr;
ptrace_notify ((trace << 8) | SIGTRAP);
}
@@ -1358,10 +1364,10 @@ long do_fork(unsigned long clone_flags,
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
}
} else {
- free_pidmap(pid);
- pid = PTR_ERR(p);
+ free_pid(pid);
+ nr = PTR_ERR(p);
}
- return pid;
+ return nr;
}
#ifndef ARCH_MIN_MMSTRUCT_ALIGN
diff --git a/kernel/futex.c b/kernel/futex.c
index 9c9b2b6b22d..5699c512057 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1039,9 +1039,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
int val2 = 0;
- if ((op == FUTEX_WAIT) && utime) {
+ if (utime && (op == FUTEX_WAIT)) {
if (copy_from_user(&t, utime, sizeof(t)) != 0)
return -EFAULT;
+ if (!timespec_valid(&t))
+ return -EINVAL;
timeout = timespec_to_jiffies(&t) + 1;
}
/*
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 54274fc8532..1ab6a0ea3d1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,9 +129,11 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
int val2 = 0;
- if ((op == FUTEX_WAIT) && utime) {
+ if (utime && (op == FUTEX_WAIT)) {
if (get_compat_timespec(&t, utime))
return -EFAULT;
+ if (!timespec_valid(&t))
+ return -EINVAL;
timeout = timespec_to_jiffies(&t) + 1;
}
if (op >= FUTEX_REQUEUE)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0237a556eb1..f181ff4dd32 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -606,6 +606,9 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
{
struct rb_node *node;
+ if (!base->first)
+ return;
+
if (base->get_softirq_time)
base->softirq_time = base->get_softirq_time();
@@ -655,29 +658,28 @@ void hrtimer_run_queues(void)
/*
* Sleep related functions:
*/
-
-struct sleep_hrtimer {
- struct hrtimer timer;
- struct task_struct *task;
- int expired;
-};
-
-static int nanosleep_wakeup(struct hrtimer *timer)
+static int hrtimer_wakeup(struct hrtimer *timer)
{
- struct sleep_hrtimer *t =
- container_of(timer, struct sleep_hrtimer, timer);
+ struct hrtimer_sleeper *t =
+ container_of(timer, struct hrtimer_sleeper, timer);
+ struct task_struct *task = t->task;
- t->expired = 1;
- wake_up_process(t->task);
+ t->task = NULL;
+ if (task)
+ wake_up_process(task);
return HRTIMER_NORESTART;
}
-static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
{
- t->timer.function = nanosleep_wakeup;
- t->task = current;
- t->expired = 0;
+ sl->timer.function = hrtimer_wakeup;
+ sl->task = task;
+}
+
+static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
+{
+ hrtimer_init_sleeper(t, current);
do {
set_current_state(TASK_INTERRUPTIBLE);
@@ -685,18 +687,17 @@ static int __sched do_nanosleep(struct sleep_hrtimer *t, enum hrtimer_mode mode)
schedule();
- if (unlikely(!t->expired)) {
- hrtimer_cancel(&t->timer);
- mode = HRTIMER_ABS;
- }
- } while (!t->expired && !signal_pending(current));
+ hrtimer_cancel(&t->timer);
+ mode = HRTIMER_ABS;
+
+ } while (t->task && !signal_pending(current));
- return t->expired;
+ return t->task == NULL;
}
static long __sched nanosleep_restart(struct restart_block *restart)
{
- struct sleep_hrtimer t;
+ struct hrtimer_sleeper t;
struct timespec __user *rmtp;
struct timespec tu;
ktime_t time;
@@ -729,7 +730,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
const enum hrtimer_mode mode, const clockid_t clockid)
{
struct restart_block *restart;
- struct sleep_hrtimer t;
+ struct hrtimer_sleeper t;
struct timespec tu;
ktime_t rem;
diff --git a/kernel/module.c b/kernel/module.c
index bd088a7c149..d24deb0dbbc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1254,6 +1254,7 @@ static inline int license_is_gpl_compatible(const char *license)
|| strcmp(license, "GPL v2") == 0
|| strcmp(license, "GPL and additional rights") == 0
|| strcmp(license, "Dual BSD/GPL") == 0
+ || strcmp(license, "Dual MIT/GPL") == 0
|| strcmp(license, "Dual MPL/GPL") == 0);
}
diff --git a/kernel/pid.c b/kernel/pid.c
index a9f2dfd006d..eeb836b65ca 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -28,8 +28,9 @@
#include <linux/hash.h>
#define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
-static struct hlist_head *pid_hash[PIDTYPE_MAX];
+static struct hlist_head *pid_hash;
static int pidhash_shift;
+static kmem_cache_t *pid_cachep;
int pid_max = PID_MAX_DEFAULT;
int last_pid;
@@ -60,9 +61,22 @@ typedef struct pidmap {
static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
{ [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
+/*
+ * Note: disable interrupts while the pidmap_lock is held as an
+ * interrupt might come in and do read_lock(&tasklist_lock).
+ *
+ * If we don't disable interrupts there is a nasty deadlock between
+ * detach_pid()->free_pid() and another cpu that does
+ * spin_lock(&pidmap_lock) followed by an interrupt routine that does
+ * read_lock(&tasklist_lock);
+ *
+ * After we clean up the tasklist_lock and know there are no
+ * irq handlers that take it we can leave the interrupts enabled.
+ * For now it is easier to be safe than to prove it can't happen.
+ */
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-fastcall void free_pidmap(int pid)
+static fastcall void free_pidmap(int pid)
{
pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
int offset = pid & BITS_PER_PAGE_MASK;
@@ -71,7 +85,7 @@ fastcall void free_pidmap(int pid)
atomic_inc(&map->nr_free);
}
-int alloc_pidmap(void)
+static int alloc_pidmap(void)
{
int i, offset, max_scan, pid, last = last_pid;
pidmap_t *map;
@@ -89,12 +103,12 @@ int alloc_pidmap(void)
* Free the page if someone raced with us
* installing it:
*/
- spin_lock(&pidmap_lock);
+ spin_lock_irq(&pidmap_lock);
if (map->page)
free_page(page);
else
map->page = (void *)page;
- spin_unlock(&pidmap_lock);
+ spin_unlock_irq(&pidmap_lock);
if (unlikely(!map->page))
break;
}
@@ -131,13 +145,73 @@ int alloc_pidmap(void)
return -1;
}
-struct pid * fastcall find_pid(enum pid_type type, int nr)
+fastcall void put_pid(struct pid *pid)
+{
+ if (!pid)
+ return;
+ if ((atomic_read(&pid->count) == 1) ||
+ atomic_dec_and_test(&pid->count))
+ kmem_cache_free(pid_cachep, pid);
+}
+
+static void delayed_put_pid(struct rcu_head *rhp)
+{
+ struct pid *pid = container_of(rhp, struct pid, rcu);
+ put_pid(pid);
+}
+
+fastcall void free_pid(struct pid *pid)
+{
+ /* We can be called with write_lock_irq(&tasklist_lock) held */
+ unsigned long flags;
+
+ spin_lock_irqsave(&pidmap_lock, flags);
+ hlist_del_rcu(&pid->pid_chain);
+ spin_unlock_irqrestore(&pidmap_lock, flags);
+
+ free_pidmap(pid->nr);
+ call_rcu(&pid->rcu, delayed_put_pid);
+}
+
+struct pid *alloc_pid(void)
+{
+ struct pid *pid;
+ enum pid_type type;
+ int nr = -1;
+
+ pid = kmem_cache_alloc(pid_cachep, GFP_KERNEL);
+ if (!pid)
+ goto out;
+
+ nr = alloc_pidmap();
+ if (nr < 0)
+ goto out_free;
+
+ atomic_set(&pid->count, 1);
+ pid->nr = nr;
+ for (type = 0; type < PIDTYPE_MAX; ++type)
+ INIT_HLIST_HEAD(&pid->tasks[type]);
+
+ spin_lock_irq(&pidmap_lock);
+ hlist_add_head_rcu(&pid->pid_chain, &pid_hash[pid_hashfn(pid->nr)]);
+ spin_unlock_irq(&pidmap_lock);
+
+out:
+ return pid;
+
+out_free:
+ kmem_cache_free(pid_cachep, pid);
+ pid = NULL;
+ goto out;
+}
+
+struct pid * fastcall find_pid(int nr)
{
struct hlist_node *elem;
struct pid *pid;
hlist_for_each_entry_rcu(pid, elem,
- &pid_hash[type][pid_hashfn(nr)], pid_chain) {
+ &pid_hash[pid_hashfn(nr)], pid_chain) {
if (pid->nr == nr)
return pid;
}
@@ -146,77 +220,82 @@ struct pid * fastcall find_pid(enum pid_type type, int nr)
int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
{
- struct pid *pid, *task_pid;
-
- task_pid = &task->pids[type];
- pid = find_pid(type, nr);
- task_pid->nr = nr;
- if (pid == NULL) {
- INIT_LIST_HEAD(&task_pid->pid_list);
- hlist_add_head_rcu(&task_pid->pid_chain,
- &pid_hash[type][pid_hashfn(nr)]);
- } else {
- INIT_HLIST_NODE(&task_pid->pid_chain);
- list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
- }
+ struct pid_link *link;
+ struct pid *pid;
+
+ WARN_ON(!task->pid); /* to be removed soon */
+ WARN_ON(!nr); /* to be removed soon */
+
+ link = &task->pids[type];
+ link->pid = pid = find_pid(nr);
+ hlist_add_head_rcu(&link->node, &pid->tasks[type]);
return 0;
}
-static fastcall int __detach_pid(task_t *task, enum pid_type type)
+void fastcall detach_pid(task_t *task, enum pid_type type)
{
- struct pid *pid, *pid_next;
- int nr = 0;
+ struct pid_link *link;
+ struct pid *pid;
+ int tmp;
- pid = &task->pids[type];
- if (!hlist_unhashed(&pid->pid_chain)) {
+ link = &task->pids[type];
+ pid = link->pid;
- if (list_empty(&pid->pid_list)) {
- nr = pid->nr;
- hlist_del_rcu(&pid->pid_chain);
- } else {
- pid_next = list_entry(pid->pid_list.next,
- struct pid, pid_list);
- /* insert next pid from pid_list to hash */
- hlist_replace_rcu(&pid->pid_chain,
- &pid_next->pid_chain);
- }
- }
+ hlist_del_rcu(&link->node);
+ link->pid = NULL;
- list_del_rcu(&pid->pid_list);
- pid->nr = 0;
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (!hlist_empty(&pid->tasks[tmp]))
+ return;
- return nr;
+ free_pid(pid);
}
-void fastcall detach_pid(task_t *task, enum pid_type type)
+struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
{
- int tmp, nr;
+ struct task_struct *result = NULL;
+ if (pid) {
+ struct hlist_node *first;
+ first = rcu_dereference(pid->tasks[type].first);
+ if (first)
+ result = hlist_entry(first, struct task_struct, pids[(type)].node);
+ }
+ return result;
+}
- nr = __detach_pid(task, type);
- if (!nr)
- return;
+/*
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ */
+task_t *find_task_by_pid_type(int type, int nr)
+{
+ return pid_task(find_pid(nr), type);
+}
- for (tmp = PIDTYPE_MAX; --tmp >= 0; )
- if (tmp != type && find_pid(tmp, nr))
- return;
+EXPORT_SYMBOL(find_task_by_pid_type);
- free_pidmap(nr);
+struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
+{
+ struct task_struct *result;
+ rcu_read_lock();
+ result = pid_task(pid, type);
+ if (result)
+ get_task_struct(result);
+ rcu_read_unlock();
+ return result;
}
-task_t *find_task_by_pid_type(int type, int nr)
+struct pid *find_get_pid(pid_t nr)
{
struct pid *pid;
- pid = find_pid(type, nr);
- if (!pid)
- return NULL;
+ rcu_read_lock();
+ pid = get_pid(find_pid(nr));
+ rcu_read_unlock();
- return pid_task(&pid->pid_list, type);
+ return pid;
}
-EXPORT_SYMBOL(find_task_by_pid_type);
-
/*
* The pid hash table is scaled according to the amount of memory in the
* machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -224,7 +303,7 @@ EXPORT_SYMBOL(find_task_by_pid_type);
*/
void __init pidhash_init(void)
{
- int i, j, pidhash_size;
+ int i, pidhash_size;
unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
pidhash_shift = max(4, fls(megabytes * 4));
@@ -233,16 +312,13 @@ void __init pidhash_init(void)
printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
pidhash_size, pidhash_shift,
- PIDTYPE_MAX * pidhash_size * sizeof(struct hlist_head));
-
- for (i = 0; i < PIDTYPE_MAX; i++) {
- pid_hash[i] = alloc_bootmem(pidhash_size *
- sizeof(*(pid_hash[i])));
- if (!pid_hash[i])
- panic("Could not alloc pidhash!\n");
- for (j = 0; j < pidhash_size; j++)
- INIT_HLIST_HEAD(&pid_hash[i][j]);
- }
+ pidhash_size * sizeof(struct hlist_head));
+
+ pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
+ if (!pid_hash)
+ panic("Could not alloc pidhash!\n");
+ for (i = 0; i < pidhash_size; i++)
+ INIT_HLIST_HEAD(&pid_hash[i]);
}
void __init pidmap_init(void)
@@ -251,4 +327,8 @@ void __init pidmap_init(void)
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, pidmap_array->page);
atomic_dec(&pidmap_array->nr_free);
+
+ pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
+ __alignof__(struct pid),
+ SLAB_PANIC, NULL, NULL);
}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 8ac7c35fad7..b2a5f671d6c 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -26,8 +26,7 @@ static inline int freezeable(struct task_struct * p)
(p->flags & PF_NOFREEZE) ||
(p->exit_state == EXIT_ZOMBIE) ||
(p->exit_state == EXIT_DEAD) ||
- (p->state == TASK_STOPPED) ||
- (p->state == TASK_TRACED))
+ (p->state == TASK_STOPPED))
return 0;
return 1;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index a9ecac398bb..dd153d6f8a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -667,9 +667,13 @@ static int effective_prio(task_t *p)
/*
* __activate_task - move a task to the runqueue.
*/
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+static void __activate_task(task_t *p, runqueue_t *rq)
{
- enqueue_task(p, rq->active);
+ prio_array_t *target = rq->active;
+
+ if (batch_task(p))
+ target = rq->expired;
+ enqueue_task(p, target);
rq->nr_running++;
}
@@ -688,7 +692,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
unsigned long long __sleep_time = now - p->timestamp;
unsigned long sleep_time;
- if (unlikely(p->policy == SCHED_BATCH))
+ if (batch_task(p))
sleep_time = 0;
else {
if (__sleep_time > NS_MAX_SLEEP_AVG)
@@ -700,21 +704,25 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
if (likely(sleep_time > 0)) {
/*
* User tasks that sleep a long time are categorised as
- * idle and will get just interactive status to stay active &
- * prevent them suddenly becoming cpu hogs and starving
- * other processes.
+ * idle. They will only have their sleep_avg increased to a
+ * level that makes them just interactive priority to stay
+ * active yet prevent them suddenly becoming cpu hogs and
+ * starving other processes.
*/
- if (p->mm && p->activated != -1 &&
- sleep_time > INTERACTIVE_SLEEP(p)) {
- p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
- DEF_TIMESLICE);
+ if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) {
+ unsigned long ceiling;
+
+ ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG -
+ DEF_TIMESLICE);
+ if (p->sleep_avg < ceiling)
+ p->sleep_avg = ceiling;
} else {
/*
* Tasks waking from uninterruptible sleep are
* limited in their sleep_avg rise as they
* are likely to be waiting on I/O
*/
- if (p->activated == -1 && p->mm) {
+ if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
sleep_time = 0;
else if (p->sleep_avg + sleep_time >=
@@ -769,7 +777,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
* This checks to make sure it's not an uninterruptible task
* that is now waking up.
*/
- if (!p->activated) {
+ if (p->sleep_type == SLEEP_NORMAL) {
/*
* Tasks which were woken up by interrupts (ie. hw events)
* are most likely of interactive nature. So we give them
@@ -778,13 +786,13 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
* on a CPU, first time around:
*/
if (in_interrupt())
- p->activated = 2;
+ p->sleep_type = SLEEP_INTERRUPTED;
else {
/*
* Normal first-time wakeups get a credit too for
* on-runqueue time, but it will be weighted down:
*/
- p->activated = 1;
+ p->sleep_type = SLEEP_INTERACTIVE;
}
}
p->timestamp = now;
@@ -1272,19 +1280,19 @@ out_activate:
* Tasks on involuntary sleep don't earn
* sleep_avg beyond just interactive state.
*/
- p->activated = -1;
- }
+ p->sleep_type = SLEEP_NONINTERACTIVE;
+ } else
/*
* Tasks that have marked their sleep as noninteractive get
- * woken up without updating their sleep average. (i.e. their
- * sleep is handled in a priority-neutral manner, no priority
- * boost and no penalty.)
+ * woken up with their sleep average not weighted in an
+ * interactive way.
*/
- if (old_state & TASK_NONINTERACTIVE)
- __activate_task(p, rq);
- else
- activate_task(p, rq, cpu == this_cpu);
+ if (old_state & TASK_NONINTERACTIVE)
+ p->sleep_type = SLEEP_NONINTERACTIVE;
+
+
+ activate_task(p, rq, cpu == this_cpu);
/*
* Sync wakeups (i.e. those types of wakeups where the waker
* has indicated that it will leave the CPU in short order)
@@ -1658,6 +1666,21 @@ unsigned long nr_iowait(void)
return sum;
}
+unsigned long nr_active(void)
+{
+ unsigned long i, running = 0, uninterruptible = 0;
+
+ for_each_online_cpu(i) {
+ running += cpu_rq(i)->nr_running;
+ uninterruptible += cpu_rq(i)->nr_uninterruptible;
+ }
+
+ if (unlikely((long)uninterruptible < 0))
+ uninterruptible = 0;
+
+ return running + uninterruptible;
+}
+
#ifdef CONFIG_SMP
/*
@@ -2860,6 +2883,12 @@ EXPORT_SYMBOL(sub_preempt_count);
#endif
+static inline int interactive_sleep(enum sleep_type sleep_type)
+{
+ return (sleep_type == SLEEP_INTERACTIVE ||
+ sleep_type == SLEEP_INTERRUPTED);
+}
+
/*
* schedule() is the main scheduler function.
*/
@@ -2983,12 +3012,12 @@ go_idle:
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
- if (!rt_task(next) && next->activated > 0) {
+ if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
unsigned long long delta = now - next->timestamp;
if (unlikely((long long)(now - next->timestamp) < 0))
delta = 0;
- if (next->activated == 1)
+ if (next->sleep_type == SLEEP_INTERACTIVE)
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
@@ -2998,10 +3027,9 @@ go_idle:
dequeue_task(next, array);
next->prio = new_prio;
enqueue_task(next, array);
- } else
- requeue_task(next, array);
+ }
}
- next->activated = 0;
+ next->sleep_type = SLEEP_NORMAL;
switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4922928d91f..92025b10879 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1560,6 +1560,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
/* Let the debugger run. */
set_current_state(TASK_TRACED);
spin_unlock_irq(&current->sighand->siglock);
+ try_to_freeze();
read_lock(&tasklist_lock);
if (likely(current->ptrace & PT_PTRACED) &&
likely(current->parent != current->real_parent ||
diff --git a/kernel/sys.c b/kernel/sys.c
index 7ef7f6054c2..0b6ec0e7936 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1372,18 +1372,29 @@ asmlinkage long sys_getsid(pid_t pid)
asmlinkage long sys_setsid(void)
{
struct task_struct *group_leader = current->group_leader;
- struct pid *pid;
+ pid_t session;
int err = -EPERM;
mutex_lock(&tty_mutex);
write_lock_irq(&tasklist_lock);
- pid = find_pid(PIDTYPE_PGID, group_leader->pid);
- if (pid)
+ /* Fail if I am already a session leader */
+ if (group_leader->signal->leader)
+ goto out;
+
+ session = group_leader->pid;
+ /* Fail if a process group id already exists that equals the
+ * proposed session id.
+ *
+ * Don't check if session id == 1 because kernel threads use this
+ * session id and so the check will always fail and make it so
+ * init cannot successfully call setsid.
+ */
+ if (session > 1 && find_task_by_pid_type(PIDTYPE_PGID, session))
goto out;
group_leader->signal->leader = 1;
- __set_special_pids(group_leader->pid, group_leader->pid);
+ __set_special_pids(session, session);
group_leader->signal->tty = NULL;
group_leader->signal->tty_old_pgrp = 0;
err = process_group(group_leader);
diff --git a/kernel/timer.c b/kernel/timer.c
index ab189dd187c..6b812c04737 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
/*
* per-CPU timer vector definitions:
*/
-
#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
#define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
-struct timer_base_s {
- spinlock_t lock;
- struct timer_list *running_timer;
-};
-
typedef struct tvec_s {
struct list_head vec[TVN_SIZE];
} tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
} tvec_root_t;
struct tvec_t_base_s {
- struct timer_base_s t_base;
+ spinlock_t lock;
+ struct timer_list *running_timer;
unsigned long timer_jiffies;
tvec_root_t tv1;
tvec_t tv2;
@@ -87,13 +82,14 @@ struct tvec_t_base_s {
typedef struct tvec_t_base_s tvec_base_t;
static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
-static tvec_base_t boot_tvec_bases;
+tvec_base_t boot_tvec_bases;
+EXPORT_SYMBOL(boot_tvec_bases);
static inline void set_running_timer(tvec_base_t *base,
struct timer_list *timer)
{
#ifdef CONFIG_SMP
- base->t_base.running_timer = timer;
+ base->running_timer = timer;
#endif
}
@@ -139,15 +135,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
list_add_tail(&timer->entry, vec);
}
-typedef struct timer_base_s timer_base_t;
-/*
- * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
- * at compile time, and we need timer->base to lock the timer.
- */
-timer_base_t __init_timer_base
- ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
-EXPORT_SYMBOL(__init_timer_base);
-
/***
* init_timer - initialize a timer.
* @timer: the timer to be initialized
@@ -158,7 +145,7 @@ EXPORT_SYMBOL(__init_timer_base);
void fastcall init_timer(struct timer_list *timer)
{
timer->entry.next = NULL;
- timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
+ timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
}
EXPORT_SYMBOL(init_timer);
@@ -174,7 +161,7 @@ static inline void detach_timer(struct timer_list *timer,
}
/*
- * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
+ * We are using hashed locking: holding per_cpu(tvec_bases).lock
* means that all timers which are tied to this base via timer->base are
* locked, and the base itself is locked too.
*
@@ -185,10 +172,10 @@ static inline void detach_timer(struct timer_list *timer,
* possible to set timer->base = NULL and drop the lock: the timer remains
* locked.
*/
-static timer_base_t *lock_timer_base(struct timer_list *timer,
+static tvec_base_t *lock_timer_base(struct timer_list *timer,
unsigned long *flags)
{
- timer_base_t *base;
+ tvec_base_t *base;
for (;;) {
base = timer->base;
@@ -205,8 +192,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
int __mod_timer(struct timer_list *timer, unsigned long expires)
{
- timer_base_t *base;
- tvec_base_t *new_base;
+ tvec_base_t *base, *new_base;
unsigned long flags;
int ret = 0;
@@ -221,7 +207,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
new_base = __get_cpu_var(tvec_bases);
- if (base != &new_base->t_base) {
+ if (base != new_base) {
/*
* We are trying to schedule the timer on the local CPU.
* However we can't change timer's base while it is running,
@@ -229,21 +215,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
* handler yet has not finished. This also guarantees that
* the timer is serialized wrt itself.
*/
- if (unlikely(base->running_timer == timer)) {
- /* The timer remains on a former base */
- new_base = container_of(base, tvec_base_t, t_base);
- } else {
+ if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
timer->base = NULL;
spin_unlock(&base->lock);
- spin_lock(&new_base->t_base.lock);
- timer->base = &new_base->t_base;
+ base = new_base;
+ spin_lock(&base->lock);
+ timer->base = base;
}
}
timer->expires = expires;
- internal_add_timer(new_base, timer);
- spin_unlock_irqrestore(&new_base->t_base.lock, flags);
+ internal_add_timer(base, timer);
+ spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
@@ -263,10 +247,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
unsigned long flags;
BUG_ON(timer_pending(timer) || !timer->function);
- spin_lock_irqsave(&base->t_base.lock, flags);
- timer->base = &base->t_base;
+ spin_lock_irqsave(&base->lock, flags);
+ timer->base = base;
internal_add_timer(base, timer);
- spin_unlock_irqrestore(&base->t_base.lock, flags);
+ spin_unlock_irqrestore(&base->lock, flags);
}
@@ -319,7 +303,7 @@ EXPORT_SYMBOL(mod_timer);
*/
int del_timer(struct timer_list *timer)
{
- timer_base_t *base;
+ tvec_base_t *base;
unsigned long flags;
int ret = 0;
@@ -346,7 +330,7 @@ EXPORT_SYMBOL(del_timer);
*/
int try_to_del_timer_sync(struct timer_list *timer)
{
- timer_base_t *base;
+ tvec_base_t *base;
unsigned long flags;
int ret = -1;
@@ -410,7 +394,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
struct timer_list *tmp;
tmp = list_entry(curr, struct timer_list, entry);
- BUG_ON(tmp->base != &base->t_base);
+ BUG_ON(tmp->base != base);
curr = curr->next;
internal_add_timer(base, tmp);
}
@@ -432,7 +416,7 @@ static inline void __run_timers(tvec_base_t *base)
{
struct timer_list *timer;
- spin_lock_irq(&base->t_base.lock);
+ spin_lock_irq(&base->lock);
while (time_after_eq(jiffies, base->timer_jiffies)) {
struct list_head work_list = LIST_HEAD_INIT(work_list);
struct list_head *head = &work_list;
@@ -458,7 +442,7 @@ static inline void __run_timers(tvec_base_t *base)
set_running_timer(base, timer);
detach_timer(timer, 1);
- spin_unlock_irq(&base->t_base.lock);
+ spin_unlock_irq(&base->lock);
{
int preempt_count = preempt_count();
fn(data);
@@ -471,11 +455,11 @@ static inline void __run_timers(tvec_base_t *base)
BUG();
}
}
- spin_lock_irq(&base->t_base.lock);
+ spin_lock_irq(&base->lock);
}
}
set_running_timer(base, NULL);
- spin_unlock_irq(&base->t_base.lock);
+ spin_unlock_irq(&base->lock);
}
#ifdef CONFIG_NO_IDLE_HZ
@@ -506,7 +490,7 @@ unsigned long next_timer_interrupt(void)
hr_expires += jiffies;
base = __get_cpu_var(tvec_bases);
- spin_lock(&base->t_base.lock);
+ spin_lock(&base->lock);
expires = base->timer_jiffies + (LONG_MAX >> 1);
list = NULL;
@@ -554,7 +538,7 @@ found:
expires = nte->expires;
}
}
- spin_unlock(&base->t_base.lock);
+ spin_unlock(&base->lock);
if (time_before(hr_expires, expires))
return hr_expires;
@@ -841,7 +825,7 @@ void update_process_times(int user_tick)
*/
static unsigned long count_active_tasks(void)
{
- return (nr_running() + nr_uninterruptible()) * FIXED_1;
+ return nr_active() * FIXED_1;
}
/*
@@ -1262,7 +1246,7 @@ static int __devinit init_timers_cpu(int cpu)
}
per_cpu(tvec_bases, cpu) = base;
}
- spin_lock_init(&base->t_base.lock);
+ spin_lock_init(&base->lock);
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1284,7 +1268,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
while (!list_empty(head)) {
timer = list_entry(head->next, struct timer_list, entry);
detach_timer(timer, 0);
- timer->base = &new_base->t_base;
+ timer->base = new_base;
internal_add_timer(new_base, timer);
}
}
@@ -1300,11 +1284,11 @@ static void __devinit migrate_timers(int cpu)
new_base = get_cpu_var(tvec_bases);
local_irq_disable();
- spin_lock(&new_base->t_base.lock);
- spin_lock(&old_base->t_base.lock);
+ spin_lock(&new_base->lock);
+ spin_lock(&old_base->lock);
+
+ BUG_ON(old_base->running_timer);
- if (old_base->t_base.running_timer)
- BUG();
for (i = 0; i < TVR_SIZE; i++)
migrate_timer_list(new_base, old_base->tv1.vec + i);
for (i = 0; i < TVN_SIZE; i++) {
@@ -1314,8 +1298,8 @@ static void __devinit migrate_timers(int cpu)
migrate_timer_list(new_base, old_base->tv5.vec + i);
}
- spin_unlock(&old_base->t_base.lock);
- spin_unlock(&new_base->t_base.lock);
+ spin_unlock(&old_base->lock);
+ spin_unlock(&new_base->lock);
local_irq_enable();
put_cpu_var(tvec_bases);
}