From 8c9843e57a7d9d7a090d6467a0f1f3afb8031527 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 18 Apr 2008 16:56:15 +1000 Subject: [POWERPC] Add thread_info_cache_init() weak hook Some architectures need to maintain a kmem cache for thread info structures. The next commit adds that to powerpc to fix an alignment problem. There is no good arch callback to use to initialize that cache that I can find, so this adds a new one in the form of a weak function whose default is empty. Signed-off-by: Benjamin Herrenschmidt Acked-by: Andrew Morton Signed-off-by: Paul Mackerras --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 311380e5fe8..d0bd97044ab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1926,6 +1926,8 @@ static inline unsigned long *end_of_stack(struct task_struct *p) #endif +extern void thread_info_cache_init(void); + /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available */ -- cgit v1.2.3 From 402b08622d9ac6e32e25289573272e0f21bb58a7 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Tue, 25 Mar 2008 18:47:10 +0100 Subject: s390: KVM preparation: provide hook to enable pgstes in user pagetable The SIE instruction on s390 uses the 2nd half of the page table page to virtualize the storage keys of a guest. This patch offers the s390_enable_sie function, which reorganizes the page tables of a single-threaded process to reserve space in the page table: s390_enable_sie makes sure that the process is single threaded and then uses dup_mm to create a new mm with reorganized page tables. The old mm is freed and the process has now a page status extended field after every page table. Code that wants to exploit pgstes should SELECT CONFIG_PGSTE. This patch has a small common code hit, namely making dup_mm non-static. Edit (Carsten): I've modified Martin's patch, following Jeremy Fitzhardinge's review feedback. Now we do have the prototype for dup_mm in include/linux/sched.h. Following Martin's suggestion, s390_enable_sie() does now call task_lock() to prevent race against ptrace modification of mm_users. Signed-off-by: Martin Schwidefsky Signed-off-by: Carsten Otte Acked-by: Andrew Morton Signed-off-by: Avi Kivity --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index d0bd97044ab..9a4f3e63e3b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1798,6 +1798,8 @@ extern void mmput(struct mm_struct *); extern struct mm_struct *get_task_mm(struct task_struct *task); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); +/* Allocate a new mm structure and copy contents from tsk->mm */ +extern struct mm_struct *dup_mm(struct task_struct *tsk); extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); extern void flush_thread(void); -- cgit v1.2.3 From 3898b1b4ebff8dcfbcf1807e0661585e06c9a91c Mon Sep 17 00:00:00 2001 From: "Andrew G. Morgan" Date: Mon, 28 Apr 2008 02:13:40 -0700 Subject: capabilities: implement per-process securebits Filesystem capability support makes it possible to do away with (set)uid-0 based privilege and use capabilities instead. That is, with filesystem support for capabilities but without this present patch, it is (conceptually) possible to manage a system with capabilities alone and never need to obtain privilege via (set)uid-0. Of course, conceptually isn't quite the same as currently possible since few user applications, certainly not enough to run a viable system, are currently prepared to leverage capabilities to exercise privilege. Further, many applications exist that may never get upgraded in this way, and the kernel will continue to want to support their setuid-0 base privilege needs. Where pure-capability applications evolve and replace setuid-0 binaries, it is desirable that there be a mechanisms by which they can contain their privilege. In addition to leveraging the per-process bounding and inheritable sets, this should include suppressing the privilege of the uid-0 superuser from the process' tree of children. The feature added by this patch can be leveraged to suppress the privilege associated with (set)uid-0. This suppression requires CAP_SETPCAP to initiate, and only immediately affects the 'current' process (it is inherited through fork()/exec()). This reimplementation differs significantly from the historical support for securebits which was system-wide, unwieldy and which has ultimately withered to a dead relic in the source of the modern kernel. With this patch applied a process, that is capable(CAP_SETPCAP), can now drop all legacy privilege (through uid=0) for itself and all subsequently fork()'d/exec()'d children with: prctl(PR_SET_SECUREBITS, 0x2f); This patch represents a no-op unless CONFIG_SECURITY_FILE_CAPABILITIES is enabled at configure time. [akpm@linux-foundation.org: fix uninitialised var warning] [serue@us.ibm.com: capabilities: use cap_task_prctl when !CONFIG_SECURITY] Signed-off-by: Andrew G. Morgan Acked-by: Serge Hallyn Reviewed-by: James Morris Cc: Stephen Smalley Cc: Paul Moore Signed-off-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9a4f3e63e3b..024d72b47a0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -68,7 +68,6 @@ struct sched_param { #include #include #include -#include #include #include #include @@ -1133,7 +1132,7 @@ struct task_struct { gid_t gid,egid,sgid,fsgid; struct group_info *group_info; kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; - unsigned keep_capabilities:1; + unsigned securebits; struct user_struct *user; #ifdef CONFIG_KEYS struct key *request_key_auth; /* assumed request_key authority */ -- cgit v1.2.3 From cf475ad28ac35cc9ba612d67158f29b73b38b05d Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 29 Apr 2008 01:00:16 -0700 Subject: cgroups: add an owner to the mm_struct Remove the mem_cgroup member from mm_struct and instead adds an owner. This approach was suggested by Paul Menage. The advantage of this approach is that, once the mm->owner is known, using the subsystem id, the cgroup can be determined. It also allows several control groups that are virtually grouped by mm_struct, to exist independent of the memory controller i.e., without adding mem_cgroup's for each controller, to mm_struct. A new config option CONFIG_MM_OWNER is added and the memory resource controller selects this config option. This patch also adds cgroup callbacks to notify subsystems when mm->owner changes. The mm_cgroup_changed callback is called with the task_lock() of the new task held and is called just prior to changing the mm->owner. I am indebted to Paul Menage for the several reviews of this patchset and helping me make it lighter and simpler. This patch was tested on a powerpc box, it was compiled with both the MM_OWNER config turned on and off. After the thread group leader exits, it's moved to init_css_state by cgroup_exit(), thus all future charges from runnings threads would be redirected to the init_css_set's subsystem. Signed-off-by: Balbir Singh Cc: Pavel Emelianov Cc: Hugh Dickins Cc: Sudhir Kumar Cc: YAMAMOTO Takashi Cc: Hirokazu Takahashi Cc: David Rientjes , Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Acked-by: Pekka Enberg Reviewed-by: Paul Menage Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 024d72b47a0..1d02babdb2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2148,6 +2148,19 @@ static inline void migration_init(void) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +#ifdef CONFIG_MM_OWNER +extern void mm_update_next_owner(struct mm_struct *mm); +extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); +#else +static inline void mm_update_next_owner(struct mm_struct *mm) +{ +} + +static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +} +#endif /* CONFIG_MM_OWNER */ + #endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From e442055193e4584218006e616c9bdce0c5e9ae5c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2008 00:52:44 -0700 Subject: signals: re-assign CLD_CONTINUED notification from the sender to reciever Based on discussion with Jiri and Roland. In short: currently handle_stop_signal(SIGCONT, p) sends the notification to p->parent, with this patch p itself notifies its parent when it becomes running. handle_stop_signal(SIGCONT) has to drop ->siglock temporary in order to notify the parent with do_notify_parent_cldstop(). This leads to multiple problems: - as Jiri Kosina pointed out, the stopped task can resume without actually seeing SIGCONT which may have a handler. - we race with another sig_kernel_stop() signal which may come in that window. - we race with sig_fatal() signals which may set SIGNAL_GROUP_EXIT in that window. - we can't avoid taking tasklist_lock() while sending SIGCONT. With this patch handle_stop_signal() just sets the new SIGNAL_CLD_CONTINUED flag in p->signal->flags and returns. The notification is sent by the first task which returns from finish_stop() (there should be at least one) or any other signalled thread from get_signal_to_deliver(). This is a user-visible change. Say, currently kill(SIGCONT, stopped_child) can't return without seeing SIGCHLD, with this patch SIGCHLD can be delayed unpredictably. Another difference is that if the child is ptraced by another process, CLD_CONTINUED may be delivered to ->real_parent after ptrace_detach() while currently it always goes to the tracer which doesn't actually need this notification. Hopefully not a problem. The patch asks for the futher obvious cleanups, I'll send them separately. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1d02babdb2c..ef561527034 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -554,6 +554,12 @@ struct signal_struct { #define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ +/* + * Pending notifications to parent. + */ +#define SIGNAL_CLD_STOPPED 0x00000010 +#define SIGNAL_CLD_CONTINUED 0x00000020 +#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) -- cgit v1.2.3 From ac5c215383f43a106ba4ef298126bf78c126f5e9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2008 00:52:57 -0700 Subject: signals: join send_sigqueue() with send_group_sigqueue() We export send_sigqueue() and send_group_sigqueue() for the only user, posix_timer_event(). This is a bit silly, because both are just trivial helpers on top of do_send_sigqueue() and because the we pass the unused .si_signo parameter. Kill them both, rename do_send_sigqueue() to send_sigqueue(), and export it. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index ef561527034..0917b3df12d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1751,8 +1751,7 @@ extern void zap_other_threads(struct task_struct *p); extern int kill_proc(pid_t, int, int); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); -extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); -extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); +extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); -- cgit v1.2.3 From fae5fa44f1fd079ffbed8e0add929dd7bbd1347f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2008 00:53:03 -0700 Subject: signals: fix /sbin/init protection from unwanted signals The global init has a lot of long standing problems with the unhandled fatal signals. - The "is_global_init(current)" check in get_signal_to_deliver() protects only the main thread. Sub-thread can dequee the fatal signal and shutdown the whole thread group except the main thread. If it dequeues SIGSTOP /sbin/init will be stopped, this is not right too. Note that we can't use is_global_init(->group_leader), this breaks exec and this can't solve other problems we have. - Even if afterwards ignored, the fatal signals sets SIGNAL_GROUP_EXIT on delivery. This breaks exec, has other bad implications, and this is just wrong. Introduce the new SIGNAL_UNKILLABLE flag to fix these problems. It also helps to solve some other problems addressed by the subsequent patches. Currently we use this flag for the global init only, but it could also be used by kthreads and (perhaps) by the sub-namespace inits. Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0917b3df12d..fe970cdca83 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -561,6 +561,8 @@ struct signal_struct { #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) +#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ + /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { -- cgit v1.2.3 From f3de272b821accbc8387211977c2de4f38468d05 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 30 Apr 2008 00:53:09 -0700 Subject: signals: use HAVE_SET_RESTORE_SIGMASK Change all the #ifdef TIF_RESTORE_SIGMASK conditionals in non-arch code to #ifdef HAVE_SET_RESTORE_SIGMASK. If arch code defines it first, the generic set_restore_sigmask() using TIF_RESTORE_SIGMASK is not defined. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index fe970cdca83..86e60796db6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1175,7 +1175,7 @@ struct task_struct { struct sighand_struct *sighand; sigset_t blocked, real_blocked; - sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */ + sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ struct sigpending pending; unsigned long sas_ss_sp; -- cgit v1.2.3 From 5cd204550b1a006f2b0c986b0e0f53220ebfd391 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 30 Apr 2008 00:54:24 -0700 Subject: Deprecate find_task_by_pid() There are some places that are known to operate on tasks' global pids only: * the rest_init() call (called on boot) * the kgdb's getthread * the create_kthread() (since the kthread is run in init ns) So use the find_task_by_pid_ns(..., &init_pid_ns) there and schedule the find_task_by_pid for removal. [sukadev@us.ibm.com: Fix warning in kernel/pid.c] Signed-off-by: Pavel Emelyanov Cc: "Eric W. Biederman" Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 86e60796db6..03c238088ae 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1677,7 +1677,10 @@ extern struct pid_namespace init_pid_ns; extern struct task_struct *find_task_by_pid_type_ns(int type, int pid, struct pid_namespace *ns); -extern struct task_struct *find_task_by_pid(pid_t nr); +static inline struct task_struct *__deprecated find_task_by_pid(pid_t nr) +{ + return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns); +} extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); -- cgit v1.2.3 From 8ae121ac8666b0421aa20fd80d4597ec66fa54bc Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 23 Apr 2008 07:13:29 -0400 Subject: sched: fix RT task-wakeup logic Dmitry Adamushko pointed out a logic error in task_wake_up_rt() where we will always evaluate to "true". You can find the thread here: http://lkml.org/lkml/2008/4/22/296 In reality, we only want to try to push tasks away when a wake up request is not going to preempt the current task. So lets fix it. Note: We introduce test_tsk_need_resched() instead of open-coding the flag check so that the merge-conflict with -rt should help remind us that we may need to support NEEDS_RESCHED_DELAYED in the future, too. Signed-off-by: Gregory Haskins CC: Dmitry Adamushko CC: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 03c238088ae..698b5a4d25a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1977,6 +1977,11 @@ static inline void clear_tsk_need_resched(struct task_struct *tsk) clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } +static inline int test_tsk_need_resched(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); +} + static inline int signal_pending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); @@ -1991,7 +1996,7 @@ static inline int fatal_signal_pending(struct task_struct *p) static inline int need_resched(void) { - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); + return unlikely(test_tsk_need_resched(current)); } /* -- cgit v1.2.3 From 690229a0912ca2fef8b542fe4d8b73acfcdc6e24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 23 Apr 2008 09:31:35 +0200 Subject: sched: make clock sync tunable by architecture code make time_sync_thresh tunable to architecture code. Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 698b5a4d25a..54c9ca26b7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -158,6 +158,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) } #endif +extern unsigned long long time_sync_thresh; + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). -- cgit v1.2.3 From 3e51f33fcc7f55e6df25d15b55ed10c8b4da84cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 3 May 2008 18:29:28 +0200 Subject: sched: add optional support for CONFIG_HAVE_UNSTABLE_SCHED_CLOCK this replaces the rq->clock stuff (and possibly cpu_clock()). - architectures that have an 'imperfect' hardware clock can set CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - the 'jiffie' window might be superfulous when we update tick_gtod before the __update_sched_clock() call in sched_clock_tick() - cpu_clock() might be implemented as: sched_clock_cpu(smp_processor_id()) if the accuracy proves good enough - how far can TSC drift in a single jiffie when considering the filtering and idle hooks? [ mingo@elte.hu: various fixes and cleanups ] Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 54c9ca26b7d..0c35b0343a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1553,6 +1553,35 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) extern unsigned long long sched_clock(void); +#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline void sched_clock_init(void) +{ +} + +static inline u64 sched_clock_cpu(int cpu) +{ + return sched_clock(); +} + +static inline void sched_clock_tick(void) +{ +} + +static inline void sched_clock_idle_sleep_event(void) +{ +} + +static inline void sched_clock_idle_wakeup_event(u64 delta_ns) +{ +} +#else +extern void sched_clock_init(void); +extern u64 sched_clock_cpu(int cpu); +extern void sched_clock_tick(void); +extern void sched_clock_idle_sleep_event(void); +extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#endif + /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): -- cgit v1.2.3