From eccdaeafaea3ed115068ba55d01f22e486e5437d Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Mon, 24 Nov 2008 15:46:31 +0100 Subject: posix-cpu-timers: fix clock_gettime with CLOCK_PROCESS_CPUTIME_ID Since CLOCK_PROCESS_CPUTIME_ID is in fact translated to -6, the switch statement in cpu_clock_sample_group() must first mask off the irrelevant bits, similar to cpu_clock_sample(). Signed-off-by: Petr Tesarik Signed-off-by: Thomas Gleixner -- posix-cpu-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- kernel/posix-cpu-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 895337b16a2..4e5288a831d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -311,7 +311,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_cputime cputime; thread_group_cputime(p, &cputime); - switch (which_clock) { + switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: -- cgit v1.2.3 From 201955463a5c1a70d3f70d1598b27e4c2c402642 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 2 Dec 2008 22:55:38 +0100 Subject: check_hung_task(): unsigned sysctl_hung_task_warnings cannot be less than 0 Impact: fix warnings-limit cutoff check for debug feature unsigned sysctl_hung_task_warnings cannot be less than 0 Signed-off-by: Roel Kluin Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 3953e4aed73..dc0b3be6b7d 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -188,7 +188,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) if ((long)(now - t->last_switch_timestamp) < sysctl_hung_task_timeout_secs) return; - if (sysctl_hung_task_warnings < 0) + if (!sysctl_hung_task_warnings) return; sysctl_hung_task_warnings--; -- cgit v1.2.3 From 6c9bacb41c10ba84ff68f238e234d96f35fb64f7 Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 1 Dec 2008 18:34:41 -0800 Subject: time: catch xtime_nsec underflows and fix them Impact: fix time warp bug Alex Shi, along with Yanmin Zhang have been noticing occasional time inconsistencies recently. Through their great diagnosis, they found that the xtime_nsec value used in update_wall_time was occasionally going negative. After looking through the code for awhile, I realized we have the possibility for an underflow when three conditions are met in update_wall_time(): 1) We have accumulated a second's worth of nanoseconds, so we incremented xtime.tv_sec and appropriately decrement xtime_nsec. (This doesn't cause xtime_nsec to go negative, but it can cause it to be small). 2) The remaining offset value is large, but just slightly less then cycle_interval. 3) clocksource_adjust() is speeding up the clock, causing a corrective amount (compensating for the increase in the multiplier being multiplied against the unaccumulated offset value) to be subtracted from xtime_nsec. This can cause xtime_nsec to underflow. Unfortunately, since we notify the NTP subsystem via second_overflow() whenever we accumulate a full second, and this effects the error accumulation that has already occured, we cannot simply revert the accumulated second from xtime nor move the second accumulation to after the clocksource_adjust call without a change in behavior. This leaves us with (at least) two options: 1) Simply return from clocksource_adjust() without making a change if we notice the adjustment would cause xtime_nsec to go negative. This would work, but I'm concerned that if a large adjustment was needed (due to the error being large), it may be possible to get stuck with an ever increasing error that becomes too large to correct (since it may always force xtime_nsec negative). This may just be paranoia on my part. 2) Catch xtime_nsec if it is negative, then add back the amount its negative to both xtime_nsec and the error. This second method is consistent with how we've handled earlier rounding issues, and also has the benefit that the error being added is always in the oposite direction also always equal or smaller then the correction being applied. So the risk of a corner case where things get out of control is lessened. This patch fixes bug 11970, as tested by Yanmin Zhang http://bugzilla.kernel.org/show_bug.cgi?id=11970 Reported-by: alex.shi@intel.com Signed-off-by: John Stultz Acked-by: "Zhang, Yanmin" Tested-by: "Zhang, Yanmin" Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e7acfb482a6..fa05e88aa76 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -518,6 +518,28 @@ void update_wall_time(void) /* correct the clock when NTP error is too big */ clocksource_adjust(offset); + /* + * Since in the loop above, we accumulate any amount of time + * in xtime_nsec over a second into xtime.tv_sec, its possible for + * xtime_nsec to be fairly small after the loop. Further, if we're + * slightly speeding the clocksource up in clocksource_adjust(), + * its possible the required corrective factor to xtime_nsec could + * cause it to underflow. + * + * Now, we cannot simply roll the accumulated second back, since + * the NTP subsystem has been notified via second_overflow. So + * instead we push xtime_nsec forward by the amount we underflowed, + * and add that amount into the error. + * + * We'll correct this error next time through this function, when + * xtime_nsec is not as small. + */ + if (unlikely((s64)clock->xtime_nsec < 0)) { + s64 neg = -(s64)clock->xtime_nsec; + clock->xtime_nsec = 0; + clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); + } + /* store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ -- cgit v1.2.3 From 50c396d38c1a7f0c693579ec88cb4be3c0b0645e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 30 Nov 2008 01:47:12 -0500 Subject: [PATCH] kill obsolete temporary comment in swsusp_close() it had been put there to mark the call of blkdev_put() that needed proper argument propagated to it; later patch in the same series had done just that. Signed-off-by: Al Viro --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b7713b53d07..6da14358537 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -633,7 +633,7 @@ void swsusp_close(fmode_t mode) return; } - blkdev_put(resume_bdev, mode); /* move up */ + blkdev_put(resume_bdev, mode); } static int swsusp_header_init(void) -- cgit v1.2.3 From a3f07114e3359fb98683069ae397220e8992a24a Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 5 Nov 2008 12:47:09 -0500 Subject: [PATCH] Audit: make audit=0 actually turn off audit Currently audit=0 on the kernel command line does absolutely nothing. Audit always loads and always uses its resources such as creating the kernel netlink socket. This patch causes audit=0 to actually disable audit. Audit will use no resources and starting the userspace auditd daemon will not cause the kernel audit system to activate. Signed-off-by: Eric Paris Signed-off-by: Al Viro --- kernel/audit.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 4414e93d875..d8646c23b42 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -61,8 +61,11 @@ #include "audit.h" -/* No auditing will take place until audit_initialized != 0. +/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED. * (Initialization happens after skb_init is called.) */ +#define AUDIT_DISABLED -1 +#define AUDIT_UNINITIALIZED 0 +#define AUDIT_INITIALIZED 1 static int audit_initialized; #define AUDIT_OFF 0 @@ -965,6 +968,9 @@ static int __init audit_init(void) { int i; + if (audit_initialized == AUDIT_DISABLED) + return 0; + printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, @@ -976,7 +982,7 @@ static int __init audit_init(void) skb_queue_head_init(&audit_skb_queue); skb_queue_head_init(&audit_skb_hold_queue); - audit_initialized = 1; + audit_initialized = AUDIT_INITIALIZED; audit_enabled = audit_default; audit_ever_enabled |= !!audit_default; @@ -999,13 +1005,21 @@ __initcall(audit_init); static int __init audit_enable(char *str) { audit_default = !!simple_strtol(str, NULL, 0); - printk(KERN_INFO "audit: %s%s\n", - audit_default ? "enabled" : "disabled", - audit_initialized ? "" : " (after initialization)"); - if (audit_initialized) { + if (!audit_default) + audit_initialized = AUDIT_DISABLED; + + printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled"); + + if (audit_initialized == AUDIT_INITIALIZED) { audit_enabled = audit_default; audit_ever_enabled |= !!audit_default; + } else if (audit_initialized == AUDIT_UNINITIALIZED) { + printk(" (after initialization)"); + } else { + printk(" (until reboot)"); } + printk("\n"); + return 1; } @@ -1146,7 +1160,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int reserve; unsigned long timeout_start = jiffies; - if (!audit_initialized) + if (audit_initialized != AUDIT_INITIALIZED) return NULL; if (unlikely(audit_filter_type(type))) -- cgit v1.2.3 From a64e64944f4b8ce3288519555dbaa0232414b8ac Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 12 Nov 2008 18:37:41 -0500 Subject: [PATCH] return records for fork() both to child and parent Signed-off-by: Al Viro --- kernel/auditsc.c | 17 +++++++++++++++++ kernel/fork.c | 1 + 2 files changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index cf5bc2f5f9c..de8468050af 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1548,6 +1548,23 @@ void audit_syscall_entry(int arch, int major, context->ppid = 0; } +void audit_finish_fork(struct task_struct *child) +{ + struct audit_context *ctx = current->audit_context; + struct audit_context *p = child->audit_context; + if (!p || !ctx || !ctx->auditable) + return; + p->arch = ctx->arch; + p->major = ctx->major; + memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); + p->ctime = ctx->ctime; + p->dummy = ctx->dummy; + p->auditable = ctx->auditable; + p->in_syscall = ctx->in_syscall; + p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); + p->ppid = current->pid; +} + /** * audit_syscall_exit - deallocate audit context after a system call * @tsk: task being audited diff --git a/kernel/fork.c b/kernel/fork.c index 2a372a0e206..8d6a7dd9282 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1398,6 +1398,7 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } + audit_finish_fork(p); tracehook_report_clone(trace, regs, clone_flags, nr, p); /* -- cgit v1.2.3 From 7f0ed77d241b60f70136f15b8eef30a3de1fa249 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 1 Dec 2008 14:16:06 -0800 Subject: [patch 1/1] audit: remove excess kernel-doc Delete excess kernel-doc notation in kernel/auditsc.c: Warning(linux-2.6.27-git10//kernel/auditsc.c:1481): Excess function parameter or struct member 'tsk' description in 'audit_syscall_entry' Warning(linux-2.6.27-git10//kernel/auditsc.c:1564): Excess function parameter or struct member 'tsk' description in 'audit_syscall_exit' Signed-off-by: Randy Dunlap Cc: Al Viro Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/auditsc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index de8468050af..0a13d689549 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1459,7 +1459,6 @@ void audit_free(struct task_struct *tsk) /** * audit_syscall_entry - fill in an audit record at syscall entry - * @tsk: task being audited * @arch: architecture type * @major: major syscall type (function) * @a1: additional syscall register 1 @@ -1567,7 +1566,6 @@ void audit_finish_fork(struct task_struct *child) /** * audit_syscall_exit - deallocate audit context after a system call - * @tsk: task being audited * @valid: success/failure flag * @return_code: syscall return value * -- cgit v1.2.3 From 48887e63d6e057543067327da6b091297f7fe645 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 6 Dec 2008 01:05:50 -0500 Subject: [PATCH] fix broken timestamps in AVC generated by kernel threads Timestamp in audit_context is valid only if ->in_syscall is set. Signed-off-by: Al Viro --- kernel/audit.c | 4 +--- kernel/auditsc.c | 5 ++++- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d8646c23b42..ce6d8ea3131 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1121,9 +1121,7 @@ unsigned int audit_serial(void) static inline void audit_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial) { - if (ctx) - auditsc_get_stamp(ctx, t, serial); - else { + if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { *t = CURRENT_TIME; *serial = audit_serial(); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 0a13d689549..2a3f0afc4d2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1957,15 +1957,18 @@ EXPORT_SYMBOL_GPL(__audit_inode_child); * * Also sets the context as auditable. */ -void auditsc_get_stamp(struct audit_context *ctx, +int auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial) { + if (!ctx->in_syscall) + return 0; if (!ctx->serial) ctx->serial = audit_serial(); t->tv_sec = ctx->ctime.tv_sec; t->tv_nsec = ctx->ctime.tv_nsec; *serial = ctx->serial; ctx->auditable = 1; + return 1; } /* global counter which is incremented every time something logs in */ -- cgit v1.2.3 From 9a2bd244e18ffbb96c8b783210fda4eded7c7e6f Mon Sep 17 00:00:00 2001 From: Brian King Date: Tue, 9 Dec 2008 08:47:00 -0600 Subject: sched: CPU remove deadlock fix Impact: fix possible deadlock in CPU hot-remove path This patch fixes a possible deadlock scenario in the CPU remove path. migration_call grabs rq->lock, then wakes up everything on rq->migration_queue with the lock held. Then one of the tasks on the migration queue ends up calling tg_shares_up which then also tries to acquire the same rq->lock. [c000000058eab2e0] c000000000502078 ._spin_lock_irqsave+0x98/0xf0 [c000000058eab370] c00000000008011c .tg_shares_up+0x10c/0x20c [c000000058eab430] c00000000007867c .walk_tg_tree+0xc4/0xfc [c000000058eab4d0] c0000000000840c8 .try_to_wake_up+0xb0/0x3c4 [c000000058eab590] c0000000000799a0 .__wake_up_common+0x6c/0xe0 [c000000058eab640] c00000000007ada4 .complete+0x54/0x80 [c000000058eab6e0] c000000000509fa8 .migration_call+0x5fc/0x6f8 [c000000058eab7c0] c000000000504074 .notifier_call_chain+0x68/0xe0 [c000000058eab860] c000000000506568 ._cpu_down+0x2b0/0x3f4 [c000000058eaba60] c000000000506750 .cpu_down+0xa4/0x108 [c000000058eabb10] c000000000507e54 .store_online+0x44/0xa8 [c000000058eabba0] c000000000396260 .sysdev_store+0x3c/0x50 [c000000058eabc10] c0000000001a39b8 .sysfs_write_file+0x124/0x18c [c000000058eabcd0] c00000000013061c .vfs_write+0xd0/0x1bc [c000000058eabd70] c0000000001308a4 .sys_write+0x68/0x114 [c000000058eabe30] c0000000000086b4 syscall_exit+0x0/0x40 Signed-off-by: Brian King Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b7480fb5c3d..e4bb1dd7b30 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6587,7 +6587,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) req = list_entry(rq->migration_queue.next, struct migration_req, list); list_del_init(&req->list); + spin_unlock_irq(&rq->lock); complete(&req->done); + spin_lock_irq(&rq->lock); } spin_unlock_irq(&rq->lock); break; -- cgit v1.2.3 From fbb5b7ae4b442f1923513dc6165a66c7a7f29073 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 9 Dec 2008 13:14:10 -0800 Subject: relayfs: fix infinite loop with splice() Running kmemtraced, which uses splice() on relayfs, causes a hard lock on x86-64 SMP. As described by Tom Zanussi: It looks like you hit the same problem as described here: commit 8191ecd1d14c6914c660dfa007154860a7908857 splice: fix infinite loop in generic_file_splice_read() relay uses the same loop but it never got noticed or fixed. Cc: Mathieu Desnoyers Tested-by: Pekka Enberg Signed-off-by: Tom Zanussi Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 32b0befdcb6..09ac2008f77 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1317,12 +1317,9 @@ static ssize_t relay_file_splice_read(struct file *in, if (ret < 0) break; else if (!ret) { - if (spliced) - break; - if (flags & SPLICE_F_NONBLOCK) { + if (flags & SPLICE_F_NONBLOCK) ret = -EAGAIN; - break; - } + break; } *ppos += ret; -- cgit v1.2.3 From 9c24624727f6d6c460e45762a408ca5f5b9b8ef2 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 9 Dec 2008 13:14:27 -0800 Subject: KSYM_SYMBOL_LEN fixes Miles Lane tailing /sys files hit a BUG which Pekka Enberg has tracked to my 966c8c12dc9e77f931e2281ba25d2f0244b06949 sprint_symbol(): use less stack exposing a bug in slub's list_locations() - kallsyms_lookup() writes a 0 to namebuf[KSYM_NAME_LEN-1], but that was beyond the end of page provided. The 100 slop which list_locations() allows at end of page looks roughly enough for all the other stuff it might print after the symbol before it checks again: break out KSYM_SYMBOL_LEN earlier than before. Latencytop and ftrace and are using KSYM_NAME_LEN buffers where they need KSYM_SYMBOL_LEN buffers, and vmallocinfo a 2*KSYM_NAME_LEN buffer where it wants a KSYM_SYMBOL_LEN buffer: fix those before anyone copies them. [akpm@linux-foundation.org: ftrace.h needs module.h] Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc Miles Lane Acked-by: Pekka Enberg Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/latencytop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 5e7b45c5692..449db466bdb 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v) latency_record[i].time, latency_record[i].max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - char sym[KSYM_NAME_LEN]; + char sym[KSYM_SYMBOL_LEN]; char *c; if (!latency_record[i].backtrace[q]) break; -- cgit v1.2.3 From b88ed20594db2c685555b68c52b693b75738b2f5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 10 Dec 2008 20:48:52 +0000 Subject: fix mapping_writably_mapped() Lee Schermerhorn noticed yesterday that I broke the mapping_writably_mapped test in 2.6.7! Bad bad bug, good good find. The i_mmap_writable count must be incremented for VM_SHARED (just as i_writecount is for VM_DENYWRITE, but while holding the i_mmap_lock) when dup_mmap() copies the vma for fork: it has its own more optimal version of __vma_link_file(), and I missed this out. So the count was later going down to 0 (dangerous) when one end unmapped, then wrapping negative (inefficient) when the other end unmapped. The only impact on x86 would have been that setting a mandatory lock on a file which has at some time been opened O_RDWR and mapped MAP_SHARED (but not necessarily PROT_WRITE) across a fork, might fail with -EAGAIN when it should succeed, or succeed when it should fail. But those architectures which rely on flush_dcache_page() to flush userspace modifications back into the page before the kernel reads it, may in some cases have skipped the flush after such a fork - though any repetitive test will soon wrap the count negative, in which case it will flush_dcache_page() unnecessarily. Fix would be a two-liner, but mapping variable added, and comment moved. Reported-by: Lee Schermerhorn Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- kernel/fork.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8d6a7dd9282..495da2e9a8b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,17 +315,20 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = file->f_mapping; + get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - - /* insert tmp into the share list, just after mpnt */ - spin_lock(&file->f_mapping->i_mmap_lock); + spin_lock(&mapping->i_mmap_lock); + if (tmp->vm_flags & VM_SHARED) + mapping->i_mmap_writable++; tmp->vm_truncate_count = mpnt->vm_truncate_count; - flush_dcache_mmap_lock(file->f_mapping); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); - flush_dcache_mmap_unlock(file->f_mapping); - spin_unlock(&file->f_mapping->i_mmap_lock); + flush_dcache_mmap_unlock(mapping); + spin_unlock(&mapping->i_mmap_lock); } /* -- cgit v1.2.3 From ca7e716c7833aeaeb8fedd6d004c5f5d5e14d325 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Dec 2008 15:46:01 -0800 Subject: Revert "sched_clock: prevent scd->clock from moving backwards" This reverts commit 5b7dba4ff834259a5623e03a565748704a8fe449, which caused a regression in hibernate, reported by and bisected by Fabio Comolli. This revert fixes http://bugzilla.kernel.org/show_bug.cgi?id=12155 http://bugzilla.kernel.org/show_bug.cgi?id=12149 Bisected-by: Fabio Comolli Requested-by: Rafael J. Wysocki Acked-by: Dave Kleikamp Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched_clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 81787248b60..e8ab096ddfe 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) /* * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * max(scd->clock, scd->tick_gtod + TICK_NSEC)); + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); */ clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); + max_clock = scd->tick_gtod + TICK_NSEC; clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); -- cgit v1.2.3 From 307257cf475aac25db30b669987f13d90c934e3a Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Mon, 15 Dec 2008 13:54:22 -0800 Subject: cgroups: fix a race between rmdir and remount When a cgroup is removed, it's unlinked from its parent's children list, but not actually freed until the last dentry on it is released (at which point cgrp->root->number_of_cgroups is decremented). Currently rebind_subsystems checks for the top cgroup's child list being empty in order to rebind subsystems into or out of a hierarchy - this can result in the set of subsystems bound to a hierarchy being removed-but-not-freed cgroup. The simplest fix for this is to forbid remounts that change the set of subsystems on a hierarchy that has removed-but-not-freed cgroups. This bug can be reproduced via: mkdir /mnt/cg mount -t cgroup -o ns,freezer cgroup /mnt/cg mkdir /mnt/cg/foo sleep 1h < /mnt/cg/foo & rmdir /mnt/cg/foo mount -t cgroup -o remount,ns,devices,freezer cgroup /mnt/cg kill $! Though the above will cause oops in -mm only but not mainline, but the bug can cause memory leak in mainline (and even oops) Signed-off-by: Paul Menage Reviewed-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fe00b3b983a..8185a0f0959 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -702,7 +702,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, * any child cgroups exist. This is theoretically supportable * but involves complex error handling, so it's being left until * later */ - if (!list_empty(&cgrp->children)) + if (root->number_of_cgroups > 1) return -EBUSY; /* Process each subsystem */ -- cgit v1.2.3 From 3d9101e92529e1ff6014f95a69afc82f37b9b13a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 17 Dec 2008 22:34:13 +0100 Subject: trace: fix task state printout Impact: fix occasionally incorrect trace output The tracing code has interesting varieties of printing out task state. Unfortunalely only one of the instances is correct as it copies the code from sched.c:sched_show_task(). The others are plain wrong as they treatthe bitfield as an integer offset into the character array. Also the size check of the character array is wrong as it includes the trailing \0. Use a common state decoder inline which does the Right Thing. Signed-off-by: Thomas Gleixner Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d86e3252f30..803100518f1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1301,6 +1301,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + /* * The message is supposed to contain an ending newline. * If the printing stops prematurely, try to add a newline of our own. @@ -1396,12 +1403,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) trace_assign_type(field, entry); - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - - state = field->prev_state ? - __ffs(field->prev_state) + 1 : 0; - S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); comm = trace_find_cmdline(field->next_pid); trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", field->prev_pid, @@ -1519,10 +1522,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", field->prev_pid, field->prev_prio, @@ -1621,12 +1622,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, @@ -1712,12 +1710,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); - S = field->prev_state < sizeof(state_to_char) ? - state_to_char[field->prev_state] : 'X'; - T = field->next_state < sizeof(state_to_char) ? - state_to_char[field->next_state] : 'X'; - if (entry->type == TRACE_WAKE) - S = '+'; + T = task_state_char(field->next_state); + S = entry->type == TRACE_WAKE ? '+' : + task_state_char(field->prev_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); -- cgit v1.2.3 From 6d102bc68f3dd2ae0e305b09170b1751aa67baeb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 17 Dec 2008 17:48:23 +0800 Subject: tracing/ring-buffer: remove unused ring_buffer size Impact: remove dead code struct ring_buffer.size is not set after ring_buffer is initialized or resized. it is always 0. we can use "buffer->pages * PAGE_SIZE" to get ring_buffer's size Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index eab81f918f6..bb6922a931b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -258,7 +258,6 @@ struct ring_buffer_per_cpu { }; struct ring_buffer { - unsigned long size; unsigned pages; unsigned flags; int cpus; @@ -2210,8 +2209,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return -EINVAL; /* At least make sure the two buffers are somewhat the same */ - if (buffer_a->size != buffer_b->size || - buffer_a->pages != buffer_b->pages) + if (buffer_a->pages != buffer_b->pages) return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; -- cgit v1.2.3