From 5116d8f6b977970ebefc1932c0f313163a6ec91f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Jul 2009 17:10:01 +0300 Subject: KVM: fix ack not being delivered when msi present kvm_notify_acked_irq does not check irq type, so that it sometimes interprets msi vector as irq. As a result, ack notifiers are not called, which typially hangs the guest. The fix is to track and check irq type. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 16713dc672e..3060bdc35ff 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -110,6 +110,7 @@ struct kvm_memory_slot { struct kvm_kernel_irq_routing_entry { u32 gsi; + u32 type; int (*set)(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int level); union { -- cgit v1.2.3 From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 17:34:57 +0200 Subject: perf_counter, ftrace: Fix perf_counter integration Adds possible second part to the assign argument of TP_EVENT(). TP_perf_assign( __perf_count(foo); __perf_addr(bar); ) Which, when specified make the swcounter increment with @foo instead of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address associated with the event) when this triggers a counter overflow. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 110 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1867553c61e..fec71f8dbc4 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -144,6 +144,9 @@ #undef TP_fast_assign #define TP_fast_assign(args...) args +#undef TP_perf_assign +#define TP_perf_assign(args...) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ @@ -345,6 +348,88 @@ static inline int ftrace_get_offsets_##call( \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#ifdef CONFIG_EVENT_PROFILE + +/* + * Generate the functions needed for tracepoint perf_counter support. + * + * static void ftrace_profile_(proto) + * { + * extern void perf_tpcounter_event(int, u64, u64); + * u64 __addr = 0, __count = 1; + * + * <-- here we expand the TP_perf_assign() macro + * + * perf_tpcounter_event(event_.id, __addr, __count); + * } + * + * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) + * { + * int ret = 0; + * + * if (!atomic_inc_return(&event_call->profile_count)) + * ret = register_trace_(ftrace_profile_); + * + * return ret; + * } + * + * static void ftrace_profile_disable_(struct ftrace_event_call *event_call) + * { + * if (atomic_add_negative(-1, &event->call->profile_count)) + * unregister_trace_(ftrace_profile_); + * } + * + */ + +#undef TP_fast_assign +#define TP_fast_assign(args...) + +#undef TP_perf_assign +#define TP_perf_assign(args...) args + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ + \ +static void ftrace_profile_##call(proto) \ +{ \ + extern void perf_tpcounter_event(int, u64, u64); \ + u64 __addr = 0, __count = 1; \ + { assign; } \ + perf_tpcounter_event(event_##call.id, __addr, __count); \ +} \ + \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + \ + if (!atomic_inc_return(&event_call->profile_count)) \ + ret = register_trace_##call(ftrace_profile_##call); \ + \ + return ret; \ +} \ + \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ +{ \ + if (atomic_add_negative(-1, &event_call->profile_count)) \ + unregister_trace_##call(ftrace_profile_##call); \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TP_perf_assign +#define TP_perf_assign(args...) + +#endif + /* * Stage 4 of the trace events. * @@ -447,28 +532,6 @@ static inline int ftrace_get_offsets_##call( \ #define TP_FMT(fmt, args...) fmt "\n", ##args #ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&event_call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ -{ \ - if (atomic_add_negative(-1, &event_call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} #define _TRACE_PROFILE_INIT(call) \ .profile_count = ATOMIC_INIT(-1), \ @@ -476,7 +539,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ .profile_disable = ftrace_profile_disable_##call, #else -#define _TRACE_PROFILE(call, proto, args) #define _TRACE_PROFILE_INIT(call) #endif @@ -502,7 +564,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ \ static struct ftrace_event_call event_##call; \ \ @@ -586,6 +647,5 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT -- cgit v1.2.3 From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 01:25:54 +0200 Subject: perf_counter: Fix/complete ftrace event records sampling This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker Cc: Li Zefan Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Steven Rostedt Cc: Paul Mackerras Cc: Pekka Enberg Cc: Gabriel Munteanu Cc: Lai Jiangshan Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 +- include/linux/perf_counter.h | 9 ++- include/trace/ftrace.h | 130 ++++++++++++++++++++++++++++++++----------- 3 files changed, 107 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d7cd193c227..a81170de7f6 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -89,7 +89,9 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; - +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); struct ring_buffer_event * trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e604e6ef72d..a67dd5c5b6d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,8 +121,9 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_TP_RECORD = 1U << 10, - PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; /* @@ -413,6 +414,11 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; +struct perf_tracepoint_record { + int size; + char *record; +}; + struct task_struct; /** @@ -681,6 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; + void *private; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index fec71f8dbc4..7fb16d90e7b 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call( \ /* * Generate the functions needed for tracepoint perf_counter support. * - * static void ftrace_profile_(proto) - * { - * extern void perf_tpcounter_event(int, u64, u64); - * u64 __addr = 0, __count = 1; - * - * <-- here we expand the TP_perf_assign() macro - * - * perf_tpcounter_event(event_.id, __addr, __count); - * } + * NOTE: The insertion profile callback (ftrace_profile_) is defined later * * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) * { @@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call( \ * */ -#undef TP_fast_assign -#define TP_fast_assign(args...) - -#undef TP_perf_assign -#define TP_perf_assign(args...) args - -#undef __perf_addr -#define __perf_addr(a) __addr = (a) - -#undef __perf_count -#define __perf_count(c) __count = (c) - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int, u64, u64); \ - u64 __addr = 0, __count = 1; \ - { assign; } \ - perf_tpcounter_event(event_##call.id, __addr, __count); \ -} \ +static void ftrace_profile_##call(proto); \ \ static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ { \ @@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - #endif /* @@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * Define the insertion callback to profile events + * + * The job is very similar to ftrace_raw_event_ except that we don't + * insert in the ring buffer but in a perf counter. + * + * static void ftrace_profile_(proto) + * { + * struct ftrace_data_offsets_ __maybe_unused __data_offsets; + * struct ftrace_event_call *event_call = &event_; + * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * struct ftrace_raw_##call *entry; + * u64 __addr = 0, __count = 1; + * unsigned long irq_flags; + * int __entry_size; + * int __data_size; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * __data_size = ftrace_get_offsets_(&__data_offsets, args); + * __entry_size = __data_size + sizeof(*entry); + * + * do { + * char raw_data[__entry_size]; <- allocate our sample in the stack + * struct trace_entry *ent; + * + * entry = (struct ftrace_raw_ *)raw_data; + * ent = &entry->ent; + * tracing_generic_entry_update(ent, irq_flags, pc); + * ent->type = event_call->id; + * + * <- do some jobs with dynamic arrays + * + * <- affect our values + * + * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * __entry_size); <- submit them to perf counter + * } while (0); + * + * } + */ + +#ifdef CONFIG_EVENT_PROFILE + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +static void ftrace_profile_##call(proto) \ +{ \ + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ + struct ftrace_event_call *event_call = &event_##call; \ + extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + struct ftrace_raw_##call *entry; \ + u64 __addr = 0, __count = 1; \ + unsigned long irq_flags; \ + int __entry_size; \ + int __data_size; \ + int pc; \ + \ + local_save_flags(irq_flags); \ + pc = preempt_count(); \ + \ + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ + __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + \ + do { \ + char raw_data[__entry_size]; \ + struct trace_entry *ent; \ + \ + entry = (struct ftrace_raw_##call *)raw_data; \ + ent = &entry->ent; \ + tracing_generic_entry_update(ent, irq_flags, pc); \ + ent->type = event_call->id; \ + \ + tstruct \ + \ + { assign; } \ + \ + perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + __entry_size); \ + } while (0); \ + \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_EVENT_PROFILE */ + #undef _TRACE_PROFILE_INIT -- cgit v1.2.3 From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a67dd5c5b6d..2aabe43c1d0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,7 +121,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_TP_RECORD = 1U << 10, + PERF_SAMPLE_RAW = 1U << 10, PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; @@ -414,9 +414,9 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; -struct perf_tracepoint_record { - int size; - char *record; +struct perf_raw_record { + u32 size; + void *data; }; struct task_struct; @@ -687,7 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; - void *private; + struct perf_raw_record *raw; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, -- cgit v1.2.3 From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ include/trace/ftrace.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2aabe43c1d0..a9d823a93fe 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -369,6 +369,8 @@ enum perf_event_type { * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW * }; */ PERF_EVENT_SAMPLE = 9, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7fb16d90e7b..7167b9b97da 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -685,7 +685,8 @@ static void ftrace_profile_##call(proto) \ pc = preempt_count(); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ - __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ + sizeof(u64)); \ \ do { \ char raw_data[__entry_size]; \ -- cgit v1.2.3 From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 12:33:05 +0100 Subject: locking, sched: Give waitqueue spinlocks their own lockdep classes Give waitqueue spinlocks their own lockdep classes when they are initialised from init_waitqueue_head(). This means that struct wait_queue::func functions can operate other waitqueues. This is used by CacheFiles to catch the page from a backing fs being unlocked and to wake up another thread to take a copy of it. Signed-off-by: Peter Zijlstra Signed-off-by: David Howells Tested-by: Takashi Iwai Cc: linux-cachefs@redhat.com Cc: torvalds@osdl.org Cc: akpm@linux-foundation.org LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 6788e1a4d4c..cf3c2f5dba5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -77,7 +77,14 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void init_waitqueue_head(wait_queue_head_t *q); +extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); + +#define init_waitqueue_head(q) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_waitqueue_head((q), &__key); \ + } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ -- cgit v1.2.3 From 304703aba31a87903b8c0db8f5e6890cac2d596d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 10 Aug 2009 16:11:32 +0200 Subject: perf_counter: Subtract the buffer size field from the event record size We compute the perf raw sample size by aligning the raw ftrace event size plus the buffer size field itself. We do that instead of aligning only the perf raw sample size, so that we might economize some in some cases. But this buffer size field is not stored in the perf raw sample, we must then substract its size from the buffer once we computed the alignment unless we may get a useless u32 field in the buffer. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <20090810141129.GA5124@nowhere> Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7167b9b97da..a05524fa245 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -637,7 +637,12 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * pc = preempt_count(); * * __data_size = ftrace_get_offsets_(&__data_offsets, args); - * __entry_size = __data_size + sizeof(*entry); + * + * // Below we want to get the aligned size by taking into account + * // the u32 field that will later store the buffer size + * __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32), + * sizeof(u64)); + * __entry_size -= sizeof(u32); * * do { * char raw_data[__entry_size]; <- allocate our sample in the stack @@ -687,6 +692,7 @@ static void ftrace_profile_##call(proto) \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ sizeof(u64)); \ + __entry_size -= sizeof(u32); \ \ do { \ char raw_data[__entry_size]; \ -- cgit v1.2.3 From 1853db0e02ae4088f102b0d8e59e83dc98f93f03 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 10 Aug 2009 16:38:36 +0200 Subject: perf_counter: Zero dead bytes from ftrace raw samples size alignment After aligning the ftrace raw samples, there are dead bytes storing random data from the stack. We don't want to leak these to userspace, then zero these out. Before: 0x2de88 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff ......P........ . 0010: 68 01 00 00 68 01 00 00 2c 00 00 00 00 00 00 00 h...h...,...... . 0020: 2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00 ,...+...h...h.. . 0030: 6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00 kondemand/0.... . 0040: 68 01 00 00 40 7f 46 81 ff ff ff ff 00 10 1b 7f h...@.F........ ^ ^ ^ ^ Leak After: 0x2d318 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff ......P........ . 0010: 68 01 00 00 68 01 00 00 68 14 00 00 00 00 00 00 h...h...h...... . 0020: 2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00 ,...+...h...h.. . 0030: 6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00 kondemand/0.... . 0040: 68 01 00 00 a0 80 46 81 ff ff ff ff 00 00 00 00 h.....F........ ^ ^ ^ ^ Fixed Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249915116-5210-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith --- include/trace/ftrace.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a05524fa245..f64fbaae781 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -648,6 +648,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * char raw_data[__entry_size]; <- allocate our sample in the stack * struct trace_entry *ent; * + * zero dead bytes from alignment to avoid stack leak to userspace: + * + * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; * entry = (struct ftrace_raw_ *)raw_data; * ent = &entry->ent; * tracing_generic_entry_update(ent, irq_flags, pc); @@ -698,6 +701,7 @@ static void ftrace_profile_##call(proto) \ char raw_data[__entry_size]; \ struct trace_entry *ent; \ \ + *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ entry = (struct ftrace_raw_##call *)raw_data; \ ent = &entry->ent; \ tracing_generic_entry_update(ent, irq_flags, pc); \ -- cgit v1.2.3 From 1ae88b2e446261c038f2c0c3150ffae142b227a2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Aug 2009 09:12:30 -0400 Subject: NFS: Fix an O_DIRECT Oops... We can't call nfs_readdata_release()/nfs_writedata_release() without first initialising and referencing args.context. Doing so inside nfs_direct_read_schedule_segment()/nfs_direct_write_schedule_segment() causes an Oops. We should rather be calling nfs_readdata_free()/nfs_writedata_free() in those cases. Looking at the O_DIRECT code, the "struct nfs_direct_req" is already referencing the nfs_open_context for us. Since the readdata and writedata structures carry a reference to that, we can simplify things by getting rid of the extra nfs_open_context references, so that we can replace all instances of nfs_readdata_release()/nfs_writedata_release(). Reported-by: Catalin Marinas Signed-off-by: Trond Myklebust Tested-by: Catalin Marinas Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/nfs_fs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index fdffb413b19..f6b90240dd4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -473,7 +473,6 @@ extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); -extern void nfs_writedata_release(void *); /* * Try to write back everything synchronously (but check the @@ -488,7 +487,6 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_write_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_write_data *wdata); -extern void nfs_commitdata_release(void *wdata); #else static inline int nfs_commit_inode(struct inode *inode, int how) @@ -507,6 +505,7 @@ nfs_have_writebacks(struct inode *inode) * Allocate nfs_write_data structures */ extern struct nfs_write_data *nfs_writedata_alloc(unsigned int npages); +extern void nfs_writedata_free(struct nfs_write_data *); /* * linux/fs/nfs/read.c @@ -515,7 +514,6 @@ extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); extern int nfs_readpage_result(struct rpc_task *, struct nfs_read_data *); -extern void nfs_readdata_release(void *data); extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, struct page *); @@ -523,6 +521,7 @@ extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, * Allocate nfs_read_data structures */ extern struct nfs_read_data *nfs_readdata_alloc(unsigned int npages); +extern void nfs_readdata_free(struct nfs_read_data *); /* * linux/fs/nfs3proc.c -- cgit v1.2.3 From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 10:13:22 +0200 Subject: perf_counter: Provide hw_perf_counter_setup_online() APIs Provide weak aliases for hw_perf_counter_setup_online(). This is used by the BTS patches (for v2.6.32), but it interacts with fixes so propagate this upstream. (it has no effect as of yet) Also export perf_counter_output() to architecture code. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a9d823a93fe..2b36afe6ae0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -694,6 +694,8 @@ struct perf_sample_data { extern int perf_counter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); /* * Return 1 for a software counter, 0 for a hardware counter -- cgit v1.2.3 From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:53 +0200 Subject: perf: Rework/fix the whole read vs group stuff Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce PERF_FORMAT_GROUP to deal with group reads in a more generic way. This allows you to get group reads out of read() as well. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.117411814@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 47 +++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b36afe6ae0..b53f7006cc4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -115,7 +115,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_TID = 1U << 1, PERF_SAMPLE_TIME = 1U << 2, PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_READ = 1U << 4, PERF_SAMPLE_CALLCHAIN = 1U << 5, PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, @@ -127,16 +127,32 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in attr.read_format to request that - * reads on the counter should return the indicated quantities, - * in increasing order of bit value, after the counter value. + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; */ enum perf_counter_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, - PERF_FORMAT_MAX = 1U << 3, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ @@ -343,10 +359,8 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, tid; - * u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 parent_id; } && PERF_FORMAT_ID + * + * struct read_format values; * }; */ PERF_EVENT_READ = 8, @@ -364,11 +378,22 @@ enum perf_event_type { * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 period; } && PERF_SAMPLE_PERIOD * - * { u64 nr; - * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP + * { struct read_format values; } && PERF_SAMPLE_READ * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW * }; -- cgit v1.2.3 From 9f162d2a810b4db48f7b8d7e734d0932c81ec2a1 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:18:44 +0300 Subject: sunrpc: hton -> cpu_to_be* htonl is already defined as cpu_to_be32. cpu_to_be64 has architecture specific optimized implementations. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index b99c625fddf..f94bbdc75c4 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -117,9 +117,8 @@ static inline __be32 *xdr_encode_array(__be32 *p, const void *s, unsigned int le static inline __be32 * xdr_encode_hyper(__be32 *p, __u64 val) { - *p++ = htonl(val >> 32); - *p++ = htonl(val & 0xFFFFFFFF); - return p; + *(__be64 *)p = cpu_to_be64(val); + return p + 2; } static inline __be32 * -- cgit v1.2.3 From 98866b5abe1513cdacc011874ca045d40002eccd Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 14 Aug 2009 17:18:49 +0300 Subject: sunrpc: ntoh -> be*_to_cpu ntohl is already defined as be32_to_cpu. be64_to_cpu has architecture specific optimized implementations. Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index f94bbdc75c4..7da466ba4b0 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -124,9 +124,8 @@ xdr_encode_hyper(__be32 *p, __u64 val) static inline __be32 * xdr_decode_hyper(__be32 *p, __u64 *valp) { - *valp = ((__u64) ntohl(*p++)) << 32; - *valp |= ntohl(*p++); - return p; + *valp = be64_to_cpup((__be64 *)p); + return p + 2; } /* -- cgit v1.2.3