From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 30 Mar 2009 19:07:44 +0900
Subject: percpu: use dynamic percpu allocator as the default percpu allocator

This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use
dynamic percpu allocator.  The first chunk is allocated using
embedding helper and 8k is reserved for modules.  This ensures that
the new allocator behaves almost identically to the original allocator
as long as static percpu variables are concerned, so it shouldn't
introduce much breakage.

s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing
range limit the addressing model imposes.  Unfortunately, this breaks
if the address is specified using a variable, so for now, the two
archs aren't converted.

The following architectures are affected by this change.

* sh
* arm
* cris
* mips
* sparc(32)
* blackfin
* avr32
* parisc (broken, under investigation)
* m32r
* powerpc(32)

As this change makes the dynamic allocator the default one,
CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert -
CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted
archs.  These archs implement their own setup_per_cpu_areas() and the
conversion is not trivial.

* powerpc(64)
* sparc(64)
* ia64
* alpha
* s390

Boot and batch alloc/free tests on x86_32 with debug code (x86_32
doesn't use default first chunk initialization).  Compile tested on
sparc(32), powerpc(32), arm and alpha.

Kyle McMartin reported that this change breaks parisc.  The problem is
still under investigation and he is okay with pushing this patch
forward and fixing parisc later.

[ Impact: use dynamic allocator for most archs w/o custom percpu setup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Bryan Wu <cooloney@kernel.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Grant Grundler <grundler@parisc-linux.org>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2..f5934954fa9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module);
 
 #ifdef CONFIG_SMP
 
-#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 
 static void *percpu_modalloc(unsigned long size, unsigned long align,
 			     const char *name)
@@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme)
 	free_percpu(freeme);
 }
 
-#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -535,7 +535,7 @@ static int percpu_modinit(void)
 }
 __initcall(percpu_modinit);
 
-#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
 
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
-- 
cgit v1.2.3


From b9bf3121af348d9255f1c917830fe8c2df52efcb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Jun 2009 15:13:47 +0900
Subject: percpu: use DEFINE_PER_CPU_SHARED_ALIGNED()

There are a few places where ___cacheline_aligned* is used with
DEFINE_PER_CPU().  Use DEFINE_PER_CPU_SHARED_ALIGNED() instead.

DEFINE_PER_CPU_SHARED_ALIGNED() applies alignment only on SMPs.  While
all other converted places used _in_smp variant or only get compiled
for SMP, net/rds used unconditional ____cacheline_aligned.  I don't
see any reason these data structures should be aligned on UP and thus
converted together.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Andy Grover <andy.grover@oracle.com>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7c9098d186e..34fd81d2178 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -318,12 +318,12 @@ struct task_group root_task_group;
 /* Default task group's sched entity on each cpu */
 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
-static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
 #endif /* CONFIG_RT_GROUP_SCHED */
 #else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-- 
cgit v1.2.3


From 245b2e70eabd797932adb263a65da0bab3711753 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Jun 2009 15:13:48 +0900
Subject: percpu: clean up percpu variable definitions

Percpu variable definition is about to be updated such that all percpu
symbols including the static ones must be unique.  Update percpu
variable definitions accordingly.

* as,cfq: rename ioc_count uniquely

* cpufreq: rename cpu_dbs_info uniquely

* xen: move nesting_count out of xen_evtchn_do_upcall() and rename it

* mm: move ratelimits out of balance_dirty_pages_ratelimited_nr() and
  rename it

* ipv4,6: rename cookie_scratch uniquely

* x86 perf_counter: rename prev_left to pmc_prev_left, irq_entry to
  pmc_irq_entry and nmi_entry to pmc_nmi_entry

* perf_counter: rename disable_count to perf_disable_count

* ftrace: rename test_event_disable to ftrace_test_event_disable

* kmemleak: rename test_pointer to kmemleak_test_pointer

* mce: rename next_interval to mce_next_interval

[ Impact: percpu usage cleanups, no duplicate static percpu var names ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: linux-mm <linux-mm@kvack.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andi Kleen <andi@firstfloor.org>
---
 kernel/perf_counter.c       | 6 +++---
 kernel/trace/trace_events.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1a933a221ea..1fd7a2e7575 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -98,16 +98,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,
 
 void __weak perf_counter_print_debug(void)	{ }
 
-static DEFINE_PER_CPU(int, disable_count);
+static DEFINE_PER_CPU(int, perf_disable_count);
 
 void __perf_disable(void)
 {
-	__get_cpu_var(disable_count)++;
+	__get_cpu_var(perf_disable_count)++;
 }
 
 bool __perf_enable(void)
 {
-	return !--__get_cpu_var(disable_count);
+	return !--__get_cpu_var(perf_disable_count);
 }
 
 void perf_disable(void)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index aa08be69a1b..54b1de5074b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1318,7 +1318,7 @@ static __init void event_trace_self_tests(void)
 
 #ifdef CONFIG_FUNCTION_TRACER
 
-static DEFINE_PER_CPU(atomic_t, test_event_disable);
+static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
 
 static void
 function_test_events_call(unsigned long ip, unsigned long parent_ip)
@@ -1334,7 +1334,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 	pc = preempt_count();
 	resched = ftrace_preempt_disable();
 	cpu = raw_smp_processor_id();
-	disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+	disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
 
 	if (disabled != 1)
 		goto out;
@@ -1352,7 +1352,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 	trace_nowake_buffer_unlock_commit(event, flags, pc);
 
  out:
-	atomic_dec(&per_cpu(test_event_disable, cpu));
+	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
 	ftrace_preempt_enable(resched);
 }
 
-- 
cgit v1.2.3


From 134e63756d5f3d0f7604dfcca847b09d1b14fd66 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Fri, 10 Jul 2009 09:51:34 +0000
Subject: genetlink: make netns aware

This makes generic netlink network namespace aware. No
generic netlink families except for the controller family
are made namespace aware, they need to be checked one by
one and then set the family->netnsok member to true.

A new function genlmsg_multicast_netns() is introduced to
allow sending a multicast message in a given namespace,
for example when it applies to an object that lives in
that namespace, a new function genlmsg_multicast_allns()
to send a message to all network namespaces (for objects
that do not have an associated netns).

The function genlmsg_multicast() is changed to multicast
the message in just init_net, which is currently correct
for all generic netlink families since they only work in
init_net right now. Some will later want to work in all
net namespaces because they do not care about the netns
at all -- those will have to be converted to use one of
the new functions genlmsg_multicast_allns() or
genlmsg_multicast_netns() whenever they are made netns
aware in some way.

After this patch families can easily decide whether or
not they should be available in all net namespaces. Many
genl families us it for objects not related to networking
and should therefore be available in all namespaces, but
that will have to be done on a per family basis.

Note that this doesn't touch on the checkpoint/restart
problem where network namespaces could be used, genl
families and multicast groups are numbered globally and
I see no easy way of changing that, especially since it
must be possible to multicast to all network namespaces
for those families that do not care about netns.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30..ea8384d3caa 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 /*
  * Send taskstats data in @skb to listener with nl_pid @pid
  */
-static int send_reply(struct sk_buff *skb, pid_t pid)
+static int send_reply(struct sk_buff *skb, struct genl_info *info)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
 	void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 		return rc;
 	}
 
-	return genlmsg_unicast(skb, pid);
+	return genlmsg_reply(skb, info);
 }
 
 /*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
 			if (!skb_next)
 				break;
 		}
-		rc = genlmsg_unicast(skb_cur, s->pid);
+		rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
 		if (rc == -ECONNREFUSED) {
 			s->valid = 0;
 			delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 		goto err;
 	}
 
-	rc = send_reply(rep_skb, info->snd_pid);
+	rc = send_reply(rep_skb, info);
 
 err:
 	fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
 	} else
 		goto err;
 
-	return send_reply(rep_skb, info->snd_pid);
+	return send_reply(rep_skb, info);
 err:
 	nlmsg_free(rep_skb);
 	return rc;
-- 
cgit v1.2.3


From 86886e55b273f565935491816c7c96b82469d4f8 Mon Sep 17 00:00:00 2001
From: Joseph Cihula <joseph.cihula@intel.com>
Date: Tue, 30 Jun 2009 19:31:07 -0700
Subject: x86, intel_txt: Intel TXT Sx shutdown support

Support for graceful handling of sleep states (S3/S4/S5) after an Intel(R) TXT launch.

Without this patch, attempting to place the system in one of the ACPI sleep
states (S3/S4/S5) will cause the TXT hardware to treat this as an attack and
will cause a system reset, with memory locked.  Not only may the subsequent
memory scrub take some time, but the platform will be unable to enter the
requested power state.

This patch calls back into the tboot so that it may properly and securely clean
up system state and clear the secrets-in-memory flag, after which it will place
the system into the requested sleep state using ACPI information passed by the kernel.

 arch/x86/kernel/smpboot.c     |    2 ++
 drivers/acpi/acpica/hwsleep.c |    3 +++
 kernel/cpu.c                  |    7 ++++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/cpu.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4a..ff071e022a8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+#include <asm/tboot.h>
 
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -376,7 +377,7 @@ static cpumask_var_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
 {
-	int cpu, first_cpu, error;
+	int cpu, first_cpu, error, num_cpus = 0;
 
 	error = stop_machine_create();
 	if (error)
@@ -391,6 +392,7 @@ int disable_nonboot_cpus(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == first_cpu)
 			continue;
+		num_cpus++;
 		error = _cpu_down(cpu, 1);
 		if (!error) {
 			cpumask_set_cpu(cpu, frozen_cpus);
@@ -401,6 +403,9 @@ int disable_nonboot_cpus(void)
 			break;
 		}
 	}
+	/* ensure all CPUs have gone into wait-for-SIPI */
+	error |= tboot_wait_for_aps(num_cpus);
+
 	if (!error) {
 		BUG_ON(num_online_cpus() > 1);
 		/* Make sure the CPUs won't be enabled by someone else */
-- 
cgit v1.2.3


From 5a165657bef7c47e5ff4cd138f7758ef6278e87b Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 13 Aug 2009 05:20:45 +0000
Subject: net: skb ftracer - Add config option to enable new ftracer (v3)

skb allocation / consumption corelator - Add config option

This patch adds a Kconfig option to enable the addtition of the skb source
tracer.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

 Kconfig |   10 ++++++++++
 1 file changed, 10 insertions(+)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/Kconfig | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 019f380fd76..f09d7635c0e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -234,6 +234,16 @@ config BOOT_TRACER
 	  You must pass in initcall_debug and ftrace=initcall to the kernel
 	  command line to enable this on bootup.
 
+config SKB_SOURCES_TRACER
+	bool "Trace skb source information"
+	select GENERIC_TRACER
+	help
+	   This tracer helps developers/sysadmins correlate skb allocation and
+	   consumption.  The idea being that some processes will primarily consume data
+	   that was allocated on certain numa nodes.  By being able to visualize which
+	   nodes the data was allocated on, a sysadmin or developer can optimize the
+	   scheduling of those processes to cut back on cross node chatter.
+
 config TRACE_BRANCH_PROFILING
 	bool
 	select GENERIC_TRACER
-- 
cgit v1.2.3


From 9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 13 Aug 2009 05:23:56 +0000
Subject: net: skb ftracer - Add actual ftrace code to kernel (v3)

skb allocation / consumption correlator

Add ftracer module to kernel to print out a list that correlates a process id,
an skb it read, and the numa nodes on wich the process was running when it was
read along with the numa node the skbuff was allocated on.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>

 Makefile            |    1
 trace.h             |   19 ++++++
 trace_skb_sources.c |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 174 insertions(+)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/Makefile            |   1 +
 kernel/trace/trace.h             |  19 +++++
 kernel/trace/trace_skb_sources.c | 154 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 174 insertions(+)
 create mode 100644 kernel/trace/trace_skb_sources.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90..ee5e5b1b928 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-$(CONFIG_EVENT_TRACING) += blktrace.o
 endif
+obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b9f4f6e955..8a6281b2330 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
 #include <trace/power.h>
+#include <trace/events/skb.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -40,6 +41,7 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
+	TRACE_SKB_SOURCE,
 
 	__TRACE_LAST_TYPE,
 };
@@ -171,6 +173,21 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+struct skb_record {
+	pid_t pid;		/* pid of the copying process */
+	int anid;		/* node where skb was allocated */
+	int cnid;		/* node to which skb was copied in userspace */
+	char ifname[IFNAMSIZ];	/* Name of the receiving interface */
+	int rx_queue;		/* The rx queue the skb was received on */
+	int ccpu;		/* Cpu the application got this frame from */
+	int len;		/* length of the data copied */
+};
+
+struct trace_skb_event {
+	struct trace_entry	ent;
+	struct skb_record	event_data;
+};
+
 enum kmemtrace_type_id {
 	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
 	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
@@ -323,6 +340,8 @@ extern void __ftrace_bad_type(void);
 			  TRACE_SYSCALL_ENTER);				\
 		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
 			  TRACE_SYSCALL_EXIT);				\
+		IF_ASSIGN(var, ent, struct trace_skb_event,		\
+			  TRACE_SKB_SOURCE);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c
new file mode 100644
index 00000000000..40eb071afdb
--- /dev/null
+++ b/kernel/trace/trace_skb_sources.c
@@ -0,0 +1,154 @@
+/*
+ * ring buffer based tracer for analyzing per-socket skb sources
+ *
+ * Neil Horman <nhorman@tuxdriver.com>
+ * Copyright (C) 2009
+ *
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <trace/events/skb.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec);
+
+static struct trace_array *skb_trace;
+static int __read_mostly trace_skb_source_enabled;
+
+static void probe_skb_dequeue(const struct sk_buff *skb, int len)
+{
+	struct ring_buffer_event *event;
+	struct trace_skb_event *entry;
+	struct trace_array *tr = skb_trace;
+	struct net_device *dev;
+
+	if (!trace_skb_source_enabled)
+		return;
+
+	if (in_interrupt())
+		return;
+
+	event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		return;
+	entry = ring_buffer_event_data(event);
+
+	entry->event_data.pid = current->pid;
+	entry->event_data.anid = page_to_nid(virt_to_page(skb->data));
+	entry->event_data.cnid = cpu_to_node(smp_processor_id());
+	entry->event_data.len = len;
+	entry->event_data.rx_queue = skb->queue_mapping;
+	entry->event_data.ccpu = smp_processor_id();
+
+	dev = dev_get_by_index(sock_net(skb->sk), skb->iif);
+	if (dev) {
+		memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ);
+		dev_put(dev);
+	} else {
+		strcpy(entry->event_data.ifname, "Unknown");
+	}
+
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+}
+
+static int tracing_skb_source_register(void)
+{
+	int ret;
+
+	ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue);
+	if (ret)
+		pr_info("skb source trace: Couldn't activate dequeue tracepoint");
+
+	return ret;
+}
+
+static void start_skb_source_trace(struct trace_array *tr)
+{
+	trace_skb_source_enabled = 1;
+}
+
+static void stop_skb_source_trace(struct trace_array *tr)
+{
+	trace_skb_source_enabled = 0;
+}
+
+static void skb_source_trace_reset(struct trace_array *tr)
+{
+	trace_skb_source_enabled = 0;
+	unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue);
+}
+
+
+static int skb_source_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	skb_trace = tr;
+
+	trace_skb_source_enabled = 1;
+	tracing_skb_source_register();
+
+	for_each_cpu(cpu, cpu_possible_mask)
+		tracing_reset(tr, cpu);
+	return 0;
+}
+
+static enum print_line_t skb_source_print_line(struct trace_iterator *iter)
+{
+	int ret = 0;
+	struct trace_entry *entry = iter->ent;
+	struct trace_skb_event *event;
+	struct skb_record *record;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(event, entry);
+	record = &event->event_data;
+	if (entry->type != TRACE_SKB_SOURCE)
+		return TRACE_TYPE_UNHANDLED;
+
+	ret = trace_seq_printf(s, "	%d	%d	%d	%s	%d	%d	%d\n",
+			record->pid,
+			record->anid,
+			record->cnid,
+			record->ifname,
+			record->rx_queue,
+			record->ccpu,
+			record->len);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static void skb_source_print_header(struct seq_file *s)
+{
+	seq_puts(s, "#	PID	ANID	CNID	IFC	RXQ	CCPU	LEN\n");
+	seq_puts(s, "#	 |	 |	 |	 |	 |	 |	 |\n");
+}
+
+static struct tracer skb_source_tracer __read_mostly =
+{
+	.name		= "skb_sources",
+	.init		= skb_source_trace_init,
+	.start		= start_skb_source_trace,
+	.stop		= stop_skb_source_trace,
+	.reset		= skb_source_trace_reset,
+	.print_line	= skb_source_print_line,
+	.print_header	= skb_source_print_header,
+};
+
+static int init_skb_source_trace(void)
+{
+	return register_tracer(&skb_source_tracer);
+}
+device_initcall(init_skb_source_trace);
-- 
cgit v1.2.3


From 3a5209e3b26386fd174820ee6e8e27479b24869a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 17 Aug 2009 16:00:30 -0700
Subject: trace_skb: fix build when CONFIG_NET is not enabled

Fix trace_skb_sources build when CONFIG_NET is not enabled:

kernel/built-in.o: In function `probe_skb_dequeue':
trace_skb_sources.c:(.text+0xd9152): undefined reference to `init_net'
trace_skb_sources.c:(.text+0xd9188): undefined reference to `dev_get_by_index'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f09d7635c0e..9a2f19bf051 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -236,6 +236,7 @@ config BOOT_TRACER
 
 config SKB_SOURCES_TRACER
 	bool "Trace skb source information"
+	depends on NET
 	select GENERIC_TRACER
 	help
 	   This tracer helps developers/sysadmins correlate skb allocation and
-- 
cgit v1.2.3


From de481560eb0bd9d940b90311eba85711e4b1150b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 30 Apr 2009 12:32:04 -0400
Subject: kconfig: keep config.gz around even if CONFIG_IKCONFIG_PROC is not
 set

If CONFIG_IKCONFIG is set but CONFIG_IKCONFIG_PROC is not, then
gcc will optimize the config.gz out, because nobody uses it.

This patch adds "__used" to the config.gz data to keep it around so that
code like extract-ikconfig can still find it.

[ Impact: allow extract-ikconfig to find config.gz ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c..d0c84e6bf50 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -119,7 +119,7 @@ $(obj)/config_data.gz: .config FORCE
 	$(call if_changed,gzip)
 
 quiet_cmd_ikconfiggz = IKCFG   $@
-      cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
+      cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
 	$(call if_changed,ikconfiggz)
-- 
cgit v1.2.3


From a15098c90df1ac2b1bfe1d33dd1c47063213aa9a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Sun, 9 Aug 2009 19:02:51 +0000
Subject: powerpc: Enable GCOV

Make it possible to enable GCOV code coverage measurement on powerpc.

Lightly tested on 64-bit, seems to work as expected.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 kernel/gcov/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 22e9dcfaa3d..654efd09f6a 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
 	bool "Profile entire Kernel"
 	depends on GCOV_KERNEL
-	depends on S390 || X86
+	depends on S390 || X86 || (PPC && EXPERIMENTAL)
 	default n
 	---help---
 	This options activates profiling for the entire kernel.
-- 
cgit v1.2.3


From 269c861baa2fe7c114c3bc7831292758d29eb336 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 19 Aug 2009 18:05:35 -0700
Subject: generic-ipi: Allow cpus not yet online to call smp_call_function with
 irqs disabled

Because of deadlock possiblities smp_call_function() is not allowed to
be called with interrupts disabled. Add an exception for the cpu not
yet online, as no one else can send smp call function interrupt to this
cpu that is not yet online and as such deadlock condition is not possible.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/smp.c | 40 ++++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d850120..2accdf6712e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -176,6 +176,11 @@ void generic_smp_call_function_interrupt(void)
 	struct call_function_data *data;
 	int cpu = get_cpu();
 
+	/*
+	 * Shouldn't receive this interrupt on a cpu that is not yet online.
+	 */
+	WARN_ON_ONCE(!cpu_online(cpu));
+
 	/*
 	 * Ensure entry is visible on call_function_queue after we have
 	 * entered the IPI. See comment in smp_call_function_many.
@@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void)
 	unsigned int data_flags;
 	LIST_HEAD(list);
 
+	/*
+	 * Shouldn't receive this interrupt on a cpu that is not yet online.
+	 */
+	WARN_ON_ONCE(!cpu_online(smp_processor_id()));
+
 	spin_lock(&q->lock);
 	list_replace_init(&q->list, &list);
 	spin_unlock(&q->lock);
@@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 	 */
 	this_cpu = get_cpu();
 
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
+	/*
+	 * Can deadlock when called with interrupts disabled.
+	 * We allow cpu's that are not yet online though, as no one else can
+	 * send smp call function interrupt to this cpu and as such deadlocks
+	 * can't happen.
+	 */
+	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
+		     && !oops_in_progress);
 
 	if (cpu == this_cpu) {
 		local_irq_save(flags);
@@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 {
 	csd_lock(data);
 
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
+	/*
+	 * Can deadlock when called with interrupts disabled.
+	 * We allow cpu's that are not yet online though, as no one else can
+	 * send smp call function interrupt to this cpu and as such deadlocks
+	 * can't happen.
+	 */
+	WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
+		     && !oops_in_progress);
 
 	generic_exec_single(cpu, data, wait);
 }
@@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask,
 	unsigned long flags;
 	int cpu, next_cpu, this_cpu = smp_processor_id();
 
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
+	/*
+	 * Can deadlock when called with interrupts disabled.
+	 * We allow cpu's that are not yet online though, as no one else can
+	 * send smp call function interrupt to this cpu and as such deadlocks
+	 * can't happen.
+	 */
+	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
+		     && !oops_in_progress);
 
 	/* So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-- 
cgit v1.2.3


From d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 19 Aug 2009 18:05:36 -0700
Subject: x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init

SDM Vol 3a section titled "MTRR considerations in MP systems" specifies
the need for synchronizing the logical cpu's while initializing/updating
MTRR.

Currently Linux kernel does the synchronization of all cpu's only when
a single MTRR register is programmed/updated. During an AP online
(during boot/cpu-online/resume)  where we initialize all the MTRR/PAT registers,
we don't follow this synchronization algorithm.

This can lead to scenarios where during a dynamic cpu online, that logical cpu
is initializing MTRR/PAT with cache disabled (cr0.cd=1) etc while other logical
HT sibling continue to run (also with cache disabled because of cr0.cd=1
on its sibling).

Starting from Westmere, VMX transitions with cr0.cd=1 don't work properly
(because of some VMX performance optimizations) and the above scenario
(with one logical cpu doing VMX activity and another logical cpu coming online)
can result in system crash.

Fix the MTRR initialization by doing rendezvous of all the cpus. During
boot and resume, we delay the MTRR/PAT init for APs till all the
logical cpu's come online and the rendezvous process at the end of AP's bringup,
will initialize the MTRR/PAT for all AP's.

For dynamic single cpu online, we synchronize all the logical cpus and
do the MTRR/PAT init on the AP that is coming online.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/cpu.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4a..f5f9485b8c0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -413,6 +413,14 @@ int disable_nonboot_cpus(void)
 	return error;
 }
 
+void __weak arch_enable_nonboot_cpus_begin(void)
+{
+}
+
+void __weak arch_enable_nonboot_cpus_end(void)
+{
+}
+
 void __ref enable_nonboot_cpus(void)
 {
 	int cpu, error;
@@ -424,6 +432,9 @@ void __ref enable_nonboot_cpus(void)
 		goto out;
 
 	printk("Enabling non-boot CPUs ...\n");
+
+	arch_enable_nonboot_cpus_begin();
+
 	for_each_cpu(cpu, frozen_cpus) {
 		error = _cpu_up(cpu, 1);
 		if (!error) {
@@ -432,6 +443,9 @@ void __ref enable_nonboot_cpus(void)
 		}
 		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
 	}
+
+	arch_enable_nonboot_cpus_end();
+
 	cpumask_clear(frozen_cpus);
 out:
 	cpu_maps_update_done();
-- 
cgit v1.2.3


From 5e928f77a09a07f9dd595bb8a489965d69a83458 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 18 Aug 2009 23:38:32 +0200
Subject: PM: Introduce core framework for run-time PM of I/O devices (rev. 17)

Introduce a core framework for run-time power management of I/O
devices.  Add device run-time PM fields to 'struct dev_pm_info'
and device run-time PM callbacks to 'struct dev_pm_ops'.  Introduce
a run-time PM workqueue and define some device run-time PM helper
functions at the core level.  Document all these things.

Special thanks to Alan Stern for his help with the design and
multiple detailed reviews of the pereceding versions of this patch
and to Magnus Damm for testing feedback.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Magnus Damm <damm@igel.co.jp>
---
 kernel/power/Kconfig | 14 ++++++++++++++
 kernel/power/main.c  | 17 +++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37..91e09d3b2eb 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
 	  random kernel OOPSes or reboots that don't seem to be related to
 	  anything, try disabling/enabling this option (or disabling/enabling
 	  APM in your BIOS).
+
+config PM_RUNTIME
+	bool "Run-time PM core functionality"
+	depends on PM
+	---help---
+	  Enable functionality allowing I/O devices to be put into energy-saving
+	  (low power) states at run time (or autosuspended) after a specified
+	  period of inactivity and woken up in response to a hardware-generated
+	  wake-up event or a driver's request.
+
+	  Hardware support is generally required for this functionality to work
+	  and the bus type drivers of the buses the devices are on are
+	  responsible for the actual handling of the autosuspend requests and
+	  wake-up events.
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930c..347d2cc88cd 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/resume-trace.h>
+#include <linux/workqueue.h>
 
 #include "power.h"
 
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
 	.attrs = g,
 };
 
+#ifdef CONFIG_PM_RUNTIME
+struct workqueue_struct *pm_wq;
+
+static int __init pm_start_workqueue(void)
+{
+	pm_wq = create_freezeable_workqueue("pm");
+
+	return pm_wq ? 0 : -ENOMEM;
+}
+#else
+static inline int pm_start_workqueue(void) { return 0; }
+#endif
+
 static int __init pm_init(void)
 {
+	int error = pm_start_workqueue();
+	if (error)
+		return error;
 	power_kobj = kobject_create_and_add("power", NULL);
 	if (!power_kobj)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 31ffe249e5426d2648d68568fa00a7b66666a5db Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 26 Aug 2009 16:32:37 -0700
Subject: net: Temporarily backout SKB sources tracer.

Steven Rostedt has suggested that Neil work with the tracing
folks, trying to use TRACE_EVENT as the mechanism for
implementation.  And if that doesn't workout we can investigate
other solutions such as that one which was tried here.

This reverts the following 2 commits:

5a165657bef7c47e5ff4cd138f7758ef6278e87b
("net: skb ftracer - Add config option to enable new ftracer (v3)")

9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb
("net: skb ftracer - Add actual ftrace code to kernel (v3)")

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/Kconfig             |  11 ---
 kernel/trace/Makefile            |   1 -
 kernel/trace/trace.h             |  19 -----
 kernel/trace/trace_skb_sources.c | 154 ---------------------------------------
 4 files changed, 185 deletions(-)
 delete mode 100644 kernel/trace/trace_skb_sources.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9a2f19bf051..019f380fd76 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -234,17 +234,6 @@ config BOOT_TRACER
 	  You must pass in initcall_debug and ftrace=initcall to the kernel
 	  command line to enable this on bootup.
 
-config SKB_SOURCES_TRACER
-	bool "Trace skb source information"
-	depends on NET
-	select GENERIC_TRACER
-	help
-	   This tracer helps developers/sysadmins correlate skb allocation and
-	   consumption.  The idea being that some processes will primarily consume data
-	   that was allocated on certain numa nodes.  By being able to visualize which
-	   nodes the data was allocated on, a sysadmin or developer can optimize the
-	   scheduling of those processes to cut back on cross node chatter.
-
 config TRACE_BRANCH_PROFILING
 	bool
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ee5e5b1b928..844164dca90 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -49,7 +49,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-$(CONFIG_EVENT_TRACING) += blktrace.o
 endif
-obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8a6281b2330..8b9f4f6e955 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,7 +11,6 @@
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
 #include <trace/power.h>
-#include <trace/events/skb.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -41,7 +40,6 @@ enum trace_type {
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
 	TRACE_BLK,
-	TRACE_SKB_SOURCE,
 
 	__TRACE_LAST_TYPE,
 };
@@ -173,21 +171,6 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
-struct skb_record {
-	pid_t pid;		/* pid of the copying process */
-	int anid;		/* node where skb was allocated */
-	int cnid;		/* node to which skb was copied in userspace */
-	char ifname[IFNAMSIZ];	/* Name of the receiving interface */
-	int rx_queue;		/* The rx queue the skb was received on */
-	int ccpu;		/* Cpu the application got this frame from */
-	int len;		/* length of the data copied */
-};
-
-struct trace_skb_event {
-	struct trace_entry	ent;
-	struct skb_record	event_data;
-};
-
 enum kmemtrace_type_id {
 	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
 	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
@@ -340,8 +323,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_SYSCALL_ENTER);				\
 		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
 			  TRACE_SYSCALL_EXIT);				\
-		IF_ASSIGN(var, ent, struct trace_skb_event,		\
-			  TRACE_SKB_SOURCE);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c
deleted file mode 100644
index 40eb071afdb..00000000000
--- a/kernel/trace/trace_skb_sources.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * ring buffer based tracer for analyzing per-socket skb sources
- *
- * Neil Horman <nhorman@tuxdriver.com>
- * Copyright (C) 2009
- *
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <trace/events/skb.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/hardirq.h>
-#include <linux/netdevice.h>
-#include <net/sock.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec);
-
-static struct trace_array *skb_trace;
-static int __read_mostly trace_skb_source_enabled;
-
-static void probe_skb_dequeue(const struct sk_buff *skb, int len)
-{
-	struct ring_buffer_event *event;
-	struct trace_skb_event *entry;
-	struct trace_array *tr = skb_trace;
-	struct net_device *dev;
-
-	if (!trace_skb_source_enabled)
-		return;
-
-	if (in_interrupt())
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry = ring_buffer_event_data(event);
-
-	entry->event_data.pid = current->pid;
-	entry->event_data.anid = page_to_nid(virt_to_page(skb->data));
-	entry->event_data.cnid = cpu_to_node(smp_processor_id());
-	entry->event_data.len = len;
-	entry->event_data.rx_queue = skb->queue_mapping;
-	entry->event_data.ccpu = smp_processor_id();
-
-	dev = dev_get_by_index(sock_net(skb->sk), skb->iif);
-	if (dev) {
-		memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ);
-		dev_put(dev);
-	} else {
-		strcpy(entry->event_data.ifname, "Unknown");
-	}
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-
-static int tracing_skb_source_register(void)
-{
-	int ret;
-
-	ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue);
-	if (ret)
-		pr_info("skb source trace: Couldn't activate dequeue tracepoint");
-
-	return ret;
-}
-
-static void start_skb_source_trace(struct trace_array *tr)
-{
-	trace_skb_source_enabled = 1;
-}
-
-static void stop_skb_source_trace(struct trace_array *tr)
-{
-	trace_skb_source_enabled = 0;
-}
-
-static void skb_source_trace_reset(struct trace_array *tr)
-{
-	trace_skb_source_enabled = 0;
-	unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue);
-}
-
-
-static int skb_source_trace_init(struct trace_array *tr)
-{
-	int cpu;
-	skb_trace = tr;
-
-	trace_skb_source_enabled = 1;
-	tracing_skb_source_register();
-
-	for_each_cpu(cpu, cpu_possible_mask)
-		tracing_reset(tr, cpu);
-	return 0;
-}
-
-static enum print_line_t skb_source_print_line(struct trace_iterator *iter)
-{
-	int ret = 0;
-	struct trace_entry *entry = iter->ent;
-	struct trace_skb_event *event;
-	struct skb_record *record;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(event, entry);
-	record = &event->event_data;
-	if (entry->type != TRACE_SKB_SOURCE)
-		return TRACE_TYPE_UNHANDLED;
-
-	ret = trace_seq_printf(s, "	%d	%d	%d	%s	%d	%d	%d\n",
-			record->pid,
-			record->anid,
-			record->cnid,
-			record->ifname,
-			record->rx_queue,
-			record->ccpu,
-			record->len);
-
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static void skb_source_print_header(struct seq_file *s)
-{
-	seq_puts(s, "#	PID	ANID	CNID	IFC	RXQ	CCPU	LEN\n");
-	seq_puts(s, "#	 |	 |	 |	 |	 |	 |	 |\n");
-}
-
-static struct tracer skb_source_tracer __read_mostly =
-{
-	.name		= "skb_sources",
-	.init		= skb_source_trace_init,
-	.start		= start_skb_source_trace,
-	.stop		= stop_skb_source_trace,
-	.reset		= skb_source_trace_reset,
-	.print_line	= skb_source_print_line,
-	.print_header	= skb_source_print_header,
-};
-
-static int init_skb_source_trace(void)
-{
-	return register_tracer(&skb_source_tracer);
-}
-device_initcall(init_skb_source_trace);
-- 
cgit v1.2.3


From 2bc481cf433879f0e6cdd4d899fc21ee05dcea23 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 28 Aug 2009 23:41:29 -0700
Subject: pktgen: spin using hrtimer

This changes how the pktgen thread spins/waits between
packets if delay is configured. It uses a high res timer to
wait for time to arrive.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/hrtimer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49da79ab848..05071bf6a37 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -485,6 +485,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
 	debug_object_init_on_stack(timer, &hrtimer_debug_descr);
 	__hrtimer_init(timer, clock_id, mode);
 }
+EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
 
 void destroy_hrtimer_on_stack(struct hrtimer *timer)
 {
@@ -1477,6 +1478,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 }
+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
 
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
 {
-- 
cgit v1.2.3


From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001
From: Shane Wang <shane.wang@intel.com>
Date: Tue, 1 Sep 2009 18:25:07 -0700
Subject: x86, intel_txt: clean up the impact on generic code, unbreak non-x86

Move tboot.h from asm to linux to fix the build errors of intel_txt
patch on non-X86 platforms. Remove the tboot code from generic code
init/main.c and kernel/cpu.c.

Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 kernel/cpu.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index ff071e022a8..67a60076dd7 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,7 +14,6 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
-#include <asm/tboot.h>
 
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -377,7 +376,7 @@ static cpumask_var_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
 {
-	int cpu, first_cpu, error, num_cpus = 0;
+	int cpu, first_cpu, error;
 
 	error = stop_machine_create();
 	if (error)
@@ -392,7 +391,6 @@ int disable_nonboot_cpus(void)
 	for_each_online_cpu(cpu) {
 		if (cpu == first_cpu)
 			continue;
-		num_cpus++;
 		error = _cpu_down(cpu, 1);
 		if (!error) {
 			cpumask_set_cpu(cpu, frozen_cpus);
@@ -403,8 +401,6 @@ int disable_nonboot_cpus(void)
 			break;
 		}
 	}
-	/* ensure all CPUs have gone into wait-for-SIPI */
-	error |= tboot_wait_for_aps(num_cpus);
 
 	if (!error) {
 		BUG_ON(num_online_cpus() > 1);
-- 
cgit v1.2.3


From d8eeb2d3b26d25c44c10f28430e2157a2d20bd1d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Fri, 31 Jul 2009 14:58:04 +0200
Subject: ring-buffer: consolidate interface of rb_buffer_peek()

rb_buffer_peek() operates with struct ring_buffer_per_cpu *cpu_buffer
only. Thus, instead of passing variables buffer and cpu it is better
to use cpu_buffer directly. This also reduces the risk of races since
cpu_buffer is not calculated twice.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1249045084-3028-1-git-send-email-robert.richter@amd.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 454e74e718c..8786c350b4c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2997,15 +2997,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 }
 
 static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
 {
-	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
 	struct buffer_page *reader;
 	int nr_loops = 0;
 
-	cpu_buffer = buffer->buffers[cpu];
-
  again:
 	/*
 	 * We repeat when a timestamp is encountered. It is possible
@@ -3049,7 +3046,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = cpu_buffer->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(buffer,
+			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
 							 cpu_buffer->cpu, ts);
 		}
 		return event;
@@ -3168,7 +3165,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	local_irq_save(flags);
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
-	event = rb_buffer_peek(buffer, cpu, ts);
+	event = rb_buffer_peek(cpu_buffer, ts);
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
 		rb_advance_reader(cpu_buffer);
 	if (dolock)
@@ -3237,7 +3234,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
 
-	event = rb_buffer_peek(buffer, cpu, ts);
+	event = rb_buffer_peek(cpu_buffer, ts);
 	if (event)
 		rb_advance_reader(cpu_buffer);
 
-- 
cgit v1.2.3


From 478142c39c8c2f5f63038e5f2224e6729406e587 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 9 Sep 2009 10:36:01 -0400
Subject: tracing: do not grab lock in wakeup latency function tracing

The wakeup tracer, when enabled, has its own function tracer.
It only traces the functions on the CPU where the task it is following
is on. If a task is woken on one CPU but then migrates to another CPU
before it wakes up, the latency tracer will then start tracing functions
on the other CPU.

To find which CPU the task is on, the wakeup function tracer performs
a task_cpu(wakeup_task). But to make sure the task does not disappear
it grabs the wakeup_lock, which is also taken when the task wakes up.
By taking this lock, the function tracer does not need to worry about
the task being freed as it checks its cpu.

Jan Blunck found a problem with this approach on his 32 CPU box. When
a task is being traced by the wakeup tracer, all functions take this
lock. That means that on all 32 CPUs, each function call is taking
this one lock to see if the task is on that CPU. This lock has just
serialized all functions on all 32 CPUs. Needless to say, this caused
major issues on that box. It would even lockup.

This patch changes the wakeup latency to insert a probe on the migrate task
tracepoint. When a task changes its CPU that it will run on, the
probe will take note. Now the wakeup function tracer no longer needs
to take the lock. It only compares the current CPU with a variable that
holds the current CPU the task is on. We don't worry about races since
it is OK to add or miss a function trace.

Reported-by: Jan Blunck <jblunck@suse.de>
Tested-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_wakeup.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ad69f105a7c..cf43bdb1763 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly	tracer_enabled;
 
 static struct task_struct	*wakeup_task;
 static int			wakeup_cpu;
+static int			wakeup_current_cpu;
 static unsigned			wakeup_prio = -1;
 static int			wakeup_rt;
 
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	resched = ftrace_preempt_disable();
 
 	cpu = raw_smp_processor_id();
+	if (cpu != wakeup_current_cpu)
+		goto out_enable;
+
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
 
 	local_irq_save(flags);
-	__raw_spin_lock(&wakeup_lock);
-
-	if (unlikely(!wakeup_task))
-		goto unlock;
-
-	/*
-	 * The task can't disappear because it needs to
-	 * wake up first, and we have the wakeup_lock.
-	 */
-	if (task_cpu(wakeup_task) != cpu)
-		goto unlock;
 
 	trace_function(tr, ip, parent_ip, flags, pc);
 
- unlock:
-	__raw_spin_unlock(&wakeup_lock);
 	local_irq_restore(flags);
 
  out:
 	atomic_dec(&data->disabled);
-
+ out_enable:
 	ftrace_preempt_enable(resched);
 }
 
@@ -107,6 +98,14 @@ static int report_latency(cycle_t delta)
 	return 1;
 }
 
+static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
+{
+	if (task != wakeup_task)
+		return;
+
+	wakeup_current_cpu = cpu;
+}
+
 static void notrace
 probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	struct task_struct *next)
@@ -244,6 +243,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 	__wakeup_reset(wakeup_trace);
 
 	wakeup_cpu = task_cpu(p);
+	wakeup_current_cpu = wakeup_cpu;
 	wakeup_prio = p->prio;
 
 	wakeup_task = p;
@@ -293,6 +293,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
 		goto fail_deprobe_wake_new;
 	}
 
+	ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't activate tracepoint"
+			" probe to kernel_sched_migrate_task\n");
+		return;
+	}
+
 	wakeup_reset(tr);
 
 	/*
@@ -325,6 +332,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 	unregister_trace_sched_switch(probe_wakeup_sched_switch);
 	unregister_trace_sched_wakeup_new(probe_wakeup);
 	unregister_trace_sched_wakeup(probe_wakeup);
+	unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
 }
 
 static int __wakeup_tracer_init(struct trace_array *tr)
-- 
cgit v1.2.3


From e0ab5f2daee1c7a6a387591bf37f0bad4e407112 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 10 Sep 2009 09:34:19 +0800
Subject: tracing: remove dead code

Removes unreachable code.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fa1dccb579d..536ae1d8362 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -120,14 +120,6 @@ struct print_entry {
 	char			buf[];
 };
 
-#define TRACE_OLD_SIZE		88
-
-struct trace_field_cont {
-	unsigned char		type;
-	/* Temporary till we get rid of this completely */
-	char			buf[TRACE_OLD_SIZE - 1];
-};
-
 struct trace_mmiotrace_rw {
 	struct trace_entry	ent;
 	struct mmiotrace_rw	rw;
@@ -509,20 +501,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
 
 extern cycle_t ftrace_now(int cpu);
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-typedef void
-(*tracer_switch_func_t)(void *private,
-			void *__rq,
-			struct task_struct *prev,
-			struct task_struct *next);
-
-struct tracer_switch_ops {
-	tracer_switch_func_t		func;
-	void				*private;
-	struct tracer_switch_ops	*next;
-};
-#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-
 extern void trace_find_cmdline(int pid, char comm[]);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-- 
cgit v1.2.3


From bd9cfca9cb71200dd82b320bba12540dc078f4e0 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 10 Sep 2009 09:34:19 +0800
Subject: tracing: format clean ups

Fix white-space formatting.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 536ae1d8362..86a0523b8e5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -169,20 +169,20 @@ enum kmemtrace_type_id {
 
 struct kmemtrace_alloc_entry {
 	struct trace_entry	ent;
-	enum kmemtrace_type_id type_id;
-	unsigned long call_site;
-	const void *ptr;
-	size_t bytes_req;
-	size_t bytes_alloc;
-	gfp_t gfp_flags;
-	int node;
+	enum kmemtrace_type_id	type_id;
+	unsigned long		call_site;
+	const void		*ptr;
+	size_t			bytes_req;
+	size_t			bytes_alloc;
+	gfp_t			gfp_flags;
+	int			node;
 };
 
 struct kmemtrace_free_entry {
 	struct trace_entry	ent;
-	enum kmemtrace_type_id type_id;
-	unsigned long call_site;
-	const void *ptr;
+	enum kmemtrace_type_id	type_id;
+	unsigned long		call_site;
+	const void		*ptr;
 };
 
 struct syscall_trace_enter {
@@ -203,7 +203,7 @@ struct syscall_trace_exit {
  * states when a trace occurs. These are:
  *  IRQS_OFF		- interrupts were disabled
  *  IRQS_NOSUPPORT	- arch does not support irqs_disabled_flags
- *  NEED_RESCED		- reschedule is requested
+ *  NEED_RESCHED	- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
  */
-- 
cgit v1.2.3


From a5921c6c37d51ee2079ca3c69ea6f7b7384f5d87 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 10 Sep 2009 09:34:19 +0800
Subject: tracing: remove stats from struct tracer

Remove unused field @stats from struct tracer.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 86a0523b8e5..2163d185fe2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -382,7 +382,6 @@ struct tracer {
 	struct tracer		*next;
 	int			print_max;
 	struct tracer_flags	*flags;
-	struct tracer_stat	*stats;
 };
 
 
-- 
cgit v1.2.3


From 197e2eabc90c203d1086916b7f66694ba5fbb937 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 10 Sep 2009 09:34:19 +0800
Subject: tracing: move PRED macros to trace_events_filter.c

Move DEFINE_COMPARISON_PRED() and DEFINE_EQUALITY_PRED()
  to kernel/trace/trace_events_filter.c

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h               | 41 --------------------------------------
 kernel/trace/trace_events_filter.c | 41 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2163d185fe2..acaa68060eb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -800,47 +800,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
-#define DEFINE_COMPARISON_PRED(type)					\
-static int filter_pred_##type(struct filter_pred *pred, void *event,	\
-			      int val1, int val2)			\
-{									\
-	type *addr = (type *)(event + pred->offset);			\
-	type val = (type)pred->val;					\
-	int match = 0;							\
-									\
-	switch (pred->op) {						\
-	case OP_LT:							\
-		match = (*addr < val);					\
-		break;							\
-	case OP_LE:							\
-		match = (*addr <= val);					\
-		break;							\
-	case OP_GT:							\
-		match = (*addr > val);					\
-		break;							\
-	case OP_GE:							\
-		match = (*addr >= val);					\
-		break;							\
-	default:							\
-		break;							\
-	}								\
-									\
-	return match;							\
-}
-
-#define DEFINE_EQUALITY_PRED(size)					\
-static int filter_pred_##size(struct filter_pred *pred, void *event,	\
-			      int val1, int val2)			\
-{									\
-	u##size *addr = (u##size *)(event + pred->offset);		\
-	u##size val = (u##size)pred->val;				\
-	int match;							\
-									\
-	match = (val == *addr) ^ pred->not;				\
-									\
-	return match;							\
-}
-
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 93660fbbf62..23245785927 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -121,6 +121,47 @@ struct filter_parse_state {
 	} operand;
 };
 
+#define DEFINE_COMPARISON_PRED(type)					\
+static int filter_pred_##type(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = 0;							\
+									\
+	switch (pred->op) {						\
+	case OP_LT:							\
+		match = (*addr < val);					\
+		break;							\
+	case OP_LE:							\
+		match = (*addr <= val);					\
+		break;							\
+	case OP_GT:							\
+		match = (*addr > val);					\
+		break;							\
+	case OP_GE:							\
+		match = (*addr >= val);					\
+		break;							\
+	default:							\
+		break;							\
+	}								\
+									\
+	return match;							\
+}
+
+#define DEFINE_EQUALITY_PRED(size)					\
+static int filter_pred_##size(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	u##size *addr = (u##size *)(event + pred->offset);		\
+	u##size val = (u##size)pred->val;				\
+	int match;							\
+									\
+	match = (val == *addr) ^ pred->not;				\
+									\
+	return match;							\
+}
+
 DEFINE_COMPARISON_PRED(s64);
 DEFINE_COMPARISON_PRED(u64);
 DEFINE_COMPARISON_PRED(s32);
-- 
cgit v1.2.3


From 5e605b64a183a6c0e84cdb99a6f8acb1f8200437 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 5 Aug 2009 09:07:21 +0200
Subject: block: add blk-iopoll, a NAPI like approach for block devices

This borrows some code from NAPI and implements a polled completion
mode for block devices. The idea is the same as NAPI - instead of
doing the command completion when the irq occurs, schedule a dedicated
softirq in the hopes that we will complete more IO when the iopoll
handler is invoked. Devices have a budget of commands assigned, and will
stay in polled mode as long as they continue to consume their budget
from the iopoll softirq handler. If they do not, the device is set back
to interrupt completion mode.

This patch holds the core bits for blk-iopoll, device driver support
sold separately.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/sysctl.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd..0ed9fa6f322 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,6 +92,7 @@ extern int sysctl_nr_trim_pages;
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+extern int blk_iopoll_enabled;
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -990,7 +991,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "blk_iopoll",
+		.data		= &blk_iopoll_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3


From 49ff590390a22c49e9063dcdec4cd5903127526b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 11 Sep 2009 00:30:26 -0400
Subject: tracing: add latency format to function_graph tracer

While debugging something with the function_graph tracer, I found the
need to see the preempt count of the traces. Unfortunately, since
the function graph tracer has its own output formatting, it does not
honor the latency-format option.

This patch makes the function_graph tracer honor the latency-format
option, but still keeps control of the output. But now we have the
same details that the latency-format supplies.

 # tracer: function_graph
 #
 #      _-----=> irqs-off
 #     / _----=> need-resched
 #    | / _---=> hardirq/softirq
 #    || / _--=> preempt-depth
 #    ||| /
 #    ||||
 # CPU||||  DURATION                  FUNCTION CALLS
 # |  ||||   |   |                     |   |   |   |
  3)  d..1  1.333 us    |        idle_cpu();
  3)  d.h1              |        tick_check_idle() {
  3)  d.h1  0.550 us    |          tick_check_oneshot_broadcast();
  3)  d.h1              |          tick_nohz_stop_idle() {
  3)  d.h1              |            ktime_get() {
  3)  d.h1              |              ktime_get_ts() {

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 74 +++++++++++++++++++++++++++++++++---
 1 file changed, 68 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b3749a2c313..ee791a9650c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -364,6 +364,29 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
 }
 
 
+static enum print_line_t
+print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+
+	if (!trace_seq_printf(s, " %c%c%c",
+			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
+				  'X' : '.',
+			      (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
+				'N' : '.',
+			      (hardirq && softirq) ? 'H' :
+				hardirq ? 'h' : softirq ? 's' : '.'))
+		return 0;
+
+	if (entry->preempt_count)
+		return trace_seq_printf(s, "%x", entry->preempt_count);
+	return trace_seq_puts(s, ".");
+}
+
 /* If the pid changed since the last trace, output this event */
 static enum print_line_t
 verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -521,6 +544,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
+
 	/* Proc */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
 		ret = print_graph_proc(s, pid);
@@ -758,6 +782,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
+	/* Latency format */
+	if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+		ret = print_graph_lat_fmt(s, ent);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	return 0;
 }
 
@@ -952,28 +983,59 @@ print_graph_function(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static void print_lat_header(struct seq_file *s)
+{
+	static const char spaces[] = "                "	/* 16 spaces */
+		"    "					/* 4 spaces */
+		"                 ";			/* 17 spaces */
+	int size = 0;
+
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+		size += 16;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+		size += 4;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+		size += 17;
+
+	seq_printf(s, "#%.*s  _-----=> irqs-off        \n", size, spaces);
+	seq_printf(s, "#%.*s / _----=> need-resched    \n", size, spaces);
+	seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
+	seq_printf(s, "#%.*s|| / _--=> preempt-depth   \n", size, spaces);
+	seq_printf(s, "#%.*s||| /                      \n", size, spaces);
+	seq_printf(s, "#%.*s||||                       \n", size, spaces);
+}
+
 static void print_graph_headers(struct seq_file *s)
 {
+	int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
+
+	if (lat)
+		print_lat_header(s);
+
 	/* 1st line */
-	seq_printf(s, "# ");
+	seq_printf(s, "#");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
 		seq_printf(s, "     TIME       ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
-		seq_printf(s, "CPU");
+		seq_printf(s, " CPU");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
-		seq_printf(s, "  TASK/PID      ");
+		seq_printf(s, "  TASK/PID       ");
+	if (lat)
+		seq_printf(s, "||||");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "  DURATION   ");
 	seq_printf(s, "               FUNCTION CALLS\n");
 
 	/* 2nd line */
-	seq_printf(s, "# ");
+	seq_printf(s, "#");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
 		seq_printf(s, "      |         ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
-		seq_printf(s, "|  ");
+		seq_printf(s, " |  ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
-		seq_printf(s, "  |    |        ");
+		seq_printf(s, "   |    |        ");
+	if (lat)
+		seq_printf(s, "||||");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "   |   |      ");
 	seq_printf(s, "               |   |   |   |\n");
-- 
cgit v1.2.3


From 48659d31195bb76d688e99dabd816c5472fb1656 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 11 Sep 2009 11:36:23 -0400
Subject: tracing: move tgid out of generic entry and into userstack

The userstack trace required the recording of the tgid entry.
Unfortunately, it was added to the generic entry where it wasted
4 bytes of every entry and was only used by one entry.

This patch moves it out of the generic field and moves it into the
only user (userstack_entry).

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 2 +-
 kernel/trace/trace.h        | 1 +
 kernel/trace/trace_events.c | 5 +----
 kernel/trace/trace_output.c | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c75deeefe3..1a37da2e853 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -886,7 +886,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->tgid			= (tsk) ? tsk->tgid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1068,6 +1067,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 		return;
 	entry	= ring_buffer_event_data(event);
 
+	entry->tgid		= current->tgid;
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
 	trace.nr_entries	= 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index acaa68060eb..b69697b4b84 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -101,6 +101,7 @@ struct stack_entry {
 
 struct userstack_entry {
 	struct trace_entry	ent;
+	unsigned int		tgid;
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
 };
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 78b1ed23017..28d92027a93 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -86,7 +86,6 @@ int trace_define_common_fields(struct ftrace_event_call *call)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
-	__common_field(int, tgid);
 
 	return ret;
 }
@@ -572,13 +571,11 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
-				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
 				FIELD(unsigned short, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
-				FIELD(int, pid),
-				FIELD(int, tgid));
+				FIELD(int, pid));
 }
 
 static ssize_t
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e0c2545622e..be34a6aa7e4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 		 * since individual threads might have already quit!
 		 */
 		rcu_read_lock();
-		task = find_task_by_vpid(entry->ent.tgid);
+		task = find_task_by_vpid(entry->tgid);
 		if (task)
 			mm = get_task_mm(task);
 		rcu_read_unlock();
-- 
cgit v1.2.3


From 637e7e864103a7a68c1ce43ada27dfc25c0d113f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 11 Sep 2009 13:55:35 -0400
Subject: tracing: add lock depth to entries

This patch adds the lock depth of the big kernel lock to the generic
entry header. This way we can see the depth of the lock and help
in removing the BKL.

Example:

 #                  _------=> CPU#
 #                 / _-----=> irqs-off
 #                | / _----=> need-resched
 #                || / _---=> hardirq/softirq
 #                ||| / _--=> preempt-depth
 #                |||| /_--=> lock-depth
 #                |||||/     delay
 #  cmd     pid   |||||| time  |   caller
 #     \   /      ||||||   \   |   /
   <idle>-0       2.N..3 5902255250us+: lock_acquire: read rcu_read_lock
   <idle>-0       2.N..3 5902255253us+: lock_release: rcu_read_lock
   <idle>-0       2dN..3 5902255257us+: lock_acquire: xtime_lock
   <idle>-0       2dN..4 5902255259us : lock_acquire: clocksource_lock
   <idle>-0       2dN..4 5902255261us+: lock_release: clocksource_lock

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c                 |  9 +++++----
 kernel/trace/trace_events.c          |  5 ++++-
 kernel/trace/trace_functions_graph.c | 16 ++++++++++++----
 kernel/trace/trace_output.c          | 10 +++++++++-
 4 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1a37da2e853..3b918283cf9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -886,6 +886,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->lock_depth		= (tsk) ? tsk->lock_depth : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1530,10 +1531,10 @@ static void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#                | / _----=> need-resched    \n");
 	seq_puts(m, "#                || / _---=> hardirq/softirq \n");
 	seq_puts(m, "#                ||| / _--=> preempt-depth   \n");
-	seq_puts(m, "#                |||| /                      \n");
-	seq_puts(m, "#                |||||     delay             \n");
-	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n");
-	seq_puts(m, "#     \\   /      |||||   \\   |   /           \n");
+	seq_puts(m, "#                |||| /_--=> lock-depth       \n");
+	seq_puts(m, "#                |||||/     delay             \n");
+	seq_puts(m, "#  cmd     pid   |||||| time  |   caller      \n");
+	seq_puts(m, "#     \\   /      ||||||   \\   |   /           \n");
 }
 
 static void print_func_help_header(struct seq_file *m)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 28d92027a93..975f324a07e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -86,6 +86,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
 	__common_field(unsigned char, flags);
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
+	__common_field(int, lock_depth);
 
 	return ret;
 }
@@ -571,11 +572,13 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
 				FIELD(unsigned short, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
-				FIELD(int, pid));
+				FIELD(int, pid),
+				FIELD(int, lock_depth));
 }
 
 static ssize_t
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ee791a9650c..48af4937438 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -368,6 +368,7 @@ static enum print_line_t
 print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
 	int hardirq, softirq;
+	int ret;
 
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -382,6 +383,13 @@ print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 				hardirq ? 'h' : softirq ? 's' : '.'))
 		return 0;
 
+	if (entry->lock_depth < 0)
+		ret = trace_seq_putc(s, '.');
+	else
+		ret = trace_seq_printf(s, "%d", entry->lock_depth);
+	if (!ret)
+		return 0;
+
 	if (entry->preempt_count)
 		return trace_seq_printf(s, "%x", entry->preempt_count);
 	return trace_seq_puts(s, ".");
@@ -1001,8 +1009,8 @@ static void print_lat_header(struct seq_file *s)
 	seq_printf(s, "#%.*s / _----=> need-resched    \n", size, spaces);
 	seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
 	seq_printf(s, "#%.*s|| / _--=> preempt-depth   \n", size, spaces);
-	seq_printf(s, "#%.*s||| /                      \n", size, spaces);
-	seq_printf(s, "#%.*s||||                       \n", size, spaces);
+	seq_printf(s, "#%.*s||| / _-=> lock-depth      \n", size, spaces);
+	seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 
 static void print_graph_headers(struct seq_file *s)
@@ -1021,7 +1029,7 @@ static void print_graph_headers(struct seq_file *s)
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "  TASK/PID       ");
 	if (lat)
-		seq_printf(s, "||||");
+		seq_printf(s, "|||||");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "  DURATION   ");
 	seq_printf(s, "               FUNCTION CALLS\n");
@@ -1035,7 +1043,7 @@ static void print_graph_headers(struct seq_file *s)
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "   |    |        ");
 	if (lat)
-		seq_printf(s, "||||");
+		seq_printf(s, "|||||");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "   |   |      ");
 	seq_printf(s, "               |   |   |   |\n");
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index be34a6aa7e4..29a370a4558 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -465,6 +465,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
 	char comm[TASK_COMM_LEN];
+	int ret;
 
 	trace_find_cmdline(entry->pid, comm);
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
@@ -481,9 +482,16 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 				hardirq ? 'h' : softirq ? 's' : '.'))
 		return 0;
 
+	if (entry->lock_depth < 0)
+		ret = trace_seq_putc(s, '.');
+	else
+		ret = trace_seq_printf(s, "%d", entry->lock_depth);
+	if (!ret)
+		return 0;
+
 	if (entry->preempt_count)
 		return trace_seq_printf(s, "%x", entry->preempt_count);
-	return trace_seq_puts(s, ".");
+	return trace_seq_putc(s, '.');
 }
 
 static unsigned long preempt_mark_thresh = 100;
-- 
cgit v1.2.3


From f81c972d27c36729e65d4a815e3d7b782a540bad Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 11 Sep 2009 14:24:13 -0400
Subject: tracing: consolidate code between trace_output.c and
 trace_function_graph.c

Both trace_output.c and trace_function_graph.c do basically the same
thing to handle the printing of the latency-format. This patch moves
the code into one function that both can use.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 26 ++------------------------
 kernel/trace/trace_output.c          | 30 ++++++++++++++++++++++++------
 kernel/trace/trace_output.h          |  2 ++
 3 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 48af4937438..61f166707a0 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -367,32 +367,10 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
 static enum print_line_t
 print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
-	int hardirq, softirq;
-	int ret;
-
-	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
-	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
-
-	if (!trace_seq_printf(s, " %c%c%c",
-			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
-				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
-				  'X' : '.',
-			      (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
-				'N' : '.',
-			      (hardirq && softirq) ? 'H' :
-				hardirq ? 'h' : softirq ? 's' : '.'))
-		return 0;
-
-	if (entry->lock_depth < 0)
-		ret = trace_seq_putc(s, '.');
-	else
-		ret = trace_seq_printf(s, "%d", entry->lock_depth);
-	if (!ret)
+	if (!trace_seq_putc(s, ' '))
 		return 0;
 
-	if (entry->preempt_count)
-		return trace_seq_printf(s, "%x", entry->preempt_count);
-	return trace_seq_puts(s, ".");
+	return trace_print_lat_fmt(s, entry);
 }
 
 /* If the pid changed since the last trace, output this event */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 29a370a4558..f572f44c6e1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -460,19 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
-static int
-lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+/**
+ * trace_print_lat_fmt - print the irq, preempt and lockdep fields
+ * @s: trace seq struct to write to
+ * @entry: The trace entry field from the ring buffer
+ *
+ * Prints the generic fields of irqs off, in hard or softirq, preempt
+ * count and lock depth.
+ */
+int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
 	int hardirq, softirq;
-	char comm[TASK_COMM_LEN];
 	int ret;
 
-	trace_find_cmdline(entry->pid, comm);
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
 
-	if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c",
-			      comm, entry->pid, cpu,
+	if (!trace_seq_printf(s, "%c%c%c",
 			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
 				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
 				  'X' : '.',
@@ -494,6 +498,20 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 	return trace_seq_putc(s, '.');
 }
 
+static int
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+	char comm[TASK_COMM_LEN];
+
+	trace_find_cmdline(entry->pid, comm);
+
+	if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
+			      comm, entry->pid, cpu))
+		return 0;
+
+	return trace_print_lat_fmt(s, entry);
+}
+
 static unsigned long preempt_mark_thresh = 100;
 
 static int
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c3..9d91c72ba38 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
 					 int flags);
+extern int
+trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
 
 /* used by module unregistering */
 extern int __unregister_ftrace_event(struct trace_event *event);
-- 
cgit v1.2.3


From b63f39ea50330f836e301ddda21c6a93dcf0d6a3 Mon Sep 17 00:00:00 2001
From: "jolsa@redhat.com" <jolsa@redhat.com>
Date: Fri, 11 Sep 2009 17:29:27 +0200
Subject: tracing: create generic trace parser

Create a "trace_parser" that can parse the user space input for
separate words.

struct trace_parser is the descriptor.

Generic "trace_get_user" function that can be a helper to read multiple
words passed in by user space.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1252682969-3366-2-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h |  35 +++++++++++++++++
 2 files changed, 141 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3b918283cf9..45c3f0352d7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -339,6 +339,112 @@ static struct {
 
 int trace_clock_id;
 
+/*
+ * trace_parser_get_init - gets the buffer for trace parser
+ */
+int trace_parser_get_init(struct trace_parser *parser, int size)
+{
+	memset(parser, 0, sizeof(*parser));
+
+	parser->buffer = kmalloc(size, GFP_KERNEL);
+	if (!parser->buffer)
+		return 1;
+
+	parser->size = size;
+	return 0;
+}
+
+/*
+ * trace_parser_put - frees the buffer for trace parser
+ */
+void trace_parser_put(struct trace_parser *parser)
+{
+	kfree(parser->buffer);
+}
+
+/*
+ * trace_get_user - reads the user input string separated by  space
+ * (matched by isspace(ch))
+ *
+ * For each string found the 'struct trace_parser' is updated,
+ * and the function returns.
+ *
+ * Returns number of bytes read.
+ *
+ * See kernel/trace/trace.h for 'struct trace_parser' details.
+ */
+int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
+	size_t cnt, loff_t *ppos)
+{
+	char ch;
+	size_t read = 0;
+	ssize_t ret;
+
+	if (!*ppos)
+		trace_parser_clear(parser);
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		goto out;
+
+	read++;
+	cnt--;
+
+	/*
+	 * The parser is not finished with the last write,
+	 * continue reading the user input without skipping spaces.
+	 */
+	if (!parser->cont) {
+		/* skip white space */
+		while (cnt && isspace(ch)) {
+			ret = get_user(ch, ubuf++);
+			if (ret)
+				goto out;
+			read++;
+			cnt--;
+		}
+
+		/* only spaces were written */
+		if (isspace(ch)) {
+			*ppos += read;
+			ret = read;
+			goto out;
+		}
+
+		parser->idx = 0;
+	}
+
+	/* read the non-space input */
+	while (cnt && !isspace(ch)) {
+		if (parser->idx < parser->size)
+			parser->buffer[parser->idx++] = ch;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+
+	/* We either got finished input or we have to wait for another call. */
+	if (isspace(ch)) {
+		parser->buffer[parser->idx] = 0;
+		parser->cont = false;
+	} else {
+		parser->cont = true;
+		parser->buffer[parser->idx++] = ch;
+	}
+
+	*ppos += read;
+	ret = read;
+
+out:
+	return ret;
+}
+
 ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 {
 	int len;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b69697b4b84..28247cecd95 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -615,6 +615,41 @@ static inline int ftrace_trace_task(struct task_struct *task)
 }
 #endif
 
+/*
+ * struct trace_parser - servers for reading the user input separated by spaces
+ * @cont: set if the input is not complete - no final space char was found
+ * @buffer: holds the parsed user input
+ * @idx: user input lenght
+ * @size: buffer size
+ */
+struct trace_parser {
+	bool		cont;
+	char		*buffer;
+	unsigned	idx;
+	unsigned	size;
+};
+
+static inline bool trace_parser_loaded(struct trace_parser *parser)
+{
+	return (parser->idx != 0);
+}
+
+static inline bool trace_parser_cont(struct trace_parser *parser)
+{
+	return parser->cont;
+}
+
+static inline void trace_parser_clear(struct trace_parser *parser)
+{
+	parser->cont = false;
+	parser->idx = 0;
+}
+
+extern int trace_parser_get_init(struct trace_parser *parser, int size);
+extern void trace_parser_put(struct trace_parser *parser);
+extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
+	size_t cnt, loff_t *ppos);
+
 /*
  * trace_iterator_flags is an enumeration that defines bit
  * positions into trace_flags that controls the output.
-- 
cgit v1.2.3


From 489663644c35d50a20f58d468a7cbc705e6a29ce Mon Sep 17 00:00:00 2001
From: "jolsa@redhat.com" <jolsa@redhat.com>
Date: Fri, 11 Sep 2009 17:29:28 +0200
Subject: tracing: trace parser support for set_event

Convert the parsing of the file 'set_event' to use the generic
trace_praser 'trace_get_user' function.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1252682969-3366-3-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 60 ++++++++++-----------------------------------
 1 file changed, 13 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 975f324a07e..f46d14cefde 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -230,11 +230,9 @@ static ssize_t
 ftrace_event_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
+	struct trace_parser parser;
 	size_t read = 0;
-	int i, set = 1;
 	ssize_t ret;
-	char *buf;
-	char ch;
 
 	if (!cnt || cnt < 0)
 		return 0;
@@ -243,60 +241,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	ret = get_user(ch, ubuf++);
-	if (ret)
-		return ret;
-	read++;
-	cnt--;
-
-	/* skip white space */
-	while (cnt && isspace(ch)) {
-		ret = get_user(ch, ubuf++);
-		if (ret)
-			return ret;
-		read++;
-		cnt--;
-	}
-
-	/* Only white space found? */
-	if (isspace(ch)) {
-		file->f_pos += read;
-		ret = read;
-		return ret;
-	}
-
-	buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
-	if (!buf)
+	if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
 		return -ENOMEM;
 
-	if (cnt > EVENT_BUF_SIZE)
-		cnt = EVENT_BUF_SIZE;
+	read = trace_get_user(&parser, ubuf, cnt, ppos);
+
+	if (trace_parser_loaded((&parser))) {
+		int set = 1;
 
-	i = 0;
-	while (cnt && !isspace(ch)) {
-		if (!i && ch == '!')
+		if (*parser.buffer == '!')
 			set = 0;
-		else
-			buf[i++] = ch;
 
-		ret = get_user(ch, ubuf++);
+		parser.buffer[parser.idx] = 0;
+
+		ret = ftrace_set_clr_event(parser.buffer + !set, set);
 		if (ret)
-			goto out_free;
-		read++;
-		cnt--;
+			goto out_put;
 	}
-	buf[i] = 0;
-
-	file->f_pos += read;
-
-	ret = ftrace_set_clr_event(buf, set);
-	if (ret)
-		goto out_free;
 
 	ret = read;
 
- out_free:
-	kfree(buf);
+ out_put:
+	trace_parser_put(&parser);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 689fd8b65d669b96d612ccc37d6fb87bf7ed6907 Mon Sep 17 00:00:00 2001
From: "jolsa@redhat.com" <jolsa@redhat.com>
Date: Fri, 11 Sep 2009 17:29:29 +0200
Subject: tracing: trace parser support for function and graph

Convert the writing to 'set_graph_function', 'set_ftrace_filter'
and 'set_ftrace_notrace' to use the generic trace_parser
'trace_get_user' function.

Removed FTRACE_ITER_CONT flag, since it's not needed after this change.

Minor fix in set_graph_function display - g_show function.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1252682969-3366-4-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 150 ++++++++++++++------------------------------------
 1 file changed, 40 insertions(+), 110 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c804e24f96..8b23d567008 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1323,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
 
 enum {
 	FTRACE_ITER_FILTER	= (1 << 0),
-	FTRACE_ITER_CONT	= (1 << 1),
-	FTRACE_ITER_NOTRACE	= (1 << 2),
-	FTRACE_ITER_FAILURES	= (1 << 3),
-	FTRACE_ITER_PRINTALL	= (1 << 4),
-	FTRACE_ITER_HASH	= (1 << 5),
+	FTRACE_ITER_NOTRACE	= (1 << 1),
+	FTRACE_ITER_FAILURES	= (1 << 2),
+	FTRACE_ITER_PRINTALL	= (1 << 3),
+	FTRACE_ITER_HASH	= (1 << 4),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1337,8 +1336,7 @@ struct ftrace_iterator {
 	int			hidx;
 	int			idx;
 	unsigned		flags;
-	unsigned char		buffer[FTRACE_BUFF_MAX+1];
-	unsigned		buffer_idx;
+	struct trace_parser	parser;
 };
 
 static void *
@@ -1604,6 +1602,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
 	if (!iter)
 		return -ENOMEM;
 
+	if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
+		kfree(iter);
+		return -ENOMEM;
+	}
+
 	mutex_lock(&ftrace_regex_lock);
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
@@ -2196,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos, int enable)
 {
 	struct ftrace_iterator *iter;
-	char ch;
-	size_t read = 0;
-	ssize_t ret;
+	struct trace_parser *parser;
+	ssize_t ret, read;
 
 	if (!cnt || cnt < 0)
 		return 0;
@@ -2211,72 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	} else
 		iter = file->private_data;
 
-	if (!*ppos) {
-		iter->flags &= ~FTRACE_ITER_CONT;
-		iter->buffer_idx = 0;
-	}
-
-	ret = get_user(ch, ubuf++);
-	if (ret)
-		goto out;
-	read++;
-	cnt--;
-
-	/*
-	 * If the parser haven't finished with the last write,
-	 * continue reading the user input without skipping spaces.
-	 */
-	if (!(iter->flags & FTRACE_ITER_CONT)) {
-		/* skip white space */
-		while (cnt && isspace(ch)) {
-			ret = get_user(ch, ubuf++);
-			if (ret)
-				goto out;
-			read++;
-			cnt--;
-		}
+	parser = &iter->parser;
+	read = trace_get_user(parser, ubuf, cnt, ppos);
 
-		/* only spaces were written */
-		if (isspace(ch)) {
-			*ppos += read;
-			ret = read;
-			goto out;
-		}
-
-		iter->buffer_idx = 0;
-	}
-
-	while (cnt && !isspace(ch)) {
-		if (iter->buffer_idx < FTRACE_BUFF_MAX)
-			iter->buffer[iter->buffer_idx++] = ch;
-		else {
-			ret = -EINVAL;
-			goto out;
-		}
-		ret = get_user(ch, ubuf++);
+	if (trace_parser_loaded(parser) &&
+	    !trace_parser_cont(parser)) {
+		ret = ftrace_process_regex(parser->buffer,
+					   parser->idx, enable);
 		if (ret)
 			goto out;
-		read++;
-		cnt--;
-	}
 
-	if (isspace(ch)) {
-		iter->buffer[iter->buffer_idx] = 0;
-		ret = ftrace_process_regex(iter->buffer,
-					   iter->buffer_idx, enable);
-		if (ret)
-			goto out;
-		iter->buffer_idx = 0;
-	} else {
-		iter->flags |= FTRACE_ITER_CONT;
-		iter->buffer[iter->buffer_idx++] = ch;
+		trace_parser_clear(parser);
 	}
 
-	*ppos += read;
 	ret = read;
- out:
-	mutex_unlock(&ftrace_regex_lock);
 
+	mutex_unlock(&ftrace_regex_lock);
+out:
 	return ret;
 }
 
@@ -2381,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
 	struct ftrace_iterator *iter;
+	struct trace_parser *parser;
 
 	mutex_lock(&ftrace_regex_lock);
 	if (file->f_mode & FMODE_READ) {
@@ -2390,9 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 	} else
 		iter = file->private_data;
 
-	if (iter->buffer_idx) {
-		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
+	parser = &iter->parser;
+	if (trace_parser_loaded(parser)) {
+		parser->buffer[parser->idx] = 0;
+		ftrace_match_records(parser->buffer, parser->idx, enable);
 	}
 
 	mutex_lock(&ftrace_lock);
@@ -2400,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
 	mutex_unlock(&ftrace_lock);
 
+	trace_parser_put(parser);
 	kfree(iter);
+
 	mutex_unlock(&ftrace_regex_lock);
 	return 0;
 }
@@ -2499,7 +2456,7 @@ static int g_show(struct seq_file *m, void *v)
 		return 0;
 	}
 
-	seq_printf(m, "%pf\n", v);
+	seq_printf(m, "%pf\n", (void *)*ptr);
 
 	return 0;
 }
@@ -2602,12 +2559,10 @@ static ssize_t
 ftrace_graph_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
-	unsigned char buffer[FTRACE_BUFF_MAX+1];
+	struct trace_parser parser;
 	unsigned long *array;
 	size_t read = 0;
 	ssize_t ret;
-	int index = 0;
-	char ch;
 
 	if (!cnt || cnt < 0)
 		return 0;
@@ -2625,51 +2580,26 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 	} else
 		array = file->private_data;
 
-	ret = get_user(ch, ubuf++);
-	if (ret)
+	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
+		ret = -ENOMEM;
 		goto out;
-	read++;
-	cnt--;
-
-	/* skip white space */
-	while (cnt && isspace(ch)) {
-		ret = get_user(ch, ubuf++);
-		if (ret)
-			goto out;
-		read++;
-		cnt--;
 	}
 
-	if (isspace(ch)) {
-		*ppos += read;
-		ret = read;
-		goto out;
-	}
+	read = trace_get_user(&parser, ubuf, cnt, ppos);
 
-	while (cnt && !isspace(ch)) {
-		if (index < FTRACE_BUFF_MAX)
-			buffer[index++] = ch;
-		else {
-			ret = -EINVAL;
-			goto out;
-		}
-		ret = get_user(ch, ubuf++);
+	if (trace_parser_loaded((&parser))) {
+		parser.buffer[parser.idx] = 0;
+
+		/* we allow only one expression at a time */
+		ret = ftrace_set_func(array, &ftrace_graph_count,
+					parser.buffer);
 		if (ret)
 			goto out;
-		read++;
-		cnt--;
 	}
-	buffer[index] = 0;
-
-	/* we allow only one expression at a time */
-	ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
-	if (ret)
-		goto out;
-
-	file->f_pos += read;
 
 	ret = read;
  out:
+	trace_parser_put(&parser);
 	mutex_unlock(&graph_lock);
 
 	return ret;
-- 
cgit v1.2.3


From 41dfba4367109b92d92ec6e059be6950497d932f Mon Sep 17 00:00:00 2001
From: Carsten Emde <Carsten.Emde@osadl.org>
Date: Sun, 13 Sep 2009 01:41:31 +0200
Subject: tracing: remove unused local variables in tracer probe functions

When the nsecs_to_usecs() conversion in probe_wakeup_sched_switch() and
check_critical_timing() was moved to a later stage in order to avoid
unnecessary computing, it was overlooked to remove the original
variables, assignments and comments..

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_irqsoff.c      | 12 +-----------
 kernel/trace/trace_sched_wakeup.c | 10 ----------
 2 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5555b75a0d1..06f8ea9e4b9 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
 		      unsigned long parent_ip,
 		      int cpu)
 {
-	unsigned long latency, t0, t1;
 	cycle_t T0, T1, delta;
 	unsigned long flags;
 	int pc;
 
-	/*
-	 * usecs conversion is slow so we try to delay the conversion
-	 * as long as possible:
-	 */
 	T0 = data->preempt_timestamp;
 	T1 = ftrace_now(cpu);
 	delta = T1-T0;
@@ -157,17 +152,12 @@ check_critical_timing(struct trace_array *tr,
 
 	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 
-	latency = nsecs_to_usecs(delta);
-
 	if (data->critical_sequence != max_sequence)
 		goto out_unlock;
 
-	tracing_max_latency = delta;
-	t0 = nsecs_to_usecs(T0);
-	t1 = nsecs_to_usecs(T1);
-
 	data->critical_end = parent_ip;
 
+	tracing_max_latency = delta;
 	update_max_tr_single(tr, current, cpu);
 
 	max_sequence++;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index cf43bdb1763..6e1529bc617 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -110,7 +110,6 @@ static void notrace
 probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	struct task_struct *next)
 {
-	unsigned long latency = 0, t0 = 0, t1 = 0;
 	struct trace_array_cpu *data;
 	cycle_t T0, T1, delta;
 	unsigned long flags;
@@ -156,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
-	/*
-	 * usecs conversion is slow so we try to delay the conversion
-	 * as long as possible:
-	 */
 	T0 = data->preempt_timestamp;
 	T1 = ftrace_now(cpu);
 	delta = T1-T0;
@@ -167,12 +162,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (!report_latency(delta))
 		goto out_unlock;
 
-	latency = nsecs_to_usecs(delta);
-
 	tracing_max_latency = delta;
-	t0 = nsecs_to_usecs(T0);
-	t1 = nsecs_to_usecs(T1);
-
 	update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
 
 out_unlock:
-- 
cgit v1.2.3


From b5130b1e7d3717d03ab1916b198bf0d49fa0a619 Mon Sep 17 00:00:00 2001
From: Carsten Emde <Carsten.Emde@osadl.org>
Date: Sun, 13 Sep 2009 01:43:07 +0200
Subject: tracing: do not update tracing_max_latency when tracer is stopped

The state of the function pair tracing_stop()/tracing_start() is
correctly considered when tracer data are updated. However, the global
and externally accessible variable tracing_max_latency is always updated
- even when tracing is stopped.

The update should only occur, if tracing was not stopped.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c              | 5 +++++
 kernel/trace/trace.h              | 1 +
 kernel/trace/trace_irqsoff.c      | 6 ++++--
 kernel/trace/trace_sched_wakeup.c | 6 ++++--
 4 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45c3f0352d7..ef82a7fabf3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -825,6 +825,11 @@ static void trace_init_cmdlines(void)
 	cmdline_idx = 0;
 }
 
+int is_tracing_stopped(void)
+{
+	return trace_stop_count;
+}
+
 /**
  * ftrace_off_permanent - disable all ftrace code permanently
  *
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 28247cecd95..4ad4e1ddcb9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -461,6 +461,7 @@ void tracing_stop_sched_switch_record(void);
 void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
+int is_tracing_stopped(void);
 
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 06f8ea9e4b9..3aa7eaa2114 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -157,8 +157,10 @@ check_critical_timing(struct trace_array *tr,
 
 	data->critical_end = parent_ip;
 
-	tracing_max_latency = delta;
-	update_max_tr_single(tr, current, cpu);
+	if (likely(!is_tracing_stopped())) {
+		tracing_max_latency = delta;
+		update_max_tr_single(tr, current, cpu);
+	}
 
 	max_sequence++;
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 6e1529bc617..26185d72767 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -162,8 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (!report_latency(delta))
 		goto out_unlock;
 
-	tracing_max_latency = delta;
-	update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
+	if (likely(!is_tracing_stopped())) {
+		tracing_max_latency = delta;
+		update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
+	}
 
 out_unlock:
 	__wakeup_reset(wakeup_trace);
-- 
cgit v1.2.3


From 558e6547e4b8a2b13608a24a9d3679802f91c4c7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 24 Aug 2009 12:19:47 +0800
Subject: tracing/profile: fix profile_disable vs module_unload

If the correspoding module is unloaded before ftrace_profile_disable()
is called, event->profile_disable() won't be called, which can
cause oops:

  # insmod trace-events-sample.ko
  # perf record -f -a -e sample:foo_bar sleep 3 &
  # sleep 1
  # rmmod trace_events_sample
  # insmod trace-events-sample.ko
  OOPS!

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A9214E3.2070807@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_event_profile.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 11ba5bb4ed0..55a25c933d1 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,6 +5,7 @@
  *
  */
 
+#include <linux/module.h>
 #include "trace.h"
 
 int ftrace_profile_enable(int event_id)
@@ -14,7 +15,8 @@ int ftrace_profile_enable(int event_id)
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id && event->profile_enable) {
+		if (event->id == event_id && event->profile_enable &&
+		    try_module_get(event->mod)) {
 			ret = event->profile_enable(event);
 			break;
 		}
@@ -32,6 +34,7 @@ void ftrace_profile_disable(int event_id)
 	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id) {
 			event->profile_disable(event);
+			module_put(event->mod);
 			break;
 		}
 	}
-- 
cgit v1.2.3


From 0a1c49db8d91c538f104f8d70e560c6fdd589bd4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 12 Sep 2009 19:17:15 -0400
Subject: tracing: use macros to create internal ftrace entry ring buffer
 structures

The entries used by ftrace internal code (plugins) currently have their
formats manually exported to userspace. That is, the format files in
debugfs/tracing/events/ftrace/*/format are currently created by hand.
This is a maintenance nightmare, and can easily become out of sync
with what is actually shown.

This patch uses the methodology of the TRACE_EVENT macros to build
the structures so that their formats can be automated and this
will keep the structures in sync with what users can see.

This patch only changes the way the structures are created. Further
patches will build off of this to automate the format files.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h         | 160 ++++----------------
 kernel/trace/trace_entries.h | 342 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 369 insertions(+), 133 deletions(-)
 create mode 100644 kernel/trace/trace_entries.h

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4ad4e1ddcb9..d308195d40a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -42,150 +42,45 @@ enum trace_type {
 	__TRACE_LAST_TYPE,
 };
 
-/*
- * Function trace entry - function address and parent function addres:
- */
-struct ftrace_entry {
-	struct trace_entry	ent;
-	unsigned long		ip;
-	unsigned long		parent_ip;
-};
-
-/* Function call entry */
-struct ftrace_graph_ent_entry {
-	struct trace_entry		ent;
-	struct ftrace_graph_ent		graph_ent;
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
 };
 
-/* Function return entry */
-struct ftrace_graph_ret_entry {
-	struct trace_entry		ent;
-	struct ftrace_graph_ret		ret;
-};
 extern struct tracer boot_tracer;
 
-/*
- * Context switch trace entry - which task (and prio) we switched from/to:
- */
-struct ctx_switch_entry {
-	struct trace_entry	ent;
-	unsigned int		prev_pid;
-	unsigned char		prev_prio;
-	unsigned char		prev_state;
-	unsigned int		next_pid;
-	unsigned char		next_prio;
-	unsigned char		next_state;
-	unsigned int		next_cpu;
-};
-
-/*
- * Special (free-form) trace entry:
- */
-struct special_entry {
-	struct trace_entry	ent;
-	unsigned long		arg1;
-	unsigned long		arg2;
-	unsigned long		arg3;
-};
-
-/*
- * Stack-trace entry:
- */
+#undef __field
+#define __field(type, item)		type	item;
 
-#define FTRACE_STACK_ENTRIES	8
+#undef __array
+#define __array(type, item, size)	type	item[size];
 
-struct stack_entry {
-	struct trace_entry	ent;
-	unsigned long		caller[FTRACE_STACK_ENTRIES];
-};
+#undef __dynamic_array
+#define __dynamic_array(type, item)	type	item[];
 
-struct userstack_entry {
-	struct trace_entry	ent;
-	unsigned int		tgid;
-	unsigned long		caller[FTRACE_STACK_ENTRIES];
-};
-
-/*
- * trace_printk entry:
- */
-struct bprint_entry {
-	struct trace_entry	ent;
-	unsigned long		ip;
-	const char		*fmt;
-	u32			buf[];
-};
-
-struct print_entry {
-	struct trace_entry	ent;
-	unsigned long		ip;
-	char			buf[];
-};
+#undef F_STRUCT
+#define F_STRUCT(args...)		args
 
-struct trace_mmiotrace_rw {
-	struct trace_entry	ent;
-	struct mmiotrace_rw	rw;
-};
-
-struct trace_mmiotrace_map {
-	struct trace_entry	ent;
-	struct mmiotrace_map	map;
-};
-
-struct trace_boot_call {
-	struct trace_entry	ent;
-	struct boot_trace_call boot_call;
-};
-
-struct trace_boot_ret {
-	struct trace_entry	ent;
-	struct boot_trace_ret boot_ret;
-};
-
-#define TRACE_FUNC_SIZE 30
-#define TRACE_FILE_SIZE 20
-struct trace_branch {
-	struct trace_entry	ent;
-	unsigned	        line;
-	char			func[TRACE_FUNC_SIZE+1];
-	char			file[TRACE_FILE_SIZE+1];
-	char			correct;
-};
-
-struct hw_branch_entry {
-	struct trace_entry	ent;
-	u64			from;
-	u64			to;
-};
-
-struct trace_power {
-	struct trace_entry	ent;
-	struct power_trace	state_data;
-};
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)	\
+	struct struct_name {					\
+		struct trace_entry	ent;			\
+		tstruct						\
+	}
 
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
+#undef TP_ARGS
+#define TP_ARGS(args...)	args
 
-struct kmemtrace_alloc_entry {
-	struct trace_entry	ent;
-	enum kmemtrace_type_id	type_id;
-	unsigned long		call_site;
-	const void		*ptr;
-	size_t			bytes_req;
-	size_t			bytes_alloc;
-	gfp_t			gfp_flags;
-	int			node;
-};
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
 
-struct kmemtrace_free_entry {
-	struct trace_entry	ent;
-	enum kmemtrace_type_id	type_id;
-	unsigned long		call_site;
-	const void		*ptr;
-};
+#include "trace_entries.h"
 
+/*
+ * syscalls are special, and need special handling, this is why
+ * they are not included in trace_entries.h
+ */
 struct syscall_trace_enter {
 	struct trace_entry	ent;
 	int			nr;
@@ -198,7 +93,6 @@ struct syscall_trace_exit {
 	unsigned long		ret;
 };
 
-
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 00000000000..82c51fdca03
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,342 @@
+/*
+ * This file defines the trace event structures that go into the ring
+ * buffer directly. They are created via macros so that changes for them
+ * appear in the format file. Using macros will automate this process.
+ *
+ * The macro used to create a ftrace data structure is:
+ *
+ * FTRACE_ENTRY( name, struct_name, id, structure, print )
+ *
+ * @name: the name used the event name, as well as the name of
+ *   the directory that holds the format file.
+ *
+ * @struct_name: the name of the structure that is created.
+ *
+ * @id: The event identifier that is used to detect what event
+ *    this is from the ring buffer.
+ *
+ * @structure: the structure layout
+ *
+ *  - __field(	type,	item	)
+ *	  This is equivalent to declaring
+ *		type	item;
+ *	  in the structure.
+ *  - __array(	type,	item,	size	)
+ *	  This is equivalent to declaring
+ *		type	item[size];
+ *	  in the structure.
+ *
+ * @print: the print format shown to users in the format file.
+ */
+
+/*
+ * Function trace entry - function address and parent function addres:
+ */
+FTRACE_ENTRY(function, ftrace_entry,
+
+	TRACE_FN,
+
+	F_STRUCT(
+		__field(	unsigned long,	ip		)
+		__field(	unsigned long,	parent_ip	)
+	),
+
+	F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
+);
+
+/* Function call entry */
+FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
+
+	TRACE_GRAPH_ENT,
+
+	F_STRUCT(
+		__field(	struct ftrace_graph_ent,	graph_ent	)
+	),
+
+	F_printk("--> %lx (%d)", __entry->graph_ent.func, __entry->depth)
+);
+
+/* Function return entry */
+FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
+
+	TRACE_GRAPH_RET,
+
+	F_STRUCT(
+		__field(	struct ftrace_graph_ret,	ret	)
+	),
+
+	F_printk("<-- %lx (%d) (start: %llx  end: %llx) over: %d",
+		 __entry->func, __entry->depth,
+		 __entry->calltime, __entry->rettim,
+		 __entrty->depth)
+);
+
+/*
+ * Context switch trace entry - which task (and prio) we switched from/to:
+ *
+ * This is used for both wakeup and context switches. We only want
+ * to create one structure, but we need two outputs for it.
+ */
+#define FTRACE_CTX_FIELDS					\
+	__field(	unsigned int,	prev_pid	)	\
+	__field(	unsigned char,	prev_prio	)	\
+	__field(	unsigned char,	prev_state	)	\
+	__field(	unsigned int,	next_pid	)	\
+	__field(	unsigned char,	next_prio	)	\
+	__field(	unsigned char,	next_state	)	\
+	__field(	unsigned int,	next_cpu	)
+
+#if 0
+FTRACE_ENTRY_STRUCT_ONLY(ctx_switch_entry,
+
+	F_STRUCT(
+		FTRACE_CTX_FIELDS
+	)
+);
+#endif
+
+FTRACE_ENTRY(context_switch, ctx_switch_entry,
+
+	TRACE_CTX,
+
+	F_STRUCT(
+		FTRACE_CTX_FIELDS
+	),
+
+	F_printk(b"%u:%u:%u  ==> %u:%u:%u [%03u]",
+		 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
+		 __entry->next_pid, __entry->next_prio, __entry->next_state,
+		 __entry->next_cpu
+		)
+);
+
+/*
+ * FTRACE_ENTRY_DUP only creates the format file, it will not
+ *  create another structure.
+ */
+FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
+
+	TRACE_WAKE,
+
+	F_STRUCT(
+		FTRACE_CTX_FIELDS
+	),
+
+	F_printk("%u:%u:%u  ==+ %u:%u:%u [%03u]",
+		 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
+		 __entry->next_pid, __entry->next_prio, __entry->next_state,
+		 __entry->next_cpu
+		)
+);
+
+/*
+ * Special (free-form) trace entry:
+ */
+FTRACE_ENTRY(special, special_entry,
+
+	TRACE_SPECIAL,
+
+	F_STRUCT(
+		__field(	unsigned long,	arg1	)
+		__field(	unsigned long,	arg2	)
+		__field(	unsigned long,	arg3	)
+	),
+
+	F_printk("(%08lx) (%08lx) (%08lx)",
+		 __entry->arg1, __entry->arg2, __entry->arg3)
+);
+
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES	8
+
+FTRACE_ENTRY(kernel_stack, stack_entry,
+
+	TRACE_STACK,
+
+	F_STRUCT(
+		__array(	unsigned long,	caller, FTRACE_STACK_ENTRIES	)
+	),
+
+	F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+		 __entry->caller[0], __entry->caller[1], __entry->caller[2],
+		 __entry->caller[3], __entry->caller[4], __entry->caller[5],
+		 __entry->caller[6], __entry->caller[7])
+);
+
+FTRACE_ENTRY(user_stack, userstack_entry,
+
+	TRACE_USER_STACK,
+
+	F_STRUCT(
+		__field(	unsigned int,	tgid	)
+		__array(	unsigned long,	caller, FTRACE_STACK_ENTRIES	)
+	),
+
+	F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+		 __entry->caller[0], __entry->caller[1], __entry->caller[2],
+		 __entry->caller[3], __entry->caller[4], __entry->caller[5],
+		 __entry->caller[6], __entry->caller[7])
+);
+
+/*
+ * trace_printk entry:
+ */
+FTRACE_ENTRY(bprint, bprint_entry,
+
+	TRACE_BPRINT,
+
+	F_STRUCT(
+		__field(	unsigned long,	ip	)
+		__field(	const char *,	fmt	)
+		__dynamic_array(	u32,	buf	)
+	),
+
+	F_printk("%08lx fmt:%p",
+		 __entry->ip, __entry->fmt)
+);
+
+FTRACE_ENTRY(print, print_entry,
+
+	TRACE_PRINT,
+
+	F_STRUCT(
+		__field(	unsigned long,	ip	)
+		__dynamic_array(	char,	buf	)
+	),
+
+	F_printk("%08lx %s",
+		 __entry->ip, __entry->buf)
+);
+
+FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
+
+	TRACE_MMIO_RW,
+
+	F_STRUCT(
+		__field(	struct mmiotrace_rw,	rw	)
+	),
+
+	F_printk("%lx %lx %lx %d %lx %lx",
+		 __entry->phs, __entry->value, __entry->pc,
+		 __entry->map_id, __entry->opcode, __entry->width)
+);
+
+FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
+
+	TRACE_MMIO_MAP,
+
+	F_STRUCT(
+		__field(	struct mmiotrace_map,	map	)
+	),
+
+	F_printk("%lx %lx %lx %d %lx",
+		 __entry->phs, __entry->virt, __entry->len,
+		 __entry->map_id, __entry->opcode)
+);
+
+FTRACE_ENTRY(boot_call, trace_boot_call,
+
+	TRACE_BOOT_CALL,
+
+	F_STRUCT(
+		__field(	struct boot_trace_call,	boot_call	)
+	),
+
+	F_printk("%d  %s", __entry->caller, __entry->func)
+);
+
+FTRACE_ENTRY(boot_ret, trace_boot_ret,
+
+	TRACE_BOOT_RET,
+
+	F_STRUCT(
+		__field(	struct boot_trace_ret,	boot_ret	)
+	),
+
+	F_printk("%s %d %lx",
+		 __entry->func, __entry->result, __entry->duration)
+);
+
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+
+FTRACE_ENTRY(branch, trace_branch,
+
+	TRACE_BRANCH,
+
+	F_STRUCT(
+		__field(	unsigned int,	line				)
+		__array(	char,		func,	TRACE_FUNC_SIZE+1	)
+		__array(	char,		file,	TRACE_FILE_SIZE+1	)
+		__field(	char,		correct				)
+	),
+
+	F_printk("%u:%s:%s (%u)",
+		 __entry->line,
+		 __entry->func, __entry->file, __entry->correct)
+);
+
+FTRACE_ENTRY(hw_branch, hw_branch_entry,
+
+	TRACE_HW_BRANCHES,
+
+	F_STRUCT(
+		__field(	u64,	from	)
+		__field(	u64,	to	)
+	),
+
+	F_printk("from: %llx to: %llx", __entry->from, __entry->to)
+);
+
+FTRACE_ENTRY(power, trace_power,
+
+	TRACE_POWER,
+
+	F_STRUCT(
+		__field(	struct power_trace,	state_data	)
+	),
+
+	F_printk("%llx->%llx type:%u state:%u",
+		 __entry->stamp, __entry->end,
+		 __entry->type, __entry->state)
+);
+
+FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
+
+	TRACE_KMEM_ALLOC,
+
+	F_STRUCT(
+		__field(	enum kmemtrace_type_id,	type_id		)
+		__field(	unsigned long,		call_site	)
+		__field(	const void *,		ptr		)
+		__field(	size_t,			bytes_req	)
+		__field(	size_t,			bytes_alloc	)
+		__field(	gfp_t,			gfp_flags	)
+		__field(	int,			node		)
+	),
+
+	F_printk("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+		 " flags:%x node:%d",
+		 __entry->type_id, __entry->call_site, __entry->ptr,
+		 __entry->bytes_req, __entry->bytes_alloc,
+		 __entry->gfp_flags, __entry->node)
+);
+
+FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
+
+	TRACE_KMEM_FREE,
+
+	F_STRUCT(
+		__field(	enum kmemtrace_type_id,	type_id		)
+		__field(	unsigned long,		call_site	)
+		__field(	const void *,		ptr		)
+	),
+
+	F_printk("type:%u call_site:%lx ptr:%p",
+		 __entry->type_id, __entry->call_site, __entry->ptr)
+);
-- 
cgit v1.2.3


From d73150943cf47b6cabcb4f4e52dd25975e820ae2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 12 Sep 2009 19:22:23 -0400
Subject: tracing: show details of structures within the ftrace structures

Some of the internal ftrace structures use structures within. The
output of a field saying it is just a structure is useless for a format
file. A binary reader of the ring buffer needs to know more about
how the fields are broken up.

This patch adds to the ftrace structure macros new fields to
describe the structures inside a structure.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h         |  9 +++++++
 kernel/trace/trace_entries.h | 64 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 66 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d308195d40a..b0d287d49a6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -53,9 +53,18 @@ extern struct tracer boot_tracer;
 #undef __field
 #define __field(type, item)		type	item;
 
+#undef __field_struct
+#define __field_struct(type, item)	__field(type, item)
+
+#undef __field_desc
+#define __field_desc(type, container, item)
+
 #undef __array
 #define __array(type, item, size)	type	item[size];
 
+#undef __array_desc
+#define __array_desc(type, container, item, size)
+
 #undef __dynamic_array
 #define __dynamic_array(type, item)	type	item[];
 
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 82c51fdca03..c866d34e014 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -26,6 +26,29 @@
  *		type	item[size];
  *	  in the structure.
  *
+ *   * for structures within structures, the format of the internal
+ *	structure is layed out. This allows the internal structure
+ *	to be deciphered for the format file. Although these macros
+ *	may become out of sync with the internal structure, they
+ *	will create a compile error if it happens. Since the
+ *	internel structures are just tracing helpers, this is not
+ *	an issue.
+ *
+ *	When an internal structure is used, it should use:
+ *
+ *	__field_struct(	type,	item	)
+ *
+ *	instead of __field. This will prevent it from being shown in
+ *	the output file. The fields in the structure should use.
+ *
+ *	__field_desc(	type,	container,	item		)
+ *	__array_desc(	type,	container,	item,	len	)
+ *
+ *	type, item and len are the same as __field and __array, but
+ *	container is added. This is the name of the item in
+ *	__field_struct that this is describing.
+ *
+ *
  * @print: the print format shown to users in the format file.
  */
 
@@ -50,7 +73,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
 	TRACE_GRAPH_ENT,
 
 	F_STRUCT(
-		__field(	struct ftrace_graph_ent,	graph_ent	)
+		__field_struct(	struct ftrace_graph_ent,	graph_ent	)
+		__field_desc(	unsigned long,	graph_ent,	func		)
+		__field_desc(	int,		graph_ent,	depth		)
 	),
 
 	F_printk("--> %lx (%d)", __entry->graph_ent.func, __entry->depth)
@@ -62,7 +87,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
 	TRACE_GRAPH_RET,
 
 	F_STRUCT(
-		__field(	struct ftrace_graph_ret,	ret	)
+		__field_struct(	struct ftrace_graph_ret,	ret	)
+		__field_desc(	unsigned long,	ret,		func	)
+		__field_desc(	unsigned long long, ret,	calltime)
+		__field_desc(	unsigned long long, ret,	rettime	)
+		__field_desc(	unsigned long,	ret,		overrun	)
+		__field_desc(	int,		ret,		depth	)
 	),
 
 	F_printk("<-- %lx (%d) (start: %llx  end: %llx) over: %d",
@@ -218,7 +248,13 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
 	TRACE_MMIO_RW,
 
 	F_STRUCT(
-		__field(	struct mmiotrace_rw,	rw	)
+		__field_struct(	struct mmiotrace_rw,	rw	)
+		__field_desc(	resource_size_t, rw,	phys	)
+		__field_desc(	unsigned long,	rw,	value	)
+		__field_desc(	unsigned long,	rw,	pc	)
+		__field_desc(	int, 		rw,	map_id	)
+		__field_desc(	unsigned char,	rw,	opcode	)
+		__field_desc(	unsigned char,	rw,	width	)
 	),
 
 	F_printk("%lx %lx %lx %d %lx %lx",
@@ -231,7 +267,12 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
 	TRACE_MMIO_MAP,
 
 	F_STRUCT(
-		__field(	struct mmiotrace_map,	map	)
+		__field_struct(	struct mmiotrace_map,	map	)
+		__field_desc(	resource_size_t, map,	phys	)
+		__field_desc(	unsigned long,	map,	virt	)
+		__field_desc(	unsigned long,	map,	len	)
+		__field_desc(	int, 		map,	map_id	)
+		__field_desc(	unsigned char,	map,	opcode	)
 	),
 
 	F_printk("%lx %lx %lx %d %lx",
@@ -244,7 +285,9 @@ FTRACE_ENTRY(boot_call, trace_boot_call,
 	TRACE_BOOT_CALL,
 
 	F_STRUCT(
-		__field(	struct boot_trace_call,	boot_call	)
+		__field_struct(	struct boot_trace_call,	boot_call	)
+		__field_desc(	pid_t,	boot_call,	caller		)
+		__array_desc(	char,	boot_call,	func,	KSYM_SYMBOL_LEN)
 	),
 
 	F_printk("%d  %s", __entry->caller, __entry->func)
@@ -255,7 +298,10 @@ FTRACE_ENTRY(boot_ret, trace_boot_ret,
 	TRACE_BOOT_RET,
 
 	F_STRUCT(
-		__field(	struct boot_trace_ret,	boot_ret	)
+		__field_struct(	struct boot_trace_ret,	boot_ret	)
+		__array_desc(	char,	boot_ret,	func,	KSYM_SYMBOL_LEN)
+		__field_desc(	int,	boot_ret,	result		)
+		__field_desc(	unsigned long, boot_ret, duration	)
 	),
 
 	F_printk("%s %d %lx",
@@ -298,7 +344,11 @@ FTRACE_ENTRY(power, trace_power,
 	TRACE_POWER,
 
 	F_STRUCT(
-		__field(	struct power_trace,	state_data	)
+		__field_struct(	struct power_trace,	state_data	)
+		__field_desc(	s64,	state_data,	stamp		)
+		__field_desc(	s64,	state_data,	end		)
+		__field_desc(	int,	state_data,	type		)
+		__field_desc(	int,	state_data,	state		)
 	),
 
 	F_printk("%llx->%llx type:%u state:%u",
-- 
cgit v1.2.3


From 4e5292ea1ac0c2939e815e6c44fad3d8696ea281 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 12 Sep 2009 19:26:21 -0400
Subject: tracing: use the new trace_entries.h to create format files

This patch changes the way the format files in

  debugfs/tracing/events/ftrace/*/format

are created. It uses the new trace_entries.h file to automate the
creation of the format files to ensure that they are always in sync
with the actual structures. This is the same methodology used to
create the format files for the TRACE_EVENT macro.

This also updates the filter creation that was built on the creation
of the format files.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        |  12 ++-
 kernel/trace/trace_events.c |   1 +
 kernel/trace/trace_export.c | 241 +++++++++++++++++++++++---------------------
 3 files changed, 136 insertions(+), 118 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b0d287d49a6..86bcff94791 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,6 +7,7 @@
 #include <linux/clocksource.h>
 #include <linux/ring_buffer.h>
 #include <linux/mmiotrace.h>
+#include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
@@ -746,11 +747,12 @@ extern struct list_head ftrace_events;
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(call, struct_name, id, tstruct, print)		\
 	extern struct ftrace_event_call event_##call;
-#undef TRACE_EVENT_FORMAT_NOFILTER
-#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
-#include "trace_event_types.h"
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)		\
+	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
+#include "trace_entries.h"
 
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f46d14cefde..adbed124c3e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -21,6 +21,7 @@
 
 #include "trace_output.h"
 
+#undef TRACE_SYSTEM
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
 DEFINE_MUTEX(event_mutex);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index df1bf6e48bb..4cb29d84d73 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,82 +15,163 @@
 
 #include "trace_output.h"
 
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM	ftrace
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+/* not needed for this file */
+#undef __field_struct
+#define __field_struct(type, item)
 
-extern void __bad_type_size(void);
+#undef __field
+#define __field(type, item)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%zu;\tsize:%zu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	if (sizeof(type) != sizeof(field.item))				\
-		__bad_type_size();					\
+#undef __field_desc
+#define __field_desc(type, container, item)				\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
+			       "offset:%zu;\tsize:%zu;\n",		\
+			       offsetof(typeof(field), container.item),	\
+			       sizeof(field.container.item));		\
 	if (!ret)							\
 		return 0;
 
+#undef __array
+#define __array(type, item, len)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
+			       "offset:%zu;\tsize:%zu;\n",		\
+			       offsetof(typeof(field), item),	\
+			       sizeof(field.item));		\
+	if (!ret)							\
+		return 0;
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)			\
-	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
+#undef __array_desc
+#define __array_desc(type, container, item, len)			\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
+			       "offset:%zu;\tsize:%zu;\n",		\
+			       offsetof(typeof(field), container.item),	\
+			       sizeof(field.container.item));		\
 	if (!ret)							\
 		return 0;
 
-#undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)					\
-	ret = trace_seq_printf(s, "\tfield:char " #item ";\t"		\
-			       "offset:%u;\tsize:0;\n",			\
-			       (unsigned int)offsetof(typeof(field), item)); \
+#undef __dynamic_array
+#define __dynamic_array(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%zu;\tsize:0;\n",		\
+			       offsetof(typeof(field), item));		\
 	if (!ret)							\
 		return 0;
 
-#undef TRACE_FIELD_SIGN
-#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
-	TRACE_FIELD(type, item, assign)
+#undef F_printk
+#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
 
-#undef TP_RAW_FMT
-#define TP_RAW_FMT(args...) args
+#undef __entry
+#define __entry REC
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)		\
 static int								\
-ftrace_format_##call(struct ftrace_event_call *unused,			\
-		      struct trace_seq *s)				\
+ftrace_format_##name(struct ftrace_event_call *unused,			\
+		     struct trace_seq *s)				\
 {									\
-	struct args field;						\
-	int ret;							\
+	struct struct_name field __attribute__((unused));		\
+	int ret = 0;							\
 									\
 	tstruct;							\
 									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
 									\
 	return ret;							\
 }
 
-#undef TRACE_EVENT_FORMAT_NOFILTER
-#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
-				    tpfmt)				\
-static int								\
-ftrace_format_##call(struct ftrace_event_call *unused,			\
-		      struct trace_seq *s)				\
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print)	\
+	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
+
+#include "trace_entries.h"
+
+
+#undef __field
+#define __field(type, item)						\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item),			\
+				 is_signed_type(type), FILTER_OTHER);	\
+	if (ret)							\
+		return ret;
+
+#undef __field_desc
+#define __field_desc(type, container, item)	\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field),		\
+					  container.item),		\
+				 sizeof(field.container.item),		\
+				 is_signed_type(type), FILTER_OTHER);	\
+	if (ret)							\
+		return ret;
+
+#undef __array
+#define __array(type, item, len)					\
+	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), 0, FILTER_OTHER);	\
+	if (ret)							\
+		return ret;
+
+#undef __array_desc
+#define __array_desc(type, container, item, len)			\
+	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field),		\
+					  container.item),		\
+				 sizeof(field.container.item), 0,	\
+				 FILTER_OTHER);				\
+	if (ret)							\
+		return ret;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item)
+
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)		\
+int									\
+ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 {									\
-	struct args field;						\
+	struct struct_name field;					\
 	int ret;							\
 									\
-	tstruct;							\
+	ret = trace_define_common_fields(event_call);			\
+	if (ret)							\
+		return ret;						\
 									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+	tstruct;							\
 									\
 	return ret;							\
 }
 
-#include "trace_event_types.h"
+#include "trace_entries.h"
+
+
+#undef __field
+#define __field(type, item)
+
+#undef __field_desc
+#define __field_desc(type, container, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __array_desc
+#define __array_desc(type, container, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item)
+
 
 #undef TRACE_ZERO_CHAR
 #define TRACE_ZERO_CHAR(arg)
@@ -117,16 +198,15 @@ ftrace_format_##call(struct ftrace_event_call *unused,			\
 #define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)	\
 	cmd;
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-int ftrace_define_fields_##call(struct ftrace_event_call *event_call);	\
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(call, struct_name, type, tstruct, print)		\
 static int ftrace_raw_init_event_##call(void);				\
 									\
 struct ftrace_event_call __used						\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
-	.id			= proto,				\
+	.id			= type,					\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.show_format		= ftrace_format_##call,			\
@@ -138,69 +218,4 @@ static int ftrace_raw_init_event_##call(void)				\
 	return 0;							\
 }									\
 
-#undef TRACE_EVENT_FORMAT_NOFILTER
-#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
-				    tpfmt)				\
-									\
-struct ftrace_event_call __used						\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.id			= proto,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.show_format		= ftrace_format_##call,			\
-};
-
-#include "trace_event_types.h"
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	ret = trace_define_field(event_call, #type, #item,		\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item),			\
-				 is_signed_type(type), FILTER_OTHER);	\
-	if (ret)							\
-		return ret;
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\
-	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item), 0, FILTER_OTHER);	\
-	if (ret)							\
-		return ret;
-
-#undef TRACE_FIELD_SIGN
-#define TRACE_FIELD_SIGN(type, item, assign, is_signed)			\
-	ret = trace_define_field(event_call, #type, #item,		\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item), is_signed,		\
-				 FILTER_OTHER);				\
-	if (ret)							\
-		return ret;
-
-#undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-int									\
-ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\
-{									\
-	struct args field;						\
-	int ret;							\
-									\
-	ret = trace_define_common_fields(event_call);			\
-	if (ret)							\
-		return ret;						\
-									\
-	tstruct;							\
-									\
-	return ret;							\
-}
-
-#undef TRACE_EVENT_FORMAT_NOFILTER
-#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
-				    tpfmt)
-
-#include "trace_event_types.h"
+#include "trace_entries.h"
-- 
cgit v1.2.3


From 51df5fcbc1296a84cf1c093c6cb56d40ca3e697e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 12 Sep 2009 20:29:22 -0400
Subject: tracing: remove trace_event_types.h

The macros in trace_entries.h have made the code in trace_event_types.h
obsolete. The file is no longer used, so this patch removes it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_event_types.h | 178 ---------------------------------------
 1 file changed, 178 deletions(-)
 delete mode 100644 kernel/trace/trace_event_types.h

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 6db005e1248..00000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,178 +0,0 @@
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM	ftrace
-
-/*
- * We cheat and use the proto type field as the ID
- * and args as the entry type (minus 'struct')
- */
-TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD(unsigned long, parent_ip, parent_ip)
-	),
-	TP_RAW_FMT(" %lx <-- %lx")
-);
-
-TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
-		   ftrace_graph_ent_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, graph_ent.func, func)
-		TRACE_FIELD(int, graph_ent.depth, depth)
-	),
-	TP_RAW_FMT("--> %lx (%d)")
-);
-
-TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
-		   ftrace_graph_ret_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, ret.func, func)
-		TRACE_FIELD(unsigned long long, ret.calltime, calltime)
-		TRACE_FIELD(unsigned long long, ret.rettime, rettime)
-		TRACE_FIELD(unsigned long, ret.overrun, overrun)
-		TRACE_FIELD(int, ret.depth, depth)
-	),
-	TP_RAW_FMT("<-- %lx (%d)")
-);
-
-TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned int, prev_pid, prev_pid)
-		TRACE_FIELD(unsigned char, prev_prio, prev_prio)
-		TRACE_FIELD(unsigned char, prev_state, prev_state)
-		TRACE_FIELD(unsigned int, next_pid, next_pid)
-		TRACE_FIELD(unsigned char, next_prio, next_prio)
-		TRACE_FIELD(unsigned char, next_state, next_state)
-		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
-	),
-	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
-);
-
-TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned int, prev_pid, prev_pid)
-		TRACE_FIELD(unsigned char, prev_prio, prev_prio)
-		TRACE_FIELD(unsigned char, prev_state, prev_state)
-		TRACE_FIELD(unsigned int, next_pid, next_pid)
-		TRACE_FIELD(unsigned char, next_prio, next_prio)
-		TRACE_FIELD(unsigned char, next_state, next_state)
-		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
-	),
-	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
-);
-
-TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, arg1, arg1)
-		TRACE_FIELD(unsigned long, arg2, arg2)
-		TRACE_FIELD(unsigned long, arg3, arg3)
-	),
-	TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
-);
-
-/*
- * Stack-trace entry:
- */
-
-/* #define FTRACE_STACK_ENTRIES   8 */
-
-TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, caller[0], stack0)
-		TRACE_FIELD(unsigned long, caller[1], stack1)
-		TRACE_FIELD(unsigned long, caller[2], stack2)
-		TRACE_FIELD(unsigned long, caller[3], stack3)
-		TRACE_FIELD(unsigned long, caller[4], stack4)
-		TRACE_FIELD(unsigned long, caller[5], stack5)
-		TRACE_FIELD(unsigned long, caller[6], stack6)
-		TRACE_FIELD(unsigned long, caller[7], stack7)
-	),
-	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
-		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
-);
-
-TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, caller[0], stack0)
-		TRACE_FIELD(unsigned long, caller[1], stack1)
-		TRACE_FIELD(unsigned long, caller[2], stack2)
-		TRACE_FIELD(unsigned long, caller[3], stack3)
-		TRACE_FIELD(unsigned long, caller[4], stack4)
-		TRACE_FIELD(unsigned long, caller[5], stack5)
-		TRACE_FIELD(unsigned long, caller[6], stack6)
-		TRACE_FIELD(unsigned long, caller[7], stack7)
-	),
-	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
-		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
-);
-
-TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD(char *, fmt, fmt)
-		TRACE_FIELD_ZERO_CHAR(buf)
-	),
-	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
-);
-
-TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD_ZERO_CHAR(buf)
-	),
-	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
-);
-
-TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(unsigned int, line, line)
-		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
-				    TRACE_FUNC_SIZE+1, func)
-		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
-				    TRACE_FUNC_SIZE+1, file)
-		TRACE_FIELD(char, correct, correct)
-	),
-	TP_RAW_FMT("%u:%s:%s (%u)")
-);
-
-TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(u64, from, from)
-		TRACE_FIELD(u64, to, to)
-	),
-	TP_RAW_FMT("from: %llx to: %llx")
-);
-
-TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
-		TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
-		TRACE_FIELD(int, state_data.type, type)
-		TRACE_FIELD(int, state_data.state, state)
-	),
-	TP_RAW_FMT("%llx->%llx type:%u state:%u")
-);
-
-TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
-		TRACE_FIELD(unsigned long, call_site, call_site)
-		TRACE_FIELD(const void *, ptr, ptr)
-		TRACE_FIELD(size_t, bytes_req, bytes_req)
-		TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
-		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
-		TRACE_FIELD(int, node, node)
-	),
-	TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
-		 " flags:%x node:%d")
-);
-
-TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
-	TRACE_STRUCT(
-		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
-		TRACE_FIELD(unsigned long, call_site, call_site)
-		TRACE_FIELD(const void *, ptr, ptr)
-	),
-	TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
-);
-
-#undef TRACE_SYSTEM
-- 
cgit v1.2.3


From 60ba77022712c7cda0eda286154bae160446b24a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 12 Sep 2009 23:34:04 -0400
Subject: tracing: add filter event logic to special, mmiotrace and boot
 tracers

Now that the pluging tracers use macros to create the structures and
automate the exporting of their formats to the format files, they also
automatically get a filter file.

This patch adds the code to implement the filter logic in the trace
recordings.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c           |  5 ++++-
 kernel/trace/trace_boot.c      |  8 ++++++--
 kernel/trace/trace_mmiotrace.c | 10 ++++++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ef82a7fabf3..fd52a19dd17 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1206,6 +1206,7 @@ ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
+	struct ftrace_event_call *call = &event_special;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct ring_buffer *buffer = tr->buffer;
@@ -1219,7 +1220,9 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	trace_buffer_unlock_commit(buffer, event, 0, pc);
+
+	if (!filter_check_discard(call, entry, buffer, event))
+		trace_buffer_unlock_commit(buffer, event, 0, pc);
 }
 
 void
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 19bfc75d467..c21d5f3956a 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly =
 
 void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
+	struct ftrace_event_call *call = &event_boot_call;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	struct trace_boot_call *entry;
@@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->boot_call = *bt;
-	trace_buffer_unlock_commit(buffer, event, 0, 0);
+	if (!filter_check_discard(call, entry, buffer, event))
+		trace_buffer_unlock_commit(buffer, event, 0, 0);
  out:
 	preempt_enable();
 }
 
 void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 {
+	struct ftrace_event_call *call = &event_boot_ret;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	struct trace_boot_ret *entry;
@@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->boot_ret = *bt;
-	trace_buffer_unlock_commit(buffer, event, 0, 0);
+	if (!filter_check_discard(call, entry, buffer, event))
+		trace_buffer_unlock_commit(buffer, event, 0, 0);
  out:
 	preempt_enable();
 }
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c4c9bbda53d..0acd834659e 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct mmiotrace_rw *rw)
 {
+	struct ftrace_event_call *call = &event_mmiotrace_rw;
 	struct ring_buffer *buffer = tr->buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
@@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	}
 	entry	= ring_buffer_event_data(event);
 	entry->rw			= *rw;
-	trace_buffer_unlock_commit(buffer, event, 0, pc);
+
+	if (!filter_check_discard(call, entry, buffer, event))
+		trace_buffer_unlock_commit(buffer, event, 0, pc);
 }
 
 void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct mmiotrace_map *map)
 {
+	struct ftrace_event_call *call = &event_mmiotrace_map;
 	struct ring_buffer *buffer = tr->buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
@@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	}
 	entry	= ring_buffer_event_data(event);
 	entry->map			= *map;
-	trace_buffer_unlock_commit(buffer, event, 0, pc);
+
+	if (!filter_check_discard(call, entry, buffer, event))
+		trace_buffer_unlock_commit(buffer, event, 0, pc);
 }
 
 void mmio_trace_mapping(struct mmiotrace_map *map)
-- 
cgit v1.2.3


From 08a408161749d2406f94f4e3d47cfdbc826ad1cc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 14 Sep 2009 09:31:35 -0400
Subject: ring-buffer: typecast cmpxchg to fix PowerPC warning

The cmpxchg used by PowerPC does the following:

  ({									 \
     __typeof__(*(ptr)) _o_ = (o);					 \
     __typeof__(*(ptr)) _n_ = (n);					 \
     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,		 \
				    (unsigned long)_n_, sizeof(*(ptr))); \
  })

This does a type check of *ptr to both o and n.

Unfortunately, the code in ring-buffer.c assigns longs to pointers
and pointers to longs and causes a warning on PowerPC:

ring_buffer.c: In function 'rb_head_page_set':
ring_buffer.c:704: warning: initialization makes pointer from integer without a cast
ring_buffer.c:704: warning: initialization makes pointer from integer without a cast
ring_buffer.c: In function 'rb_head_page_replace':
ring_buffer.c:797: warning: initialization makes integer from pointer without a cast

This patch adds the typecasts inside cmpxchg to annotate that a long is
being cast to a pointer and a pointer is being casted to a long and this
removes the PowerPC warnings.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8786c350b4c..6eef38923b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -701,8 +701,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
 
 	val &= ~RB_FLAG_MASK;
 
-	ret = (unsigned long)cmpxchg(&list->next,
-				     val | old_flag, val | new_flag);
+	ret = cmpxchg((unsigned long *)&list->next,
+		      val | old_flag, val | new_flag);
 
 	/* check if the reader took the page */
 	if ((ret & ~RB_FLAG_MASK) != val)
@@ -794,7 +794,7 @@ static int rb_head_page_replace(struct buffer_page *old,
 	val = *ptr & ~RB_FLAG_MASK;
 	val |= RB_PAGE_HEAD;
 
-	ret = cmpxchg(ptr, val, &new->list);
+	ret = cmpxchg(ptr, val, (unsigned long)&new->list);
 
 	return ret == val;
 }
-- 
cgit v1.2.3


From ec827c7ece8901044e6b3f92aeea489be9e1bcf7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 14 Sep 2009 10:50:23 -0400
Subject: tracing: add static to generated TRACE_EVENT functions

Some of the generated functions used in the TRACE_EVENT macros are
not declared static, but they are not global.

Discovered by sparse.

Reported-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index adbed124c3e..0fa8f9faa61 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1154,7 +1154,7 @@ static int trace_module_notify(struct notifier_block *self,
 }
 #endif /* CONFIG_MODULES */
 
-struct notifier_block trace_module_nb = {
+static struct notifier_block trace_module_nb = {
 	.notifier_call = trace_module_notify,
 	.priority = 0,
 };
-- 
cgit v1.2.3


From c16de8fd7a608aba8708dd056cf6e4d9462e800a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 14 Sep 2009 15:51:39 +0800
Subject: tracing: fix F_printk() typos

I found some typos in F_printk(), so I wrote compile-time check
for it, and triggered some compile errors and warnings.

I've fixed them on x86_32, but I have no x86_64 in my hand, so there
may still be some compile warnings on 64bits.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AADF60B.5070407@cn.fujitsu.com>

[ tested on x86_64, and works fine ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_entries.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c866d34e014..0c100958c64 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -78,7 +78,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
 		__field_desc(	int,		graph_ent,	depth		)
 	),
 
-	F_printk("--> %lx (%d)", __entry->graph_ent.func, __entry->depth)
+	F_printk("--> %lx (%d)", __entry->func, __entry->depth)
 );
 
 /* Function return entry */
@@ -97,8 +97,8 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
 
 	F_printk("<-- %lx (%d) (start: %llx  end: %llx) over: %d",
 		 __entry->func, __entry->depth,
-		 __entry->calltime, __entry->rettim,
-		 __entrty->depth)
+		 __entry->calltime, __entry->rettime,
+		 __entry->depth)
 );
 
 /*
@@ -133,7 +133,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
 		FTRACE_CTX_FIELDS
 	),
 
-	F_printk(b"%u:%u:%u  ==> %u:%u:%u [%03u]",
+	F_printk("%u:%u:%u  ==> %u:%u:%u [%03u]",
 		 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
 		 __entry->next_pid, __entry->next_prio, __entry->next_state,
 		 __entry->next_cpu
@@ -257,8 +257,8 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
 		__field_desc(	unsigned char,	rw,	width	)
 	),
 
-	F_printk("%lx %lx %lx %d %lx %lx",
-		 __entry->phs, __entry->value, __entry->pc,
+	F_printk("%lx %lx %lx %d %x %x",
+		 (unsigned long)__entry->phys, __entry->value, __entry->pc,
 		 __entry->map_id, __entry->opcode, __entry->width)
 );
 
@@ -275,8 +275,8 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
 		__field_desc(	unsigned char,	map,	opcode	)
 	),
 
-	F_printk("%lx %lx %lx %d %lx",
-		 __entry->phs, __entry->virt, __entry->len,
+	F_printk("%lx %lx %lx %d %x",
+		 (unsigned long)__entry->phys, __entry->virt, __entry->len,
 		 __entry->map_id, __entry->opcode)
 );
 
@@ -370,7 +370,7 @@ FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
 		__field(	int,			node		)
 	),
 
-	F_printk("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+	F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
 		 " flags:%x node:%d",
 		 __entry->type_id, __entry->call_site, __entry->ptr,
 		 __entry->bytes_req, __entry->bytes_alloc,
-- 
cgit v1.2.3


From 05ffa2d02066c2cb169c02d5417308ee8ecab638 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 14 Sep 2009 15:54:52 +0800
Subject: ftrace: add compile-time check on F_printk()

Make sure F_printk() has corrent format and args, and make sure
changes in F_STRUCT() won't break F_printk().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AADF6CC.1060809@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_export.c | 45 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4cb29d84d73..2435d55947e 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -22,6 +22,47 @@
 #undef __field_struct
 #define __field_struct(type, item)
 
+#undef __field
+#define __field(type, item)				type item;
+
+#undef __field_desc
+#define __field_desc(type, container, item)		type item;
+
+#undef __array
+#define __array(type, item, size)			type item[size];
+
+#undef __array_desc
+#define __array_desc(type, container, item, size)	type item[size];
+
+#undef __dynamic_array
+#define __dynamic_array(type, item)			type item[];
+
+#undef F_STRUCT
+#define F_STRUCT(args...)				args
+
+#undef F_printk
+#define F_printk(fmt, args...) fmt, args
+
+#undef FTRACE_ENTRY
+#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)	\
+struct ____ftrace_##name {					\
+	tstruct							\
+};								\
+static void __used ____ftrace_check_##name(void)		\
+{								\
+	struct ____ftrace_##name *__entry = NULL;		\
+								\
+	/* force cmpile-time check on F_printk() */		\
+	printk(print);						\
+}
+
+#undef FTRACE_ENTRY_DUP
+#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print)	\
+	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
+
+#include "trace_entries.h"
+
+
 #undef __field
 #define __field(type, item)						\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
@@ -88,10 +129,6 @@ ftrace_format_##name(struct ftrace_event_call *unused,			\
 	return ret;							\
 }
 
-#undef FTRACE_ENTRY_DUP
-#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print)	\
-	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
-
 #include "trace_entries.h"
 
 
-- 
cgit v1.2.3


From 20a58a77231c5f5f61470932503b889303e8d4f3 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 14 Sep 2009 15:55:18 +0800
Subject: tracing: remove some unused macros

- remove FTRACE_ENTRY_STRUCT_ONLY()
- remove TRACE_XXX() macros

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AADF6E6.3080606@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_entries.h |  9 ---------
 kernel/trace/trace_export.c  | 26 --------------------------
 2 files changed, 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 0c100958c64..a431748ddd6 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -116,15 +116,6 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
 	__field(	unsigned char,	next_state	)	\
 	__field(	unsigned int,	next_cpu	)
 
-#if 0
-FTRACE_ENTRY_STRUCT_ONLY(ctx_switch_entry,
-
-	F_STRUCT(
-		FTRACE_CTX_FIELDS
-	)
-);
-#endif
-
 FTRACE_ENTRY(context_switch, ctx_switch_entry,
 
 	TRACE_CTX,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 2435d55947e..9753fcc61bc 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -209,32 +209,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #undef __dynamic_array
 #define __dynamic_array(type, item)
 
-
-#undef TRACE_ZERO_CHAR
-#define TRACE_ZERO_CHAR(arg)
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TRACE_FIELD_SIGN
-#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
-	TRACE_FIELD(type, item, assign)
-
-#undef TP_CMD
-#define TP_CMD(cmd...)	cmd
-
-#undef TRACE_ENTRY
-#define TRACE_ENTRY	entry
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)	\
-	cmd;
-
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, type, tstruct, print)		\
 static int ftrace_raw_init_event_##call(void);				\
-- 
cgit v1.2.3


From 1f5a6b45416694ff8c0d04625f1a438a0e380add Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 14 Sep 2009 11:58:24 -0400
Subject: tracing: make testing syscall events a separate configuration

Parag noticed that the number of event tests has increased tremendously:

grep "Testing event" dmesg.31rc9 |wc -l
100

grep "Testing event" dmesg.31git |wc -l
1172

This is due to the testing of every syscall event when ftrace self
test is enabled. This adds a bit more time to kernel boot up and can
affect development by slowing down the time it takes between reboots.

This option makes the testing of the syscall events into a separate
config, to still be able to test most of ftrace internals at boot up
but not have to wait for all the syscall events to be tested.

The syscall event testing only tests the enabling and disabling of
the trace point, since the syscalls are not executed. What really needs
to be done is to somehow have a userspace tool test the syscall tracepoints
as well.

Reported-by: Parag Warudkar <parag.lkml@gmail.com>
LKML-Reference: <f7848160909130815l3e768a30n3b28808bbe5c254b@mail.gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig        | 12 ++++++++++++
 kernel/trace/trace_events.c | 12 ++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1ea0d1234f4..aa002cef924 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -469,6 +469,18 @@ config FTRACE_STARTUP_TEST
 	  functioning properly. It will do tests on all the configured
 	  tracers of ftrace.
 
+config EVENT_TRACE_TEST_SYSCALLS
+	bool "Run selftest on syscall events"
+	depends on FTRACE_STARTUP_TEST
+	help
+	 This option will also enable testing every syscall event.
+	 It only enables the event and disables it and runs various loads
+	 with the event enabled. This adds a bit more time for kernel boot
+	 up since it runs this on every system call defined.
+
+	 TBD - enable a way to actually call the syscalls as we test their
+	       events
+
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
 	depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0fa8f9faa61..787f0fb0994 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1326,6 +1326,18 @@ static __init void event_trace_self_tests(void)
 		if (!call->regfunc)
 			continue;
 
+/*
+ * Testing syscall events here is pretty useless, but
+ * we still do it if configured. But this is time consuming.
+ * What we really need is a user thread to perform the
+ * syscalls as we test.
+ */
+#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
+		if (call->system &&
+		    strcmp(call->system, "syscalls") == 0)
+			continue;
+#endif
+
 		pr_info("Testing event %s: ", call->name);
 
 		/*
-- 
cgit v1.2.3


From e681c9dd62fe8fcc5bba28a3ca3f7dc8be940206 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Wed, 8 Jul 2009 13:23:32 +0200
Subject: PM: Fix typo in label name s/Platofrm_finish/Platform_finish/

Although the same label name is used somewhere else in the file, this
particular label was consistently typoed in all of its uses.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 81d2e746489..ec8202512b0 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -460,11 +460,11 @@ int hibernation_platform_enter(void)
 
 	error = hibernation_ops->prepare();
 	if (error)
-		goto Platofrm_finish;
+		goto Platform_finish;
 
 	error = disable_nonboot_cpus();
 	if (error)
-		goto Platofrm_finish;
+		goto Platform_finish;
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_HIBERNATE);
@@ -476,7 +476,7 @@ int hibernation_platform_enter(void)
 	 * We don't need to reenable the nonboot CPUs or resume consoles, since
 	 * the system is going to be halted anyway.
 	 */
- Platofrm_finish:
+ Platform_finish:
 	hibernation_ops->finish();
 
 	dpm_suspend_noirq(PMSG_RESTORE);
-- 
cgit v1.2.3


From 4bb334353ebd821bc8eeabeb019eaac33c7307df Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 8 Jul 2009 13:23:51 +0200
Subject: PM/Hibernate: Rework shrinking of memory

Rework swsusp_shrink_memory() so that it calls shrink_all_memory()
just once to make some room for the image and then allocates memory
to apply more pressure to the memory management subsystem, if
necessary.

Unfortunately, we don't seem to be able to drop shrink_all_memory()
entirely just yet, because that would lead to huge performance
regressions in some test cases.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
---
 kernel/power/snapshot.c | 205 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 158 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 523a451b45d..a3a175fa0a4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1066,69 +1066,180 @@ void swsusp_free(void)
 	buffer = NULL;
 }
 
+/* Helper functions used for the shrinking of memory. */
+
+#define GFP_IMAGE	(GFP_KERNEL | __GFP_NOWARN)
+
 /**
- *	swsusp_shrink_memory -  Try to free as much memory as needed
- *
- *	... but do not OOM-kill anyone
+ * preallocate_image_pages - Allocate a number of pages for hibernation image
+ * @nr_pages: Number of page frames to allocate.
+ * @mask: GFP flags to use for the allocation.
  *
- *	Notice: all userland should be stopped before it is called, or
- *	livelock is possible.
+ * Return value: Number of page frames actually allocated
  */
+static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
+{
+	unsigned long nr_alloc = 0;
+
+	while (nr_pages > 0) {
+		if (!alloc_image_page(mask))
+			break;
+		nr_pages--;
+		nr_alloc++;
+	}
 
-#define SHRINK_BITE	10000
-static inline unsigned long __shrink_memory(long tmp)
+	return nr_alloc;
+}
+
+static unsigned long preallocate_image_memory(unsigned long nr_pages)
+{
+	return preallocate_image_pages(nr_pages, GFP_IMAGE);
+}
+
+#ifdef CONFIG_HIGHMEM
+static unsigned long preallocate_image_highmem(unsigned long nr_pages)
 {
-	if (tmp > SHRINK_BITE)
-		tmp = SHRINK_BITE;
-	return shrink_all_memory(tmp);
+	return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
 }
 
+/**
+ *  __fraction - Compute (an approximation of) x * (multiplier / base)
+ */
+static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
+{
+	x *= multiplier;
+	do_div(x, base);
+	return (unsigned long)x;
+}
+
+static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
+						unsigned long highmem,
+						unsigned long total)
+{
+	unsigned long alloc = __fraction(nr_pages, highmem, total);
+
+	return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
+}
+#else /* CONFIG_HIGHMEM */
+static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
+{
+	return 0;
+}
+
+static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
+						unsigned long highmem,
+						unsigned long total)
+{
+	return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+/**
+ * swsusp_shrink_memory -  Make the kernel release as much memory as needed
+ *
+ * To create a hibernation image it is necessary to make a copy of every page
+ * frame in use.  We also need a number of page frames to be free during
+ * hibernation for allocations made while saving the image and for device
+ * drivers, in case they need to allocate memory from their hibernation
+ * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
+ * respectively, both of which are rough estimates).  To make this happen, we
+ * compute the total number of available page frames and allocate at least
+ *
+ * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
+ *
+ * of them, which corresponds to the maximum size of a hibernation image.
+ *
+ * If image_size is set below the number following from the above formula,
+ * the preallocation of memory is continued until the total number of saveable
+ * pages in the system is below the requested image size or it is impossible to
+ * allocate more memory, whichever happens first.
+ */
 int swsusp_shrink_memory(void)
 {
-	long tmp;
 	struct zone *zone;
-	unsigned long pages = 0;
-	unsigned int i = 0;
-	char *p = "-\\|/";
+	unsigned long saveable, size, max_size, count, highmem, pages = 0;
+	unsigned long alloc, pages_highmem;
 	struct timeval start, stop;
+	int error = 0;
 
-	printk(KERN_INFO "PM: Shrinking memory...  ");
+	printk(KERN_INFO "PM: Shrinking memory... ");
 	do_gettimeofday(&start);
-	do {
-		long size, highmem_size;
-
-		highmem_size = count_highmem_pages();
-		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
-		tmp = size;
-		size += highmem_size;
-		for_each_populated_zone(zone) {
-			tmp += snapshot_additional_pages(zone);
-			if (is_highmem(zone)) {
-				highmem_size -=
-					zone_page_state(zone, NR_FREE_PAGES);
-			} else {
-				tmp -= zone_page_state(zone, NR_FREE_PAGES);
-				tmp += zone->lowmem_reserve[ZONE_NORMAL];
-			}
-		}
 
-		if (highmem_size < 0)
-			highmem_size = 0;
+	/* Count the number of saveable data pages. */
+	highmem = count_highmem_pages();
+	saveable = count_data_pages();
 
-		tmp += highmem_size;
-		if (tmp > 0) {
-			tmp = __shrink_memory(tmp);
-			if (!tmp)
-				return -ENOMEM;
-			pages += tmp;
-		} else if (size > image_size / PAGE_SIZE) {
-			tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
-			pages += tmp;
-		}
-		printk("\b%c", p[i++%4]);
-	} while (tmp > 0);
+	/*
+	 * Compute the total number of page frames we can use (count) and the
+	 * number of pages needed for image metadata (size).
+	 */
+	count = saveable;
+	saveable += highmem;
+	size = 0;
+	for_each_populated_zone(zone) {
+		size += snapshot_additional_pages(zone);
+		if (is_highmem(zone))
+			highmem += zone_page_state(zone, NR_FREE_PAGES);
+		else
+			count += zone_page_state(zone, NR_FREE_PAGES);
+	}
+	count += highmem;
+	count -= totalreserve_pages;
+
+	/* Compute the maximum number of saveable pages to leave in memory. */
+	max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
+	size = DIV_ROUND_UP(image_size, PAGE_SIZE);
+	if (size > max_size)
+		size = max_size;
+	/*
+	 * If the maximum is not less than the current number of saveable pages
+	 * in memory, we don't need to do anything more.
+	 */
+	if (size >= saveable)
+		goto out;
+
+	/*
+	 * Let the memory management subsystem know that we're going to need a
+	 * large number of page frames to allocate and make it free some memory.
+	 * NOTE: If this is not done, performance will be hurt badly in some
+	 * test cases.
+	 */
+	shrink_all_memory(saveable - size);
+
+	/*
+	 * The number of saveable pages in memory was too high, so apply some
+	 * pressure to decrease it.  First, make room for the largest possible
+	 * image and fail if that doesn't work.  Next, try to decrease the size
+	 * of the image as much as indicated by image_size using allocations
+	 * from highmem and non-highmem zones separately.
+	 */
+	pages_highmem = preallocate_image_highmem(highmem / 2);
+	alloc = (count - max_size) - pages_highmem;
+	pages = preallocate_image_memory(alloc);
+	if (pages < alloc) {
+		error = -ENOMEM;
+		goto free_out;
+	}
+	size = max_size - size;
+	alloc = size;
+	size = preallocate_highmem_fraction(size, highmem, count);
+	pages_highmem += size;
+	alloc -= size;
+	pages += preallocate_image_memory(alloc);
+	pages += pages_highmem;
+
+ free_out:
+	/* Release all of the preallocated page frames. */
+	swsusp_free();
+
+	if (error) {
+		printk(KERN_CONT "\n");
+		return error;
+	}
+
+ out:
 	do_gettimeofday(&stop);
-	printk("\bdone (%lu pages freed)\n", pages);
+	printk(KERN_CONT "done (preallocated %lu free pages)\n", pages);
 	swsusp_show_speed(&start, &stop, pages, "Freed");
 
 	return 0;
-- 
cgit v1.2.3


From 64a473cb74a88cb4991edf985d55a266e65292e1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 8 Jul 2009 13:24:05 +0200
Subject: PM/Hibernate: Do not release preallocated memory unnecessarily (rev.
 2)

Since the hibernation code is now going to use allocations of memory
to make enough room for the image, it can also use the page frames
allocated at this stage as image page frames.  The low-level
hibernation code needs to be rearranged for this purpose, but it
allows us to avoid freeing a great number of pages and allocating
these same pages once again later, so it generally is worth doing.

[rev. 2: Take highmem into account correctly.]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c |  15 +++-
 kernel/power/power.h     |   2 +-
 kernel/power/snapshot.c  | 202 +++++++++++++++++++++++++++++++----------------
 3 files changed, 147 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index ec8202512b0..04b3a83d686 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode)
 	if (error)
 		return error;
 
-	/* Free memory before shutting down devices. */
-	error = swsusp_shrink_memory();
+	/* Preallocate image memory before shutting down devices. */
+	error = hibernate_preallocate_memory();
 	if (error)
 		goto Close;
 
@@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode)
 	/* Control returns here after successful restore */
 
  Resume_devices:
+	/* We may need to release the preallocated image pages here. */
+	if (error || !in_suspend)
+		swsusp_free();
+
 	dpm_resume_end(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 	resume_console();
@@ -578,7 +582,10 @@ int hibernate(void)
 		goto Thaw;
 
 	error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
-	if (in_suspend && !error) {
+	if (error)
+		goto Thaw;
+
+	if (in_suspend) {
 		unsigned int flags = 0;
 
 		if (hibernation_mode == HIBERNATION_PLATFORM)
@@ -590,8 +597,8 @@ int hibernate(void)
 			power_down();
 	} else {
 		pr_debug("PM: Image restored successfully.\n");
-		swsusp_free();
 	}
+
  Thaw:
 	thaw_processes();
  Finish:
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 26d5a26f82e..46c5a26630a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
 
 extern int create_basic_memory_bitmaps(void);
 extern void free_basic_memory_bitmaps(void);
-extern int swsusp_shrink_memory(void);
+extern int hibernate_preallocate_memory(void);
 
 /**
  *	Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a3a175fa0a4..2b1a7bc24c9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 static unsigned int nr_copy_pages;
 /* Number of pages needed for saving the original pfns of the image pages */
 static unsigned int nr_meta_pages;
+/*
+ * Numbers of normal and highmem page frames allocated for hibernation image
+ * before suspending devices.
+ */
+unsigned int alloc_normal, alloc_highmem;
+/*
+ * Memory bitmap used for marking saveable pages (during hibernation) or
+ * hibernation image pages (during restore)
+ */
+static struct memory_bitmap orig_bm;
+/*
+ * Memory bitmap used during hibernation for marking allocated page frames that
+ * will contain copies of saveable pages.  During restore it is initially used
+ * for marking hibernation image pages, but then the set bits from it are
+ * duplicated in @orig_bm and it is released.  On highmem systems it is next
+ * used for marking "safe" highmem pages, but it has to be reinitialized for
+ * this purpose.
+ */
+static struct memory_bitmap copy_bm;
 
 /**
  *	swsusp_free - free pages allocated for the suspend.
@@ -1064,6 +1083,8 @@ void swsusp_free(void)
 	nr_meta_pages = 0;
 	restore_pblist = NULL;
 	buffer = NULL;
+	alloc_normal = 0;
+	alloc_highmem = 0;
 }
 
 /* Helper functions used for the shrinking of memory. */
@@ -1082,8 +1103,16 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
 	unsigned long nr_alloc = 0;
 
 	while (nr_pages > 0) {
-		if (!alloc_image_page(mask))
+		struct page *page;
+
+		page = alloc_image_page(mask);
+		if (!page)
 			break;
+		memory_bm_set_bit(&copy_bm, page_to_pfn(page));
+		if (PageHighMem(page))
+			alloc_highmem++;
+		else
+			alloc_normal++;
 		nr_pages--;
 		nr_alloc++;
 	}
@@ -1135,7 +1164,47 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
 #endif /* CONFIG_HIGHMEM */
 
 /**
- * swsusp_shrink_memory -  Make the kernel release as much memory as needed
+ * free_unnecessary_pages - Release preallocated pages not needed for the image
+ */
+static void free_unnecessary_pages(void)
+{
+	unsigned long save_highmem, to_free_normal, to_free_highmem;
+
+	to_free_normal = alloc_normal - count_data_pages();
+	save_highmem = count_highmem_pages();
+	if (alloc_highmem > save_highmem) {
+		to_free_highmem = alloc_highmem - save_highmem;
+	} else {
+		to_free_highmem = 0;
+		to_free_normal -= save_highmem - alloc_highmem;
+	}
+
+	memory_bm_position_reset(&copy_bm);
+
+	while (to_free_normal > 0 && to_free_highmem > 0) {
+		unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+		struct page *page = pfn_to_page(pfn);
+
+		if (PageHighMem(page)) {
+			if (!to_free_highmem)
+				continue;
+			to_free_highmem--;
+			alloc_highmem--;
+		} else {
+			if (!to_free_normal)
+				continue;
+			to_free_normal--;
+			alloc_normal--;
+		}
+		memory_bm_clear_bit(&copy_bm, pfn);
+		swsusp_unset_page_forbidden(page);
+		swsusp_unset_page_free(page);
+		__free_page(page);
+	}
+}
+
+/**
+ * hibernate_preallocate_memory - Preallocate memory for hibernation image
  *
  * To create a hibernation image it is necessary to make a copy of every page
  * frame in use.  We also need a number of page frames to be free during
@@ -1154,19 +1223,30 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
  * pages in the system is below the requested image size or it is impossible to
  * allocate more memory, whichever happens first.
  */
-int swsusp_shrink_memory(void)
+int hibernate_preallocate_memory(void)
 {
 	struct zone *zone;
 	unsigned long saveable, size, max_size, count, highmem, pages = 0;
-	unsigned long alloc, pages_highmem;
+	unsigned long alloc, save_highmem, pages_highmem;
 	struct timeval start, stop;
-	int error = 0;
+	int error;
 
-	printk(KERN_INFO "PM: Shrinking memory... ");
+	printk(KERN_INFO "PM: Preallocating image memory... ");
 	do_gettimeofday(&start);
 
+	error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
+	if (error)
+		goto err_out;
+
+	error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
+	if (error)
+		goto err_out;
+
+	alloc_normal = 0;
+	alloc_highmem = 0;
+
 	/* Count the number of saveable data pages. */
-	highmem = count_highmem_pages();
+	save_highmem = count_highmem_pages();
 	saveable = count_data_pages();
 
 	/*
@@ -1174,7 +1254,8 @@ int swsusp_shrink_memory(void)
 	 * number of pages needed for image metadata (size).
 	 */
 	count = saveable;
-	saveable += highmem;
+	saveable += save_highmem;
+	highmem = save_highmem;
 	size = 0;
 	for_each_populated_zone(zone) {
 		size += snapshot_additional_pages(zone);
@@ -1193,10 +1274,13 @@ int swsusp_shrink_memory(void)
 		size = max_size;
 	/*
 	 * If the maximum is not less than the current number of saveable pages
-	 * in memory, we don't need to do anything more.
+	 * in memory, allocate page frames for the image and we're done.
 	 */
-	if (size >= saveable)
+	if (size >= saveable) {
+		pages = preallocate_image_highmem(save_highmem);
+		pages += preallocate_image_memory(saveable - pages);
 		goto out;
+	}
 
 	/*
 	 * Let the memory management subsystem know that we're going to need a
@@ -1216,10 +1300,8 @@ int swsusp_shrink_memory(void)
 	pages_highmem = preallocate_image_highmem(highmem / 2);
 	alloc = (count - max_size) - pages_highmem;
 	pages = preallocate_image_memory(alloc);
-	if (pages < alloc) {
-		error = -ENOMEM;
-		goto free_out;
-	}
+	if (pages < alloc)
+		goto err_out;
 	size = max_size - size;
 	alloc = size;
 	size = preallocate_highmem_fraction(size, highmem, count);
@@ -1228,21 +1310,24 @@ int swsusp_shrink_memory(void)
 	pages += preallocate_image_memory(alloc);
 	pages += pages_highmem;
 
- free_out:
-	/* Release all of the preallocated page frames. */
-	swsusp_free();
-
-	if (error) {
-		printk(KERN_CONT "\n");
-		return error;
-	}
+	/*
+	 * We only need as many page frames for the image as there are saveable
+	 * pages in memory, but we have allocated more.  Release the excessive
+	 * ones now.
+	 */
+	free_unnecessary_pages();
 
  out:
 	do_gettimeofday(&stop);
-	printk(KERN_CONT "done (preallocated %lu free pages)\n", pages);
-	swsusp_show_speed(&start, &stop, pages, "Freed");
+	printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+	swsusp_show_speed(&start, &stop, pages, "Allocated");
 
 	return 0;
+
+ err_out:
+	printk(KERN_CONT "\n");
+	swsusp_free();
+	return -ENOMEM;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -1253,7 +1338,7 @@ int swsusp_shrink_memory(void)
 
 static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
 {
-	unsigned int free_highmem = count_free_highmem_pages();
+	unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
 
 	if (free_highmem >= nr_highmem)
 		nr_highmem = 0;
@@ -1275,19 +1360,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
 static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 {
 	struct zone *zone;
-	unsigned int free = 0, meta = 0;
+	unsigned int free = alloc_normal;
 
-	for_each_zone(zone) {
-		meta += snapshot_additional_pages(zone);
+	for_each_zone(zone)
 		if (!is_highmem(zone))
 			free += zone_page_state(zone, NR_FREE_PAGES);
-	}
 
 	nr_pages += count_pages_for_highmem(nr_highmem);
-	pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n",
-		nr_pages, PAGES_FOR_IO, meta, free);
+	pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
+		nr_pages, PAGES_FOR_IO, free);
 
-	return free > nr_pages + PAGES_FOR_IO + meta;
+	return free > nr_pages + PAGES_FOR_IO;
 }
 
 #ifdef CONFIG_HIGHMEM
@@ -1309,7 +1392,7 @@ static inline int get_highmem_buffer(int safe_needed)
  */
 
 static inline unsigned int
-alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 {
 	unsigned int to_alloc = count_free_highmem_pages();
 
@@ -1329,7 +1412,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
 static inline int get_highmem_buffer(int safe_needed) { return 0; }
 
 static inline unsigned int
-alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
 #endif /* CONFIG_HIGHMEM */
 
 /**
@@ -1348,51 +1431,36 @@ static int
 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 		unsigned int nr_pages, unsigned int nr_highmem)
 {
-	int error;
-
-	error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
-	if (error)
-		goto Free;
-
-	error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
-	if (error)
-		goto Free;
+	int error = 0;
 
 	if (nr_highmem > 0) {
 		error = get_highmem_buffer(PG_ANY);
 		if (error)
-			goto Free;
-
-		nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
+			goto err_out;
+		if (nr_highmem > alloc_highmem) {
+			nr_highmem -= alloc_highmem;
+			nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
+		}
 	}
-	while (nr_pages-- > 0) {
-		struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
-
-		if (!page)
-			goto Free;
+	if (nr_pages > alloc_normal) {
+		nr_pages -= alloc_normal;
+		while (nr_pages-- > 0) {
+			struct page *page;
 
-		memory_bm_set_bit(copy_bm, page_to_pfn(page));
+			page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+			if (!page)
+				goto err_out;
+			memory_bm_set_bit(copy_bm, page_to_pfn(page));
+		}
 	}
+
 	return 0;
 
- Free:
+ err_out:
 	swsusp_free();
-	return -ENOMEM;
+	return error;
 }
 
-/* Memory bitmap used for marking saveable pages (during suspend) or the
- * suspend image pages (during resume)
- */
-static struct memory_bitmap orig_bm;
-/* Memory bitmap used on suspend for marking allocated pages that will contain
- * the copies of saveable pages.  During resume it is initially used for
- * marking the suspend image pages, but then its set bits are duplicated in
- * @orig_bm and it is released.  Next, on systems with high memory, it may be
- * used for marking "safe" highmem pages, but it has to be reinitialized for
- * this purpose.
- */
-static struct memory_bitmap copy_bm;
-
 asmlinkage int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
-- 
cgit v1.2.3


From ef4aede3f10d82adef1fb044b565ba5f08f851e0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 8 Jul 2009 13:24:12 +0200
Subject: PM/Hibernate: Do not try to allocate too much memory too hard (rev.
 2)

We want to avoid attempting to free too much memory too hard during
hibernation, so estimate the minimum size of the image to use as the
lower limit for preallocating memory.

The approach here is based on the (experimental) observation that we
can't free more page frames than the sum of:

* global_page_state(NR_SLAB_RECLAIMABLE)
* global_page_state(NR_ACTIVE_ANON)
* global_page_state(NR_INACTIVE_ANON)
* global_page_state(NR_ACTIVE_FILE)
* global_page_state(NR_INACTIVE_FILE)

minus

* global_page_state(NR_FILE_MAPPED)

Namely, if this number is subtracted from the number of saveable
pages in the system, we get a good estimate of the minimum reasonable
size of a hibernation image.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
---
 kernel/power/snapshot.c | 43 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2b1a7bc24c9..0a06b114dd3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1203,6 +1203,36 @@ static void free_unnecessary_pages(void)
 	}
 }
 
+/**
+ * minimum_image_size - Estimate the minimum acceptable size of an image
+ * @saveable: Number of saveable pages in the system.
+ *
+ * We want to avoid attempting to free too much memory too hard, so estimate the
+ * minimum acceptable size of a hibernation image to use as the lower limit for
+ * preallocating memory.
+ *
+ * We assume that the minimum image size should be proportional to
+ *
+ * [number of saveable pages] - [number of pages that can be freed in theory]
+ *
+ * where the second term is the sum of (1) reclaimable slab pages, (2) active
+ * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
+ * minus mapped file pages.
+ */
+static unsigned long minimum_image_size(unsigned long saveable)
+{
+	unsigned long size;
+
+	size = global_page_state(NR_SLAB_RECLAIMABLE)
+		+ global_page_state(NR_ACTIVE_ANON)
+		+ global_page_state(NR_INACTIVE_ANON)
+		+ global_page_state(NR_ACTIVE_FILE)
+		+ global_page_state(NR_INACTIVE_FILE)
+		- global_page_state(NR_FILE_MAPPED);
+
+	return saveable <= size ? 0 : saveable - size;
+}
+
 /**
  * hibernate_preallocate_memory - Preallocate memory for hibernation image
  *
@@ -1220,8 +1250,8 @@ static void free_unnecessary_pages(void)
  *
  * If image_size is set below the number following from the above formula,
  * the preallocation of memory is continued until the total number of saveable
- * pages in the system is below the requested image size or it is impossible to
- * allocate more memory, whichever happens first.
+ * pages in the system is below the requested image size or the minimum
+ * acceptable image size returned by minimum_image_size(), whichever is greater.
  */
 int hibernate_preallocate_memory(void)
 {
@@ -1282,6 +1312,11 @@ int hibernate_preallocate_memory(void)
 		goto out;
 	}
 
+	/* Estimate the minimum size of the image. */
+	pages = minimum_image_size(saveable);
+	if (size < pages)
+		size = min_t(unsigned long, pages, max_size);
+
 	/*
 	 * Let the memory management subsystem know that we're going to need a
 	 * large number of page frames to allocate and make it free some memory.
@@ -1294,8 +1329,8 @@ int hibernate_preallocate_memory(void)
 	 * The number of saveable pages in memory was too high, so apply some
 	 * pressure to decrease it.  First, make room for the largest possible
 	 * image and fail if that doesn't work.  Next, try to decrease the size
-	 * of the image as much as indicated by image_size using allocations
-	 * from highmem and non-highmem zones separately.
+	 * of the image as much as indicated by 'size' using allocations from
+	 * highmem and non-highmem zones separately.
 	 */
 	pages_highmem = preallocate_image_highmem(highmem / 2);
 	alloc = (count - max_size) - pages_highmem;
-- 
cgit v1.2.3


From 98e73dc5d2dadfcb95305ad71ac9239f4e361870 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Wed, 22 Jul 2009 00:36:56 +0200
Subject: PM / Hibernate / Memory hotplug: Always use for_each_populated_zone()

Use for_each_populated_zone() instead of for_each_zone() in hibernation
code. This fixes a bug on s390, where we allow both config options
HIBERNATION and MEMORY_HOTPLUG, so that we also have a ZONE_MOVABLE
here. We only allow hibernation if no memory hotplug operation was
performed, so in fact both features can only be used exclusively, but
this way we don't need 2 differently configured (distribution) kernels.

If we have an unpopulated ZONE_MOVABLE, we allow hibernation but run
into a BUG_ON() in memory_bm_test/set/clear_bit() because hibernation
code iterates through all zones, not only the populated zones, in
several places. For example, swsusp_free() does for_each_zone() and
then checks for pfn_valid(), which is true even if the zone is not
populated, resulting in a BUG_ON() later because the pfn cannot be
found in the memory bitmap.

Replacing all occurences of for_each_zone() in hibernation code with
for_each_populated_zone() would fix this issue.

[rjw: Rebased on top of linux-next hibernation patches.]

Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/snapshot.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0a06b114dd3..bf06658f205 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void)
 	struct zone *zone;
 	unsigned int n = 0;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned long pfn, max_zone_pfn;
 
 		if (!is_highmem(zone))
@@ -916,7 +916,7 @@ static unsigned int count_data_pages(void)
 	unsigned long pfn, max_zone_pfn;
 	unsigned int n = 0;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		if (is_highmem(zone))
 			continue;
 
@@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 	struct zone *zone;
 	unsigned long pfn;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned long max_zone_pfn;
 
 		mark_free_pages(zone);
@@ -1065,7 +1065,7 @@ void swsusp_free(void)
 	struct zone *zone;
 	unsigned long pfn, max_zone_pfn;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (pfn_valid(pfn)) {
@@ -1397,7 +1397,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 	struct zone *zone;
 	unsigned int free = alloc_normal;
 
-	for_each_zone(zone)
+	for_each_populated_zone(zone)
 		if (!is_highmem(zone))
 			free += zone_page_state(zone, NR_FREE_PAGES);
 
@@ -1688,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 	unsigned long pfn, max_zone_pfn;
 
 	/* Clear page flags */
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (pfn_valid(pfn))
-- 
cgit v1.2.3


From 8de0307326be94148436082a9abf365da8e3c66d Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 22 Jul 2009 19:56:10 +0200
Subject: PM: Trivial fixes

Fix the definition of BM_BITS_PER_BLOCK and kerneldoc
description of create_bm_block_list().

[rjw: Added changelog.]

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/snapshot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bf06658f205..97955b0e44f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 
 #define BM_END_OF_MAP	(~0UL)
 
-#define BM_BITS_PER_BLOCK	(PAGE_SIZE << 3)
+#define BM_BITS_PER_BLOCK	(PAGE_SIZE * BITS_PER_BYTE)
 
 struct bm_block {
 	struct list_head hook;	/* hook into a list of bitmap blocks */
@@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 
 /**
  *	create_bm_block_list - create a list of block bitmap objects
- *	@nr_blocks - number of blocks to allocate
+ *	@pages - number of pages to track
  *	@list - list to put the allocated blocks into
  *	@ca - chain allocator to be used for allocating memory
  */
-- 
cgit v1.2.3


From 4a5d6ba1914d1bf1fcfb5e15834c29d84a879219 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 14 Sep 2009 12:45:39 +0100
Subject: CRED: Allow put_cred() to cope with a NULL groups list

put_cred() will oops if given a NULL groups list, but that is now possible with
the existence of cred_alloc_blank(), as used in keyctl_session_to_parent().

Added in commit:

	commit ee18d64c1f632043a02e6f5ba5e045bb26a5465f
	Author: David Howells <dhowells@redhat.com>
	Date:   Wed Sep 2 09:14:21 2009 +0100
	KEYS: Add a keyctl to install a process's session keyring on its parent [try #6]

Reported-by: Marc Dionne <marc.c.dionne@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/cred.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 006fcab009d..d7f7a01082e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -147,7 +147,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
 	key_put(cred->thread_keyring);
 	key_put(cred->request_key_auth);
 	release_tgcred(cred);
-	put_group_info(cred->group_info);
+	if (cred->group_info)
+		put_group_info(cred->group_info);
 	free_uid(cred->user);
 	kmem_cache_free(cred_jar, cred);
 }
-- 
cgit v1.2.3


From 353f6dd2dec992ddd34620a94b051b0f76227379 Mon Sep 17 00:00:00 2001
From: Anirban Sinha <asinha@zeugmasystems.com>
Date: Mon, 14 Sep 2009 11:13:37 -0700
Subject: cleanup console_print()

console_print() is an old legacy interface mostly unused in the entire
kernel tree. It's best to clean up its existing use and let developers
use their own implementation of it as they feel fit.

Signed-off-by: Anirban Sinha <asinha@zeugmasystems.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e10d193a833..602033acd6c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1075,12 +1075,6 @@ void __sched console_conditional_schedule(void)
 }
 EXPORT_SYMBOL(console_conditional_schedule);
 
-void console_print(const char *s)
-{
-	printk(KERN_EMERG "%s", s);
-}
-EXPORT_SYMBOL(console_print);
-
 void console_unblank(void)
 {
 	struct console *c;
-- 
cgit v1.2.3


From 555f386c98cc93890f48fdea098936755270304b Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Mon, 14 Sep 2009 20:10:15 -0400
Subject: ftrace: document function and function graph implementation

While implementing function tracer and function tracer graph support,
I found the exact arch implementation details to be a bit lacking
(and my x86 foo ain't great).  So after pounding out support for
the Blackfin arch, start documenting the requirements/details.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
LKML-Reference: <1252973415-21264-1-git-send-email-vapier@gentoo.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index aa002cef924..e7163460440 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,12 +11,18 @@ config NOP_TRACER
 
 config HAVE_FTRACE_NMI_ENTER
 	bool
+	help
+	  See Documentation/trace/ftrace-implementation.txt
 
 config HAVE_FUNCTION_TRACER
 	bool
+	help
+	  See Documentation/trace/ftrace-implementation.txt
 
 config HAVE_FUNCTION_GRAPH_TRACER
 	bool
+	help
+	  See Documentation/trace/ftrace-implementation.txt
 
 config HAVE_FUNCTION_GRAPH_FP_TEST
 	bool
@@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
 config HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	bool
 	help
-	 This gets selected when the arch tests the function_trace_stop
-	 variable at the mcount call site. Otherwise, this variable
-	 is tested by the called function.
+	  See Documentation/trace/ftrace-implementation.txt
 
 config HAVE_DYNAMIC_FTRACE
 	bool
+	help
+	  See Documentation/trace/ftrace-implementation.txt
 
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
+	help
+	  See Documentation/trace/ftrace-implementation.txt
 
 config HAVE_HW_BRANCH_TRACER
 	bool
 
 config HAVE_SYSCALL_TRACEPOINTS
 	bool
+	help
+	  See Documentation/trace/ftrace-implementation.txt
 
 config TRACER_MAX_TRACE
 	bool
-- 
cgit v1.2.3


From b3e62e35058fc744ac794611f4e79bcd1c5a4b83 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 15 Sep 2009 14:44:36 +0800
Subject: perf_counter: Fix buffer overflow in perf_copy_attr()

If we pass a big size data over perf_counter_open() syscall,
the kernel will copy this data to a small buffer, it will
cause kernel crash.

This bug makes the kernel unsafe and non-root local user can
trigger it.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: <stable@kernel.org>
LKML-Reference: <4AAF37D4.5010706@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d7cbc579fc8..a67a1dc3cfa 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4171,6 +4171,7 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
 			if (val)
 				goto err_size;
 		}
+		size = sizeof(*attr);
 	}
 
 	ret = copy_from_user(attr, uattr, size);
-- 
cgit v1.2.3


From b78bb868c54bebbf8d8786a3f8320700d6d2b864 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Sep 2009 14:23:18 +0200
Subject: sched: Fix double_rq_lock() compile warning

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e27a53685ed..17e4391ec2d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -119,8 +119,6 @@
  */
 #define RUNTIME_INF	((u64)~0ULL)
 
-static void double_rq_lock(struct rq *rq1, struct rq *rq2);
-
 static inline int rt_policy(int policy)
 {
 	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -1695,6 +1693,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #ifdef CONFIG_PREEMPT
 
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
 /*
  * fair double_lock_balance: Safely acquires both rq->locks in a fair
  * way at the expense of forcing extra atomic operations in all
-- 
cgit v1.2.3


From e6b1b2c9c0461c4e0971ed905ce3cda6512ee82a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Sep 2009 11:59:22 +0200
Subject: sched: Split WAKEUP_OVERLAP

It consists of two conditions, split them out in separate toggles
so we can test them independently.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 7 ++++---
 kernel/sched_features.h | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aa7f8412101..cea5b82242e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1526,9 +1526,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
-	if (sched_feat(WAKEUP_OVERLAP) && (sync ||
-			(se->avg_overlap < sysctl_sched_migration_cost &&
-			 pse->avg_overlap < sysctl_sched_migration_cost))) {
+	if ((sched_feat(WAKEUP_SYNC) && sync) ||
+	    (sched_feat(WAKEUP_OVERLAP) &&
+	     (se->avg_overlap < sysctl_sched_migration_cost &&
+	      pse->avg_overlap < sysctl_sched_migration_cost))) {
 		resched_task(curr);
 		return;
 	}
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index e2dc63a5815..07c8250b404 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,6 +12,7 @@ SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
+SCHED_FEAT(WAKEUP_SYNC, 0)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 SCHED_FEAT(LAST_BUDDY, 1)
 SCHED_FEAT(OWNER_SPIN, 1)
-- 
cgit v1.2.3


From 3cb63d527f76e25dbccce4f577f21aecfa2abac7 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 11 Sep 2009 12:01:17 +0200
Subject: sched: Complete buddy switches

Add a NEXT_BUDDY feature flag to aid in debugging.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 3 ++-
 kernel/sched_features.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cea5b82242e..4f6356e70ad 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1501,7 +1501,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	 */
 	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
 		set_last_buddy(se);
-	set_next_buddy(pse);
+	if (sched_feat(NEXT_BUDDY))
+		set_next_buddy(pse);
 
 	/*
 	 * We can come here with TIF_NEED_RESCHED already set from new task
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 07c8250b404..6174c123399 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -14,5 +14,6 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 SCHED_FEAT(WAKEUP_SYNC, 0)
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
+SCHED_FEAT(NEXT_BUDDY, 1)
 SCHED_FEAT(LAST_BUDDY, 1)
 SCHED_FEAT(OWNER_SPIN, 1)
-- 
cgit v1.2.3


From e26af0e8b2c9916f1e8a12ddaf52057bbe8ff600 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Sep 2009 12:31:23 +0200
Subject: sched: Add come comments to the sched features

Add text...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_features.h | 93 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 85 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 6174c123399..891ea0f72b4 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,19 +1,96 @@
+/*
+ * Disregards a certain amount of sleep time (sched_latency_ns) and
+ * considers the task to be running during that period. This gives it
+ * a service deficit on wakeup, allowing it to run sooner.
+ */
 SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
+
+/*
+ * By not normalizing the sleep time, heavy tasks get an effective
+ * longer period, and lighter task an effective shorter period they
+ * are considered running.
+ */
 SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-SCHED_FEAT(WAKEUP_PREEMPT, 1)
+
+/*
+ * Place new tasks ahead so that they do not starve already running
+ * tasks
+ */
 SCHED_FEAT(START_DEBIT, 1)
+
+/*
+ * Should wakeups try to preempt running tasks.
+ */
+SCHED_FEAT(WAKEUP_PREEMPT, 1)
+
+/*
+ * Compute wakeup_gran based on task behaviour, clipped to
+ *  [0, sched_wakeup_gran_ns]
+ */
+SCHED_FEAT(ADAPTIVE_GRAN, 1)
+
+/*
+ * When converting the wakeup granularity to virtual time, do it such
+ * that heavier tasks preempting a lighter task have an edge.
+ */
+SCHED_FEAT(ASYM_GRAN, 1)
+
+/*
+ * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
+ */
+SCHED_FEAT(WAKEUP_SYNC, 0)
+
+/*
+ * Wakeup preempt based on task behaviour. Tasks that do not overlap
+ * don't get preempted.
+ */
+SCHED_FEAT(WAKEUP_OVERLAP, 0)
+
+/*
+ * Use the SYNC wakeup hint, pipes and the likes use this to indicate
+ * the remote end is likely to consume the data we just wrote, and
+ * therefore has cache benefit from being placed on the same cpu, see
+ * also AFFINE_WAKEUPS.
+ */
+SCHED_FEAT(SYNC_WAKEUPS, 1)
+
+/*
+ * Based on load and program behaviour, see if it makes sense to place
+ * a newly woken task on the same cpu as the task that woke it --
+ * improve cache locality. Typically used with SYNC wakeups as
+ * generated by pipes and the like, see also SYNC_WAKEUPS.
+ */
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
+
+/*
+ * Prefer to schedule the task we woke last (assuming it failed
+ * wakeup-preemption), since its likely going to consume data we
+ * touched, increases cache locality.
+ */
+SCHED_FEAT(NEXT_BUDDY, 1)
+
+/*
+ * Prefer to schedule the task that ran last (when we did
+ * wake-preempt) as that likely will touch the same data, increases
+ * cache locality.
+ */
+SCHED_FEAT(LAST_BUDDY, 1)
+
+/*
+ * Consider buddies to be cache hot, decreases the likelyness of a
+ * cache buddy being migrated away, increases cache locality.
+ */
 SCHED_FEAT(CACHE_HOT_BUDDY, 1)
-SCHED_FEAT(SYNC_WAKEUPS, 1)
+
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
-SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
-SCHED_FEAT(WAKEUP_SYNC, 0)
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-SCHED_FEAT(NEXT_BUDDY, 1)
-SCHED_FEAT(LAST_BUDDY, 1)
+
+/*
+ * Spin-wait on mutex acquisition when the mutex owner is running on
+ * another cpu -- assumes that when the owner is running, it will soon
+ * release the lock. Decreases scheduling overhead.
+ */
 SCHED_FEAT(OWNER_SPIN, 1)
-- 
cgit v1.2.3


From f5f08f39ee4c5fd0a757d25d9e04d696676b3df7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 10 Sep 2009 13:35:28 +0200
Subject: sched: Move code around

In preparation to other code movement, move weighted_cpuload(),
source_load() and target_load() before the class includes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 81 ++++++++++++++++++++++++++++------------------------------
 1 file changed, 39 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 17e4391ec2d..b56d1505d05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1507,8 +1507,45 @@ static int tg_nop(struct task_group *tg, void *data)
 #endif
 
 #ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+	return cpu_rq(cpu)->load.weight;
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long total = weighted_cpuload(cpu);
+
+	if (type == 0 || !sched_feat(LB_BIAS))
+		return total;
+
+	return min(rq->cpu_load[type-1], total);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long total = weighted_cpuload(cpu);
+
+	if (type == 0 || !sched_feat(LB_BIAS))
+		return total;
+
+	return max(rq->cpu_load[type-1], total);
+}
+
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
 static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1959,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 }
 
 #ifdef CONFIG_SMP
-
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-	return cpu_rq(cpu)->load.weight;
-}
-
 /*
  * Is this task likely cache-hot:
  */
@@ -2240,39 +2270,6 @@ void kick_process(struct task_struct *p)
 }
 EXPORT_SYMBOL_GPL(kick_process);
 
-/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
-
-	if (type == 0 || !sched_feat(LB_BIAS))
-		return total;
-
-	return min(rq->cpu_load[type-1], total);
-}
-
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long total = weighted_cpuload(cpu);
-
-	if (type == 0 || !sched_feat(LB_BIAS))
-		return total;
-
-	return max(rq->cpu_load[type-1], total);
-}
-
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
-- 
cgit v1.2.3


From aaee1203ca52b9db799433c33c9bffc33cdf8909 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 10 Sep 2009 13:36:25 +0200
Subject: sched: Move sched_balance_self() into sched_fair.c

Move the sched_balance_self() code into sched_fair.c

This facilitates the merger of sched_balance_self() and
sched_fair::select_task_rq().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 146 ----------------------------------------------------
 kernel/sched_fair.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+), 146 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b56d1505d05..60400a22401 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2269,152 +2269,6 @@ void kick_process(struct task_struct *p)
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- */
-static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
-{
-	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
-	unsigned long min_load = ULONG_MAX, this_load = 0;
-	int load_idx = sd->forkexec_idx;
-	int imbalance = 100 + (sd->imbalance_pct-100)/2;
-
-	do {
-		unsigned long load, avg_load;
-		int local_group;
-		int i;
-
-		/* Skip over this group if it has no CPUs allowed */
-		if (!cpumask_intersects(sched_group_cpus(group),
-					&p->cpus_allowed))
-			continue;
-
-		local_group = cpumask_test_cpu(this_cpu,
-					       sched_group_cpus(group));
-
-		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
-
-		for_each_cpu(i, sched_group_cpus(group)) {
-			/* Bias balancing toward cpus of our domain */
-			if (local_group)
-				load = source_load(i, load_idx);
-			else
-				load = target_load(i, load_idx);
-
-			avg_load += load;
-		}
-
-		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
-
-		if (local_group) {
-			this_load = avg_load;
-			this = group;
-		} else if (avg_load < min_load) {
-			min_load = avg_load;
-			idlest = group;
-		}
-	} while (group = group->next, group != sd->groups);
-
-	if (!idlest || 100*this_load < imbalance*min_load)
-		return NULL;
-	return idlest;
-}
-
-/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
- */
-static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
-{
-	unsigned long load, min_load = ULONG_MAX;
-	int idlest = -1;
-	int i;
-
-	/* Traverse only the allowed CPUs */
-	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
-		load = weighted_cpuload(i);
-
-		if (load < min_load || (load == min_load && i == this_cpu)) {
-			min_load = load;
-			idlest = i;
-		}
-	}
-
-	return idlest;
-}
-
-/*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
- *
- * Balance, ie. select the least loaded group.
- *
- * Returns the target CPU number, or the same CPU if no balancing is needed.
- *
- * preempt must be disabled.
- */
-static int sched_balance_self(int cpu, int flag)
-{
-	struct task_struct *t = current;
-	struct sched_domain *tmp, *sd = NULL;
-
-	for_each_domain(cpu, tmp) {
-		/*
-		 * If power savings logic is enabled for a domain, stop there.
-		 */
-		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
-			break;
-		if (tmp->flags & flag)
-			sd = tmp;
-	}
-
-	if (sd)
-		update_shares(sd);
-
-	while (sd) {
-		struct sched_group *group;
-		int new_cpu, weight;
-
-		if (!(sd->flags & flag)) {
-			sd = sd->child;
-			continue;
-		}
-
-		group = find_idlest_group(sd, t, cpu);
-		if (!group) {
-			sd = sd->child;
-			continue;
-		}
-
-		new_cpu = find_idlest_cpu(group, t, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
-			/* Now try balancing at a lower domain level of cpu */
-			sd = sd->child;
-			continue;
-		}
-
-		/* Now try balancing at a lower domain level of new_cpu */
-		cpu = new_cpu;
-		weight = cpumask_weight(sched_domain_span(sd));
-		sd = NULL;
-		for_each_domain(cpu, tmp) {
-			if (weight <= cpumask_weight(sched_domain_span(tmp)))
-				break;
-			if (tmp->flags & flag)
-				sd = tmp;
-		}
-		/* while loop will break here if sd == NULL */
-	}
-
-	return cpu;
-}
-
 #endif /* CONFIG_SMP */
 
 /**
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4f6356e70ad..a82d71d3afe 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1360,6 +1360,151 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 out:
 	return wake_idle(new_cpu, p);
 }
+
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long min_load = ULONG_MAX, this_load = 0;
+	int load_idx = sd->forkexec_idx;
+	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+	do {
+		unsigned long load, avg_load;
+		int local_group;
+		int i;
+
+		/* Skip over this group if it has no CPUs allowed */
+		if (!cpumask_intersects(sched_group_cpus(group),
+					&p->cpus_allowed))
+			continue;
+
+		local_group = cpumask_test_cpu(this_cpu,
+					       sched_group_cpus(group));
+
+		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
+
+		for_each_cpu(i, sched_group_cpus(group)) {
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = source_load(i, load_idx);
+			else
+				load = target_load(i, load_idx);
+
+			avg_load += load;
+		}
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+		} else if (avg_load < min_load) {
+			min_load = avg_load;
+			idlest = group;
+		}
+	} while (group = group->next, group != sd->groups);
+
+	if (!idlest || 100*this_load < imbalance*min_load)
+		return NULL;
+	return idlest;
+}
+
+/*
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+	unsigned long load, min_load = ULONG_MAX;
+	int idlest = -1;
+	int i;
+
+	/* Traverse only the allowed CPUs */
+	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+		load = weighted_cpuload(i);
+
+		if (load < min_load || (load == min_load && i == this_cpu)) {
+			min_load = load;
+			idlest = i;
+		}
+	}
+
+	return idlest;
+}
+
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+	struct task_struct *t = current;
+	struct sched_domain *tmp, *sd = NULL;
+
+	for_each_domain(cpu, tmp) {
+		/*
+		 * If power savings logic is enabled for a domain, stop there.
+		 */
+		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+			break;
+		if (tmp->flags & flag)
+			sd = tmp;
+	}
+
+	if (sd)
+		update_shares(sd);
+
+	while (sd) {
+		struct sched_group *group;
+		int new_cpu, weight;
+
+		if (!(sd->flags & flag)) {
+			sd = sd->child;
+			continue;
+		}
+
+		group = find_idlest_group(sd, t, cpu);
+		if (!group) {
+			sd = sd->child;
+			continue;
+		}
+
+		new_cpu = find_idlest_cpu(group, t, cpu);
+		if (new_cpu == -1 || new_cpu == cpu) {
+			/* Now try balancing at a lower domain level of cpu */
+			sd = sd->child;
+			continue;
+		}
+
+		/* Now try balancing at a lower domain level of new_cpu */
+		cpu = new_cpu;
+		weight = cpumask_weight(sched_domain_span(sd));
+		sd = NULL;
+		for_each_domain(cpu, tmp) {
+			if (weight <= cpumask_weight(sched_domain_span(tmp)))
+				break;
+			if (tmp->flags & flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	return cpu;
+}
 #endif /* CONFIG_SMP */
 
 /*
-- 
cgit v1.2.3


From 5f3edc1b1ead6d9bd45a85c551f44eff8fe76b9f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 10 Sep 2009 13:42:00 +0200
Subject: sched: Hook sched_balance_self() into sched_class::select_task_rq()

Rather ugly patch to fully place the sched_balance_self() code
inside the fair class.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 14 +++++++-------
 kernel/sched_fair.c     |  7 ++++++-
 kernel/sched_idletask.c |  2 +-
 kernel/sched_rt.c       |  5 ++++-
 4 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 60400a22401..32b7a81230c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2350,7 +2350,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-	cpu = p->sched_class->select_task_rq(p, sync);
+	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync);
 	if (cpu != orig_cpu) {
 		set_task_cpu(p, cpu);
 		task_rq_unlock(rq, &flags);
@@ -2525,11 +2525,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
 
 	__sched_fork(p);
 
-#ifdef CONFIG_SMP
-	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
-#endif
-	set_task_cpu(p, cpu);
-
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
 	 */
@@ -2560,6 +2555,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
+#ifdef CONFIG_SMP
+	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
+#endif
+	set_task_cpu(p, cpu);
+
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -3114,7 +3114,7 @@ out:
 void sched_exec(void)
 {
 	int new_cpu, this_cpu = get_cpu();
-	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
+	new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
 	put_cpu();
 	if (new_cpu != this_cpu)
 		sched_migrate_task(current, new_cpu);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a82d71d3afe..f2eb5b93471 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1300,7 +1300,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	return 0;
 }
 
-static int select_task_rq_fair(struct task_struct *p, int sync)
+static int sched_balance_self(int cpu, int flag);
+
+static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 {
 	struct sched_domain *sd, *this_sd = NULL;
 	int prev_cpu, this_cpu, new_cpu;
@@ -1314,6 +1316,9 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	this_rq		= cpu_rq(this_cpu);
 	new_cpu		= prev_cpu;
 
+	if (flag != SD_BALANCE_WAKE)
+		return sched_balance_self(this_cpu, flag);
+
 	/*
 	 * 'this_sd' is the first domain that both
 	 * this_cpu and prev_cpu are present in:
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cb..99b2f033760 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sync)
+static int select_task_rq_idle(struct task_struct *p, int flag, int sync)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2eb4bd6a526..438380810ac 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int sync)
+static int select_task_rq_rt(struct task_struct *p, int flag, int sync)
 {
 	struct rq *rq = task_rq(p);
 
+	if (flag != SD_BALANCE_WAKE)
+		return smp_processor_id();
+
 	/*
 	 * If the current task is an RT task, then
 	 * try to see if we can wake this RT task up on another
-- 
cgit v1.2.3


From e9c8431185d6c406887190519f6dbdd112641686 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Sep 2009 14:43:03 +0200
Subject: sched: Add TASK_WAKING

We're going to want to drop rq->lock in try_to_wake_up() for a
longer period of time, however we also want to deal with concurrent
waking of the same task, which is currently handled by holding
rq->lock.

So introduce a new TASK state, namely TASK_WAKING, which indicates
someone is already waking the task (other wakers will fail p->state
& state).

We also keep preemption disabled over the whole ttwu().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 32b7a81230c..fc6fda881d2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2310,7 +2310,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
-	long old_state;
 	struct rq *rq;
 
 	if (!sched_feat(SYNC_WAKEUPS))
@@ -2332,11 +2331,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	}
 #endif
 
+	this_cpu = get_cpu();
+
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
-	old_state = p->state;
-	if (!(old_state & state))
+	if (!(p->state & state))
 		goto out;
 
 	if (p->se.on_rq)
@@ -2344,27 +2344,25 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 	cpu = task_cpu(p);
 	orig_cpu = cpu;
-	this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
+	/*
+	 * In order to handle concurrent wakeups and release the rq->lock
+	 * we put the task in TASK_WAKING state.
+	 */
+	p->state = TASK_WAKING;
+	task_rq_unlock(rq, &flags);
+
 	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync);
-	if (cpu != orig_cpu) {
+	if (cpu != orig_cpu)
 		set_task_cpu(p, cpu);
-		task_rq_unlock(rq, &flags);
-		/* might preempt at this point */
-		rq = task_rq_lock(p, &flags);
-		old_state = p->state;
-		if (!(old_state & state))
-			goto out;
-		if (p->se.on_rq)
-			goto out_running;
 
-		this_cpu = smp_processor_id();
-		cpu = task_cpu(p);
-	}
+	rq = task_rq_lock(p, &flags);
+	WARN_ON(p->state != TASK_WAKING);
+	cpu = task_cpu(p);
 
 #ifdef CONFIG_SCHEDSTATS
 	schedstat_inc(rq, ttwu_count);
@@ -2422,6 +2420,7 @@ out_running:
 #endif
 out:
 	task_rq_unlock(rq, &flags);
+	put_cpu();
 
 	return success;
 }
-- 
cgit v1.2.3


From c88d5910890ad35af283344417891344604f0438 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 10 Sep 2009 13:50:02 +0200
Subject: sched: Merge select_task_rq_fair() and sched_balance_self()

The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.

To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().

Modify sched_balance_self() to:

  - update_shares() when walking up the domain tree,
    (it only called it for the top domain, but it should
     have done this anyway), which allows us to remove
    this ugly bit from try_to_wake_up().

  - do wake_affine() on the smallest domain that contains
    both this (the waking) and the prev (the wakee) cpu for
    WAKE invocations.

Then use the top-down balance steps it had to replace wake_idle().

This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.

SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.

Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  41 +--------
 kernel/sched_fair.c | 233 ++++++++++++++--------------------------------------
 2 files changed, 66 insertions(+), 208 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index fc6fda881d2..6c819f338b1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -512,14 +512,6 @@ struct root_domain {
 #ifdef CONFIG_SMP
 	struct cpupri cpupri;
 #endif
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	/*
-	 * Preferred wake up cpu nominated by sched_mc balance that will be
-	 * used when most cpus are idle in the system indicating overall very
-	 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
-	 */
-	unsigned int sched_mc_preferred_wakeup_cpu;
-#endif
 };
 
 /*
@@ -2315,22 +2307,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
-#ifdef CONFIG_SMP
-	if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
-		struct sched_domain *sd;
-
-		this_cpu = raw_smp_processor_id();
-		cpu = task_cpu(p);
-
-		for_each_domain(this_cpu, sd) {
-			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-				update_shares(sd);
-				break;
-			}
-		}
-	}
-#endif
-
 	this_cpu = get_cpu();
 
 	smp_wmb();
@@ -3533,11 +3509,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 	*imbalance = sds->min_load_per_task;
 	sds->busiest = sds->group_min;
 
-	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-			group_first_cpu(sds->group_leader);
-	}
-
 	return 1;
 
 }
@@ -7850,9 +7821,7 @@ static int sd_degenerate(struct sched_domain *sd)
 	}
 
 	/* Following flags don't use groups */
-	if (sd->flags & (SD_WAKE_IDLE |
-			 SD_WAKE_AFFINE |
-			 SD_WAKE_BALANCE))
+	if (sd->flags & (SD_WAKE_AFFINE))
 		return 0;
 
 	return 1;
@@ -7869,10 +7838,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
 		return 0;
 
-	/* Does parent contain flags not in child? */
-	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
-	if (cflags & SD_WAKE_AFFINE)
-		pflags &= ~SD_WAKE_BALANCE;
 	/* Flags needing groups don't count if only 1 group in parent */
 	if (parent->groups == parent->groups->next) {
 		pflags &= ~(SD_LOAD_BALANCE |
@@ -8558,10 +8523,10 @@ static void set_domain_attribute(struct sched_domain *sd,
 		request = attr->relax_domain_level;
 	if (request < sd->level) {
 		/* turn off idle balance on this domain */
-		sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE);
+		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	} else {
 		/* turn on idle balance on this domain */
-		sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE);
+		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	}
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2eb5b93471..09d19f77eb3 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1062,83 +1062,6 @@ static void yield_task_fair(struct rq *rq)
 	se->vruntime = rightmost->vruntime + 1;
 }
 
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (rq->rd->online)
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-
-#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
-
-static int wake_idle(int cpu, struct task_struct *p)
-{
-	struct sched_domain *sd;
-	int i;
-	unsigned int chosen_wakeup_cpu;
-	int this_cpu;
-	struct rq *task_rq = task_rq(p);
-
-	/*
-	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
-	 * are idle and this is not a kernel thread and this task's affinity
-	 * allows it to be moved to preferred cpu, then just move!
-	 */
-
-	this_cpu = smp_processor_id();
-	chosen_wakeup_cpu =
-		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
-
-	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
-		idle_cpu(cpu) && idle_cpu(this_cpu) &&
-		p->mm && !(p->flags & PF_KTHREAD) &&
-		cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
-		return chosen_wakeup_cpu;
-
-	/*
-	 * If it is idle, then it is the best cpu to run this task.
-	 *
-	 * This cpu is also the best, if it has more than one task already.
-	 * Siblings must be also busy(in most cases) as they didn't already
-	 * pickup the extra load from this cpu and hence we need not check
-	 * sibling runqueue info. This will avoid the checks and cache miss
-	 * penalities associated with that.
-	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
-		return cpu;
-
-	for_each_domain(cpu, sd) {
-		if ((sd->flags & SD_WAKE_IDLE)
-		    || ((sd->flags & SD_WAKE_IDLE_FAR)
-			&& !task_hot(p, task_rq->clock, sd))) {
-			for_each_cpu_and(i, sched_domain_span(sd),
-					 &p->cpus_allowed) {
-				if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
-					if (i != task_cpu(p)) {
-						schedstat_inc(p,
-						       se.nr_wakeups_idle);
-					}
-					return i;
-				}
-			}
-		} else {
-			break;
-		}
-	}
-	return cpu;
-}
-#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-	return cpu;
-}
-#endif
-
 #ifdef CONFIG_SMP
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,21 +1148,22 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 #endif
 
-static int
-wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
-	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
-	    int idx, unsigned long load, unsigned long this_load,
-	    unsigned int imbalance)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-	struct task_struct *curr = this_rq->curr;
-	struct task_group *tg;
-	unsigned long tl = this_load;
+	struct task_struct *curr = current;
+	unsigned long this_load, load;
+	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
+	unsigned int imbalance;
+	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
 
-	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
-		return 0;
+	idx	  = sd->wake_idx;
+	this_cpu  = smp_processor_id();
+	prev_cpu  = task_cpu(p);
+	load	  = source_load(prev_cpu, idx);
+	this_load = target_load(this_cpu, idx);
 
 	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
 			p->se.avg_overlap > sysctl_sched_migration_cost))
@@ -1254,24 +1178,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 		tg = task_group(current);
 		weight = current->se.load.weight;
 
-		tl += effective_load(tg, this_cpu, -weight, -weight);
+		this_load += effective_load(tg, this_cpu, -weight, -weight);
 		load += effective_load(tg, prev_cpu, 0, -weight);
 	}
 
 	tg = task_group(p);
 	weight = p->se.load.weight;
 
+	imbalance = 100 + (sd->imbalance_pct - 100) / 2;
+
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
-	 * due to the sync cause above having dropped tl to 0, we'll always have
-	 * an imbalance, but there's really nothing you can do about that, so
-	 * that's good too.
+	 * due to the sync cause above having dropped this_load to 0, we'll
+	 * always have an imbalance, but there's really nothing you can do
+	 * about that, so that's good too.
 	 *
 	 * Otherwise check if either cpus are near enough in load to allow this
 	 * task to be woken on this_cpu.
 	 */
-	balanced = !tl ||
-		100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+	balanced = !this_load ||
+		100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
 		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
 
 	/*
@@ -1285,14 +1211,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 
-	if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
-			tl_per_task)) {
+	if (balanced ||
+	    (this_load <= load &&
+	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
 		/*
 		 * This domain has SD_WAKE_AFFINE and
 		 * p is cache cold in this domain, and
 		 * there is no bad imbalance.
 		 */
-		schedstat_inc(this_sd, ttwu_move_affine);
+		schedstat_inc(sd, ttwu_move_affine);
 		schedstat_inc(p, se.nr_wakeups_affine);
 
 		return 1;
@@ -1300,72 +1227,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	return 0;
 }
 
-static int sched_balance_self(int cpu, int flag);
-
-static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
-{
-	struct sched_domain *sd, *this_sd = NULL;
-	int prev_cpu, this_cpu, new_cpu;
-	unsigned long load, this_load;
-	struct rq *this_rq;
-	unsigned int imbalance;
-	int idx;
-
-	prev_cpu	= task_cpu(p);
-	this_cpu	= smp_processor_id();
-	this_rq		= cpu_rq(this_cpu);
-	new_cpu		= prev_cpu;
-
-	if (flag != SD_BALANCE_WAKE)
-		return sched_balance_self(this_cpu, flag);
-
-	/*
-	 * 'this_sd' is the first domain that both
-	 * this_cpu and prev_cpu are present in:
-	 */
-	for_each_domain(this_cpu, sd) {
-		if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
-			this_sd = sd;
-			break;
-		}
-	}
-
-	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
-		goto out;
-
-	/*
-	 * Check for affine wakeup and passive balancing possibilities.
-	 */
-	if (!this_sd)
-		goto out;
-
-	idx = this_sd->wake_idx;
-
-	imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
-	load = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
-
-	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
-				     load, this_load, imbalance))
-		return this_cpu;
-
-	/*
-	 * Start passive balancing when half the imbalance_pct
-	 * limit is reached.
-	 */
-	if (this_sd->flags & SD_WAKE_BALANCE) {
-		if (imbalance*this_load <= 100*load) {
-			schedstat_inc(this_sd, ttwu_move_balance);
-			schedstat_inc(p, se.nr_wakeups_passive);
-			return this_cpu;
-		}
-	}
-
-out:
-	return wake_idle(new_cpu, p);
-}
-
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -1455,10 +1316,20 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  *
  * preempt must be disabled.
  */
-static int sched_balance_self(int cpu, int flag)
+static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 {
 	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
+	int cpu = smp_processor_id();
+	int prev_cpu = task_cpu(p);
+	int new_cpu = cpu;
+	int want_affine = 0;
+
+	if (flag & SD_BALANCE_WAKE) {
+		if (sched_feat(AFFINE_WAKEUPS))
+			want_affine = 1;
+		new_cpu = prev_cpu;
+	}
 
 	for_each_domain(cpu, tmp) {
 		/*
@@ -1466,16 +1337,38 @@ static int sched_balance_self(int cpu, int flag)
 		 */
 		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
 			break;
-		if (tmp->flags & flag)
-			sd = tmp;
-	}
 
-	if (sd)
-		update_shares(sd);
+		switch (flag) {
+		case SD_BALANCE_WAKE:
+			if (!sched_feat(LB_WAKEUP_UPDATE))
+				break;
+		case SD_BALANCE_FORK:
+		case SD_BALANCE_EXEC:
+			if (root_task_group_empty())
+				break;
+			update_shares(tmp);
+		default:
+			break;
+		}
+
+		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+
+			if (wake_affine(tmp, p, sync))
+				return cpu;
+
+			want_affine = 0;
+		}
+
+		if (!(tmp->flags & flag))
+			continue;
+
+		sd = tmp;
+	}
 
 	while (sd) {
 		struct sched_group *group;
-		int new_cpu, weight;
+		int weight;
 
 		if (!(sd->flags & flag)) {
 			sd = sd->child;
@@ -1508,7 +1401,7 @@ static int sched_balance_self(int cpu, int flag)
 		/* while loop will break here if sd == NULL */
 	}
 
-	return cpu;
+	return new_cpu;
 }
 #endif /* CONFIG_SMP */
 
-- 
cgit v1.2.3


From ae154be1f34a674e6cbb43ccf6e442f56acd7a70 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 10 Sep 2009 14:40:57 +0200
Subject: sched: Weaken SD_POWERSAVINGS_BALANCE

One of the problems of power-saving balancing is that under certain
scenarios it is too slow and allows tons of real work to pile up.

Avoid this by ignoring the powersave stuff when there's real work
to be done.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 40 ++++++++++++++++++++--------------------
 kernel/sched_fair.c | 21 ++++++++++++++++++---
 2 files changed, 38 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6c819f338b1..f0ccb8b926c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1538,6 +1538,26 @@ static unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], total);
 }
 
+static struct sched_group *group_of(int cpu)
+{
+	struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+
+	if (!sd)
+		return NULL;
+
+	return sd->groups;
+}
+
+static unsigned long power_of(int cpu)
+{
+	struct sched_group *group = group_of(cpu);
+
+	if (!group)
+		return SCHED_LOAD_SCALE;
+
+	return group->cpu_power;
+}
+
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
 static unsigned long cpu_avg_load_per_task(int cpu)
@@ -3982,26 +4002,6 @@ ret:
 	return NULL;
 }
 
-static struct sched_group *group_of(int cpu)
-{
-	struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
-
-	if (!sd)
-		return NULL;
-
-	return sd->groups;
-}
-
-static unsigned long power_of(int cpu)
-{
-	struct sched_group *group = group_of(cpu);
-
-	if (!group)
-		return SCHED_LOAD_SCALE;
-
-	return group->cpu_power;
-}
-
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 09d19f77eb3..eaa00014b49 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1333,10 +1333,25 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 
 	for_each_domain(cpu, tmp) {
 		/*
-		 * If power savings logic is enabled for a domain, stop there.
+		 * If power savings logic is enabled for a domain, see if we
+		 * are not overloaded, if so, don't balance wider.
 		 */
-		if (tmp->flags & SD_POWERSAVINGS_BALANCE)
-			break;
+		if (tmp->flags & SD_POWERSAVINGS_BALANCE) {
+			unsigned long power = 0;
+			unsigned long nr_running = 0;
+			unsigned long capacity;
+			int i;
+
+			for_each_cpu(i, sched_domain_span(tmp)) {
+				power += power_of(i);
+				nr_running += cpu_rq(i)->cfs.nr_running;
+			}
+
+			capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+
+			if (nr_running/2 < capacity)
+				break;
+		}
 
 		switch (flag) {
 		case SD_BALANCE_WAKE:
-- 
cgit v1.2.3


From 83f54960c11a14942ab00b54c51e91906b9d8235 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 10 Sep 2009 18:18:47 +0200
Subject: sched: for_each_domain() vs RCU

for_each_domain() uses RCU to serialize the sched_domains, except
it doesn't actually use rcu_read_lock() and instead relies on
disabling preemption -> FAIL.

XXX: audit other sched_domain code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index eaa00014b49..43dc6d1d9e8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,6 +1331,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 		new_cpu = prev_cpu;
 	}
 
+	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		/*
 		 * If power savings logic is enabled for a domain, see if we
@@ -1369,8 +1370,10 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
 
-			if (wake_affine(tmp, p, sync))
-				return cpu;
+			if (wake_affine(tmp, p, sync)) {
+				new_cpu = cpu;
+				goto out;
+			}
 
 			want_affine = 0;
 		}
@@ -1416,6 +1419,8 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 		/* while loop will break here if sd == NULL */
 	}
 
+out:
+	rcu_read_unlock();
 	return new_cpu;
 }
 #endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From d7c33c4930f569caf6b2ece597432853c4151a45 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 11 Sep 2009 12:45:38 +0200
Subject: sched: Fix task affinity for select_task_rq_fair

While merging select_task_rq_fair() and sched_balance_self() I made
a mistake that leads to testing the wrong task affinty.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 43dc6d1d9e8..8b3eddbcf9a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1318,7 +1318,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  */
 static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 {
-	struct task_struct *t = current;
 	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
@@ -1393,13 +1392,13 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 			continue;
 		}
 
-		group = find_idlest_group(sd, t, cpu);
+		group = find_idlest_group(sd, p, cpu);
 		if (!group) {
 			sd = sd->child;
 			continue;
 		}
 
-		new_cpu = find_idlest_cpu(group, t, cpu);
+		new_cpu = find_idlest_cpu(group, p, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
-- 
cgit v1.2.3


From 78e7ed53c9f42f04f9401ada6f7047db60781676 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 3 Sep 2009 13:16:51 +0200
Subject: sched: Tweak wake_idx

When merging select_task_rq_fair() and sched_balance_self() we lost
the use of wake_idx, restore that and set them to 0 to make wake
balancing more aggressive.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8b3eddbcf9a..19593568031 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1232,12 +1232,27 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  * domain.
  */
 static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+		  int this_cpu, int flag)
 {
 	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
-	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+	int load_idx = 0;
+
+	switch (flag) {
+	case SD_BALANCE_FORK:
+	case SD_BALANCE_EXEC:
+		load_idx = sd->forkexec_idx;
+		break;
+
+	case SD_BALANCE_WAKE:
+		load_idx = sd->wake_idx;
+		break;
+
+	default:
+		break;
+	}
 
 	do {
 		unsigned long load, avg_load;
@@ -1392,7 +1407,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu);
+		group = find_idlest_group(sd, p, cpu, flag);
 		if (!group) {
 			sd = sd->child;
 			continue;
-- 
cgit v1.2.3


From 0ec9fab3d186d9cbb00c0f694d4a260d07c198d9 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 15 Sep 2009 15:07:03 +0200
Subject: sched: Improve latencies and throughput

Make the idle balancer more agressive, to improve a
x264 encoding workload provided by Jason Garrett-Glaser:

 NEXT_BUDDY NO_LB_BIAS
 encoded 600 frames, 252.82 fps, 22096.60 kb/s
 encoded 600 frames, 250.69 fps, 22096.60 kb/s
 encoded 600 frames, 245.76 fps, 22096.60 kb/s

 NO_NEXT_BUDDY LB_BIAS
 encoded 600 frames, 344.44 fps, 22096.60 kb/s
 encoded 600 frames, 346.66 fps, 22096.60 kb/s
 encoded 600 frames, 352.59 fps, 22096.60 kb/s

 NO_NEXT_BUDDY NO_LB_BIAS
 encoded 600 frames, 425.75 fps, 22096.60 kb/s
 encoded 600 frames, 425.45 fps, 22096.60 kb/s
 encoded 600 frames, 422.49 fps, 22096.60 kb/s

Peter pointed out that this is better done via newidle_idx,
not via LB_BIAS, newidle balancing should look for where
there is load _now_, not where there was load 2 ticks ago.

Worst-case latencies are improved as well as no buddies
means less vruntime spread. (as per prior lkml discussions)

This change improves kbuild-peak parallelism as well.

Reported-by: Jason Garrett-Glaser <darkshikari@gmail.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1253011667.9128.16.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 891ea0f72b4..e98c2e8de1d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -67,7 +67,7 @@ SCHED_FEAT(AFFINE_WAKEUPS, 1)
  * wakeup-preemption), since its likely going to consume data we
  * touched, increases cache locality.
  */
-SCHED_FEAT(NEXT_BUDDY, 1)
+SCHED_FEAT(NEXT_BUDDY, 0)
 
 /*
  * Prefer to schedule the task that ran last (when we did
-- 
cgit v1.2.3


From d6a59aa3a2b1ca8411884c833a313b33b5f76e20 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 2 Sep 2009 13:28:02 +0200
Subject: sched: Provide arch_scale_freq_power

Provide an ach specific hook for cpufreq based scaling of
cpu_power.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[ego@in.ibm.com: spotting bugs]
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f0ccb8b926c..c210321adcb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3552,7 +3552,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return default_scale_freq_power(sd, cpu);
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = cpumask_weight(sched_domain_span(sd));
 	unsigned long smt_gain = sd->smt_gain;
@@ -3562,6 +3573,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 	return smt_gain;
 }
 
+unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	return default_scale_smt_power(sd, cpu);
+}
+
 unsigned long scale_rt_power(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -3586,7 +3602,8 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 	unsigned long power = SCHED_LOAD_SCALE;
 	struct sched_group *sdg = sd->groups;
 
-	/* here we could scale based on cpufreq */
+	power *= arch_scale_freq_power(sd, cpu);
+	power >>= SCHED_LOAD_SHIFT;
 
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
 		power *= arch_scale_smt_power(sd, cpu);
-- 
cgit v1.2.3


From 8e6598af3f35629c37249a610cf13e73f70db279 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 3 Sep 2009 13:20:03 +0200
Subject: sched: Feature to disable APERF/MPERF cpu_power

I suspect a feed-back loop between cpuidle and the aperf/mperf
cpu_power bits, where when we have idle C-states lower the ratio,
which leads to lower cpu_power and then less load, which generates
more idle time, etc..

Put in a knob to disable it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 12 ++++++++++--
 kernel/sched_features.h |  5 +++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c210321adcb..e8e603bf876 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3602,11 +3602,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
 	unsigned long power = SCHED_LOAD_SCALE;
 	struct sched_group *sdg = sd->groups;
 
-	power *= arch_scale_freq_power(sd, cpu);
+	if (sched_feat(ARCH_POWER))
+		power *= arch_scale_freq_power(sd, cpu);
+	else
+		power *= default_scale_freq_power(sd, cpu);
+
 	power >>= SCHED_LOAD_SHIFT;
 
 	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-		power *= arch_scale_smt_power(sd, cpu);
+		if (sched_feat(ARCH_POWER))
+			power *= arch_scale_smt_power(sd, cpu);
+		else
+			power *= default_scale_smt_power(sd, cpu);
+
 		power >>= SCHED_LOAD_SHIFT;
 	}
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index e98c2e8de1d..294e10edd3c 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -82,6 +82,11 @@ SCHED_FEAT(LAST_BUDDY, 1)
  */
 SCHED_FEAT(CACHE_HOT_BUDDY, 1)
 
+/*
+ * Use arch dependent cpu power functions
+ */
+SCHED_FEAT(ARCH_POWER, 0)
+
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(LB_BIAS, 1)
-- 
cgit v1.2.3


From 0763a660a84220cc3900fd32abdd7ad109e2278d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 14 Sep 2009 19:37:39 +0200
Subject: sched: Rename select_task_rq() argument

In order to be able to rename the sync argument, we need to rename
the current flag argument.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 14 +++++++-------
 kernel/sched_idletask.c |  2 +-
 kernel/sched_rt.c       |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 19593568031..b554e63c521 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,7 +1331,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
+static int select_task_rq_fair(struct task_struct *p, int sd_flag, int sync)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
@@ -1339,7 +1339,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 	int new_cpu = cpu;
 	int want_affine = 0;
 
-	if (flag & SD_BALANCE_WAKE) {
+	if (sd_flag & SD_BALANCE_WAKE) {
 		if (sched_feat(AFFINE_WAKEUPS))
 			want_affine = 1;
 		new_cpu = prev_cpu;
@@ -1368,7 +1368,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 				break;
 		}
 
-		switch (flag) {
+		switch (sd_flag) {
 		case SD_BALANCE_WAKE:
 			if (!sched_feat(LB_WAKEUP_UPDATE))
 				break;
@@ -1392,7 +1392,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 			want_affine = 0;
 		}
 
-		if (!(tmp->flags & flag))
+		if (!(tmp->flags & sd_flag))
 			continue;
 
 		sd = tmp;
@@ -1402,12 +1402,12 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 		struct sched_group *group;
 		int weight;
 
-		if (!(sd->flags & flag)) {
+		if (!(sd->flags & sd_flag)) {
 			sd = sd->child;
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu, flag);
+		group = find_idlest_group(sd, p, cpu, sd_flag);
 		if (!group) {
 			sd = sd->child;
 			continue;
@@ -1427,7 +1427,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync)
 		for_each_domain(cpu, tmp) {
 			if (weight <= cpumask_weight(sched_domain_span(tmp)))
 				break;
-			if (tmp->flags & flag)
+			if (tmp->flags & sd_flag)
 				sd = tmp;
 		}
 		/* while loop will break here if sd == NULL */
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 99b2f033760..9ff7697e5dc 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int flag, int sync)
+static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 438380810ac..97c53f3f51a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,11 +938,11 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int flag, int sync)
+static int select_task_rq_rt(struct task_struct *p, int sd_flag, int sync)
 {
 	struct rq *rq = task_rq(p);
 
-	if (flag != SD_BALANCE_WAKE)
+	if (sd_flag != SD_BALANCE_WAKE)
 		return smp_processor_id();
 
 	/*
-- 
cgit v1.2.3


From 7d47872146398dbede13223299fe1cb368ebc781 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 14 Sep 2009 19:55:44 +0200
Subject: sched: Rename sync arguments

In order to extend the functions to have more than 1 flag (sync),
rename the argument to flags, and explicitly define a WF_ space for
individual flags.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 30 ++++++++++++++++--------------
 kernel/sched_fair.c     |  6 ++++--
 kernel/sched_idletask.c |  4 ++--
 kernel/sched_rt.c       |  4 ++--
 4 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e8e603bf876..4da335cec8e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -636,9 +636,10 @@ struct rq {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
+static inline
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
-	rq->curr->sched_class->check_preempt_curr(rq, p, sync);
+	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -2318,14 +2319,15 @@ void task_oncpu_function_call(struct task_struct *p,
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+static int try_to_wake_up(struct task_struct *p, unsigned int state,
+			  int wake_flags)
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
 	struct rq *rq;
 
 	if (!sched_feat(SYNC_WAKEUPS))
-		sync = 0;
+		wake_flags &= ~WF_SYNC;
 
 	this_cpu = get_cpu();
 
@@ -2352,7 +2354,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	p->state = TASK_WAKING;
 	task_rq_unlock(rq, &flags);
 
-	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync);
+	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
 	if (cpu != orig_cpu)
 		set_task_cpu(p, cpu);
 
@@ -2378,7 +2380,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 out_activate:
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
-	if (sync)
+	if (wake_flags & WF_SYNC)
 		schedstat_inc(p, se.nr_wakeups_sync);
 	if (orig_cpu != cpu)
 		schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2407,7 +2409,7 @@ out_activate:
 
 out_running:
 	trace_sched_wakeup(rq, p, success);
-	check_preempt_curr(rq, p, sync);
+	check_preempt_curr(rq, p, wake_flags);
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -5562,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
 
 #endif /* CONFIG_PREEMPT */
 
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+int default_wake_function(wait_queue_t *curr, unsigned mode, int flags,
 			  void *key)
 {
-	return try_to_wake_up(curr->private, mode, sync);
+	return try_to_wake_up(curr->private, mode, flags);
 }
 EXPORT_SYMBOL(default_wake_function);
 
@@ -5579,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-			int nr_exclusive, int sync, void *key)
+			int nr_exclusive, int flags, void *key)
 {
 	wait_queue_t *curr, *next;
 
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 
-		if (curr->func(curr, mode, sync, key) &&
+		if (curr->func(curr, mode, flags, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
@@ -5647,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
 {
 	unsigned long flags;
-	int sync = 1;
+	int wake_flags = WF_SYNC;
 
 	if (unlikely(!q))
 		return;
 
 	if (unlikely(!nr_exclusive))
-		sync = 0;
+		wake_flags = 0;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, sync, key);
+	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b554e63c521..007958e3c93 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,13 +1331,14 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int sync)
+static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
 	int want_affine = 0;
+	int sync = flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		if (sched_feat(AFFINE_WAKEUPS))
@@ -1548,11 +1549,12 @@ static void set_next_buddy(struct sched_entity *se)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+	int sync = flags & WF_SYNC;
 
 	update_curr(cfs_rq);
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9ff7697e5dc..a8b448af004 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync)
+static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync)
 /*
  * Idle tasks are unconditionally rescheduled:
  */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	resched_task(rq->idle);
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 97c53f3f51a..13de7126a6a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,7 +938,7 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int sync)
+static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 {
 	struct rq *rq = task_rq(p);
 
@@ -1002,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (p->prio < rq->curr->prio) {
 		resched_task(rq->curr);
-- 
cgit v1.2.3


From a7558e01056f5191ff2ecff53b075dcb9e484188 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 14 Sep 2009 20:02:34 +0200
Subject: sched: Add WF_FORK

Avoid the cache buddies from biasing the time distribution away
from fork()ers. Normally the next buddy will be the preferred
scheduling target, but this makes fork()s prefer to run the new
child, whereas we prefer to run the parent, since that will
generate more work.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 2 +-
 kernel/sched_fair.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4da335cec8e..0d4c4fea331 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2602,7 +2602,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		inc_nr_running(rq);
 	}
 	trace_sched_wakeup_new(rq, p, 1);
-	check_preempt_curr(rq, p, 0);
+	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 007958e3c93..6766959c7f4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1580,7 +1580,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags
 	 */
 	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
 		set_last_buddy(se);
-	if (sched_feat(NEXT_BUDDY))
+	if (sched_feat(NEXT_BUDDY) && !(flags & WF_FORK))
 		set_next_buddy(pse);
 
 	/*
-- 
cgit v1.2.3


From 6ca6cca31ddc7cc8b1dc38b12d7593d2667defe8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 15 Sep 2009 12:24:22 -0400
Subject: tracing: optimize global_trace_clock cachelines

The prev_trace_clock_time is only read or written to when the
trace_clock_lock is taken. For better perfomance, they
should share the same cache line.

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_clock.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f..20c5f92e28a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -66,10 +66,14 @@ u64 notrace trace_clock(void)
  * Used by plugins that need globally coherent timestamps.
  */
 
-static u64 prev_trace_clock_time;
-
-static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
-	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+/* keep prev_time and lock in the same cacheline. */
+static struct {
+	u64 prev_time;
+	raw_spinlock_t lock;
+} trace_clock_struct ____cacheline_aligned_in_smp =
+	{
+		.lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
+	};
 
 u64 notrace trace_clock_global(void)
 {
@@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)
 	if (unlikely(in_nmi()))
 		goto out;
 
-	__raw_spin_lock(&trace_clock_lock);
+	__raw_spin_lock(&trace_clock_struct.lock);
 
 	/*
 	 * TODO: if this happens often then maybe we should reset
-	 * my_scd->clock to prev_trace_clock_time+1, to make sure
+	 * my_scd->clock to prev_time+1, to make sure
 	 * we start ticking with the local clock from now on?
 	 */
-	if ((s64)(now - prev_trace_clock_time) < 0)
-		now = prev_trace_clock_time + 1;
+	if ((s64)(now - trace_clock_struct.prev_time) < 0)
+		now = trace_clock_struct.prev_time + 1;
 
-	prev_trace_clock_time = now;
+	trace_clock_struct.prev_time = now;
 
-	__raw_spin_unlock(&trace_clock_lock);
+	__raw_spin_unlock(&trace_clock_struct.lock);
 
  out:
 	raw_local_irq_restore(flags);
-- 
cgit v1.2.3


From a56af87648054089d89874b52e3fc23ed4f274ad Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sun, 12 Jul 2009 21:44:55 +0800
Subject: driver-core: move dma-coherent.c from kernel to driver/base

Placing dma-coherent.c in driver/base is better than in kernel,
since it contains code to do per-device coherent dma memory
handling.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/Makefile       |   1 -
 kernel/dma-coherent.c | 176 --------------------------------------------------
 2 files changed, 177 deletions(-)
 delete mode 100644 kernel/dma-coherent.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 961379caf66..3d9c7e27e3f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -90,7 +90,6 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f2..00000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Coherent per-device memory handling.
- * Borrowed from i386
- */
-#include <linux/kernel.h>
-#include <linux/dma-mapping.h>
-
-struct dma_coherent_mem {
-	void		*virt_base;
-	u32		device_base;
-	int		size;
-	int		flags;
-	unsigned long	*bitmap;
-};
-
-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
-				dma_addr_t device_addr, size_t size, int flags)
-{
-	void __iomem *mem_base = NULL;
-	int pages = size >> PAGE_SHIFT;
-	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
-
-	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
-		goto out;
-	if (!size)
-		goto out;
-	if (dev->dma_mem)
-		goto out;
-
-	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
-	mem_base = ioremap(bus_addr, size);
-	if (!mem_base)
-		goto out;
-
-	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
-	if (!dev->dma_mem)
-		goto out;
-	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
-	if (!dev->dma_mem->bitmap)
-		goto free1_out;
-
-	dev->dma_mem->virt_base = mem_base;
-	dev->dma_mem->device_base = device_addr;
-	dev->dma_mem->size = pages;
-	dev->dma_mem->flags = flags;
-
-	if (flags & DMA_MEMORY_MAP)
-		return DMA_MEMORY_MAP;
-
-	return DMA_MEMORY_IO;
-
- free1_out:
-	kfree(dev->dma_mem);
- out:
-	if (mem_base)
-		iounmap(mem_base);
-	return 0;
-}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
-	struct dma_coherent_mem *mem = dev->dma_mem;
-
-	if (!mem)
-		return;
-	dev->dma_mem = NULL;
-	iounmap(mem->virt_base);
-	kfree(mem->bitmap);
-	kfree(mem);
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
-					dma_addr_t device_addr, size_t size)
-{
-	struct dma_coherent_mem *mem = dev->dma_mem;
-	int pos, err;
-
-	size += device_addr & ~PAGE_MASK;
-
-	if (!mem)
-		return ERR_PTR(-EINVAL);
-
-	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
-	err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
-	if (err != 0)
-		return ERR_PTR(err);
-	return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
-/**
- * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
- *
- * @dev:	device from which we allocate memory
- * @size:	size of requested memory area
- * @dma_handle:	This will be filled with the correct dma handle
- * @ret:	This pointer will be filled with the virtual address
- *		to allocated area.
- *
- * This function should be only called from per-arch dma_alloc_coherent()
- * to support allocation from per-device coherent memory pools.
- *
- * Returns 0 if dma_alloc_coherent should continue with allocating from
- * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
- */
-int dma_alloc_from_coherent(struct device *dev, ssize_t size,
-				       dma_addr_t *dma_handle, void **ret)
-{
-	struct dma_coherent_mem *mem;
-	int order = get_order(size);
-	int pageno;
-
-	if (!dev)
-		return 0;
-	mem = dev->dma_mem;
-	if (!mem)
-		return 0;
-
-	*ret = NULL;
-
-	if (unlikely(size > (mem->size << PAGE_SHIFT)))
-		goto err;
-
-	pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
-	if (unlikely(pageno < 0))
-		goto err;
-
-	/*
-	 * Memory was found in the per-device area.
-	 */
-	*dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
-	*ret = mem->virt_base + (pageno << PAGE_SHIFT);
-	memset(*ret, 0, size);
-
-	return 1;
-
-err:
-	/*
-	 * In the case where the allocation can not be satisfied from the
-	 * per-device area, try to fall back to generic memory if the
-	 * constraints allow it.
-	 */
-	return mem->flags & DMA_MEMORY_EXCLUSIVE;
-}
-EXPORT_SYMBOL(dma_alloc_from_coherent);
-
-/**
- * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
- * @dev:	device from which the memory was allocated
- * @order:	the order of pages allocated
- * @vaddr:	virtual address of allocated pages
- *
- * This checks whether the memory was allocated from the per-device
- * coherent memory pool and if so, releases that memory.
- *
- * Returns 1 if we correctly released the memory, or 0 if
- * dma_release_coherent() should proceed with releasing memory from
- * generic pools.
- */
-int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
-{
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-
-	if (mem && vaddr >= mem->virt_base && vaddr <
-		   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
-		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-
-		bitmap_release_region(mem->bitmap, page, order);
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(dma_release_from_coherent);
-- 
cgit v1.2.3


From 63859d4fe4c97b737e7adbfe60acb1c2b2e668cb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Sep 2009 19:14:42 +0200
Subject: sched: Fix sync wakeups again

The sync argument rename to introduce WF_* broke stuff by missing a
local alias for an argument in __wake_up_common, fix it by using
the more descriptive wake_flags name.

This restores WF_SYNC propagation, which fixes wake_affine()
behaviour, which fixes pipe-test.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0d4c4fea331..af04ede6dd2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5564,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
 
 #endif /* CONFIG_PREEMPT */
 
-int default_wake_function(wait_queue_t *curr, unsigned mode, int flags,
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 			  void *key)
 {
-	return try_to_wake_up(curr->private, mode, flags);
+	return try_to_wake_up(curr->private, mode, wake_flags);
 }
 EXPORT_SYMBOL(default_wake_function);
 
@@ -5581,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-			int nr_exclusive, int flags, void *key)
+			int nr_exclusive, int wake_flags, void *key)
 {
 	wait_queue_t *curr, *next;
 
 	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
 		unsigned flags = curr->flags;
 
-		if (curr->func(curr, mode, flags, key) &&
+		if (curr->func(curr, mode, wake_flags, key) &&
 				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 			break;
 	}
-- 
cgit v1.2.3


From e69b0f1b41c0e57bb1e29100b5810a5914efcb45 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Sep 2009 19:38:52 +0200
Subject: sched: Add a few SYNC hint knobs to play with

Currently we use overlap to weaken the SYNC hint, but allow it to
set the hint as well.

 echo NO_SYNC_WAKEUP > /debug/sched_features
 echo SYNC_MORE > /debug/sched_features

preserves pipe-test behaviour without using the WF_SYNC hint.

Worth playing with on more workloads...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 14 +++++++++++---
 kernel/sched_features.h | 10 ++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6766959c7f4..280892e9d85 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1165,9 +1165,17 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	load	  = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
-	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-			p->se.avg_overlap > sysctl_sched_migration_cost))
-		sync = 0;
+	if (sync) {
+	       if (sched_feat(SYNC_LESS) &&
+		   (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+		    p->se.avg_overlap > sysctl_sched_migration_cost))
+		       sync = 0;
+	} else {
+		if (sched_feat(SYNC_MORE) &&
+		    (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+		     p->se.avg_overlap < sysctl_sched_migration_cost))
+			sync = 1;
+	}
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 294e10edd3c..70115c69c7a 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -62,6 +62,16 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
  */
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
 
+/*
+ * Weaken SYNC hint based on overlap
+ */
+SCHED_FEAT(SYNC_LESS, 1)
+
+/*
+ * Add SYNC hint based on overlap
+ */
+SCHED_FEAT(SYNC_MORE, 0)
+
 /*
  * Prefer to schedule the task we woke last (assuming it failed
  * wakeup-preemption), since its likely going to consume data we
-- 
cgit v1.2.3


From cb684b5bcd6a79ae7e2360c6b158c4aa7b0a293a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 15 Sep 2009 21:53:11 +0200
Subject: block: fix linkage problem with blk_iopoll and !CONFIG_BLOCK

 kernel/built-in.o:(.data+0x17b0): undefined reference to `blk_iopoll_enabled'

Since the extern declaration makes the compile work, but the actual
symbol is missing when block/blk-iopoll.o isn't linked in.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/sysctl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6bb59f70740..1a631ba684a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -91,7 +91,9 @@ extern int sysctl_nr_trim_pages;
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+#ifdef CONFIG_BLOCK
 extern int blk_iopoll_enabled;
+#endif
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -998,6 +1000,7 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_BLOCK
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "blk_iopoll",
@@ -1006,6 +1009,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3


From 59abf02644c45f1591e1374ee7bb45dc757fcb88 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Sep 2009 08:28:30 +0200
Subject: sched: Add SD_PREFER_LOCAL

And turn it on for NUMA and MC domains. This improves
locality in balancing decisions by keeping up to
capacity amount of tasks local before looking for idle
CPUs. (and twice the capacity if SD_POWERSAVINGS_BALANCE
is set.)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 280892e9d85..a37f311f436 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1360,7 +1360,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 		 * If power savings logic is enabled for a domain, see if we
 		 * are not overloaded, if so, don't balance wider.
 		 */
-		if (tmp->flags & SD_POWERSAVINGS_BALANCE) {
+		if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
 			unsigned long power = 0;
 			unsigned long nr_running = 0;
 			unsigned long capacity;
@@ -1373,7 +1373,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 
 			capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
 
-			if (nr_running/2 < capacity)
+			if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+				nr_running /= 2;
+
+			if (nr_running < capacity)
 				break;
 		}
 
-- 
cgit v1.2.3


From 51e0304ce6e55a6e59658558916b4f74da085ff0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 16 Sep 2009 08:54:45 +0200
Subject: sched: Implement a gentler fair-sleepers feature

Add back FAIR_SLEEPERS and GENTLE_FAIR_SLEEPERS.

FAIR_SLEEPERS is the old logic: credit sleepers with their sleep time.

GENTLE_FAIR_SLEEPERS dampens this a bit: 50% of their sleep time gets
credited.

The hope here is to still give the benefits of fair-sleepers logic
(quick wakeups, etc.) while not allow them to have 100% of their
sleep time as if they were running.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 9 ++++++++-
 kernel/sched_features.h | 9 ++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a37f311f436..acf16a8d934 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -711,7 +711,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	if (!initial) {
 		/* sleeps upto a single latency don't count. */
-		if (sched_feat(NEW_FAIR_SLEEPERS)) {
+		if (sched_feat(FAIR_SLEEPERS)) {
 			unsigned long thresh = sysctl_sched_latency;
 
 			/*
@@ -725,6 +725,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 					 task_of(se)->policy != SCHED_IDLE))
 				thresh = calc_delta_fair(thresh, se);
 
+			/*
+			 * Halve their sleep time's effect, to allow
+			 * for a gentler effect of sleepers:
+			 */
+			if (sched_feat(GENTLE_FAIR_SLEEPERS))
+				thresh >>= 1;
+
 			vruntime -= thresh;
 		}
 	}
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 70115c69c7a..fd375675f83 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -3,7 +3,14 @@
  * considers the task to be running during that period. This gives it
  * a service deficit on wakeup, allowing it to run sooner.
  */
-SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
+SCHED_FEAT(FAIR_SLEEPERS, 1)
+
+/*
+ * Only give sleepers 50% of their service deficit. This allows
+ * them to run sooner, but does not allow tons of sleepers to
+ * rip the spread apart.
+ */
+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 
 /*
  * By not normalizing the sleep time, heavy tasks get an effective
-- 
cgit v1.2.3


From b36461da2a0389149d7f88f3cbc05a30d1db9faa Mon Sep 17 00:00:00 2001
From: Atsushi Tsuji <a-tsuji@bk.jp.nec.com>
Date: Tue, 15 Sep 2009 19:06:30 +0900
Subject: tracing: Fix minor bugs for __unregister_ftrace_function_probe

Fix the condition of strcmp for "*".
Also fix NULL pointer dereference when glob is NULL.

Signed-off-by: Atsushi Tsuji <a-tsuji@bk.jp.nec.com>
LKML-Reference: <4AAF6726.5090905@bk.jp.nec.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8b23d567008..f7ab7fc162c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2062,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	int i, len = 0;
 	char *search;
 
-	if (glob && (strcmp(glob, "*") || !strlen(glob)))
+	if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
 		glob = NULL;
-	else {
+	else if (glob) {
 		int not;
 
 		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
-- 
cgit v1.2.3


From 3b6408942206f940dd538e980e9904e48f4b64f8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Sep 2009 13:44:33 +0200
Subject: sched: Optimize cgroup vs wakeup a bit

We don't need to call update_shares() for each domain we iterate,
just got the largets one.

However, we should call it before wake_affine() as well, so that
that can use up-to-date values too.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          |  7 -------
 kernel/sched_fair.c     | 23 +++++++++--------------
 kernel/sched_features.h |  2 +-
 3 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index af04ede6dd2..5049d959bb2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -376,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 
 #else
 
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-	return 1;
-}
-#endif
-
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline struct task_group *task_group(struct task_struct *p)
 {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index acf16a8d934..722d392b0da 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1348,7 +1348,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  */
 static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 {
-	struct sched_domain *tmp, *sd = NULL;
+	struct sched_domain *tmp, *shares = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
@@ -1387,22 +1387,14 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 				break;
 		}
 
-		switch (sd_flag) {
-		case SD_BALANCE_WAKE:
-			if (!sched_feat(LB_WAKEUP_UPDATE))
-				break;
-		case SD_BALANCE_FORK:
-		case SD_BALANCE_EXEC:
-			if (root_task_group_empty())
-				break;
-			update_shares(tmp);
-		default:
-			break;
-		}
-
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
 
+			if (sched_feat(LB_SHARES_UPDATE)) {
+				update_shares(tmp);
+				shares = tmp;
+			}
+
 			if (wake_affine(tmp, p, sync)) {
 				new_cpu = cpu;
 				goto out;
@@ -1417,6 +1409,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 		sd = tmp;
 	}
 
+	if (sd && sd != shares && sched_feat(LB_SHARES_UPDATE))
+		update_shares(sd);
+
 	while (sd) {
 		struct sched_group *group;
 		int weight;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index fd375675f83..d5059fd761d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -107,7 +107,7 @@ SCHED_FEAT(ARCH_POWER, 0)
 SCHED_FEAT(HRTICK, 0)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
+SCHED_FEAT(LB_SHARES_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
 
 /*
-- 
cgit v1.2.3


From 5158f4e4428c6b8d52796b3b460e95796123a114 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Sep 2009 13:46:59 +0200
Subject: sched: Clean up the load_idx selection in select_task_rq_fair

Clean up the code a little.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 722d392b0da..aeff40e7ec1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1248,26 +1248,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-		  int this_cpu, int flag)
+		  int this_cpu, int load_idx)
 {
 	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
-	int load_idx = 0;
-
-	switch (flag) {
-	case SD_BALANCE_FORK:
-	case SD_BALANCE_EXEC:
-		load_idx = sd->forkexec_idx;
-		break;
-
-	case SD_BALANCE_WAKE:
-		load_idx = sd->wake_idx;
-		break;
-
-	default:
-		break;
-	}
 
 	do {
 		unsigned long load, avg_load;
@@ -1346,14 +1331,14 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
+static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 {
 	struct sched_domain *tmp, *shares = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
 	int want_affine = 0;
-	int sync = flags & WF_SYNC;
+	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
 		if (sched_feat(AFFINE_WAKEUPS))
@@ -1413,6 +1398,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 		update_shares(sd);
 
 	while (sd) {
+		int load_idx = sd->forkexec_idx;
 		struct sched_group *group;
 		int weight;
 
@@ -1421,7 +1407,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags)
 			continue;
 		}
 
-		group = find_idlest_group(sd, p, cpu, sd_flag);
+		if (sd_flag & SD_BALANCE_WAKE)
+			load_idx = sd->wake_idx;
+
+		group = find_idlest_group(sd, p, cpu, load_idx);
 		if (!group) {
 			sd = sd->child;
 			continue;
-- 
cgit v1.2.3


From 5a9b86f647a56862cdc0a1362bfb015ae921af7f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 16 Sep 2009 13:47:58 +0200
Subject: sched: Rename flags to wake_flags

For consistencies sake, rename the argument (again).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aeff40e7ec1..c741cd9d38d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1551,12 +1551,12 @@ static void set_next_buddy(struct sched_entity *se)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	int sync = flags & WF_SYNC;
+	int sync = wake_flags & WF_SYNC;
 
 	update_curr(cfs_rq);
 
@@ -1582,7 +1582,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags
 	 */
 	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
 		set_last_buddy(se);
-	if (sched_feat(NEXT_BUDDY) && !(flags & WF_FORK))
+	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
 		set_next_buddy(pse);
 
 	/*
-- 
cgit v1.2.3


From eb24073bc1fe3e569a855cf38d529fb650c35524 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 16 Sep 2009 21:09:13 +0200
Subject: sched: Fix TASK_WAKING & loadaverage breakage

Fix this:

top - 21:54:00 up  2:59,  1 user,  load average: 432512.33, 426421.74, 417432.74

Which happens because we now set TASK_WAKING before activate_task().

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5049d959bb2..969dfaef246 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2343,7 +2343,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	/*
 	 * In order to handle concurrent wakeups and release the rq->lock
 	 * we put the task in TASK_WAKING state.
+	 *
+	 * First fix up the nr_uninterruptible count:
 	 */
+	if (task_contributes_to_load(p))
+		rq->nr_uninterruptible--;
 	p->state = TASK_WAKING;
 	task_rq_unlock(rq, &flags);
 
-- 
cgit v1.2.3


From ad4b78bbcbab66998b05d422ac6106b645796e54 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Sep 2009 12:31:31 +0200
Subject: sched: Add new wakeup preemption mode: WAKEUP_RUNNING

Create a new wakeup preemption mode, preempt towards tasks that run
shorter on avg. It sets next buddy to be sure we actually run the task
we preempted for.

Test results:

 root@twins:~# while :; do :; done &
 [1] 6537
 root@twins:~# while :; do :; done &
 [2] 6538
 root@twins:~# while :; do :; done &
 [3] 6539
 root@twins:~# while :; do :; done &
 [4] 6540

 root@twins:/home/peter# ./latt -c4 sleep 4
 Entries: 48 (clients=4)

 Averages:
 ------------------------------
        Max          4750 usec
        Avg           497 usec
        Stdev         737 usec

 root@twins:/home/peter# echo WAKEUP_RUNNING > /debug/sched_features

 root@twins:/home/peter# ./latt -c4 sleep 4
 Entries: 48 (clients=4)

 Averages:
 ------------------------------
        Max            14 usec
        Avg             5 usec
        Stdev           3 usec

Disabled by default - needs more testing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <new-submission>
---
 kernel/sched.c          | 17 ++++++++++-------
 kernel/sched_debug.c    |  1 +
 kernel/sched_fair.c     | 14 +++++++++++---
 kernel/sched_features.h |  5 +++++
 4 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 969dfaef246..3bb4ea2ee6f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2458,6 +2458,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.avg_overlap		= 0;
 	p->se.start_runtime		= 0;
 	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
+	p->se.avg_running		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start			= 0;
@@ -5310,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
 
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
+static void put_prev_task(struct rq *rq, struct task_struct *p)
 {
-	if (prev->state == TASK_RUNNING) {
-		u64 runtime = prev->se.sum_exec_runtime;
+	u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
 
-		runtime -= prev->se.prev_sum_exec_runtime;
-		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+	update_avg(&p->se.avg_running, runtime);
 
+	if (p->state == TASK_RUNNING) {
 		/*
 		 * In order to avoid avg_overlap growing stale when we are
 		 * indeed overlapping and hence not getting put to sleep, grow
@@ -5327,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 		 * correlates to the amount of cache footprint a task can
 		 * build up.
 		 */
-		update_avg(&prev->se.avg_overlap, runtime);
+		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+		update_avg(&p->se.avg_overlap, runtime);
+	} else {
+		update_avg(&p->se.avg_running, 0);
 	}
-	prev->sched_class->put_prev_task(rq, prev);
+	p->sched_class->put_prev_task(rq, p);
 }
 
 /*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ddbd089126..efb84409bc4 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.sum_exec_runtime);
 	PN(se.avg_overlap);
 	PN(se.avg_wakeup);
+	PN(se.avg_running);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c741cd9d38d..3e6f78c6687 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1605,9 +1605,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 	}
 
-	if (!sched_feat(WAKEUP_PREEMPT))
-		return;
-
 	if ((sched_feat(WAKEUP_SYNC) && sync) ||
 	    (sched_feat(WAKEUP_OVERLAP) &&
 	     (se->avg_overlap < sysctl_sched_migration_cost &&
@@ -1616,6 +1613,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 	}
 
+	if (sched_feat(WAKEUP_RUNNING)) {
+		if (pse->avg_running < se->avg_running) {
+			set_next_buddy(pse);
+			resched_task(curr);
+			return;
+		}
+	}
+
+	if (!sched_feat(WAKEUP_PREEMPT))
+		return;
+
 	find_matching_se(&se, &pse);
 
 	BUG_ON(!pse);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d..0d94083582c 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -53,6 +53,11 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
  */
 SCHED_FEAT(WAKEUP_OVERLAP, 0)
 
+/*
+ * Wakeup preemption towards tasks that run short
+ */
+SCHED_FEAT(WAKEUP_RUNNING, 0)
+
 /*
  * Use the SYNC wakeup hint, pipes and the likes use this to indicate
  * the remote end is likely to consume the data we just wrote, and
-- 
cgit v1.2.3


From de69a80be32445b0a71e8e3b757e584d7beb90f7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 17 Sep 2009 09:01:20 +0200
Subject: sched: Stop buddies from hogging the system

Clear buddies more agressively.

The (theoretical, haven't actually observed any of this) problem is
that when we do not select either buddy in pick_next_entity()
because they are too far ahead of the left-most task, we do not
clear the buddies.

This means that as soon as we service the left-most task, these
same buddies will be tried again on the next schedule. Now if the
left-most task was a pure hog, it wouldn't have done any wakeups
and it wouldn't have set buddies of its own. That leads to the old
buddies dominating, which would lead to bad latencies.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3e6f78c6687..ffee827fa22 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -764,10 +764,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (cfs_rq->last == se)
+	if (!se || cfs_rq->last == se)
 		cfs_rq->last = NULL;
 
-	if (cfs_rq->next == se)
+	if (!se || cfs_rq->next == se)
 		cfs_rq->next = NULL;
 }
 
@@ -1646,8 +1646,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 		/*
 		 * If se was a buddy, clear it so that it will have to earn
 		 * the favour again.
+		 *
+		 * If se was not a buddy, clear the buddies because neither
+		 * was elegible to run, let them earn it again.
+		 *
+		 * IOW. unconditionally clear buddies.
 		 */
-		__clear_buddies(cfs_rq, se);
+		__clear_buddies(cfs_rq, NULL);
 		set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
-- 
cgit v1.2.3


From 29cd8bae396583a2ee9a3340db8c5102acf9f6fd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 17 Sep 2009 09:01:14 +0200
Subject: sched: Fix SD_POWERSAVING_BALANCE|SD_PREFER_LOCAL vs SD_WAKE_AFFINE

The SD_POWERSAVING_BALANCE|SD_PREFER_LOCAL code can break out of
the domain iteration early, making us miss the SD_WAKE_AFFINE bits.

Fix this by continuing iteration until there is no need for a
larger domain.

This also cleans up the cgroup stuff a bit, but not having two
update_shares() invocations.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ffee827fa22..10d218ab69f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1333,11 +1333,12 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  */
 static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 {
-	struct sched_domain *tmp, *shares = NULL, *sd = NULL;
+	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
 	int want_affine = 0;
+	int want_sd = 1;
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
@@ -1369,33 +1370,44 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 				nr_running /= 2;
 
 			if (nr_running < capacity)
-				break;
+				want_sd = 0;
 		}
 
 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
 
-			if (sched_feat(LB_SHARES_UPDATE)) {
-				update_shares(tmp);
-				shares = tmp;
-			}
-
-			if (wake_affine(tmp, p, sync)) {
-				new_cpu = cpu;
-				goto out;
-			}
-
+			affine_sd = tmp;
 			want_affine = 0;
 		}
 
+		if (!want_sd && !want_affine)
+			break;
+
 		if (!(tmp->flags & sd_flag))
 			continue;
 
-		sd = tmp;
+		if (want_sd)
+			sd = tmp;
+	}
+
+	if (sched_feat(LB_SHARES_UPDATE)) {
+		/*
+		 * Pick the largest domain to update shares over
+		 */
+		tmp = sd;
+		if (affine_sd && (!tmp ||
+				  cpumask_weight(sched_domain_span(affine_sd)) >
+				  cpumask_weight(sched_domain_span(sd))))
+			tmp = affine_sd;
+
+		if (tmp)
+			update_shares(tmp);
 	}
 
-	if (sd && sd != shares && sched_feat(LB_SHARES_UPDATE))
-		update_shares(sd);
+	if (affine_sd && wake_affine(affine_sd, p, sync)) {
+		new_cpu = cpu;
+		goto out;
+	}
 
 	while (sd) {
 		int load_idx = sd->forkexec_idx;
-- 
cgit v1.2.3


From b375a11a239e9e1cac40c7f3ff28b343d9f7ac51 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 17 Sep 2009 00:05:58 -0400
Subject: tracing: switch function prints from %pf to %ps

For direct function pointers (like what mcount provides) PowerPC64
requires the use of %ps, otherwise nothing is printed.

This patch converts all prints of functions retrieved through mcount
to use the %ps format from the %pf.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c                | 6 +++---
 kernel/trace/trace_functions.c       | 2 +-
 kernel/trace/trace_functions_graph.c | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f7ab7fc162c..cc615f84751 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1405,7 +1405,7 @@ static int t_hash_show(struct seq_file *m, void *v)
 	if (rec->ops->print)
 		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
 
-	seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func);
+	seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
 
 	if (rec->data)
 		seq_printf(m, ":%p", rec->data);
@@ -1515,7 +1515,7 @@ static int t_show(struct seq_file *m, void *v)
 	if (!rec)
 		return 0;
 
-	seq_printf(m, "%pf\n", (void *)rec->ip);
+	seq_printf(m, "%ps\n", (void *)rec->ip);
 
 	return 0;
 }
@@ -2456,7 +2456,7 @@ static int g_show(struct seq_file *m, void *v)
 		return 0;
 	}
 
-	seq_printf(m, "%pf\n", (void *)*ptr);
+	seq_printf(m, "%ps\n", (void *)*ptr);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5b01b94518f..b3f3776b0cd 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 {
 	long count = (long)data;
 
-	seq_printf(m, "%pf:", (void *)ip);
+	seq_printf(m, "%ps:", (void *)ip);
 
 	if (ops == &traceon_probe_ops)
 		seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 61f166707a0..45e6c01b2e4 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
 	if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
 		ftrace_graph_stop();
 		WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
-		     "  from func %pF return to %lx\n",
+		     "  from func %ps return to %lx\n",
 		     current->ret_stack[index].fp,
 		     frame_pointer,
 		     (void *)current->ret_stack[index].func,
@@ -669,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = trace_seq_printf(s, "%pf();\n", (void *)call->func);
+	ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -712,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func);
+	ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3


From 5dd4de587fd9c25cb32a7a0fe9feec3647509b6f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 17 Sep 2009 17:38:32 +0800
Subject: softirq: add BLOCK_IOPOLL to softirq_to_name

With BLOCK_IOPOLL_SOFTIRQ added, softirq_to_name[] and
show_softirq_name() needs to be updated.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AB20398.8070209@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/softirq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7db25067cd2..f8749e5216e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
-	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
+	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
 	"TASKLET", "SCHED", "HRTIMER",	"RCU"
 };
 
-- 
cgit v1.2.3