From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- kernel/module.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2..f5934954fa9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -535,7 +535,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, -- cgit v1.2.3 From b9bf3121af348d9255f1c917830fe8c2df52efcb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:47 +0900 Subject: percpu: use DEFINE_PER_CPU_SHARED_ALIGNED() There are a few places where ___cacheline_aligned* is used with DEFINE_PER_CPU(). Use DEFINE_PER_CPU_SHARED_ALIGNED() instead. DEFINE_PER_CPU_SHARED_ALIGNED() applies alignment only on SMPs. While all other converted places used _in_smp variant or only get compiled for SMP, net/rds used unconditional ____cacheline_aligned. I don't see any reason these data structures should be aligned on UP and thus converted together. Signed-off-by: Tejun Heo Cc: Mike Frysinger Cc: Tony Luck Cc: Andy Grover --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e..34fd81d2178 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -318,12 +318,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -- cgit v1.2.3 From 245b2e70eabd797932adb263a65da0bab3711753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:48 +0900 Subject: percpu: clean up percpu variable definitions Percpu variable definition is about to be updated such that all percpu symbols including the static ones must be unique. Update percpu variable definitions accordingly. * as,cfq: rename ioc_count uniquely * cpufreq: rename cpu_dbs_info uniquely * xen: move nesting_count out of xen_evtchn_do_upcall() and rename it * mm: move ratelimits out of balance_dirty_pages_ratelimited_nr() and rename it * ipv4,6: rename cookie_scratch uniquely * x86 perf_counter: rename prev_left to pmc_prev_left, irq_entry to pmc_irq_entry and nmi_entry to pmc_nmi_entry * perf_counter: rename disable_count to perf_disable_count * ftrace: rename test_event_disable to ftrace_test_event_disable * kmemleak: rename test_pointer to kmemleak_test_pointer * mce: rename next_interval to mce_next_interval [ Impact: percpu usage cleanups, no duplicate static percpu var names ] Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ivan Kokshaysky Cc: Jens Axboe Cc: Dave Jones Cc: Jeremy Fitzhardinge Cc: linux-mm Cc: David S. Miller Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Li Zefan Cc: Catalin Marinas Cc: Andi Kleen --- kernel/perf_counter.c | 6 +++--- kernel/trace/trace_events.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1a933a221ea..1fd7a2e7575 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -98,16 +98,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } -static DEFINE_PER_CPU(int, disable_count); +static DEFINE_PER_CPU(int, perf_disable_count); void __perf_disable(void) { - __get_cpu_var(disable_count)++; + __get_cpu_var(perf_disable_count)++; } bool __perf_enable(void) { - return !--__get_cpu_var(disable_count); + return !--__get_cpu_var(perf_disable_count); } void perf_disable(void) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b..54b1de5074b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1318,7 +1318,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1334,7 +1334,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1352,7 +1352,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } -- cgit v1.2.3 From 134e63756d5f3d0f7604dfcca847b09d1b14fd66 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 10 Jul 2009 09:51:34 +0000 Subject: genetlink: make netns aware This makes generic netlink network namespace aware. No generic netlink families except for the controller family are made namespace aware, they need to be checked one by one and then set the family->netnsok member to true. A new function genlmsg_multicast_netns() is introduced to allow sending a multicast message in a given namespace, for example when it applies to an object that lives in that namespace, a new function genlmsg_multicast_allns() to send a message to all network namespaces (for objects that do not have an associated netns). The function genlmsg_multicast() is changed to multicast the message in just init_net, which is currently correct for all generic netlink families since they only work in init_net right now. Some will later want to work in all net namespaces because they do not care about the netns at all -- those will have to be converted to use one of the new functions genlmsg_multicast_allns() or genlmsg_multicast_netns() whenever they are made netns aware in some way. After this patch families can easily decide whether or not they should be available in all net namespaces. Many genl families us it for objects not related to networking and should therefore be available in all namespaces, but that will have to be done on a per family basis. Note that this doesn't touch on the checkpoint/restart problem where network namespaces could be used, genl families and multicast groups are numbered globally and I see no easy way of changing that, especially since it must be possible to multicast to all network namespaces for those families that do not care about netns. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- kernel/taskstats.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 888adbcca30..ea8384d3caa 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, /* * Send taskstats data in @skb to listener with nl_pid @pid */ -static int send_reply(struct sk_buff *skb, pid_t pid) +static int send_reply(struct sk_buff *skb, struct genl_info *info) { struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); void *reply = genlmsg_data(genlhdr); @@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid) return rc; } - return genlmsg_unicast(skb, pid); + return genlmsg_reply(skb, info); } /* @@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb, if (!skb_next) break; } - rc = genlmsg_unicast(skb_cur, s->pid); + rc = genlmsg_unicast(&init_net, skb_cur, s->pid); if (rc == -ECONNREFUSED) { s->valid = 0; delcount++; @@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) goto err; } - rc = send_reply(rep_skb, info->snd_pid); + rc = send_reply(rep_skb, info); err: fput_light(file, fput_needed); @@ -487,7 +487,7 @@ free_return_rc: } else goto err; - return send_reply(rep_skb, info->snd_pid); + return send_reply(rep_skb, info); err: nlmsg_free(rep_skb); return rc; -- cgit v1.2.3 From 86886e55b273f565935491816c7c96b82469d4f8 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:07 -0700 Subject: x86, intel_txt: Intel TXT Sx shutdown support Support for graceful handling of sleep states (S3/S4/S5) after an Intel(R) TXT launch. Without this patch, attempting to place the system in one of the ACPI sleep states (S3/S4/S5) will cause the TXT hardware to treat this as an attack and will cause a system reset, with memory locked. Not only may the subsequent memory scrub take some time, but the platform will be unable to enter the requested power state. This patch calls back into the tboot so that it may properly and securely clean up system state and clear the secrets-in-memory flag, after which it will place the system into the requested sleep state using ACPI information passed by the kernel. arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- kernel/cpu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..ff071e022a8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -376,7 +377,7 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error; + int cpu, first_cpu, error, num_cpus = 0; error = stop_machine_create(); if (error) @@ -391,6 +392,7 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + num_cpus++; error = _cpu_down(cpu, 1); if (!error) { cpumask_set_cpu(cpu, frozen_cpus); @@ -401,6 +403,9 @@ int disable_nonboot_cpus(void) break; } } + /* ensure all CPUs have gone into wait-for-SIPI */ + error |= tboot_wait_for_aps(num_cpus); + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ -- cgit v1.2.3 From 5a165657bef7c47e5ff4cd138f7758ef6278e87b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 13 Aug 2009 05:20:45 +0000 Subject: net: skb ftracer - Add config option to enable new ftracer (v3) skb allocation / consumption corelator - Add config option This patch adds a Kconfig option to enable the addtition of the skb source tracer. Signed-off-by: Neil Horman Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 019f380fd76..f09d7635c0e 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -234,6 +234,16 @@ config BOOT_TRACER You must pass in initcall_debug and ftrace=initcall to the kernel command line to enable this on bootup. +config SKB_SOURCES_TRACER + bool "Trace skb source information" + select GENERIC_TRACER + help + This tracer helps developers/sysadmins correlate skb allocation and + consumption. The idea being that some processes will primarily consume data + that was allocated on certain numa nodes. By being able to visualize which + nodes the data was allocated on, a sysadmin or developer can optimize the + scheduling of those processes to cut back on cross node chatter. + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER -- cgit v1.2.3 From 9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 13 Aug 2009 05:23:56 +0000 Subject: net: skb ftracer - Add actual ftrace code to kernel (v3) skb allocation / consumption correlator Add ftracer module to kernel to print out a list that correlates a process id, an skb it read, and the numa nodes on wich the process was running when it was read along with the numa node the skbuff was allocated on. Signed-off-by: Neil Horman Makefile | 1 trace.h | 19 ++++++ trace_skb_sources.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+) Signed-off-by: David S. Miller --- kernel/trace/Makefile | 1 + kernel/trace/trace.h | 19 +++++ kernel/trace/trace_skb_sources.c | 154 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+) create mode 100644 kernel/trace/trace_skb_sources.c (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 844164dca90..ee5e5b1b928 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o endif +obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b9f4f6e955..8a6281b2330 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ enum trace_type { TRACE_KMEM_FREE, TRACE_POWER, TRACE_BLK, + TRACE_SKB_SOURCE, __TRACE_LAST_TYPE, }; @@ -171,6 +173,21 @@ struct trace_power { struct power_trace state_data; }; +struct skb_record { + pid_t pid; /* pid of the copying process */ + int anid; /* node where skb was allocated */ + int cnid; /* node to which skb was copied in userspace */ + char ifname[IFNAMSIZ]; /* Name of the receiving interface */ + int rx_queue; /* The rx queue the skb was received on */ + int ccpu; /* Cpu the application got this frame from */ + int len; /* length of the data copied */ +}; + +struct trace_skb_event { + struct trace_entry ent; + struct skb_record event_data; +}; + enum kmemtrace_type_id { KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ @@ -323,6 +340,8 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ + IF_ASSIGN(var, ent, struct trace_skb_event, \ + TRACE_SKB_SOURCE); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c new file mode 100644 index 00000000000..40eb071afdb --- /dev/null +++ b/kernel/trace/trace_skb_sources.c @@ -0,0 +1,154 @@ +/* + * ring buffer based tracer for analyzing per-socket skb sources + * + * Neil Horman + * Copyright (C) 2009 + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_output.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec); + +static struct trace_array *skb_trace; +static int __read_mostly trace_skb_source_enabled; + +static void probe_skb_dequeue(const struct sk_buff *skb, int len) +{ + struct ring_buffer_event *event; + struct trace_skb_event *entry; + struct trace_array *tr = skb_trace; + struct net_device *dev; + + if (!trace_skb_source_enabled) + return; + + if (in_interrupt()) + return; + + event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE, + sizeof(*entry), 0, 0); + if (!event) + return; + entry = ring_buffer_event_data(event); + + entry->event_data.pid = current->pid; + entry->event_data.anid = page_to_nid(virt_to_page(skb->data)); + entry->event_data.cnid = cpu_to_node(smp_processor_id()); + entry->event_data.len = len; + entry->event_data.rx_queue = skb->queue_mapping; + entry->event_data.ccpu = smp_processor_id(); + + dev = dev_get_by_index(sock_net(skb->sk), skb->iif); + if (dev) { + memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ); + dev_put(dev); + } else { + strcpy(entry->event_data.ifname, "Unknown"); + } + + trace_buffer_unlock_commit(tr, event, 0, 0); +} + +static int tracing_skb_source_register(void) +{ + int ret; + + ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue); + if (ret) + pr_info("skb source trace: Couldn't activate dequeue tracepoint"); + + return ret; +} + +static void start_skb_source_trace(struct trace_array *tr) +{ + trace_skb_source_enabled = 1; +} + +static void stop_skb_source_trace(struct trace_array *tr) +{ + trace_skb_source_enabled = 0; +} + +static void skb_source_trace_reset(struct trace_array *tr) +{ + trace_skb_source_enabled = 0; + unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue); +} + + +static int skb_source_trace_init(struct trace_array *tr) +{ + int cpu; + skb_trace = tr; + + trace_skb_source_enabled = 1; + tracing_skb_source_register(); + + for_each_cpu(cpu, cpu_possible_mask) + tracing_reset(tr, cpu); + return 0; +} + +static enum print_line_t skb_source_print_line(struct trace_iterator *iter) +{ + int ret = 0; + struct trace_entry *entry = iter->ent; + struct trace_skb_event *event; + struct skb_record *record; + struct trace_seq *s = &iter->seq; + + trace_assign_type(event, entry); + record = &event->event_data; + if (entry->type != TRACE_SKB_SOURCE) + return TRACE_TYPE_UNHANDLED; + + ret = trace_seq_printf(s, " %d %d %d %s %d %d %d\n", + record->pid, + record->anid, + record->cnid, + record->ifname, + record->rx_queue, + record->ccpu, + record->len); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static void skb_source_print_header(struct seq_file *s) +{ + seq_puts(s, "# PID ANID CNID IFC RXQ CCPU LEN\n"); + seq_puts(s, "# | | | | | | |\n"); +} + +static struct tracer skb_source_tracer __read_mostly = +{ + .name = "skb_sources", + .init = skb_source_trace_init, + .start = start_skb_source_trace, + .stop = stop_skb_source_trace, + .reset = skb_source_trace_reset, + .print_line = skb_source_print_line, + .print_header = skb_source_print_header, +}; + +static int init_skb_source_trace(void) +{ + return register_tracer(&skb_source_tracer); +} +device_initcall(init_skb_source_trace); -- cgit v1.2.3 From 3a5209e3b26386fd174820ee6e8e27479b24869a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 17 Aug 2009 16:00:30 -0700 Subject: trace_skb: fix build when CONFIG_NET is not enabled Fix trace_skb_sources build when CONFIG_NET is not enabled: kernel/built-in.o: In function `probe_skb_dequeue': trace_skb_sources.c:(.text+0xd9152): undefined reference to `init_net' trace_skb_sources.c:(.text+0xd9188): undefined reference to `dev_get_by_index' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f09d7635c0e..9a2f19bf051 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -236,6 +236,7 @@ config BOOT_TRACER config SKB_SOURCES_TRACER bool "Trace skb source information" + depends on NET select GENERIC_TRACER help This tracer helps developers/sysadmins correlate skb allocation and -- cgit v1.2.3 From de481560eb0bd9d940b90311eba85711e4b1150b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Apr 2009 12:32:04 -0400 Subject: kconfig: keep config.gz around even if CONFIG_IKCONFIG_PROC is not set If CONFIG_IKCONFIG is set but CONFIG_IKCONFIG_PROC is not, then gcc will optimize the config.gz out, because nobody uses it. This patch adds "__used" to the config.gz data to keep it around so that code like extract-ikconfig can still find it. [ Impact: allow extract-ikconfig to find config.gz ] Signed-off-by: Steven Rostedt --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 2093a691f1c..d0c84e6bf50 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -119,7 +119,7 @@ $(obj)/config_data.gz: .config FORCE $(call if_changed,gzip) quiet_cmd_ikconfiggz = IKCFG $@ - cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ + cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call if_changed,ikconfiggz) -- cgit v1.2.3 From a15098c90df1ac2b1bfe1d33dd1c47063213aa9a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 9 Aug 2009 19:02:51 +0000 Subject: powerpc: Enable GCOV Make it possible to enable GCOV code coverage measurement on powerpc. Lightly tested on 64-bit, seems to work as expected. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 22e9dcfaa3d..654efd09f6a 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 + depends on S390 || X86 || (PPC && EXPERIMENTAL) default n ---help--- This options activates profiling for the entire kernel. -- cgit v1.2.3 From 269c861baa2fe7c114c3bc7831292758d29eb336 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:35 -0700 Subject: generic-ipi: Allow cpus not yet online to call smp_call_function with irqs disabled Because of deadlock possiblities smp_call_function() is not allowed to be called with interrupts disabled. Add an exception for the cpu not yet online, as no one else can send smp call function interrupt to this cpu that is not yet online and as such deadlock condition is not possible. Signed-off-by: Suresh Siddha Acked-by: Nick Piggin Signed-off-by: H. Peter Anvin --- kernel/smp.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index ad63d850120..2accdf6712e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -176,6 +176,11 @@ void generic_smp_call_function_interrupt(void) struct call_function_data *data; int cpu = get_cpu(); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(cpu)); + /* * Ensure entry is visible on call_function_queue after we have * entered the IPI. See comment in smp_call_function_many. @@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void) unsigned int data_flags; LIST_HEAD(list); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(smp_processor_id())); + spin_lock(&q->lock); list_replace_init(&q->list, &list); spin_unlock(&q->lock); @@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, */ this_cpu = get_cpu(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, { csd_lock(data); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() + && !oops_in_progress); generic_exec_single(cpu, data, wait); } @@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask, unsigned long flags; int cpu, next_cpu, this_cpu = smp_processor_id(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); -- cgit v1.2.3 From d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:36 -0700 Subject: x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init SDM Vol 3a section titled "MTRR considerations in MP systems" specifies the need for synchronizing the logical cpu's while initializing/updating MTRR. Currently Linux kernel does the synchronization of all cpu's only when a single MTRR register is programmed/updated. During an AP online (during boot/cpu-online/resume) where we initialize all the MTRR/PAT registers, we don't follow this synchronization algorithm. This can lead to scenarios where during a dynamic cpu online, that logical cpu is initializing MTRR/PAT with cache disabled (cr0.cd=1) etc while other logical HT sibling continue to run (also with cache disabled because of cr0.cd=1 on its sibling). Starting from Westmere, VMX transitions with cr0.cd=1 don't work properly (because of some VMX performance optimizations) and the above scenario (with one logical cpu doing VMX activity and another logical cpu coming online) can result in system crash. Fix the MTRR initialization by doing rendezvous of all the cpus. During boot and resume, we delay the MTRR/PAT init for APs till all the logical cpu's come online and the rendezvous process at the end of AP's bringup, will initialize the MTRR/PAT for all AP's. For dynamic single cpu online, we synchronize all the logical cpus and do the MTRR/PAT init on the AP that is coming online. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- kernel/cpu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..f5f9485b8c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -413,6 +413,14 @@ int disable_nonboot_cpus(void) return error; } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + void __ref enable_nonboot_cpus(void) { int cpu, error; @@ -424,6 +432,9 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { @@ -432,6 +443,9 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } + + arch_enable_nonboot_cpus_end(); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); -- cgit v1.2.3 From 5e928f77a09a07f9dd595bb8a489965d69a83458 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Aug 2009 23:38:32 +0200 Subject: PM: Introduce core framework for run-time PM of I/O devices (rev. 17) Introduce a core framework for run-time power management of I/O devices. Add device run-time PM fields to 'struct dev_pm_info' and device run-time PM callbacks to 'struct dev_pm_ops'. Introduce a run-time PM workqueue and define some device run-time PM helper functions at the core level. Document all these things. Special thanks to Alan Stern for his help with the design and multiple detailed reviews of the pereceding versions of this patch and to Magnus Damm for testing feedback. Signed-off-by: Rafael J. Wysocki Acked-by: Magnus Damm --- kernel/power/Kconfig | 14 ++++++++++++++ kernel/power/main.c | 17 +++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 72067cbdb37..91e09d3b2eb 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -208,3 +208,17 @@ config APM_EMULATION random kernel OOPSes or reboots that don't seem to be related to anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on PM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. diff --git a/kernel/power/main.c b/kernel/power/main.c index f710e36930c..347d2cc88cd 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "power.h" @@ -217,8 +218,24 @@ static struct attribute_group attr_group = { .attrs = g, }; +#ifdef CONFIG_PM_RUNTIME +struct workqueue_struct *pm_wq; + +static int __init pm_start_workqueue(void) +{ + pm_wq = create_freezeable_workqueue("pm"); + + return pm_wq ? 0 : -ENOMEM; +} +#else +static inline int pm_start_workqueue(void) { return 0; } +#endif + static int __init pm_init(void) { + int error = pm_start_workqueue(); + if (error) + return error; power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; -- cgit v1.2.3 From 31ffe249e5426d2648d68568fa00a7b66666a5db Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 26 Aug 2009 16:32:37 -0700 Subject: net: Temporarily backout SKB sources tracer. Steven Rostedt has suggested that Neil work with the tracing folks, trying to use TRACE_EVENT as the mechanism for implementation. And if that doesn't workout we can investigate other solutions such as that one which was tried here. This reverts the following 2 commits: 5a165657bef7c47e5ff4cd138f7758ef6278e87b ("net: skb ftracer - Add config option to enable new ftracer (v3)") 9ec04da7489d2c9ae01ea6e9b5fa313ccf3d35fb ("net: skb ftracer - Add actual ftrace code to kernel (v3)") Signed-off-by: David S. Miller --- kernel/trace/Kconfig | 11 --- kernel/trace/Makefile | 1 - kernel/trace/trace.h | 19 ----- kernel/trace/trace_skb_sources.c | 154 --------------------------------------- 4 files changed, 185 deletions(-) delete mode 100644 kernel/trace/trace_skb_sources.c (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9a2f19bf051..019f380fd76 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -234,17 +234,6 @@ config BOOT_TRACER You must pass in initcall_debug and ftrace=initcall to the kernel command line to enable this on bootup. -config SKB_SOURCES_TRACER - bool "Trace skb source information" - depends on NET - select GENERIC_TRACER - help - This tracer helps developers/sysadmins correlate skb allocation and - consumption. The idea being that some processes will primarily consume data - that was allocated on certain numa nodes. By being able to visualize which - nodes the data was allocated on, a sysadmin or developer can optimize the - scheduling of those processes to cut back on cross node chatter. - config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ee5e5b1b928..844164dca90 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -49,7 +49,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o endif -obj-$(CONFIG_SKB_SOURCES_TRACER) += trace_skb_sources.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8a6281b2330..8b9f4f6e955 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -41,7 +40,6 @@ enum trace_type { TRACE_KMEM_FREE, TRACE_POWER, TRACE_BLK, - TRACE_SKB_SOURCE, __TRACE_LAST_TYPE, }; @@ -173,21 +171,6 @@ struct trace_power { struct power_trace state_data; }; -struct skb_record { - pid_t pid; /* pid of the copying process */ - int anid; /* node where skb was allocated */ - int cnid; /* node to which skb was copied in userspace */ - char ifname[IFNAMSIZ]; /* Name of the receiving interface */ - int rx_queue; /* The rx queue the skb was received on */ - int ccpu; /* Cpu the application got this frame from */ - int len; /* length of the data copied */ -}; - -struct trace_skb_event { - struct trace_entry ent; - struct skb_record event_data; -}; - enum kmemtrace_type_id { KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ @@ -340,8 +323,6 @@ extern void __ftrace_bad_type(void); TRACE_SYSCALL_ENTER); \ IF_ASSIGN(var, ent, struct syscall_trace_exit, \ TRACE_SYSCALL_EXIT); \ - IF_ASSIGN(var, ent, struct trace_skb_event, \ - TRACE_SKB_SOURCE); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_skb_sources.c b/kernel/trace/trace_skb_sources.c deleted file mode 100644 index 40eb071afdb..00000000000 --- a/kernel/trace/trace_skb_sources.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * ring buffer based tracer for analyzing per-socket skb sources - * - * Neil Horman - * Copyright (C) 2009 - * - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -EXPORT_TRACEPOINT_SYMBOL_GPL(skb_copy_datagram_iovec); - -static struct trace_array *skb_trace; -static int __read_mostly trace_skb_source_enabled; - -static void probe_skb_dequeue(const struct sk_buff *skb, int len) -{ - struct ring_buffer_event *event; - struct trace_skb_event *entry; - struct trace_array *tr = skb_trace; - struct net_device *dev; - - if (!trace_skb_source_enabled) - return; - - if (in_interrupt()) - return; - - event = trace_buffer_lock_reserve(tr, TRACE_SKB_SOURCE, - sizeof(*entry), 0, 0); - if (!event) - return; - entry = ring_buffer_event_data(event); - - entry->event_data.pid = current->pid; - entry->event_data.anid = page_to_nid(virt_to_page(skb->data)); - entry->event_data.cnid = cpu_to_node(smp_processor_id()); - entry->event_data.len = len; - entry->event_data.rx_queue = skb->queue_mapping; - entry->event_data.ccpu = smp_processor_id(); - - dev = dev_get_by_index(sock_net(skb->sk), skb->iif); - if (dev) { - memcpy(entry->event_data.ifname, dev->name, IFNAMSIZ); - dev_put(dev); - } else { - strcpy(entry->event_data.ifname, "Unknown"); - } - - trace_buffer_unlock_commit(tr, event, 0, 0); -} - -static int tracing_skb_source_register(void) -{ - int ret; - - ret = register_trace_skb_copy_datagram_iovec(probe_skb_dequeue); - if (ret) - pr_info("skb source trace: Couldn't activate dequeue tracepoint"); - - return ret; -} - -static void start_skb_source_trace(struct trace_array *tr) -{ - trace_skb_source_enabled = 1; -} - -static void stop_skb_source_trace(struct trace_array *tr) -{ - trace_skb_source_enabled = 0; -} - -static void skb_source_trace_reset(struct trace_array *tr) -{ - trace_skb_source_enabled = 0; - unregister_trace_skb_copy_datagram_iovec(probe_skb_dequeue); -} - - -static int skb_source_trace_init(struct trace_array *tr) -{ - int cpu; - skb_trace = tr; - - trace_skb_source_enabled = 1; - tracing_skb_source_register(); - - for_each_cpu(cpu, cpu_possible_mask) - tracing_reset(tr, cpu); - return 0; -} - -static enum print_line_t skb_source_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_skb_event *event; - struct skb_record *record; - struct trace_seq *s = &iter->seq; - - trace_assign_type(event, entry); - record = &event->event_data; - if (entry->type != TRACE_SKB_SOURCE) - return TRACE_TYPE_UNHANDLED; - - ret = trace_seq_printf(s, " %d %d %d %s %d %d %d\n", - record->pid, - record->anid, - record->cnid, - record->ifname, - record->rx_queue, - record->ccpu, - record->len); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static void skb_source_print_header(struct seq_file *s) -{ - seq_puts(s, "# PID ANID CNID IFC RXQ CCPU LEN\n"); - seq_puts(s, "# | | | | | | |\n"); -} - -static struct tracer skb_source_tracer __read_mostly = -{ - .name = "skb_sources", - .init = skb_source_trace_init, - .start = start_skb_source_trace, - .stop = stop_skb_source_trace, - .reset = skb_source_trace_reset, - .print_line = skb_source_print_line, - .print_header = skb_source_print_header, -}; - -static int init_skb_source_trace(void) -{ - return register_tracer(&skb_source_tracer); -} -device_initcall(init_skb_source_trace); -- cgit v1.2.3 From 2bc481cf433879f0e6cdd4d899fc21ee05dcea23 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 28 Aug 2009 23:41:29 -0700 Subject: pktgen: spin using hrtimer This changes how the pktgen thread spins/waits between packets if delay is configured. It uses a high res timer to wait for time to arrive. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- kernel/hrtimer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 49da79ab848..05071bf6a37 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -485,6 +485,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, debug_object_init_on_stack(timer, &hrtimer_debug_descr); __hrtimer_init(timer, clock_id, mode); } +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); void destroy_hrtimer_on_stack(struct hrtimer *timer) { @@ -1477,6 +1478,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; } +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { -- cgit v1.2.3 From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001 From: Shane Wang Date: Tue, 1 Sep 2009 18:25:07 -0700 Subject: x86, intel_txt: clean up the impact on generic code, unbreak non-x86 Move tboot.h from asm to linux to fix the build errors of intel_txt patch on non-X86 platforms. Remove the tboot code from generic code init/main.c and kernel/cpu.c. Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- kernel/cpu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index ff071e022a8..67a60076dd7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,7 +14,6 @@ #include #include #include -#include #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -377,7 +376,7 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error, num_cpus = 0; + int cpu, first_cpu, error; error = stop_machine_create(); if (error) @@ -392,7 +391,6 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; - num_cpus++; error = _cpu_down(cpu, 1); if (!error) { cpumask_set_cpu(cpu, frozen_cpus); @@ -403,8 +401,6 @@ int disable_nonboot_cpus(void) break; } } - /* ensure all CPUs have gone into wait-for-SIPI */ - error |= tboot_wait_for_aps(num_cpus); if (!error) { BUG_ON(num_online_cpus() > 1); -- cgit v1.2.3 From d8eeb2d3b26d25c44c10f28430e2157a2d20bd1d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 31 Jul 2009 14:58:04 +0200 Subject: ring-buffer: consolidate interface of rb_buffer_peek() rb_buffer_peek() operates with struct ring_buffer_per_cpu *cpu_buffer only. Thus, instead of passing variables buffer and cpu it is better to use cpu_buffer directly. This also reduces the risk of races since cpu_buffer is not calculated twice. Signed-off-by: Robert Richter LKML-Reference: <1249045084-3028-1-git-send-email-robert.richter@amd.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 454e74e718c..8786c350b4c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2997,15 +2997,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) } static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; struct buffer_page *reader; int nr_loops = 0; - cpu_buffer = buffer->buffers[cpu]; - again: /* * We repeat when a timestamp is encountered. It is possible @@ -3049,7 +3046,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(buffer, + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } return event; @@ -3168,7 +3165,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) local_irq_save(flags); if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) @@ -3237,7 +3234,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) if (dolock) spin_lock(&cpu_buffer->reader_lock); - event = rb_buffer_peek(buffer, cpu, ts); + event = rb_buffer_peek(cpu_buffer, ts); if (event) rb_advance_reader(cpu_buffer); -- cgit v1.2.3 From 478142c39c8c2f5f63038e5f2224e6729406e587 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Sep 2009 10:36:01 -0400 Subject: tracing: do not grab lock in wakeup latency function tracing The wakeup tracer, when enabled, has its own function tracer. It only traces the functions on the CPU where the task it is following is on. If a task is woken on one CPU but then migrates to another CPU before it wakes up, the latency tracer will then start tracing functions on the other CPU. To find which CPU the task is on, the wakeup function tracer performs a task_cpu(wakeup_task). But to make sure the task does not disappear it grabs the wakeup_lock, which is also taken when the task wakes up. By taking this lock, the function tracer does not need to worry about the task being freed as it checks its cpu. Jan Blunck found a problem with this approach on his 32 CPU box. When a task is being traced by the wakeup tracer, all functions take this lock. That means that on all 32 CPUs, each function call is taking this one lock to see if the task is on that CPU. This lock has just serialized all functions on all 32 CPUs. Needless to say, this caused major issues on that box. It would even lockup. This patch changes the wakeup latency to insert a probe on the migrate task tracepoint. When a task changes its CPU that it will run on, the probe will take note. Now the wakeup function tracer no longer needs to take the lock. It only compares the current CPU with a variable that holds the current CPU the task is on. We don't worry about races since it is OK to add or miss a function trace. Reported-by: Jan Blunck Tested-by: Jan Blunck Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ad69f105a7c..cf43bdb1763 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled; static struct task_struct *wakeup_task; static int wakeup_cpu; +static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; @@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (unlikely(disabled != 1)) goto out; local_irq_save(flags); - __raw_spin_lock(&wakeup_lock); - - if (unlikely(!wakeup_task)) - goto unlock; - - /* - * The task can't disappear because it needs to - * wake up first, and we have the wakeup_lock. - */ - if (task_cpu(wakeup_task) != cpu) - goto unlock; trace_function(tr, ip, parent_ip, flags, pc); - unlock: - __raw_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: atomic_dec(&data->disabled); - + out_enable: ftrace_preempt_enable(resched); } @@ -107,6 +98,14 @@ static int report_latency(cycle_t delta) return 1; } +static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) +{ + if (task != wakeup_task) + return; + + wakeup_current_cpu = cpu; +} + static void notrace probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) @@ -244,6 +243,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) __wakeup_reset(wakeup_trace); wakeup_cpu = task_cpu(p); + wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; wakeup_task = p; @@ -293,6 +293,13 @@ static void start_wakeup_tracer(struct trace_array *tr) goto fail_deprobe_wake_new; } + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_migrate_task\n"); + return; + } + wakeup_reset(tr); /* @@ -325,6 +332,7 @@ static void stop_wakeup_tracer(struct trace_array *tr) unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); unregister_trace_sched_wakeup(probe_wakeup); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); } static int __wakeup_tracer_init(struct trace_array *tr) -- cgit v1.2.3 From e0ab5f2daee1c7a6a387591bf37f0bad4e407112 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 10 Sep 2009 09:34:19 +0800 Subject: tracing: remove dead code Removes unreachable code. Signed-off-by: Li Zefan LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fa1dccb579d..536ae1d8362 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -120,14 +120,6 @@ struct print_entry { char buf[]; }; -#define TRACE_OLD_SIZE 88 - -struct trace_field_cont { - unsigned char type; - /* Temporary till we get rid of this completely */ - char buf[TRACE_OLD_SIZE - 1]; -}; - struct trace_mmiotrace_rw { struct trace_entry ent; struct mmiotrace_rw rw; @@ -509,20 +501,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -typedef void -(*tracer_switch_func_t)(void *private, - void *__rq, - struct task_struct *prev, - struct task_struct *next); - -struct tracer_switch_ops { - tracer_switch_func_t func; - void *private; - struct tracer_switch_ops *next; -}; -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE -- cgit v1.2.3 From bd9cfca9cb71200dd82b320bba12540dc078f4e0 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 10 Sep 2009 09:34:19 +0800 Subject: tracing: format clean ups Fix white-space formatting. Signed-off-by: Li Zefan LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 536ae1d8362..86a0523b8e5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -169,20 +169,20 @@ enum kmemtrace_type_id { struct kmemtrace_alloc_entry { struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; - size_t bytes_req; - size_t bytes_alloc; - gfp_t gfp_flags; - int node; + enum kmemtrace_type_id type_id; + unsigned long call_site; + const void *ptr; + size_t bytes_req; + size_t bytes_alloc; + gfp_t gfp_flags; + int node; }; struct kmemtrace_free_entry { struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; + enum kmemtrace_type_id type_id; + unsigned long call_site; + const void *ptr; }; struct syscall_trace_enter { @@ -203,7 +203,7 @@ struct syscall_trace_exit { * states when a trace occurs. These are: * IRQS_OFF - interrupts were disabled * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags - * NEED_RESCED - reschedule is requested + * NEED_RESCHED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler */ -- cgit v1.2.3 From a5921c6c37d51ee2079ca3c69ea6f7b7384f5d87 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 10 Sep 2009 09:34:19 +0800 Subject: tracing: remove stats from struct tracer Remove unused field @stats from struct tracer. Signed-off-by: Li Zefan LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 86a0523b8e5..2163d185fe2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -382,7 +382,6 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; - struct tracer_stat *stats; }; -- cgit v1.2.3 From 197e2eabc90c203d1086916b7f66694ba5fbb937 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 10 Sep 2009 09:34:19 +0800 Subject: tracing: move PRED macros to trace_events_filter.c Move DEFINE_COMPARISON_PRED() and DEFINE_EQUALITY_PRED() to kernel/trace/trace_events_filter.c Signed-off-by: Li Zefan LKML-Reference: <4AA8579B.4020706@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 41 -------------------------------------- kernel/trace/trace_events_filter.c | 41 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2163d185fe2..acaa68060eb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -800,47 +800,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - default: \ - break; \ - } \ - \ - return match; \ -} - -#define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ -{ \ - u##size *addr = (u##size *)(event + pred->offset); \ - u##size val = (u##size)pred->val; \ - int match; \ - \ - match = (val == *addr) ^ pred->not; \ - \ - return match; \ -} - extern struct mutex event_mutex; extern struct list_head ftrace_events; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 93660fbbf62..23245785927 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -121,6 +121,47 @@ struct filter_parse_state { } operand; }; +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + DEFINE_COMPARISON_PRED(s64); DEFINE_COMPARISON_PRED(u64); DEFINE_COMPARISON_PRED(s32); -- cgit v1.2.3 From 5e605b64a183a6c0e84cdb99a6f8acb1f8200437 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Aug 2009 09:07:21 +0200 Subject: block: add blk-iopoll, a NAPI like approach for block devices This borrows some code from NAPI and implements a polled completion mode for block devices. The idea is the same as NAPI - instead of doing the command completion when the irq occurs, schedule a dedicated softirq in the hopes that we will complete more IO when the iopoll handler is invoked. Devices have a budget of commands assigned, and will stay in polled mode as long as they continue to consume their budget from the iopoll softirq handler. If they do not, the device is set back to interrupt completion mode. This patch holds the core bits for blk-iopoll, device driver support sold separately. Signed-off-by: Jens Axboe --- kernel/sysctl.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 58be76017fd..0ed9fa6f322 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -92,6 +92,7 @@ extern int sysctl_nr_trim_pages; #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +extern int blk_iopoll_enabled; /* Constants used for minimum and maximum */ #ifdef CONFIG_DETECT_SOFTLOCKUP @@ -990,7 +991,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif - + { + .ctl_name = CTL_UNNUMBERED, + .procname = "blk_iopoll", + .data = &blk_iopoll_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- cgit v1.2.3 From 49ff590390a22c49e9063dcdec4cd5903127526b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Sep 2009 00:30:26 -0400 Subject: tracing: add latency format to function_graph tracer While debugging something with the function_graph tracer, I found the need to see the preempt count of the traces. Unfortunately, since the function graph tracer has its own output formatting, it does not honor the latency-format option. This patch makes the function_graph tracer honor the latency-format option, but still keeps control of the output. But now we have the same details that the latency-format supplies. # tracer: function_graph # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / # |||| # CPU|||| DURATION FUNCTION CALLS # | |||| | | | | | | 3) d..1 1.333 us | idle_cpu(); 3) d.h1 | tick_check_idle() { 3) d.h1 0.550 us | tick_check_oneshot_broadcast(); 3) d.h1 | tick_nohz_stop_idle() { 3) d.h1 | ktime_get() { 3) d.h1 | ktime_get_ts() { Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 74 +++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b3749a2c313..ee791a9650c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -364,6 +364,29 @@ print_graph_proc(struct trace_seq *s, pid_t pid) } +static enum print_line_t +print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + int hardirq, softirq; + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + + if (!trace_seq_printf(s, " %c%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? + 'X' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? + 'N' : '.', + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : softirq ? 's' : '.')) + return 0; + + if (entry->preempt_count) + return trace_seq_printf(s, "%x", entry->preempt_count); + return trace_seq_puts(s, "."); +} + /* If the pid changed since the last trace, output this event */ static enum print_line_t verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) @@ -521,6 +544,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } + /* Proc */ if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { ret = print_graph_proc(s, pid); @@ -758,6 +782,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + ret = print_graph_lat_fmt(s, ent); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + return 0; } @@ -952,28 +983,59 @@ print_graph_function(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static void print_lat_header(struct seq_file *s) +{ + static const char spaces[] = " " /* 16 spaces */ + " " /* 4 spaces */ + " "; /* 17 spaces */ + int size = 0; + + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + size += 16; + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) + size += 4; + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) + size += 17; + + seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); + seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); + seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); + seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); + seq_printf(s, "#%.*s||| / \n", size, spaces); + seq_printf(s, "#%.*s|||| \n", size, spaces); +} + static void print_graph_headers(struct seq_file *s) { + int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + + if (lat) + print_lat_header(s); + /* 1st line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " TIME "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "CPU"); + seq_printf(s, " CPU"); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); + seq_printf(s, " TASK/PID "); + if (lat) + seq_printf(s, "||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); /* 2nd line */ - seq_printf(s, "# "); + seq_printf(s, "#"); if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "| "); + seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); + seq_printf(s, " | | "); + if (lat) + seq_printf(s, "||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); -- cgit v1.2.3 From 48659d31195bb76d688e99dabd816c5472fb1656 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Sep 2009 11:36:23 -0400 Subject: tracing: move tgid out of generic entry and into userstack The userstack trace required the recording of the tgid entry. Unfortunately, it was added to the generic entry where it wasted 4 bytes of every entry and was only used by one entry. This patch moves it out of the generic field and moves it into the only user (userstack_entry). Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 5 +---- kernel/trace/trace_output.c | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c75deeefe3..1a37da2e853 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -886,7 +886,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->tgid = (tsk) ? tsk->tgid : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1068,6 +1067,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) return; entry = ring_buffer_event_data(event); + entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); trace.nr_entries = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index acaa68060eb..b69697b4b84 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -101,6 +101,7 @@ struct stack_entry { struct userstack_entry { struct trace_entry ent; + unsigned int tgid; unsigned long caller[FTRACE_STACK_ENTRIES]; }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 78b1ed23017..28d92027a93 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -86,7 +86,6 @@ int trace_define_common_fields(struct ftrace_event_call *call) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, tgid); return ret; } @@ -572,13 +571,11 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", FIELD(unsigned short, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, tgid)); + FIELD(int, pid)); } static ssize_t diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e0c2545622e..be34a6aa7e4 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, * since individual threads might have already quit! */ rcu_read_lock(); - task = find_task_by_vpid(entry->ent.tgid); + task = find_task_by_vpid(entry->tgid); if (task) mm = get_task_mm(task); rcu_read_unlock(); -- cgit v1.2.3 From 637e7e864103a7a68c1ce43ada27dfc25c0d113f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Sep 2009 13:55:35 -0400 Subject: tracing: add lock depth to entries This patch adds the lock depth of the big kernel lock to the generic entry header. This way we can see the depth of the lock and help in removing the BKL. Example: # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| /_--=> lock-depth # |||||/ delay # cmd pid |||||| time | caller # \ / |||||| \ | / -0 2.N..3 5902255250us+: lock_acquire: read rcu_read_lock -0 2.N..3 5902255253us+: lock_release: rcu_read_lock -0 2dN..3 5902255257us+: lock_acquire: xtime_lock -0 2dN..4 5902255259us : lock_acquire: clocksource_lock -0 2dN..4 5902255261us+: lock_release: clocksource_lock Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 +++++---- kernel/trace/trace_events.c | 5 ++++- kernel/trace/trace_functions_graph.c | 16 ++++++++++++---- kernel/trace/trace_output.c | 10 +++++++++- 4 files changed, 30 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1a37da2e853..3b918283cf9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -886,6 +886,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1530,10 +1531,10 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# |||| /_--=> lock-depth \n"); + seq_puts(m, "# |||||/ delay \n"); + seq_puts(m, "# cmd pid |||||| time | caller \n"); + seq_puts(m, "# \\ / |||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 28d92027a93..975f324a07e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -86,6 +86,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(int, lock_depth); return ret; } @@ -571,11 +572,13 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", FIELD(unsigned short, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), - FIELD(int, pid)); + FIELD(int, pid), + FIELD(int, lock_depth)); } static ssize_t diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ee791a9650c..48af4937438 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -368,6 +368,7 @@ static enum print_line_t print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { int hardirq, softirq; + int ret; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -382,6 +383,13 @@ print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) hardirq ? 'h' : softirq ? 's' : '.')) return 0; + if (entry->lock_depth < 0) + ret = trace_seq_putc(s, '.'); + else + ret = trace_seq_printf(s, "%d", entry->lock_depth); + if (!ret) + return 0; + if (entry->preempt_count) return trace_seq_printf(s, "%x", entry->preempt_count); return trace_seq_puts(s, "."); @@ -1001,8 +1009,8 @@ static void print_lat_header(struct seq_file *s) seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); - seq_printf(s, "#%.*s||| / \n", size, spaces); - seq_printf(s, "#%.*s|||| \n", size, spaces); + seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); + seq_printf(s, "#%.*s|||| / \n", size, spaces); } static void print_graph_headers(struct seq_file *s) @@ -1021,7 +1029,7 @@ static void print_graph_headers(struct seq_file *s) if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " TASK/PID "); if (lat) - seq_printf(s, "||||"); + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " DURATION "); seq_printf(s, " FUNCTION CALLS\n"); @@ -1035,7 +1043,7 @@ static void print_graph_headers(struct seq_file *s) if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) seq_printf(s, " | | "); if (lat) - seq_printf(s, "||||"); + seq_printf(s, "|||||"); if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) seq_printf(s, " | | "); seq_printf(s, " | | | |\n"); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index be34a6aa7e4..29a370a4558 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -465,6 +465,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { int hardirq, softirq; char comm[TASK_COMM_LEN]; + int ret; trace_find_cmdline(entry->pid, comm); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; @@ -481,9 +482,16 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) hardirq ? 'h' : softirq ? 's' : '.')) return 0; + if (entry->lock_depth < 0) + ret = trace_seq_putc(s, '.'); + else + ret = trace_seq_printf(s, "%d", entry->lock_depth); + if (!ret) + return 0; + if (entry->preempt_count) return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_puts(s, "."); + return trace_seq_putc(s, '.'); } static unsigned long preempt_mark_thresh = 100; -- cgit v1.2.3 From f81c972d27c36729e65d4a815e3d7b782a540bad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Sep 2009 14:24:13 -0400 Subject: tracing: consolidate code between trace_output.c and trace_function_graph.c Both trace_output.c and trace_function_graph.c do basically the same thing to handle the printing of the latency-format. This patch moves the code into one function that both can use. Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 26 ++------------------------ kernel/trace/trace_output.c | 30 ++++++++++++++++++++++++------ kernel/trace/trace_output.h | 2 ++ 3 files changed, 28 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 48af4937438..61f166707a0 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -367,32 +367,10 @@ print_graph_proc(struct trace_seq *s, pid_t pid) static enum print_line_t print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { - int hardirq, softirq; - int ret; - - hardirq = entry->flags & TRACE_FLAG_HARDIRQ; - softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - - if (!trace_seq_printf(s, " %c%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? - 'X' : '.', - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? - 'N' : '.', - (hardirq && softirq) ? 'H' : - hardirq ? 'h' : softirq ? 's' : '.')) - return 0; - - if (entry->lock_depth < 0) - ret = trace_seq_putc(s, '.'); - else - ret = trace_seq_printf(s, "%d", entry->lock_depth); - if (!ret) + if (!trace_seq_putc(s, ' ')) return 0; - if (entry->preempt_count) - return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_puts(s, "."); + return trace_print_lat_fmt(s, entry); } /* If the pid changed since the last trace, output this event */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 29a370a4558..f572f44c6e1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -460,19 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } -static int -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count and lock depth. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { int hardirq, softirq; - char comm[TASK_COMM_LEN]; int ret; - trace_find_cmdline(entry->pid, comm); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", - comm, entry->pid, cpu, + if (!trace_seq_printf(s, "%c%c%c", (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', @@ -494,6 +498,20 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) return trace_seq_putc(s, '.'); } +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + if (!trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu)) + return 0; + + return trace_print_lat_fmt(s, entry); +} + static unsigned long preempt_mark_thresh = 100; static int diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index d38bec4a9c3..9d91c72ba38 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); -- cgit v1.2.3 From b63f39ea50330f836e301ddda21c6a93dcf0d6a3 Mon Sep 17 00:00:00 2001 From: "jolsa@redhat.com" Date: Fri, 11 Sep 2009 17:29:27 +0200 Subject: tracing: create generic trace parser Create a "trace_parser" that can parse the user space input for separate words. struct trace_parser is the descriptor. Generic "trace_get_user" function that can be a helper to read multiple words passed in by user space. Signed-off-by: Jiri Olsa LKML-Reference: <1252682969-3366-2-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 35 +++++++++++++++++ 2 files changed, 141 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3b918283cf9..45c3f0352d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -339,6 +339,112 @@ static struct { int trace_clock_id; +/* + * trace_parser_get_init - gets the buffer for trace parser + */ +int trace_parser_get_init(struct trace_parser *parser, int size) +{ + memset(parser, 0, sizeof(*parser)); + + parser->buffer = kmalloc(size, GFP_KERNEL); + if (!parser->buffer) + return 1; + + parser->size = size; + return 0; +} + +/* + * trace_parser_put - frees the buffer for trace parser + */ +void trace_parser_put(struct trace_parser *parser) +{ + kfree(parser->buffer); +} + +/* + * trace_get_user - reads the user input string separated by space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char ch; + size_t read = 0; + ssize_t ret; + + if (!*ppos) + trace_parser_clear(parser); + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + + read++; + cnt--; + + /* + * The parser is not finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!parser->cont) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* only spaces were written */ + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + parser->idx = 0; + } + + /* read the non-space input */ + while (cnt && !isspace(ch)) { + if (parser->idx < parser->size) + parser->buffer[parser->idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* We either got finished input or we have to wait for another call. */ + if (isspace(ch)) { + parser->buffer[parser->idx] = 0; + parser->cont = false; + } else { + parser->cont = true; + parser->buffer[parser->idx++] = ch; + } + + *ppos += read; + ret = read; + +out: + return ret; +} + ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) { int len; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b69697b4b84..28247cecd95 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -615,6 +615,41 @@ static inline int ftrace_trace_task(struct task_struct *task) } #endif +/* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input lenght + * @size: buffer size + */ +struct trace_parser { + bool cont; + char *buffer; + unsigned idx; + unsigned size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ + return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ + return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ + parser->cont = false; + parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos); + /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. -- cgit v1.2.3 From 489663644c35d50a20f58d468a7cbc705e6a29ce Mon Sep 17 00:00:00 2001 From: "jolsa@redhat.com" Date: Fri, 11 Sep 2009 17:29:28 +0200 Subject: tracing: trace parser support for set_event Convert the parsing of the file 'set_event' to use the generic trace_praser 'trace_get_user' function. Signed-off-by: Jiri Olsa LKML-Reference: <1252682969-3366-3-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 60 ++++++++++----------------------------------- 1 file changed, 13 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 975f324a07e..f46d14cefde 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -230,11 +230,9 @@ static ssize_t ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_parser parser; size_t read = 0; - int i, set = 1; ssize_t ret; - char *buf; - char ch; if (!cnt || cnt < 0) return 0; @@ -243,60 +241,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (ret < 0) return ret; - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - return ret; - read++; - cnt--; - } - - /* Only white space found? */ - if (isspace(ch)) { - file->f_pos += read; - ret = read; - return ret; - } - - buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); - if (!buf) + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) return -ENOMEM; - if (cnt > EVENT_BUF_SIZE) - cnt = EVENT_BUF_SIZE; + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (trace_parser_loaded((&parser))) { + int set = 1; - i = 0; - while (cnt && !isspace(ch)) { - if (!i && ch == '!') + if (*parser.buffer == '!') set = 0; - else - buf[i++] = ch; - ret = get_user(ch, ubuf++); + parser.buffer[parser.idx] = 0; + + ret = ftrace_set_clr_event(parser.buffer + !set, set); if (ret) - goto out_free; - read++; - cnt--; + goto out_put; } - buf[i] = 0; - - file->f_pos += read; - - ret = ftrace_set_clr_event(buf, set); - if (ret) - goto out_free; ret = read; - out_free: - kfree(buf); + out_put: + trace_parser_put(&parser); return ret; } -- cgit v1.2.3 From 689fd8b65d669b96d612ccc37d6fb87bf7ed6907 Mon Sep 17 00:00:00 2001 From: "jolsa@redhat.com" Date: Fri, 11 Sep 2009 17:29:29 +0200 Subject: tracing: trace parser support for function and graph Convert the writing to 'set_graph_function', 'set_ftrace_filter' and 'set_ftrace_notrace' to use the generic trace_parser 'trace_get_user' function. Removed FTRACE_ITER_CONT flag, since it's not needed after this change. Minor fix in set_graph_function display - g_show function. Signed-off-by: Jiri Olsa LKML-Reference: <1252682969-3366-4-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 150 ++++++++++++++------------------------------------ 1 file changed, 40 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8c804e24f96..8b23d567008 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1323,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) enum { FTRACE_ITER_FILTER = (1 << 0), - FTRACE_ITER_CONT = (1 << 1), - FTRACE_ITER_NOTRACE = (1 << 2), - FTRACE_ITER_FAILURES = (1 << 3), - FTRACE_ITER_PRINTALL = (1 << 4), - FTRACE_ITER_HASH = (1 << 5), + FTRACE_ITER_NOTRACE = (1 << 1), + FTRACE_ITER_FAILURES = (1 << 2), + FTRACE_ITER_PRINTALL = (1 << 3), + FTRACE_ITER_HASH = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1337,8 +1336,7 @@ struct ftrace_iterator { int hidx; int idx; unsigned flags; - unsigned char buffer[FTRACE_BUFF_MAX+1]; - unsigned buffer_idx; + struct trace_parser parser; }; static void * @@ -1604,6 +1602,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!iter) return -ENOMEM; + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { + kfree(iter); + return -ENOMEM; + } + mutex_lock(&ftrace_regex_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -2196,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos, int enable) { struct ftrace_iterator *iter; - char ch; - size_t read = 0; - ssize_t ret; + struct trace_parser *parser; + ssize_t ret, read; if (!cnt || cnt < 0) return 0; @@ -2211,72 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, } else iter = file->private_data; - if (!*ppos) { - iter->flags &= ~FTRACE_ITER_CONT; - iter->buffer_idx = 0; - } - - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - - /* - * If the parser haven't finished with the last write, - * continue reading the user input without skipping spaces. - */ - if (!(iter->flags & FTRACE_ITER_CONT)) { - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; - } + parser = &iter->parser; + read = trace_get_user(parser, ubuf, cnt, ppos); - /* only spaces were written */ - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } - - iter->buffer_idx = 0; - } - - while (cnt && !isspace(ch)) { - if (iter->buffer_idx < FTRACE_BUFF_MAX) - iter->buffer[iter->buffer_idx++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + ret = ftrace_process_regex(parser->buffer, + parser->idx, enable); if (ret) goto out; - read++; - cnt--; - } - if (isspace(ch)) { - iter->buffer[iter->buffer_idx] = 0; - ret = ftrace_process_regex(iter->buffer, - iter->buffer_idx, enable); - if (ret) - goto out; - iter->buffer_idx = 0; - } else { - iter->flags |= FTRACE_ITER_CONT; - iter->buffer[iter->buffer_idx++] = ch; + trace_parser_clear(parser); } - *ppos += read; ret = read; - out: - mutex_unlock(&ftrace_regex_lock); + mutex_unlock(&ftrace_regex_lock); +out: return ret; } @@ -2381,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) { struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; + struct trace_parser *parser; mutex_lock(&ftrace_regex_lock); if (file->f_mode & FMODE_READ) { @@ -2390,9 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) } else iter = file->private_data; - if (iter->buffer_idx) { - iter->buffer[iter->buffer_idx] = 0; - ftrace_match_records(iter->buffer, iter->buffer_idx, enable); + parser = &iter->parser; + if (trace_parser_loaded(parser)) { + parser->buffer[parser->idx] = 0; + ftrace_match_records(parser->buffer, parser->idx, enable); } mutex_lock(&ftrace_lock); @@ -2400,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_lock); + trace_parser_put(parser); kfree(iter); + mutex_unlock(&ftrace_regex_lock); return 0; } @@ -2499,7 +2456,7 @@ static int g_show(struct seq_file *m, void *v) return 0; } - seq_printf(m, "%pf\n", v); + seq_printf(m, "%pf\n", (void *)*ptr); return 0; } @@ -2602,12 +2559,10 @@ static ssize_t ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned char buffer[FTRACE_BUFF_MAX+1]; + struct trace_parser parser; unsigned long *array; size_t read = 0; ssize_t ret; - int index = 0; - char ch; if (!cnt || cnt < 0) return 0; @@ -2625,51 +2580,26 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, } else array = file->private_data; - ret = get_user(ch, ubuf++); - if (ret) + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { + ret = -ENOMEM; goto out; - read++; - cnt--; - - /* skip white space */ - while (cnt && isspace(ch)) { - ret = get_user(ch, ubuf++); - if (ret) - goto out; - read++; - cnt--; } - if (isspace(ch)) { - *ppos += read; - ret = read; - goto out; - } + read = trace_get_user(&parser, ubuf, cnt, ppos); - while (cnt && !isspace(ch)) { - if (index < FTRACE_BUFF_MAX) - buffer[index++] = ch; - else { - ret = -EINVAL; - goto out; - } - ret = get_user(ch, ubuf++); + if (trace_parser_loaded((&parser))) { + parser.buffer[parser.idx] = 0; + + /* we allow only one expression at a time */ + ret = ftrace_set_func(array, &ftrace_graph_count, + parser.buffer); if (ret) goto out; - read++; - cnt--; } - buffer[index] = 0; - - /* we allow only one expression at a time */ - ret = ftrace_set_func(array, &ftrace_graph_count, buffer); - if (ret) - goto out; - - file->f_pos += read; ret = read; out: + trace_parser_put(&parser); mutex_unlock(&graph_lock); return ret; -- cgit v1.2.3 From 41dfba4367109b92d92ec6e059be6950497d932f Mon Sep 17 00:00:00 2001 From: Carsten Emde Date: Sun, 13 Sep 2009 01:41:31 +0200 Subject: tracing: remove unused local variables in tracer probe functions When the nsecs_to_usecs() conversion in probe_wakeup_sched_switch() and check_critical_timing() was moved to a later stage in order to avoid unnecessary computing, it was overlooked to remove the original variables, assignments and comments.. Signed-off-by: Carsten Emde Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 12 +----------- kernel/trace/trace_sched_wakeup.c | 10 ---------- 2 files changed, 1 insertion(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5555b75a0d1..06f8ea9e4b9 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr, unsigned long parent_ip, int cpu) { - unsigned long latency, t0, t1; cycle_t T0, T1, delta; unsigned long flags; int pc; - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -157,17 +152,12 @@ check_critical_timing(struct trace_array *tr, trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); - latency = nsecs_to_usecs(delta); - if (data->critical_sequence != max_sequence) goto out_unlock; - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - data->critical_end = parent_ip; + tracing_max_latency = delta; update_max_tr_single(tr, current, cpu); max_sequence++; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index cf43bdb1763..6e1529bc617 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -110,7 +110,6 @@ static void notrace probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - unsigned long latency = 0, t0 = 0, t1 = 0; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; @@ -156,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); - /* - * usecs conversion is slow so we try to delay the conversion - * as long as possible: - */ T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); delta = T1-T0; @@ -167,12 +162,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (!report_latency(delta)) goto out_unlock; - latency = nsecs_to_usecs(delta); - tracing_max_latency = delta; - t0 = nsecs_to_usecs(T0); - t1 = nsecs_to_usecs(T1); - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); out_unlock: -- cgit v1.2.3 From b5130b1e7d3717d03ab1916b198bf0d49fa0a619 Mon Sep 17 00:00:00 2001 From: Carsten Emde Date: Sun, 13 Sep 2009 01:43:07 +0200 Subject: tracing: do not update tracing_max_latency when tracer is stopped The state of the function pair tracing_stop()/tracing_start() is correctly considered when tracer data are updated. However, the global and externally accessible variable tracing_max_latency is always updated - even when tracing is stopped. The update should only occur, if tracing was not stopped. Signed-off-by: Carsten Emde Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 +++++ kernel/trace/trace.h | 1 + kernel/trace/trace_irqsoff.c | 6 ++++-- kernel/trace/trace_sched_wakeup.c | 6 ++++-- 4 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 45c3f0352d7..ef82a7fabf3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -825,6 +825,11 @@ static void trace_init_cmdlines(void) cmdline_idx = 0; } +int is_tracing_stopped(void) +{ + return trace_stop_count; +} + /** * ftrace_off_permanent - disable all ftrace code permanently * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 28247cecd95..4ad4e1ddcb9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -461,6 +461,7 @@ void tracing_stop_sched_switch_record(void); void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); +int is_tracing_stopped(void); extern unsigned long nsecs_to_usecs(unsigned long nsecs); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 06f8ea9e4b9..3aa7eaa2114 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -157,8 +157,10 @@ check_critical_timing(struct trace_array *tr, data->critical_end = parent_ip; - tracing_max_latency = delta; - update_max_tr_single(tr, current, cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr_single(tr, current, cpu); + } max_sequence++; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 6e1529bc617..26185d72767 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -162,8 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (!report_latency(delta)) goto out_unlock; - tracing_max_latency = delta; - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + } out_unlock: __wakeup_reset(wakeup_trace); -- cgit v1.2.3 From 558e6547e4b8a2b13608a24a9d3679802f91c4c7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 24 Aug 2009 12:19:47 +0800 Subject: tracing/profile: fix profile_disable vs module_unload If the correspoding module is unloaded before ftrace_profile_disable() is called, event->profile_disable() won't be called, which can cause oops: # insmod trace-events-sample.ko # perf record -f -a -e sample:foo_bar sleep 3 & # sleep 1 # rmmod trace_events_sample # insmod trace-events-sample.ko OOPS! Signed-off-by: Li Zefan LKML-Reference: <4A9214E3.2070807@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_profile.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 11ba5bb4ed0..55a25c933d1 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -5,6 +5,7 @@ * */ +#include #include "trace.h" int ftrace_profile_enable(int event_id) @@ -14,7 +15,8 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->profile_enable) { + if (event->id == event_id && event->profile_enable && + try_module_get(event->mod)) { ret = event->profile_enable(event); break; } @@ -32,6 +34,7 @@ void ftrace_profile_disable(int event_id) list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) { event->profile_disable(event); + module_put(event->mod); break; } } -- cgit v1.2.3 From 0a1c49db8d91c538f104f8d70e560c6fdd589bd4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 12 Sep 2009 19:17:15 -0400 Subject: tracing: use macros to create internal ftrace entry ring buffer structures The entries used by ftrace internal code (plugins) currently have their formats manually exported to userspace. That is, the format files in debugfs/tracing/events/ftrace/*/format are currently created by hand. This is a maintenance nightmare, and can easily become out of sync with what is actually shown. This patch uses the methodology of the TRACE_EVENT macros to build the structures so that their formats can be automated and this will keep the structures in sync with what users can see. This patch only changes the way the structures are created. Further patches will build off of this to automate the format files. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 160 ++++---------------- kernel/trace/trace_entries.h | 342 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 369 insertions(+), 133 deletions(-) create mode 100644 kernel/trace/trace_entries.h (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4ad4e1ddcb9..d308195d40a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -42,150 +42,45 @@ enum trace_type { __TRACE_LAST_TYPE, }; -/* - * Function trace entry - function address and parent function addres: - */ -struct ftrace_entry { - struct trace_entry ent; - unsigned long ip; - unsigned long parent_ip; -}; - -/* Function call entry */ -struct ftrace_graph_ent_entry { - struct trace_entry ent; - struct ftrace_graph_ent graph_ent; +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ }; -/* Function return entry */ -struct ftrace_graph_ret_entry { - struct trace_entry ent; - struct ftrace_graph_ret ret; -}; extern struct tracer boot_tracer; -/* - * Context switch trace entry - which task (and prio) we switched from/to: - */ -struct ctx_switch_entry { - struct trace_entry ent; - unsigned int prev_pid; - unsigned char prev_prio; - unsigned char prev_state; - unsigned int next_pid; - unsigned char next_prio; - unsigned char next_state; - unsigned int next_cpu; -}; - -/* - * Special (free-form) trace entry: - */ -struct special_entry { - struct trace_entry ent; - unsigned long arg1; - unsigned long arg2; - unsigned long arg3; -}; - -/* - * Stack-trace entry: - */ +#undef __field +#define __field(type, item) type item; -#define FTRACE_STACK_ENTRIES 8 +#undef __array +#define __array(type, item, size) type item[size]; -struct stack_entry { - struct trace_entry ent; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; -struct userstack_entry { - struct trace_entry ent; - unsigned int tgid; - unsigned long caller[FTRACE_STACK_ENTRIES]; -}; - -/* - * trace_printk entry: - */ -struct bprint_entry { - struct trace_entry ent; - unsigned long ip; - const char *fmt; - u32 buf[]; -}; - -struct print_entry { - struct trace_entry ent; - unsigned long ip; - char buf[]; -}; +#undef F_STRUCT +#define F_STRUCT(args...) args -struct trace_mmiotrace_rw { - struct trace_entry ent; - struct mmiotrace_rw rw; -}; - -struct trace_mmiotrace_map { - struct trace_entry ent; - struct mmiotrace_map map; -}; - -struct trace_boot_call { - struct trace_entry ent; - struct boot_trace_call boot_call; -}; - -struct trace_boot_ret { - struct trace_entry ent; - struct boot_trace_ret boot_ret; -}; - -#define TRACE_FUNC_SIZE 30 -#define TRACE_FILE_SIZE 20 -struct trace_branch { - struct trace_entry ent; - unsigned line; - char func[TRACE_FUNC_SIZE+1]; - char file[TRACE_FILE_SIZE+1]; - char correct; -}; - -struct hw_branch_entry { - struct trace_entry ent; - u64 from; - u64 to; -}; - -struct trace_power { - struct trace_entry ent; - struct power_trace state_data; -}; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ + } -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; +#undef TP_ARGS +#define TP_ARGS(args...) args -struct kmemtrace_alloc_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; - size_t bytes_req; - size_t bytes_alloc; - gfp_t gfp_flags; - int node; -}; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) -struct kmemtrace_free_entry { - struct trace_entry ent; - enum kmemtrace_type_id type_id; - unsigned long call_site; - const void *ptr; -}; +#include "trace_entries.h" +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */ struct syscall_trace_enter { struct trace_entry ent; int nr; @@ -198,7 +93,6 @@ struct syscall_trace_exit { unsigned long ret; }; - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 00000000000..82c51fdca03 --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,342 @@ +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + * the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + * this is from the ring buffer. + * + * @structure: the structure layout + * + * - __field( type, item ) + * This is equivalent to declaring + * type item; + * in the structure. + * - __array( type, item, size ) + * This is equivalent to declaring + * type item[size]; + * in the structure. + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function addres: + */ +FTRACE_ENTRY(function, ftrace_entry, + + TRACE_FN, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + ), + + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +); + +/* Function call entry */ +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, + + TRACE_GRAPH_ENT, + + F_STRUCT( + __field( struct ftrace_graph_ent, graph_ent ) + ), + + F_printk("--> %lx (%d)", __entry->graph_ent.func, __entry->depth) +); + +/* Function return entry */ +FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field( struct ftrace_graph_ret, ret ) + ), + + F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", + __entry->func, __entry->depth, + __entry->calltime, __entry->rettim, + __entrty->depth) +); + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS \ + __field( unsigned int, prev_pid ) \ + __field( unsigned char, prev_prio ) \ + __field( unsigned char, prev_state ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned char, next_prio ) \ + __field( unsigned char, next_state ) \ + __field( unsigned int, next_cpu ) + +#if 0 +FTRACE_ENTRY_STRUCT_ONLY(ctx_switch_entry, + + F_STRUCT( + FTRACE_CTX_FIELDS + ) +); +#endif + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + + TRACE_CTX, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk(b"%u:%u:%u ==> %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + * create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + + TRACE_WAKE, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * Special (free-form) trace entry: + */ +FTRACE_ENTRY(special, special_entry, + + TRACE_SPECIAL, + + F_STRUCT( + __field( unsigned long, arg1 ) + __field( unsigned long, arg2 ) + __field( unsigned long, arg3 ) + ), + + F_printk("(%08lx) (%08lx) (%08lx)", + __entry->arg1, __entry->arg2, __entry->arg3) +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +FTRACE_ENTRY(kernel_stack, stack_entry, + + TRACE_STACK, + + F_STRUCT( + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +FTRACE_ENTRY(user_stack, userstack_entry, + + TRACE_USER_STACK, + + F_STRUCT( + __field( unsigned int, tgid ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + + TRACE_BPRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, fmt ) + __dynamic_array( u32, buf ) + ), + + F_printk("%08lx fmt:%p", + __entry->ip, __entry->fmt) +); + +FTRACE_ENTRY(print, print_entry, + + TRACE_PRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __dynamic_array( char, buf ) + ), + + F_printk("%08lx %s", + __entry->ip, __entry->buf) +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + + TRACE_MMIO_RW, + + F_STRUCT( + __field( struct mmiotrace_rw, rw ) + ), + + F_printk("%lx %lx %lx %d %lx %lx", + __entry->phs, __entry->value, __entry->pc, + __entry->map_id, __entry->opcode, __entry->width) +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + + TRACE_MMIO_MAP, + + F_STRUCT( + __field( struct mmiotrace_map, map ) + ), + + F_printk("%lx %lx %lx %d %lx", + __entry->phs, __entry->virt, __entry->len, + __entry->map_id, __entry->opcode) +); + +FTRACE_ENTRY(boot_call, trace_boot_call, + + TRACE_BOOT_CALL, + + F_STRUCT( + __field( struct boot_trace_call, boot_call ) + ), + + F_printk("%d %s", __entry->caller, __entry->func) +); + +FTRACE_ENTRY(boot_ret, trace_boot_ret, + + TRACE_BOOT_RET, + + F_STRUCT( + __field( struct boot_trace_ret, boot_ret ) + ), + + F_printk("%s %d %lx", + __entry->func, __entry->result, __entry->duration) +); + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + + TRACE_BRANCH, + + F_STRUCT( + __field( unsigned int, line ) + __array( char, func, TRACE_FUNC_SIZE+1 ) + __array( char, file, TRACE_FILE_SIZE+1 ) + __field( char, correct ) + ), + + F_printk("%u:%s:%s (%u)", + __entry->line, + __entry->func, __entry->file, __entry->correct) +); + +FTRACE_ENTRY(hw_branch, hw_branch_entry, + + TRACE_HW_BRANCHES, + + F_STRUCT( + __field( u64, from ) + __field( u64, to ) + ), + + F_printk("from: %llx to: %llx", __entry->from, __entry->to) +); + +FTRACE_ENTRY(power, trace_power, + + TRACE_POWER, + + F_STRUCT( + __field( struct power_trace, state_data ) + ), + + F_printk("%llx->%llx type:%u state:%u", + __entry->stamp, __entry->end, + __entry->type, __entry->state) +); + +FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, + + TRACE_KMEM_ALLOC, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + F_printk("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" + " flags:%x node:%d", + __entry->type_id, __entry->call_site, __entry->ptr, + __entry->bytes_req, __entry->bytes_alloc, + __entry->gfp_flags, __entry->node) +); + +FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, + + TRACE_KMEM_FREE, + + F_STRUCT( + __field( enum kmemtrace_type_id, type_id ) + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + F_printk("type:%u call_site:%lx ptr:%p", + __entry->type_id, __entry->call_site, __entry->ptr) +); -- cgit v1.2.3 From d73150943cf47b6cabcb4f4e52dd25975e820ae2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 12 Sep 2009 19:22:23 -0400 Subject: tracing: show details of structures within the ftrace structures Some of the internal ftrace structures use structures within. The output of a field saying it is just a structure is useless for a format file. A binary reader of the ring buffer needs to know more about how the fields are broken up. This patch adds to the ftrace structure macros new fields to describe the structures inside a structure. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +++++++ kernel/trace/trace_entries.h | 64 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 66 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d308195d40a..b0d287d49a6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -53,9 +53,18 @@ extern struct tracer boot_tracer; #undef __field #define __field(type, item) type item; +#undef __field_struct +#define __field_struct(type, item) __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + #undef __array #define __array(type, item, size) type item[size]; +#undef __array_desc +#define __array_desc(type, container, item, size) + #undef __dynamic_array #define __dynamic_array(type, item) type item[]; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 82c51fdca03..c866d34e014 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -26,6 +26,29 @@ * type item[size]; * in the structure. * + * * for structures within structures, the format of the internal + * structure is layed out. This allows the internal structure + * to be deciphered for the format file. Although these macros + * may become out of sync with the internal structure, they + * will create a compile error if it happens. Since the + * internel structures are just tracing helpers, this is not + * an issue. + * + * When an internal structure is used, it should use: + * + * __field_struct( type, item ) + * + * instead of __field. This will prevent it from being shown in + * the output file. The fields in the structure should use. + * + * __field_desc( type, container, item ) + * __array_desc( type, container, item, len ) + * + * type, item and len are the same as __field and __array, but + * container is added. This is the name of the item in + * __field_struct that this is describing. + * + * * @print: the print format shown to users in the format file. */ @@ -50,7 +73,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, F_STRUCT( - __field( struct ftrace_graph_ent, graph_ent ) + __field_struct( struct ftrace_graph_ent, graph_ent ) + __field_desc( unsigned long, graph_ent, func ) + __field_desc( int, graph_ent, depth ) ), F_printk("--> %lx (%d)", __entry->graph_ent.func, __entry->depth) @@ -62,7 +87,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, TRACE_GRAPH_RET, F_STRUCT( - __field( struct ftrace_graph_ret, ret ) + __field_struct( struct ftrace_graph_ret, ret ) + __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long long, ret, calltime) + __field_desc( unsigned long long, ret, rettime ) + __field_desc( unsigned long, ret, overrun ) + __field_desc( int, ret, depth ) ), F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", @@ -218,7 +248,13 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, TRACE_MMIO_RW, F_STRUCT( - __field( struct mmiotrace_rw, rw ) + __field_struct( struct mmiotrace_rw, rw ) + __field_desc( resource_size_t, rw, phys ) + __field_desc( unsigned long, rw, value ) + __field_desc( unsigned long, rw, pc ) + __field_desc( int, rw, map_id ) + __field_desc( unsigned char, rw, opcode ) + __field_desc( unsigned char, rw, width ) ), F_printk("%lx %lx %lx %d %lx %lx", @@ -231,7 +267,12 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, TRACE_MMIO_MAP, F_STRUCT( - __field( struct mmiotrace_map, map ) + __field_struct( struct mmiotrace_map, map ) + __field_desc( resource_size_t, map, phys ) + __field_desc( unsigned long, map, virt ) + __field_desc( unsigned long, map, len ) + __field_desc( int, map, map_id ) + __field_desc( unsigned char, map, opcode ) ), F_printk("%lx %lx %lx %d %lx", @@ -244,7 +285,9 @@ FTRACE_ENTRY(boot_call, trace_boot_call, TRACE_BOOT_CALL, F_STRUCT( - __field( struct boot_trace_call, boot_call ) + __field_struct( struct boot_trace_call, boot_call ) + __field_desc( pid_t, boot_call, caller ) + __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) ), F_printk("%d %s", __entry->caller, __entry->func) @@ -255,7 +298,10 @@ FTRACE_ENTRY(boot_ret, trace_boot_ret, TRACE_BOOT_RET, F_STRUCT( - __field( struct boot_trace_ret, boot_ret ) + __field_struct( struct boot_trace_ret, boot_ret ) + __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) + __field_desc( int, boot_ret, result ) + __field_desc( unsigned long, boot_ret, duration ) ), F_printk("%s %d %lx", @@ -298,7 +344,11 @@ FTRACE_ENTRY(power, trace_power, TRACE_POWER, F_STRUCT( - __field( struct power_trace, state_data ) + __field_struct( struct power_trace, state_data ) + __field_desc( s64, state_data, stamp ) + __field_desc( s64, state_data, end ) + __field_desc( int, state_data, type ) + __field_desc( int, state_data, state ) ), F_printk("%llx->%llx type:%u state:%u", -- cgit v1.2.3 From 4e5292ea1ac0c2939e815e6c44fad3d8696ea281 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 12 Sep 2009 19:26:21 -0400 Subject: tracing: use the new trace_entries.h to create format files This patch changes the way the format files in debugfs/tracing/events/ftrace/*/format are created. It uses the new trace_entries.h file to automate the creation of the format files to ensure that they are always in sync with the actual structures. This is the same methodology used to create the format files for the TRACE_EVENT macro. This also updates the filter creation that was built on the creation of the format files. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 12 ++- kernel/trace/trace_events.c | 1 + kernel/trace/trace_export.c | 241 +++++++++++++++++++++++--------------------- 3 files changed, 136 insertions(+), 118 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b0d287d49a6..86bcff94791 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -746,11 +747,12 @@ extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct ftrace_event_call event_##call; -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) -#include "trace_event_types.h" +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#include "trace_entries.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f46d14cefde..adbed124c3e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -21,6 +21,7 @@ #include "trace_output.h" +#undef TRACE_SYSTEM #define TRACE_SYSTEM "TRACE_SYSTEM" DEFINE_MUTEX(event_mutex); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index df1bf6e48bb..4cb29d84d73 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -15,82 +15,163 @@ #include "trace_output.h" +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) -extern void __bad_type_size(void); +#undef __field +#define __field(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (!ret) \ + return 0; -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - if (sizeof(type) != sizeof(field.item)) \ - __bad_type_size(); \ +#undef __field_desc +#define __field_desc(type, container, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item)); \ if (!ret) \ return 0; +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (!ret) \ + return 0; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ +#undef __array_desc +#define __array_desc(type, container, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%zu;\tsize:%zu;\n", \ + offsetof(typeof(field), container.item), \ + sizeof(field.container.item)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) \ - ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ - "offset:%u;\tsize:0;\n", \ - (unsigned int)offsetof(typeof(field), item)); \ +#undef __dynamic_array +#define __dynamic_array(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%zu;\tsize:0;\n", \ + offsetof(typeof(field), item)); \ if (!ret) \ return 0; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) +#undef F_printk +#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) -#undef TP_RAW_FMT -#define TP_RAW_FMT(args...) args +#undef __entry +#define __entry REC -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ +ftrace_format_##name(struct ftrace_event_call *unused, \ + struct trace_seq *s) \ { \ - struct args field; \ - int ret; \ + struct struct_name field __attribute__((unused)); \ + int ret = 0; \ \ tstruct; \ \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + trace_seq_printf(s, "\nprint fmt: " print); \ \ return ret; \ } -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ -static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), 0, FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __array_desc +#define __array_desc(type, container, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), 0, \ + FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +int \ +ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ - struct args field; \ + struct struct_name field; \ int ret; \ \ - tstruct; \ + ret = trace_define_common_fields(event_call); \ + if (ret) \ + return ret; \ \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + tstruct; \ \ return ret; \ } -#include "trace_event_types.h" +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, len) + +#undef __array_desc +#define __array_desc(type, container, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item) + #undef TRACE_ZERO_CHAR #define TRACE_ZERO_CHAR(arg) @@ -117,16 +198,15 @@ ftrace_format_##call(struct ftrace_event_call *unused, \ #define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ cmd; -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ static int ftrace_raw_init_event_##call(void); \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ - .id = proto, \ + .id = type, \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event_##call, \ .show_format = ftrace_format_##call, \ @@ -138,69 +218,4 @@ static int ftrace_raw_init_event_##call(void) \ return 0; \ } \ -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) \ - \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .id = proto, \ - .system = __stringify(TRACE_SYSTEM), \ - .show_format = ftrace_format_##call, \ -}; - -#include "trace_event_types.h" - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), 0, FILTER_OTHER); \ - if (ret) \ - return ret; - -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed, \ - FILTER_OTHER); \ - if (ret) \ - return ret; - -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int \ -ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ -{ \ - struct args field; \ - int ret; \ - \ - ret = trace_define_common_fields(event_call); \ - if (ret) \ - return ret; \ - \ - tstruct; \ - \ - return ret; \ -} - -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ - tpfmt) - -#include "trace_event_types.h" +#include "trace_entries.h" -- cgit v1.2.3 From 51df5fcbc1296a84cf1c093c6cb56d40ca3e697e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 12 Sep 2009 20:29:22 -0400 Subject: tracing: remove trace_event_types.h The macros in trace_entries.h have made the code in trace_event_types.h obsolete. The file is no longer used, so this patch removes it. Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_types.h | 178 --------------------------------------- 1 file changed, 178 deletions(-) delete mode 100644 kernel/trace/trace_event_types.h (limited to 'kernel') diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h deleted file mode 100644 index 6db005e1248..00000000000 --- a/kernel/trace/trace_event_types.h +++ /dev/null @@ -1,178 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM ftrace - -/* - * We cheat and use the proto type field as the ID - * and args as the entry type (minus 'struct') - */ -TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned long, parent_ip, parent_ip) - ), - TP_RAW_FMT(" %lx <-- %lx") -); - -TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, - ftrace_graph_ent_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, graph_ent.func, func) - TRACE_FIELD(int, graph_ent.depth, depth) - ), - TP_RAW_FMT("--> %lx (%d)") -); - -TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, - ftrace_graph_ret_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ret.func, func) - TRACE_FIELD(unsigned long long, ret.calltime, calltime) - TRACE_FIELD(unsigned long long, ret.rettime, rettime) - TRACE_FIELD(unsigned long, ret.overrun, overrun) - TRACE_FIELD(int, ret.depth, depth) - ), - TP_RAW_FMT("<-- %lx (%d)") -); - -TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, prev_pid, prev_pid) - TRACE_FIELD(unsigned char, prev_prio, prev_prio) - TRACE_FIELD(unsigned char, prev_state, prev_state) - TRACE_FIELD(unsigned int, next_pid, next_pid) - TRACE_FIELD(unsigned char, next_prio, next_prio) - TRACE_FIELD(unsigned char, next_state, next_state) - TRACE_FIELD(unsigned int, next_cpu, next_cpu) - ), - TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, arg1, arg1) - TRACE_FIELD(unsigned long, arg2, arg2) - TRACE_FIELD(unsigned long, arg3, arg3) - ), - TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") -); - -/* - * Stack-trace entry: - */ - -/* #define FTRACE_STACK_ENTRIES 8 */ - -TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, caller[0], stack0) - TRACE_FIELD(unsigned long, caller[1], stack1) - TRACE_FIELD(unsigned long, caller[2], stack2) - TRACE_FIELD(unsigned long, caller[3], stack3) - TRACE_FIELD(unsigned long, caller[4], stack4) - TRACE_FIELD(unsigned long, caller[5], stack5) - TRACE_FIELD(unsigned long, caller[6], stack6) - TRACE_FIELD(unsigned long, caller[7], stack7) - ), - TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(char *, fmt, fmt) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD_ZERO_CHAR(buf) - ), - TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, - TRACE_STRUCT( - TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, - TRACE_FUNC_SIZE+1, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, - TRACE_FUNC_SIZE+1, file) - TRACE_FIELD(char, correct, correct) - ), - TP_RAW_FMT("%u:%s:%s (%u)") -); - -TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(u64, from, from) - TRACE_FIELD(u64, to, to) - ), - TP_RAW_FMT("from: %llx to: %llx") -); - -TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, - TRACE_STRUCT( - TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) - TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) - TRACE_FIELD(int, state_data.type, type) - TRACE_FIELD(int, state_data.state, state) - ), - TP_RAW_FMT("%llx->%llx type:%u state:%u") -); - -TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - TRACE_FIELD(size_t, bytes_req, bytes_req) - TRACE_FIELD(size_t, bytes_alloc, bytes_alloc) - TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) - TRACE_FIELD(int, node, node) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" - " flags:%x node:%d") -); - -TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, - TRACE_STRUCT( - TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) - TRACE_FIELD(unsigned long, call_site, call_site) - TRACE_FIELD(const void *, ptr, ptr) - ), - TP_RAW_FMT("type:%u call_site:%lx ptr:%p") -); - -#undef TRACE_SYSTEM -- cgit v1.2.3 From 60ba77022712c7cda0eda286154bae160446b24a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 12 Sep 2009 23:34:04 -0400 Subject: tracing: add filter event logic to special, mmiotrace and boot tracers Now that the pluging tracers use macros to create the structures and automate the exporting of their formats to the format files, they also automatically get a filter file. This patch adds the code to implement the filter logic in the trace recordings. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++++- kernel/trace/trace_boot.c | 8 ++++++-- kernel/trace/trace_mmiotrace.c | 10 ++++++++-- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ef82a7fabf3..fd52a19dd17 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1206,6 +1206,7 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { + struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct ring_buffer *buffer = tr->buffer; @@ -1219,7 +1220,9 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 19bfc75d467..c21d5f3956a 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly = void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_call; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_boot_call *entry; @@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) goto out; entry = ring_buffer_event_data(event); entry->boot_call = *bt; - trace_buffer_unlock_commit(buffer, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { + struct ftrace_event_call *call = &event_boot_ret; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_boot_ret *entry; @@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) goto out; entry = ring_buffer_event_data(event); entry->boot_ret = *bt; - trace_buffer_unlock_commit(buffer, event, 0, 0); + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index c4c9bbda53d..0acd834659e 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_rw *rw) { + struct ftrace_event_call *call = &event_mmiotrace_rw; struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; @@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->rw = *rw; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, struct mmiotrace_map *map) { + struct ftrace_event_call *call = &event_mmiotrace_map; struct ring_buffer *buffer = tr->buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; @@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr, } entry = ring_buffer_event_data(event); entry->map = *map; - trace_buffer_unlock_commit(buffer, event, 0, pc); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) -- cgit v1.2.3 From 08a408161749d2406f94f4e3d47cfdbc826ad1cc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 14 Sep 2009 09:31:35 -0400 Subject: ring-buffer: typecast cmpxchg to fix PowerPC warning The cmpxchg used by PowerPC does the following: ({ \ __typeof__(*(ptr)) _o_ = (o); \ __typeof__(*(ptr)) _n_ = (n); \ (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ (unsigned long)_n_, sizeof(*(ptr))); \ }) This does a type check of *ptr to both o and n. Unfortunately, the code in ring-buffer.c assigns longs to pointers and pointers to longs and causes a warning on PowerPC: ring_buffer.c: In function 'rb_head_page_set': ring_buffer.c:704: warning: initialization makes pointer from integer without a cast ring_buffer.c:704: warning: initialization makes pointer from integer without a cast ring_buffer.c: In function 'rb_head_page_replace': ring_buffer.c:797: warning: initialization makes integer from pointer without a cast This patch adds the typecasts inside cmpxchg to annotate that a long is being cast to a pointer and a pointer is being casted to a long and this removes the PowerPC warnings. Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8786c350b4c..6eef38923b0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -701,8 +701,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, val &= ~RB_FLAG_MASK; - ret = (unsigned long)cmpxchg(&list->next, - val | old_flag, val | new_flag); + ret = cmpxchg((unsigned long *)&list->next, + val | old_flag, val | new_flag); /* check if the reader took the page */ if ((ret & ~RB_FLAG_MASK) != val) @@ -794,7 +794,7 @@ static int rb_head_page_replace(struct buffer_page *old, val = *ptr & ~RB_FLAG_MASK; val |= RB_PAGE_HEAD; - ret = cmpxchg(ptr, val, &new->list); + ret = cmpxchg(ptr, val, (unsigned long)&new->list); return ret == val; } -- cgit v1.2.3 From ec827c7ece8901044e6b3f92aeea489be9e1bcf7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 14 Sep 2009 10:50:23 -0400 Subject: tracing: add static to generated TRACE_EVENT functions Some of the generated functions used in the TRACE_EVENT macros are not declared static, but they are not global. Discovered by sparse. Reported-by: Jaswinder Singh Rajput Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index adbed124c3e..0fa8f9faa61 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1154,7 +1154,7 @@ static int trace_module_notify(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block trace_module_nb = { +static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, }; -- cgit v1.2.3 From c16de8fd7a608aba8708dd056cf6e4d9462e800a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 14 Sep 2009 15:51:39 +0800 Subject: tracing: fix F_printk() typos I found some typos in F_printk(), so I wrote compile-time check for it, and triggered some compile errors and warnings. I've fixed them on x86_32, but I have no x86_64 in my hand, so there may still be some compile warnings on 64bits. Signed-off-by: Li Zefan LKML-Reference: <4AADF60B.5070407@cn.fujitsu.com> [ tested on x86_64, and works fine ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_entries.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c866d34e014..0c100958c64 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -78,7 +78,7 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->graph_ent.func, __entry->depth) + F_printk("--> %lx (%d)", __entry->func, __entry->depth) ); /* Function return entry */ @@ -97,8 +97,8 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", __entry->func, __entry->depth, - __entry->calltime, __entry->rettim, - __entrty->depth) + __entry->calltime, __entry->rettime, + __entry->depth) ); /* @@ -133,7 +133,7 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, FTRACE_CTX_FIELDS ), - F_printk(b"%u:%u:%u ==> %u:%u:%u [%03u]", + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, __entry->next_cpu @@ -257,8 +257,8 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, __field_desc( unsigned char, rw, width ) ), - F_printk("%lx %lx %lx %d %lx %lx", - __entry->phs, __entry->value, __entry->pc, + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, __entry->map_id, __entry->opcode, __entry->width) ); @@ -275,8 +275,8 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __field_desc( unsigned char, map, opcode ) ), - F_printk("%lx %lx %lx %d %lx", - __entry->phs, __entry->virt, __entry->len, + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, __entry->map_id, __entry->opcode) ); @@ -370,7 +370,7 @@ FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, __field( int, node ) ), - F_printk("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" + F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" " flags:%x node:%d", __entry->type_id, __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, -- cgit v1.2.3 From 05ffa2d02066c2cb169c02d5417308ee8ecab638 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 14 Sep 2009 15:54:52 +0800 Subject: ftrace: add compile-time check on F_printk() Make sure F_printk() has corrent format and args, and make sure changes in F_STRUCT() won't break F_printk(). Signed-off-by: Li Zefan LKML-Reference: <4AADF6CC.1060809@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 45 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4cb29d84d73..2435d55947e 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -22,6 +22,47 @@ #undef __field_struct #define __field_struct(type, item) +#undef __field +#define __field(type, item) type item; + +#undef __field_desc +#define __field_desc(type, container, item) type item; + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __used ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force cmpile-time check on F_printk() */ \ + printk(print); \ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + + #undef __field #define __field(type, item) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ @@ -88,10 +129,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ return ret; \ } -#undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) - #include "trace_entries.h" -- cgit v1.2.3 From 20a58a77231c5f5f61470932503b889303e8d4f3 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 14 Sep 2009 15:55:18 +0800 Subject: tracing: remove some unused macros - remove FTRACE_ENTRY_STRUCT_ONLY() - remove TRACE_XXX() macros Signed-off-by: Li Zefan LKML-Reference: <4AADF6E6.3080606@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_entries.h | 9 --------- kernel/trace/trace_export.c | 26 -------------------------- 2 files changed, 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 0c100958c64..a431748ddd6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -116,15 +116,6 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, __field( unsigned char, next_state ) \ __field( unsigned int, next_cpu ) -#if 0 -FTRACE_ENTRY_STRUCT_ONLY(ctx_switch_entry, - - F_STRUCT( - FTRACE_CTX_FIELDS - ) -); -#endif - FTRACE_ENTRY(context_switch, ctx_switch_entry, TRACE_CTX, diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 2435d55947e..9753fcc61bc 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -209,32 +209,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef __dynamic_array #define __dynamic_array(type, item) - -#undef TRACE_ZERO_CHAR -#define TRACE_ZERO_CHAR(arg) - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ - TRACE_FIELD(type, item, assign) - -#undef TP_CMD -#define TP_CMD(cmd...) cmd - -#undef TRACE_ENTRY -#define TRACE_ENTRY entry - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ - cmd; - #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ static int ftrace_raw_init_event_##call(void); \ -- cgit v1.2.3 From 1f5a6b45416694ff8c0d04625f1a438a0e380add Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 14 Sep 2009 11:58:24 -0400 Subject: tracing: make testing syscall events a separate configuration Parag noticed that the number of event tests has increased tremendously: grep "Testing event" dmesg.31rc9 |wc -l 100 grep "Testing event" dmesg.31git |wc -l 1172 This is due to the testing of every syscall event when ftrace self test is enabled. This adds a bit more time to kernel boot up and can affect development by slowing down the time it takes between reboots. This option makes the testing of the syscall events into a separate config, to still be able to test most of ftrace internals at boot up but not have to wait for all the syscall events to be tested. The syscall event testing only tests the enabling and disabling of the trace point, since the syscalls are not executed. What really needs to be done is to somehow have a userspace tool test the syscall tracepoints as well. Reported-by: Parag Warudkar LKML-Reference: Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 12 ++++++++++++ kernel/trace/trace_events.c | 12 ++++++++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1ea0d1234f4..aa002cef924 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -469,6 +469,18 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on FTRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0fa8f9faa61..787f0fb0994 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1326,6 +1326,18 @@ static __init void event_trace_self_tests(void) if (!call->regfunc) continue; +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->system && + strcmp(call->system, "syscalls") == 0) + continue; +#endif + pr_info("Testing event %s: ", call->name); /* -- cgit v1.2.3 From e681c9dd62fe8fcc5bba28a3ca3f7dc8be940206 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 8 Jul 2009 13:23:32 +0200 Subject: PM: Fix typo in label name s/Platofrm_finish/Platform_finish/ Although the same label name is used somewhere else in the file, this particular label was consistently typoed in all of its uses. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 81d2e746489..ec8202512b0 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -460,11 +460,11 @@ int hibernation_platform_enter(void) error = hibernation_ops->prepare(); if (error) - goto Platofrm_finish; + goto Platform_finish; error = disable_nonboot_cpus(); if (error) - goto Platofrm_finish; + goto Platform_finish; local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); @@ -476,7 +476,7 @@ int hibernation_platform_enter(void) * We don't need to reenable the nonboot CPUs or resume consoles, since * the system is going to be halted anyway. */ - Platofrm_finish: + Platform_finish: hibernation_ops->finish(); dpm_suspend_noirq(PMSG_RESTORE); -- cgit v1.2.3 From 4bb334353ebd821bc8eeabeb019eaac33c7307df Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 8 Jul 2009 13:23:51 +0200 Subject: PM/Hibernate: Rework shrinking of memory Rework swsusp_shrink_memory() so that it calls shrink_all_memory() just once to make some room for the image and then allocates memory to apply more pressure to the memory management subsystem, if necessary. Unfortunately, we don't seem to be able to drop shrink_all_memory() entirely just yet, because that would lead to huge performance regressions in some test cases. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek --- kernel/power/snapshot.c | 205 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 158 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 523a451b45d..a3a175fa0a4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1066,69 +1066,180 @@ void swsusp_free(void) buffer = NULL; } +/* Helper functions used for the shrinking of memory. */ + +#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) + /** - * swsusp_shrink_memory - Try to free as much memory as needed - * - * ... but do not OOM-kill anyone + * preallocate_image_pages - Allocate a number of pages for hibernation image + * @nr_pages: Number of page frames to allocate. + * @mask: GFP flags to use for the allocation. * - * Notice: all userland should be stopped before it is called, or - * livelock is possible. + * Return value: Number of page frames actually allocated */ +static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) +{ + unsigned long nr_alloc = 0; + + while (nr_pages > 0) { + if (!alloc_image_page(mask)) + break; + nr_pages--; + nr_alloc++; + } -#define SHRINK_BITE 10000 -static inline unsigned long __shrink_memory(long tmp) + return nr_alloc; +} + +static unsigned long preallocate_image_memory(unsigned long nr_pages) +{ + return preallocate_image_pages(nr_pages, GFP_IMAGE); +} + +#ifdef CONFIG_HIGHMEM +static unsigned long preallocate_image_highmem(unsigned long nr_pages) { - if (tmp > SHRINK_BITE) - tmp = SHRINK_BITE; - return shrink_all_memory(tmp); + return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM); } +/** + * __fraction - Compute (an approximation of) x * (multiplier / base) + */ +static unsigned long __fraction(u64 x, u64 multiplier, u64 base) +{ + x *= multiplier; + do_div(x, base); + return (unsigned long)x; +} + +static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + unsigned long alloc = __fraction(nr_pages, highmem, total); + + return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM); +} +#else /* CONFIG_HIGHMEM */ +static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return 0; +} + +static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + return 0; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * swsusp_shrink_memory - Make the kernel release as much memory as needed + * + * To create a hibernation image it is necessary to make a copy of every page + * frame in use. We also need a number of page frames to be free during + * hibernation for allocations made while saving the image and for device + * drivers, in case they need to allocate memory from their hibernation + * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, + * respectively, both of which are rough estimates). To make this happen, we + * compute the total number of available page frames and allocate at least + * + * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES + * + * of them, which corresponds to the maximum size of a hibernation image. + * + * If image_size is set below the number following from the above formula, + * the preallocation of memory is continued until the total number of saveable + * pages in the system is below the requested image size or it is impossible to + * allocate more memory, whichever happens first. + */ int swsusp_shrink_memory(void) { - long tmp; struct zone *zone; - unsigned long pages = 0; - unsigned int i = 0; - char *p = "-\\|/"; + unsigned long saveable, size, max_size, count, highmem, pages = 0; + unsigned long alloc, pages_highmem; struct timeval start, stop; + int error = 0; - printk(KERN_INFO "PM: Shrinking memory... "); + printk(KERN_INFO "PM: Shrinking memory... "); do_gettimeofday(&start); - do { - long size, highmem_size; - - highmem_size = count_highmem_pages(); - size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; - tmp = size; - size += highmem_size; - for_each_populated_zone(zone) { - tmp += snapshot_additional_pages(zone); - if (is_highmem(zone)) { - highmem_size -= - zone_page_state(zone, NR_FREE_PAGES); - } else { - tmp -= zone_page_state(zone, NR_FREE_PAGES); - tmp += zone->lowmem_reserve[ZONE_NORMAL]; - } - } - if (highmem_size < 0) - highmem_size = 0; + /* Count the number of saveable data pages. */ + highmem = count_highmem_pages(); + saveable = count_data_pages(); - tmp += highmem_size; - if (tmp > 0) { - tmp = __shrink_memory(tmp); - if (!tmp) - return -ENOMEM; - pages += tmp; - } else if (size > image_size / PAGE_SIZE) { - tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); - pages += tmp; - } - printk("\b%c", p[i++%4]); - } while (tmp > 0); + /* + * Compute the total number of page frames we can use (count) and the + * number of pages needed for image metadata (size). + */ + count = saveable; + saveable += highmem; + size = 0; + for_each_populated_zone(zone) { + size += snapshot_additional_pages(zone); + if (is_highmem(zone)) + highmem += zone_page_state(zone, NR_FREE_PAGES); + else + count += zone_page_state(zone, NR_FREE_PAGES); + } + count += highmem; + count -= totalreserve_pages; + + /* Compute the maximum number of saveable pages to leave in memory. */ + max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; + size = DIV_ROUND_UP(image_size, PAGE_SIZE); + if (size > max_size) + size = max_size; + /* + * If the maximum is not less than the current number of saveable pages + * in memory, we don't need to do anything more. + */ + if (size >= saveable) + goto out; + + /* + * Let the memory management subsystem know that we're going to need a + * large number of page frames to allocate and make it free some memory. + * NOTE: If this is not done, performance will be hurt badly in some + * test cases. + */ + shrink_all_memory(saveable - size); + + /* + * The number of saveable pages in memory was too high, so apply some + * pressure to decrease it. First, make room for the largest possible + * image and fail if that doesn't work. Next, try to decrease the size + * of the image as much as indicated by image_size using allocations + * from highmem and non-highmem zones separately. + */ + pages_highmem = preallocate_image_highmem(highmem / 2); + alloc = (count - max_size) - pages_highmem; + pages = preallocate_image_memory(alloc); + if (pages < alloc) { + error = -ENOMEM; + goto free_out; + } + size = max_size - size; + alloc = size; + size = preallocate_highmem_fraction(size, highmem, count); + pages_highmem += size; + alloc -= size; + pages += preallocate_image_memory(alloc); + pages += pages_highmem; + + free_out: + /* Release all of the preallocated page frames. */ + swsusp_free(); + + if (error) { + printk(KERN_CONT "\n"); + return error; + } + + out: do_gettimeofday(&stop); - printk("\bdone (%lu pages freed)\n", pages); + printk(KERN_CONT "done (preallocated %lu free pages)\n", pages); swsusp_show_speed(&start, &stop, pages, "Freed"); return 0; -- cgit v1.2.3 From 64a473cb74a88cb4991edf985d55a266e65292e1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 8 Jul 2009 13:24:05 +0200 Subject: PM/Hibernate: Do not release preallocated memory unnecessarily (rev. 2) Since the hibernation code is now going to use allocations of memory to make enough room for the image, it can also use the page frames allocated at this stage as image page frames. The low-level hibernation code needs to be rearranged for this purpose, but it allows us to avoid freeing a great number of pages and allocating these same pages once again later, so it generally is worth doing. [rev. 2: Take highmem into account correctly.] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 15 +++- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 202 +++++++++++++++++++++++++++++++---------------- 3 files changed, 147 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index ec8202512b0..04b3a83d686 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode) if (error) return error; - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); + /* Preallocate image memory before shutting down devices. */ + error = hibernate_preallocate_memory(); if (error) goto Close; @@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode) /* Control returns here after successful restore */ Resume_devices: + /* We may need to release the preallocated image pages here. */ + if (error || !in_suspend) + swsusp_free(); + dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); resume_console(); @@ -578,7 +582,10 @@ int hibernate(void) goto Thaw; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (in_suspend && !error) { + if (error) + goto Thaw; + + if (in_suspend) { unsigned int flags = 0; if (hibernation_mode == HIBERNATION_PLATFORM) @@ -590,8 +597,8 @@ int hibernate(void) power_down(); } else { pr_debug("PM: Image restored successfully.\n"); - swsusp_free(); } + Thaw: thaw_processes(); Finish: diff --git a/kernel/power/power.h b/kernel/power/power.h index 26d5a26f82e..46c5a26630a 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void); extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); -extern int swsusp_shrink_memory(void); +extern int hibernate_preallocate_memory(void); /** * Auxiliary structure used for reading the snapshot image data and diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a3a175fa0a4..2b1a7bc24c9 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) static unsigned int nr_copy_pages; /* Number of pages needed for saving the original pfns of the image pages */ static unsigned int nr_meta_pages; +/* + * Numbers of normal and highmem page frames allocated for hibernation image + * before suspending devices. + */ +unsigned int alloc_normal, alloc_highmem; +/* + * Memory bitmap used for marking saveable pages (during hibernation) or + * hibernation image pages (during restore) + */ +static struct memory_bitmap orig_bm; +/* + * Memory bitmap used during hibernation for marking allocated page frames that + * will contain copies of saveable pages. During restore it is initially used + * for marking hibernation image pages, but then the set bits from it are + * duplicated in @orig_bm and it is released. On highmem systems it is next + * used for marking "safe" highmem pages, but it has to be reinitialized for + * this purpose. + */ +static struct memory_bitmap copy_bm; /** * swsusp_free - free pages allocated for the suspend. @@ -1064,6 +1083,8 @@ void swsusp_free(void) nr_meta_pages = 0; restore_pblist = NULL; buffer = NULL; + alloc_normal = 0; + alloc_highmem = 0; } /* Helper functions used for the shrinking of memory. */ @@ -1082,8 +1103,16 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) unsigned long nr_alloc = 0; while (nr_pages > 0) { - if (!alloc_image_page(mask)) + struct page *page; + + page = alloc_image_page(mask); + if (!page) break; + memory_bm_set_bit(©_bm, page_to_pfn(page)); + if (PageHighMem(page)) + alloc_highmem++; + else + alloc_normal++; nr_pages--; nr_alloc++; } @@ -1135,7 +1164,47 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, #endif /* CONFIG_HIGHMEM */ /** - * swsusp_shrink_memory - Make the kernel release as much memory as needed + * free_unnecessary_pages - Release preallocated pages not needed for the image + */ +static void free_unnecessary_pages(void) +{ + unsigned long save_highmem, to_free_normal, to_free_highmem; + + to_free_normal = alloc_normal - count_data_pages(); + save_highmem = count_highmem_pages(); + if (alloc_highmem > save_highmem) { + to_free_highmem = alloc_highmem - save_highmem; + } else { + to_free_highmem = 0; + to_free_normal -= save_highmem - alloc_highmem; + } + + memory_bm_position_reset(©_bm); + + while (to_free_normal > 0 && to_free_highmem > 0) { + unsigned long pfn = memory_bm_next_pfn(©_bm); + struct page *page = pfn_to_page(pfn); + + if (PageHighMem(page)) { + if (!to_free_highmem) + continue; + to_free_highmem--; + alloc_highmem--; + } else { + if (!to_free_normal) + continue; + to_free_normal--; + alloc_normal--; + } + memory_bm_clear_bit(©_bm, pfn); + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } +} + +/** + * hibernate_preallocate_memory - Preallocate memory for hibernation image * * To create a hibernation image it is necessary to make a copy of every page * frame in use. We also need a number of page frames to be free during @@ -1154,19 +1223,30 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, * pages in the system is below the requested image size or it is impossible to * allocate more memory, whichever happens first. */ -int swsusp_shrink_memory(void) +int hibernate_preallocate_memory(void) { struct zone *zone; unsigned long saveable, size, max_size, count, highmem, pages = 0; - unsigned long alloc, pages_highmem; + unsigned long alloc, save_highmem, pages_highmem; struct timeval start, stop; - int error = 0; + int error; - printk(KERN_INFO "PM: Shrinking memory... "); + printk(KERN_INFO "PM: Preallocating image memory... "); do_gettimeofday(&start); + error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; + + error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; + + alloc_normal = 0; + alloc_highmem = 0; + /* Count the number of saveable data pages. */ - highmem = count_highmem_pages(); + save_highmem = count_highmem_pages(); saveable = count_data_pages(); /* @@ -1174,7 +1254,8 @@ int swsusp_shrink_memory(void) * number of pages needed for image metadata (size). */ count = saveable; - saveable += highmem; + saveable += save_highmem; + highmem = save_highmem; size = 0; for_each_populated_zone(zone) { size += snapshot_additional_pages(zone); @@ -1193,10 +1274,13 @@ int swsusp_shrink_memory(void) size = max_size; /* * If the maximum is not less than the current number of saveable pages - * in memory, we don't need to do anything more. + * in memory, allocate page frames for the image and we're done. */ - if (size >= saveable) + if (size >= saveable) { + pages = preallocate_image_highmem(save_highmem); + pages += preallocate_image_memory(saveable - pages); goto out; + } /* * Let the memory management subsystem know that we're going to need a @@ -1216,10 +1300,8 @@ int swsusp_shrink_memory(void) pages_highmem = preallocate_image_highmem(highmem / 2); alloc = (count - max_size) - pages_highmem; pages = preallocate_image_memory(alloc); - if (pages < alloc) { - error = -ENOMEM; - goto free_out; - } + if (pages < alloc) + goto err_out; size = max_size - size; alloc = size; size = preallocate_highmem_fraction(size, highmem, count); @@ -1228,21 +1310,24 @@ int swsusp_shrink_memory(void) pages += preallocate_image_memory(alloc); pages += pages_highmem; - free_out: - /* Release all of the preallocated page frames. */ - swsusp_free(); - - if (error) { - printk(KERN_CONT "\n"); - return error; - } + /* + * We only need as many page frames for the image as there are saveable + * pages in memory, but we have allocated more. Release the excessive + * ones now. + */ + free_unnecessary_pages(); out: do_gettimeofday(&stop); - printk(KERN_CONT "done (preallocated %lu free pages)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Freed"); + printk(KERN_CONT "done (allocated %lu pages)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Allocated"); return 0; + + err_out: + printk(KERN_CONT "\n"); + swsusp_free(); + return -ENOMEM; } #ifdef CONFIG_HIGHMEM @@ -1253,7 +1338,7 @@ int swsusp_shrink_memory(void) static unsigned int count_pages_for_highmem(unsigned int nr_highmem) { - unsigned int free_highmem = count_free_highmem_pages(); + unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; if (free_highmem >= nr_highmem) nr_highmem = 0; @@ -1275,19 +1360,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; } static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) { struct zone *zone; - unsigned int free = 0, meta = 0; + unsigned int free = alloc_normal; - for_each_zone(zone) { - meta += snapshot_additional_pages(zone); + for_each_zone(zone) if (!is_highmem(zone)) free += zone_page_state(zone, NR_FREE_PAGES); - } nr_pages += count_pages_for_highmem(nr_highmem); - pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, meta, free); + pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); - return free > nr_pages + PAGES_FOR_IO + meta; + return free > nr_pages + PAGES_FOR_IO; } #ifdef CONFIG_HIGHMEM @@ -1309,7 +1392,7 @@ static inline int get_highmem_buffer(int safe_needed) */ static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) { unsigned int to_alloc = count_free_highmem_pages(); @@ -1329,7 +1412,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) static inline int get_highmem_buffer(int safe_needed) { return 0; } static inline unsigned int -alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } #endif /* CONFIG_HIGHMEM */ /** @@ -1348,51 +1431,36 @@ static int swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) { - int error; - - error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; - - error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); - if (error) - goto Free; + int error = 0; if (nr_highmem > 0) { error = get_highmem_buffer(PG_ANY); if (error) - goto Free; - - nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); + goto err_out; + if (nr_highmem > alloc_highmem) { + nr_highmem -= alloc_highmem; + nr_pages += alloc_highmem_pages(copy_bm, nr_highmem); + } } - while (nr_pages-- > 0) { - struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); - - if (!page) - goto Free; + if (nr_pages > alloc_normal) { + nr_pages -= alloc_normal; + while (nr_pages-- > 0) { + struct page *page; - memory_bm_set_bit(copy_bm, page_to_pfn(page)); + page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + if (!page) + goto err_out; + memory_bm_set_bit(copy_bm, page_to_pfn(page)); + } } + return 0; - Free: + err_out: swsusp_free(); - return -ENOMEM; + return error; } -/* Memory bitmap used for marking saveable pages (during suspend) or the - * suspend image pages (during resume) - */ -static struct memory_bitmap orig_bm; -/* Memory bitmap used on suspend for marking allocated pages that will contain - * the copies of saveable pages. During resume it is initially used for - * marking the suspend image pages, but then its set bits are duplicated in - * @orig_bm and it is released. Next, on systems with high memory, it may be - * used for marking "safe" highmem pages, but it has to be reinitialized for - * this purpose. - */ -static struct memory_bitmap copy_bm; - asmlinkage int swsusp_save(void) { unsigned int nr_pages, nr_highmem; -- cgit v1.2.3 From ef4aede3f10d82adef1fb044b565ba5f08f851e0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 8 Jul 2009 13:24:12 +0200 Subject: PM/Hibernate: Do not try to allocate too much memory too hard (rev. 2) We want to avoid attempting to free too much memory too hard during hibernation, so estimate the minimum size of the image to use as the lower limit for preallocating memory. The approach here is based on the (experimental) observation that we can't free more page frames than the sum of: * global_page_state(NR_SLAB_RECLAIMABLE) * global_page_state(NR_ACTIVE_ANON) * global_page_state(NR_INACTIVE_ANON) * global_page_state(NR_ACTIVE_FILE) * global_page_state(NR_INACTIVE_FILE) minus * global_page_state(NR_FILE_MAPPED) Namely, if this number is subtracted from the number of saveable pages in the system, we get a good estimate of the minimum reasonable size of a hibernation image. Signed-off-by: Rafael J. Wysocki Acked-by: Wu Fengguang --- kernel/power/snapshot.c | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 2b1a7bc24c9..0a06b114dd3 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1203,6 +1203,36 @@ static void free_unnecessary_pages(void) } } +/** + * minimum_image_size - Estimate the minimum acceptable size of an image + * @saveable: Number of saveable pages in the system. + * + * We want to avoid attempting to free too much memory too hard, so estimate the + * minimum acceptable size of a hibernation image to use as the lower limit for + * preallocating memory. + * + * We assume that the minimum image size should be proportional to + * + * [number of saveable pages] - [number of pages that can be freed in theory] + * + * where the second term is the sum of (1) reclaimable slab pages, (2) active + * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, + * minus mapped file pages. + */ +static unsigned long minimum_image_size(unsigned long saveable) +{ + unsigned long size; + + size = global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE) + - global_page_state(NR_FILE_MAPPED); + + return saveable <= size ? 0 : saveable - size; +} + /** * hibernate_preallocate_memory - Preallocate memory for hibernation image * @@ -1220,8 +1250,8 @@ static void free_unnecessary_pages(void) * * If image_size is set below the number following from the above formula, * the preallocation of memory is continued until the total number of saveable - * pages in the system is below the requested image size or it is impossible to - * allocate more memory, whichever happens first. + * pages in the system is below the requested image size or the minimum + * acceptable image size returned by minimum_image_size(), whichever is greater. */ int hibernate_preallocate_memory(void) { @@ -1282,6 +1312,11 @@ int hibernate_preallocate_memory(void) goto out; } + /* Estimate the minimum size of the image. */ + pages = minimum_image_size(saveable); + if (size < pages) + size = min_t(unsigned long, pages, max_size); + /* * Let the memory management subsystem know that we're going to need a * large number of page frames to allocate and make it free some memory. @@ -1294,8 +1329,8 @@ int hibernate_preallocate_memory(void) * The number of saveable pages in memory was too high, so apply some * pressure to decrease it. First, make room for the largest possible * image and fail if that doesn't work. Next, try to decrease the size - * of the image as much as indicated by image_size using allocations - * from highmem and non-highmem zones separately. + * of the image as much as indicated by 'size' using allocations from + * highmem and non-highmem zones separately. */ pages_highmem = preallocate_image_highmem(highmem / 2); alloc = (count - max_size) - pages_highmem; -- cgit v1.2.3 From 98e73dc5d2dadfcb95305ad71ac9239f4e361870 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Wed, 22 Jul 2009 00:36:56 +0200 Subject: PM / Hibernate / Memory hotplug: Always use for_each_populated_zone() Use for_each_populated_zone() instead of for_each_zone() in hibernation code. This fixes a bug on s390, where we allow both config options HIBERNATION and MEMORY_HOTPLUG, so that we also have a ZONE_MOVABLE here. We only allow hibernation if no memory hotplug operation was performed, so in fact both features can only be used exclusively, but this way we don't need 2 differently configured (distribution) kernels. If we have an unpopulated ZONE_MOVABLE, we allow hibernation but run into a BUG_ON() in memory_bm_test/set/clear_bit() because hibernation code iterates through all zones, not only the populated zones, in several places. For example, swsusp_free() does for_each_zone() and then checks for pfn_valid(), which is true even if the zone is not populated, resulting in a BUG_ON() later because the pfn cannot be found in the memory bitmap. Replacing all occurences of for_each_zone() in hibernation code with for_each_populated_zone() would fix this issue. [rjw: Rebased on top of linux-next hibernation patches.] Signed-off-by: Gerald Schaefer Acked-by: KOSAKI Motohiro Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0a06b114dd3..bf06658f205 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void) struct zone *zone; unsigned int n = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long pfn, max_zone_pfn; if (!is_highmem(zone)) @@ -916,7 +916,7 @@ static unsigned int count_data_pages(void) unsigned long pfn, max_zone_pfn; unsigned int n = 0; - for_each_zone(zone) { + for_each_populated_zone(zone) { if (is_highmem(zone)) continue; @@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) struct zone *zone; unsigned long pfn; - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long max_zone_pfn; mark_free_pages(zone); @@ -1065,7 +1065,7 @@ void swsusp_free(void) struct zone *zone; unsigned long pfn, max_zone_pfn; - for_each_zone(zone) { + for_each_populated_zone(zone) { max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { @@ -1397,7 +1397,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) struct zone *zone; unsigned int free = alloc_normal; - for_each_zone(zone) + for_each_populated_zone(zone) if (!is_highmem(zone)) free += zone_page_state(zone, NR_FREE_PAGES); @@ -1688,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) unsigned long pfn, max_zone_pfn; /* Clear page flags */ - for_each_zone(zone) { + for_each_populated_zone(zone) { max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) -- cgit v1.2.3 From 8de0307326be94148436082a9abf365da8e3c66d Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 22 Jul 2009 19:56:10 +0200 Subject: PM: Trivial fixes Fix the definition of BM_BITS_PER_BLOCK and kerneldoc description of create_bm_block_list(). [rjw: Added changelog.] Signed-off-by: Wu Fengguang Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index bf06658f205..97955b0e44f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) #define BM_END_OF_MAP (~0UL) -#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) +#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) struct bm_block { struct list_head hook; /* hook into a list of bitmap blocks */ @@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects - * @nr_blocks - number of blocks to allocate + * @pages - number of pages to track * @list - list to put the allocated blocks into * @ca - chain allocator to be used for allocating memory */ -- cgit v1.2.3 From 4a5d6ba1914d1bf1fcfb5e15834c29d84a879219 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 14 Sep 2009 12:45:39 +0100 Subject: CRED: Allow put_cred() to cope with a NULL groups list put_cred() will oops if given a NULL groups list, but that is now possible with the existence of cred_alloc_blank(), as used in keyctl_session_to_parent(). Added in commit: commit ee18d64c1f632043a02e6f5ba5e045bb26a5465f Author: David Howells Date: Wed Sep 2 09:14:21 2009 +0100 KEYS: Add a keyctl to install a process's session keyring on its parent [try #6] Reported-by: Marc Dionne Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 006fcab009d..d7f7a01082e 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -147,7 +147,8 @@ static void put_cred_rcu(struct rcu_head *rcu) key_put(cred->thread_keyring); key_put(cred->request_key_auth); release_tgcred(cred); - put_group_info(cred->group_info); + if (cred->group_info) + put_group_info(cred->group_info); free_uid(cred->user); kmem_cache_free(cred_jar, cred); } -- cgit v1.2.3 From 353f6dd2dec992ddd34620a94b051b0f76227379 Mon Sep 17 00:00:00 2001 From: Anirban Sinha Date: Mon, 14 Sep 2009 11:13:37 -0700 Subject: cleanup console_print() console_print() is an old legacy interface mostly unused in the entire kernel tree. It's best to clean up its existing use and let developers use their own implementation of it as they feel fit. Signed-off-by: Anirban Sinha Signed-off-by: Linus Torvalds --- kernel/printk.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index e10d193a833..602033acd6c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1075,12 +1075,6 @@ void __sched console_conditional_schedule(void) } EXPORT_SYMBOL(console_conditional_schedule); -void console_print(const char *s) -{ - printk(KERN_EMERG "%s", s); -} -EXPORT_SYMBOL(console_print); - void console_unblank(void) { struct console *c; -- cgit v1.2.3 From 555f386c98cc93890f48fdea098936755270304b Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 14 Sep 2009 20:10:15 -0400 Subject: ftrace: document function and function graph implementation While implementing function tracer and function tracer graph support, I found the exact arch implementation details to be a bit lacking (and my x86 foo ain't great). So after pounding out support for the Blackfin arch, start documenting the requirements/details. Signed-off-by: Mike Frysinger LKML-Reference: <1252973415-21264-1-git-send-email-vapier@gentoo.org> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index aa002cef924..e7163460440 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -11,12 +11,18 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_TRACER bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FUNCTION_GRAPH_FP_TEST bool @@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool help - This gets selected when the arch tests the function_trace_stop - variable at the mcount call site. Otherwise, this variable - is tested by the called function. + See Documentation/trace/ftrace-implementation.txt config HAVE_DYNAMIC_FTRACE bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_FTRACE_MCOUNT_RECORD bool + help + See Documentation/trace/ftrace-implementation.txt config HAVE_HW_BRANCH_TRACER bool config HAVE_SYSCALL_TRACEPOINTS bool + help + See Documentation/trace/ftrace-implementation.txt config TRACER_MAX_TRACE bool -- cgit v1.2.3 From b3e62e35058fc744ac794611f4e79bcd1c5a4b83 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 15 Sep 2009 14:44:36 +0800 Subject: perf_counter: Fix buffer overflow in perf_copy_attr() If we pass a big size data over perf_counter_open() syscall, the kernel will copy this data to a small buffer, it will cause kernel crash. This bug makes the kernel unsafe and non-root local user can trigger it. Signed-off-by: Xiao Guangrong Acked-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: LKML-Reference: <4AAF37D4.5010706@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d7cbc579fc8..a67a1dc3cfa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4171,6 +4171,7 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr, if (val) goto err_size; } + size = sizeof(*attr); } ret = copy_from_user(attr, uattr, size); -- cgit v1.2.3 From b78bb868c54bebbf8d8786a3f8320700d6d2b864 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 14:23:18 +0200 Subject: sched: Fix double_rq_lock() compile warning Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e27a53685ed..17e4391ec2d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -119,8 +119,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -1695,6 +1693,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #ifdef CONFIG_PREEMPT +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + /* * fair double_lock_balance: Safely acquires both rq->locks in a fair * way at the expense of forcing extra atomic operations in all -- cgit v1.2.3 From e6b1b2c9c0461c4e0971ed905ce3cda6512ee82a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Sep 2009 11:59:22 +0200 Subject: sched: Split WAKEUP_OVERLAP It consists of two conditions, split them out in separate toggles so we can test them independently. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 7 ++++--- kernel/sched_features.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index aa7f8412101..cea5b82242e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1526,9 +1526,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && (sync || - (se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost))) { + if ((sched_feat(WAKEUP_SYNC) && sync) || + (sched_feat(WAKEUP_OVERLAP) && + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index e2dc63a5815..07c8250b404 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -12,6 +12,7 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) +SCHED_FEAT(WAKEUP_SYNC, 0) SCHED_FEAT(WAKEUP_OVERLAP, 0) SCHED_FEAT(LAST_BUDDY, 1) SCHED_FEAT(OWNER_SPIN, 1) -- cgit v1.2.3 From 3cb63d527f76e25dbccce4f577f21aecfa2abac7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 11 Sep 2009 12:01:17 +0200 Subject: sched: Complete buddy switches Add a NEXT_BUDDY feature flag to aid in debugging. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 ++- kernel/sched_features.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cea5b82242e..4f6356e70ad 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1501,7 +1501,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) */ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) set_last_buddy(se); - set_next_buddy(pse); + if (sched_feat(NEXT_BUDDY)) + set_next_buddy(pse); /* * We can come here with TIF_NEED_RESCHED already set from new task diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 07c8250b404..6174c123399 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -14,5 +14,6 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) SCHED_FEAT(WAKEUP_SYNC, 0) SCHED_FEAT(WAKEUP_OVERLAP, 0) +SCHED_FEAT(NEXT_BUDDY, 1) SCHED_FEAT(LAST_BUDDY, 1) SCHED_FEAT(OWNER_SPIN, 1) -- cgit v1.2.3 From e26af0e8b2c9916f1e8a12ddaf52057bbe8ff600 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Sep 2009 12:31:23 +0200 Subject: sched: Add come comments to the sched features Add text... Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 93 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 6174c123399..891ea0f72b4 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,19 +1,96 @@ +/* + * Disregards a certain amount of sleep time (sched_latency_ns) and + * considers the task to be running during that period. This gives it + * a service deficit on wakeup, allowing it to run sooner. + */ SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) + +/* + * By not normalizing the sleep time, heavy tasks get an effective + * longer period, and lighter task an effective shorter period they + * are considered running. + */ SCHED_FEAT(NORMALIZED_SLEEPER, 0) -SCHED_FEAT(ADAPTIVE_GRAN, 1) -SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ SCHED_FEAT(START_DEBIT, 1) + +/* + * Should wakeups try to preempt running tasks. + */ +SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Compute wakeup_gran based on task behaviour, clipped to + * [0, sched_wakeup_gran_ns] + */ +SCHED_FEAT(ADAPTIVE_GRAN, 1) + +/* + * When converting the wakeup granularity to virtual time, do it such + * that heavier tasks preempting a lighter task have an edge. + */ +SCHED_FEAT(ASYM_GRAN, 1) + +/* + * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS. + */ +SCHED_FEAT(WAKEUP_SYNC, 0) + +/* + * Wakeup preempt based on task behaviour. Tasks that do not overlap + * don't get preempted. + */ +SCHED_FEAT(WAKEUP_OVERLAP, 0) + +/* + * Use the SYNC wakeup hint, pipes and the likes use this to indicate + * the remote end is likely to consume the data we just wrote, and + * therefore has cache benefit from being placed on the same cpu, see + * also AFFINE_WAKEUPS. + */ +SCHED_FEAT(SYNC_WAKEUPS, 1) + +/* + * Based on load and program behaviour, see if it makes sense to place + * a newly woken task on the same cpu as the task that woke it -- + * improve cache locality. Typically used with SYNC wakeups as + * generated by pipes and the like, see also SYNC_WAKEUPS. + */ SCHED_FEAT(AFFINE_WAKEUPS, 1) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, 1) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ SCHED_FEAT(CACHE_HOT_BUDDY, 1) -SCHED_FEAT(SYNC_WAKEUPS, 1) + SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -SCHED_FEAT(WAKEUP_SYNC, 0) -SCHED_FEAT(WAKEUP_OVERLAP, 0) -SCHED_FEAT(NEXT_BUDDY, 1) -SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Spin-wait on mutex acquisition when the mutex owner is running on + * another cpu -- assumes that when the owner is running, it will soon + * release the lock. Decreases scheduling overhead. + */ SCHED_FEAT(OWNER_SPIN, 1) -- cgit v1.2.3 From f5f08f39ee4c5fd0a757d25d9e04d696676b3df7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:35:28 +0200 Subject: sched: Move code around In preparation to other code movement, move weighted_cpuload(), source_load() and target_load() before the class includes. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 81 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 39 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 17e4391ec2d..b56d1505d05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1507,8 +1507,45 @@ static int tg_nop(struct task_group *tg, void *data) #endif #ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) @@ -1959,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - /* * Is this task likely cache-hot: */ @@ -2240,39 +2270,6 @@ void kick_process(struct task_struct *p) } EXPORT_SYMBOL_GPL(kick_process); -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - /* * find_idlest_group finds and returns the least busy CPU group within the * domain. -- cgit v1.2.3 From aaee1203ca52b9db799433c33c9bffc33cdf8909 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:36:25 +0200 Subject: sched: Move sched_balance_self() into sched_fair.c Move the sched_balance_self() code into sched_fair.c This facilitates the merger of sched_balance_self() and sched_fair::select_task_rq(). Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 146 ---------------------------------------------------- kernel/sched_fair.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 146 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b56d1505d05..60400a22401 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2269,152 +2269,6 @@ void kick_process(struct task_struct *p) preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; - unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; - - do { - unsigned long load, avg_load; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_cpus(group), - &p->cpus_allowed)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); - - /* Tally up the load of all CPUs in the group */ - avg_load = 0; - - for_each_cpu(i, sched_group_cpus(group)) { - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - - avg_load += load; - } - - /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - - if (local_group) { - this_load = avg_load; - this = group; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; - } - } while (group = group->next, group != sd->groups); - - if (!idlest || 100*this_load < imbalance*min_load) - return NULL; - return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -{ - unsigned long load, min_load = ULONG_MAX; - int idlest = -1; - int i; - - /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; - } - } - - return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ - struct task_struct *t = current; - struct sched_domain *tmp, *sd = NULL; - - for_each_domain(cpu, tmp) { - /* - * If power savings logic is enabled for a domain, stop there. - */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - break; - if (tmp->flags & flag) - sd = tmp; - } - - if (sd) - update_shares(sd); - - while (sd) { - struct sched_group *group; - int new_cpu, weight; - - if (!(sd->flags & flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, t, cpu); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, t, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = cpumask_weight(sched_domain_span(sd)); - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= cpumask_weight(sched_domain_span(tmp))) - break; - if (tmp->flags & flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - return cpu; -} - #endif /* CONFIG_SMP */ /** diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4f6356e70ad..a82d71d3afe 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1360,6 +1360,151 @@ static int select_task_rq_fair(struct task_struct *p, int sync) out: return wake_idle(new_cpu, p); } + +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +{ + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; + int imbalance = 100 + (sd->imbalance_pct-100)/2; + + do { + unsigned long load, avg_load; + int local_group; + int i; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu(i, sched_group_cpus(group)) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + + if (local_group) { + this_load = avg_load; + this = group; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + /* Traverse only the allowed CPUs */ + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + load = weighted_cpuload(i); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; + } + } + + return idlest; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int sched_balance_self(int cpu, int flag) +{ + struct task_struct *t = current; + struct sched_domain *tmp, *sd = NULL; + + for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, stop there. + */ + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + break; + if (tmp->flags & flag) + sd = tmp; + } + + if (sd) + update_shares(sd); + + while (sd) { + struct sched_group *group; + int new_cpu, weight; + + if (!(sd->flags & flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, t, cpu); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_cpu(group, t, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = cpumask_weight(sched_domain_span(sd)); + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= cpumask_weight(sched_domain_span(tmp))) + break; + if (tmp->flags & flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return cpu; +} #endif /* CONFIG_SMP */ /* -- cgit v1.2.3 From 5f3edc1b1ead6d9bd45a85c551f44eff8fe76b9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:42:00 +0200 Subject: sched: Hook sched_balance_self() into sched_class::select_task_rq() Rather ugly patch to fully place the sched_balance_self() code inside the fair class. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 +++++++------- kernel/sched_fair.c | 7 ++++++- kernel/sched_idletask.c | 2 +- kernel/sched_rt.c | 5 ++++- 4 files changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 60400a22401..32b7a81230c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2350,7 +2350,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (unlikely(task_running(rq, p))) goto out_activate; - cpu = p->sched_class->select_task_rq(p, sync); + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync); if (cpu != orig_cpu) { set_task_cpu(p, cpu); task_rq_unlock(rq, &flags); @@ -2525,11 +2525,6 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); -#ifdef CONFIG_SMP - cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif - set_task_cpu(p, cpu); - /* * Make sure we do not leak PI boosting priority to the child. */ @@ -2560,6 +2555,11 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; +#ifdef CONFIG_SMP + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); +#endif + set_task_cpu(p, cpu); + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -3114,7 +3114,7 @@ out: void sched_exec(void) { int new_cpu, this_cpu = get_cpu(); - new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); + new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); put_cpu(); if (new_cpu != this_cpu) sched_migrate_task(current, new_cpu); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a82d71d3afe..f2eb5b93471 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1300,7 +1300,9 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, return 0; } -static int select_task_rq_fair(struct task_struct *p, int sync) +static int sched_balance_self(int cpu, int flag); + +static int select_task_rq_fair(struct task_struct *p, int flag, int sync) { struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; @@ -1314,6 +1316,9 @@ static int select_task_rq_fair(struct task_struct *p, int sync) this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; + if (flag != SD_BALANCE_WAKE) + return sched_balance_self(this_cpu, flag); + /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 499672c10cb..99b2f033760 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,7 @@ */ #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int sync) +static int select_task_rq_idle(struct task_struct *p, int flag, int sync) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2eb4bd6a526..438380810ac 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int sync) +static int select_task_rq_rt(struct task_struct *p, int flag, int sync) { struct rq *rq = task_rq(p); + if (flag != SD_BALANCE_WAKE) + return smp_processor_id(); + /* * If the current task is an RT task, then * try to see if we can wake this RT task up on another -- cgit v1.2.3 From e9c8431185d6c406887190519f6dbdd112641686 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 14:43:03 +0200 Subject: sched: Add TASK_WAKING We're going to want to drop rq->lock in try_to_wake_up() for a longer period of time, however we also want to deal with concurrent waking of the same task, which is currently handled by holding rq->lock. So introduce a new TASK state, namely TASK_WAKING, which indicates someone is already waking the task (other wakers will fail p->state & state). We also keep preemption disabled over the whole ttwu(). Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 32b7a81230c..fc6fda881d2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2310,7 +2310,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - long old_state; struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) @@ -2332,11 +2331,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) } #endif + this_cpu = get_cpu(); + smp_wmb(); rq = task_rq_lock(p, &flags); update_rq_clock(rq); - old_state = p->state; - if (!(old_state & state)) + if (!(p->state & state)) goto out; if (p->se.on_rq) @@ -2344,27 +2344,25 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); orig_cpu = cpu; - this_cpu = smp_processor_id(); #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) goto out_activate; + /* + * In order to handle concurrent wakeups and release the rq->lock + * we put the task in TASK_WAKING state. + */ + p->state = TASK_WAKING; + task_rq_unlock(rq, &flags); + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync); - if (cpu != orig_cpu) { + if (cpu != orig_cpu) set_task_cpu(p, cpu); - task_rq_unlock(rq, &flags); - /* might preempt at this point */ - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - } + rq = task_rq_lock(p, &flags); + WARN_ON(p->state != TASK_WAKING); + cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2422,6 +2420,7 @@ out_running: #endif out: task_rq_unlock(rq, &flags); + put_cpu(); return success; } -- cgit v1.2.3 From c88d5910890ad35af283344417891344604f0438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:50:02 +0200 Subject: sched: Merge select_task_rq_fair() and sched_balance_self() The problem with wake_idle() is that is doesn't respect things like cpu_power, which means it doesn't deal well with SMT nor the recent RT interaction. To cure this, it needs to do what sched_balance_self() does, which leads to the possibility of merging select_task_rq_fair() and sched_balance_self(). Modify sched_balance_self() to: - update_shares() when walking up the domain tree, (it only called it for the top domain, but it should have done this anyway), which allows us to remove this ugly bit from try_to_wake_up(). - do wake_affine() on the smallest domain that contains both this (the waking) and the prev (the wakee) cpu for WAKE invocations. Then use the top-down balance steps it had to replace wake_idle(). This leads to the dissapearance of SD_WAKE_BALANCE and SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE. SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective. Touch all topology bits to replace the old with new SD flags -- platforms might need re-tuning, enabling SD_BALANCE_WAKE conditionally on a NUMA distance seems like a good additional feature, magny-core and small nehalem systems would want this enabled, systems with slow interconnects would not. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 41 +-------- kernel/sched_fair.c | 233 ++++++++++++++-------------------------------------- 2 files changed, 66 insertions(+), 208 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index fc6fda881d2..6c819f338b1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -512,14 +512,6 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Preferred wake up cpu nominated by sched_mc balance that will be - * used when most cpus are idle in the system indicating overall very - * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) - */ - unsigned int sched_mc_preferred_wakeup_cpu; -#endif }; /* @@ -2315,22 +2307,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { - struct sched_domain *sd; - - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); - - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - update_shares(sd); - break; - } - } - } -#endif - this_cpu = get_cpu(); smp_wmb(); @@ -3533,11 +3509,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, *imbalance = sds->min_load_per_task; sds->busiest = sds->group_min; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds->group_leader); - } - return 1; } @@ -7850,9 +7821,7 @@ static int sd_degenerate(struct sched_domain *sd) } /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) + if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; @@ -7869,10 +7838,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | @@ -8558,10 +8523,10 @@ static void set_domain_attribute(struct sched_domain *sd, request = attr->relax_domain_level; if (request < sd->level) { /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } else { /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2eb5b93471..09d19f77eb3 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1062,83 +1062,6 @@ static void yield_task_fair(struct rq *rq) se->vruntime = rightmost->vruntime + 1; } -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available. The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (rq->rd->online) - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) - -#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) - -static int wake_idle(int cpu, struct task_struct *p) -{ - struct sched_domain *sd; - int i; - unsigned int chosen_wakeup_cpu; - int this_cpu; - struct rq *task_rq = task_rq(p); - - /* - * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu - * are idle and this is not a kernel thread and this task's affinity - * allows it to be moved to preferred cpu, then just move! - */ - - this_cpu = smp_processor_id(); - chosen_wakeup_cpu = - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; - - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && - idle_cpu(cpu) && idle_cpu(this_cpu) && - p->mm && !(p->flags & PF_KTHREAD) && - cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) - return chosen_wakeup_cpu; - - /* - * If it is idle, then it is the best cpu to run this task. - * - * This cpu is also the best, if it has more than one task already. - * Siblings must be also busy(in most cases) as they didn't already - * pickup the extra load from this cpu and hence we need not check - * sibling runqueue info. This will avoid the checks and cache miss - * penalities associated with that. - */ - if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) - return cpu; - - for_each_domain(cpu, sd) { - if ((sd->flags & SD_WAKE_IDLE) - || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq->clock, sd))) { - for_each_cpu_and(i, sched_domain_span(sd), - &p->cpus_allowed) { - if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { - if (i != task_cpu(p)) { - schedstat_inc(p, - se.nr_wakeups_idle); - } - return i; - } - } - } else { - break; - } - } - return cpu; -} -#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ -static inline int wake_idle(int cpu, struct task_struct *p) -{ - return cpu; -} -#endif - #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1225,21 +1148,22 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif -static int -wake_affine(struct sched_domain *this_sd, struct rq *this_rq, - struct task_struct *p, int prev_cpu, int this_cpu, int sync, - int idx, unsigned long load, unsigned long this_load, - unsigned int imbalance) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - struct task_struct *curr = this_rq->curr; - struct task_group *tg; - unsigned long tl = this_load; + struct task_struct *curr = current; + unsigned long this_load, load; + int idx, this_cpu, prev_cpu; unsigned long tl_per_task; + unsigned int imbalance; + struct task_group *tg; unsigned long weight; int balanced; - if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) - return 0; + idx = sd->wake_idx; + this_cpu = smp_processor_id(); + prev_cpu = task_cpu(p); + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || p->se.avg_overlap > sysctl_sched_migration_cost)) @@ -1254,24 +1178,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, tg = task_group(current); weight = current->se.load.weight; - tl += effective_load(tg, this_cpu, -weight, -weight); + this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); weight = p->se.load.weight; + imbalance = 100 + (sd->imbalance_pct - 100) / 2; + /* * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped tl to 0, we'll always have - * an imbalance, but there's really nothing you can do about that, so - * that's good too. + * due to the sync cause above having dropped this_load to 0, we'll + * always have an imbalance, but there's really nothing you can do + * about that, so that's good too. * * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !tl || - 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + balanced = !this_load || + 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* @@ -1285,14 +1211,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= - tl_per_task)) { + if (balanced || + (this_load <= load && + this_load + target_load(prev_cpu, idx) <= tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ - schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(sd, ttwu_move_affine); schedstat_inc(p, se.nr_wakeups_affine); return 1; @@ -1300,72 +1227,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, return 0; } -static int sched_balance_self(int cpu, int flag); - -static int select_task_rq_fair(struct task_struct *p, int flag, int sync) -{ - struct sched_domain *sd, *this_sd = NULL; - int prev_cpu, this_cpu, new_cpu; - unsigned long load, this_load; - struct rq *this_rq; - unsigned int imbalance; - int idx; - - prev_cpu = task_cpu(p); - this_cpu = smp_processor_id(); - this_rq = cpu_rq(this_cpu); - new_cpu = prev_cpu; - - if (flag != SD_BALANCE_WAKE) - return sched_balance_self(this_cpu, flag); - - /* - * 'this_sd' is the first domain that both - * this_cpu and prev_cpu are present in: - */ - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { - this_sd = sd; - break; - } - } - - if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) - goto out; - - /* - * Check for affine wakeup and passive balancing possibilities. - */ - if (!this_sd) - goto out; - - idx = this_sd->wake_idx; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); - - if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, - load, this_load, imbalance)) - return this_cpu; - - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - return this_cpu; - } - } - -out: - return wake_idle(new_cpu, p); -} - /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -1455,10 +1316,20 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) * * preempt must be disabled. */ -static int sched_balance_self(int cpu, int flag) +static int select_task_rq_fair(struct task_struct *p, int flag, int sync) { struct task_struct *t = current; struct sched_domain *tmp, *sd = NULL; + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; + int want_affine = 0; + + if (flag & SD_BALANCE_WAKE) { + if (sched_feat(AFFINE_WAKEUPS)) + want_affine = 1; + new_cpu = prev_cpu; + } for_each_domain(cpu, tmp) { /* @@ -1466,16 +1337,38 @@ static int sched_balance_self(int cpu, int flag) */ if (tmp->flags & SD_POWERSAVINGS_BALANCE) break; - if (tmp->flags & flag) - sd = tmp; - } - if (sd) - update_shares(sd); + switch (flag) { + case SD_BALANCE_WAKE: + if (!sched_feat(LB_WAKEUP_UPDATE)) + break; + case SD_BALANCE_FORK: + case SD_BALANCE_EXEC: + if (root_task_group_empty()) + break; + update_shares(tmp); + default: + break; + } + + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + + if (wake_affine(tmp, p, sync)) + return cpu; + + want_affine = 0; + } + + if (!(tmp->flags & flag)) + continue; + + sd = tmp; + } while (sd) { struct sched_group *group; - int new_cpu, weight; + int weight; if (!(sd->flags & flag)) { sd = sd->child; @@ -1508,7 +1401,7 @@ static int sched_balance_self(int cpu, int flag) /* while loop will break here if sd == NULL */ } - return cpu; + return new_cpu; } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From ae154be1f34a674e6cbb43ccf6e442f56acd7a70 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 14:40:57 +0200 Subject: sched: Weaken SD_POWERSAVINGS_BALANCE One of the problems of power-saving balancing is that under certain scenarios it is too slow and allows tons of real work to pile up. Avoid this by ignoring the powersave stuff when there's real work to be done. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 ++++++++++++++++++++-------------------- kernel/sched_fair.c | 21 ++++++++++++++++++--- 2 files changed, 38 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c819f338b1..f0ccb8b926c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1538,6 +1538,26 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } +static struct sched_group *group_of(int cpu) +{ + struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + + if (!sd) + return NULL; + + return sd->groups; +} + +static unsigned long power_of(int cpu) +{ + struct sched_group *group = group_of(cpu); + + if (!group) + return SCHED_LOAD_SCALE; + + return group->cpu_power; +} + static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) @@ -3982,26 +4002,6 @@ ret: return NULL; } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - -static unsigned long power_of(int cpu) -{ - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; -} - /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 09d19f77eb3..eaa00014b49 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1333,10 +1333,25 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) for_each_domain(cpu, tmp) { /* - * If power savings logic is enabled for a domain, stop there. + * If power savings logic is enabled for a domain, see if we + * are not overloaded, if so, don't balance wider. */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - break; + if (tmp->flags & SD_POWERSAVINGS_BALANCE) { + unsigned long power = 0; + unsigned long nr_running = 0; + unsigned long capacity; + int i; + + for_each_cpu(i, sched_domain_span(tmp)) { + power += power_of(i); + nr_running += cpu_rq(i)->cfs.nr_running; + } + + capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + + if (nr_running/2 < capacity) + break; + } switch (flag) { case SD_BALANCE_WAKE: -- cgit v1.2.3 From 83f54960c11a14942ab00b54c51e91906b9d8235 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 18:18:47 +0200 Subject: sched: for_each_domain() vs RCU for_each_domain() uses RCU to serialize the sched_domains, except it doesn't actually use rcu_read_lock() and instead relies on disabling preemption -> FAIL. XXX: audit other sched_domain code. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index eaa00014b49..43dc6d1d9e8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,6 +1331,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) new_cpu = prev_cpu; } + rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If power savings logic is enabled for a domain, see if we @@ -1369,8 +1370,10 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { - if (wake_affine(tmp, p, sync)) - return cpu; + if (wake_affine(tmp, p, sync)) { + new_cpu = cpu; + goto out; + } want_affine = 0; } @@ -1416,6 +1419,8 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) /* while loop will break here if sd == NULL */ } +out: + rcu_read_unlock(); return new_cpu; } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From d7c33c4930f569caf6b2ece597432853c4151a45 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Sep 2009 12:45:38 +0200 Subject: sched: Fix task affinity for select_task_rq_fair While merging select_task_rq_fair() and sched_balance_self() I made a mistake that leads to testing the wrong task affinty. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 43dc6d1d9e8..8b3eddbcf9a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1318,7 +1318,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) { - struct task_struct *t = current; struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); @@ -1393,13 +1392,13 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) continue; } - group = find_idlest_group(sd, t, cpu); + group = find_idlest_group(sd, p, cpu); if (!group) { sd = sd->child; continue; } - new_cpu = find_idlest_cpu(group, t, cpu); + new_cpu = find_idlest_cpu(group, p, cpu); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; -- cgit v1.2.3 From 78e7ed53c9f42f04f9401ada6f7047db60781676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Sep 2009 13:16:51 +0200 Subject: sched: Tweak wake_idx When merging select_task_rq_fair() and sched_balance_self() we lost the use of wake_idx, restore that and set them to 0 to make wake balancing more aggressive. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8b3eddbcf9a..19593568031 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1232,12 +1232,27 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * domain. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int flag) { struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + int load_idx = 0; + + switch (flag) { + case SD_BALANCE_FORK: + case SD_BALANCE_EXEC: + load_idx = sd->forkexec_idx; + break; + + case SD_BALANCE_WAKE: + load_idx = sd->wake_idx; + break; + + default: + break; + } do { unsigned long load, avg_load; @@ -1392,7 +1407,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) continue; } - group = find_idlest_group(sd, p, cpu); + group = find_idlest_group(sd, p, cpu, flag); if (!group) { sd = sd->child; continue; -- cgit v1.2.3 From 0ec9fab3d186d9cbb00c0f694d4a260d07c198d9 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 15 Sep 2009 15:07:03 +0200 Subject: sched: Improve latencies and throughput Make the idle balancer more agressive, to improve a x264 encoding workload provided by Jason Garrett-Glaser: NEXT_BUDDY NO_LB_BIAS encoded 600 frames, 252.82 fps, 22096.60 kb/s encoded 600 frames, 250.69 fps, 22096.60 kb/s encoded 600 frames, 245.76 fps, 22096.60 kb/s NO_NEXT_BUDDY LB_BIAS encoded 600 frames, 344.44 fps, 22096.60 kb/s encoded 600 frames, 346.66 fps, 22096.60 kb/s encoded 600 frames, 352.59 fps, 22096.60 kb/s NO_NEXT_BUDDY NO_LB_BIAS encoded 600 frames, 425.75 fps, 22096.60 kb/s encoded 600 frames, 425.45 fps, 22096.60 kb/s encoded 600 frames, 422.49 fps, 22096.60 kb/s Peter pointed out that this is better done via newidle_idx, not via LB_BIAS, newidle balancing should look for where there is load _now_, not where there was load 2 ticks ago. Worst-case latencies are improved as well as no buddies means less vruntime spread. (as per prior lkml discussions) This change improves kbuild-peak parallelism as well. Reported-by: Jason Garrett-Glaser Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1253011667.9128.16.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 891ea0f72b4..e98c2e8de1d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -67,7 +67,7 @@ SCHED_FEAT(AFFINE_WAKEUPS, 1) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, 1) +SCHED_FEAT(NEXT_BUDDY, 0) /* * Prefer to schedule the task that ran last (when we did -- cgit v1.2.3 From d6a59aa3a2b1ca8411884c833a313b33b5f76e20 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Sep 2009 13:28:02 +0200 Subject: sched: Provide arch_scale_freq_power Provide an ach specific hook for cpufreq based scaling of cpu_power. Signed-off-by: Peter Zijlstra [ego@in.ibm.com: spotting bugs] LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f0ccb8b926c..c210321adcb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3552,7 +3552,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return SCHED_LOAD_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return default_scale_freq_power(sd, cpu); +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long smt_gain = sd->smt_gain; @@ -3562,6 +3573,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) return smt_gain; } +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + return default_scale_smt_power(sd, cpu); +} + unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -3586,7 +3602,8 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - /* here we could scale based on cpufreq */ + power *= arch_scale_freq_power(sd, cpu); + power >>= SCHED_LOAD_SHIFT; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { power *= arch_scale_smt_power(sd, cpu); -- cgit v1.2.3 From 8e6598af3f35629c37249a610cf13e73f70db279 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Sep 2009 13:20:03 +0200 Subject: sched: Feature to disable APERF/MPERF cpu_power I suspect a feed-back loop between cpuidle and the aperf/mperf cpu_power bits, where when we have idle C-states lower the ratio, which leads to lower cpu_power and then less load, which generates more idle time, etc.. Put in a knob to disable it. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++++-- kernel/sched_features.h | 5 +++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c210321adcb..e8e603bf876 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3602,11 +3602,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - power *= arch_scale_freq_power(sd, cpu); + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + power >>= SCHED_LOAD_SHIFT; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= arch_scale_smt_power(sd, cpu); + if (sched_feat(ARCH_POWER)) + power *= arch_scale_smt_power(sd, cpu); + else + power *= default_scale_smt_power(sd, cpu); + power >>= SCHED_LOAD_SHIFT; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index e98c2e8de1d..294e10edd3c 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -82,6 +82,11 @@ SCHED_FEAT(LAST_BUDDY, 1) */ SCHED_FEAT(CACHE_HOT_BUDDY, 1) +/* + * Use arch dependent cpu power functions + */ +SCHED_FEAT(ARCH_POWER, 0) + SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(LB_BIAS, 1) -- cgit v1.2.3 From 0763a660a84220cc3900fd32abdd7ad109e2278d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 19:37:39 +0200 Subject: sched: Rename select_task_rq() argument In order to be able to rename the sync argument, we need to rename the current flag argument. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 14 +++++++------- kernel/sched_idletask.c | 2 +- kernel/sched_rt.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 19593568031..b554e63c521 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,7 +1331,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) * * preempt must be disabled. */ -static int select_task_rq_fair(struct task_struct *p, int flag, int sync) +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int sync) { struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); @@ -1339,7 +1339,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) int new_cpu = cpu; int want_affine = 0; - if (flag & SD_BALANCE_WAKE) { + if (sd_flag & SD_BALANCE_WAKE) { if (sched_feat(AFFINE_WAKEUPS)) want_affine = 1; new_cpu = prev_cpu; @@ -1368,7 +1368,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) break; } - switch (flag) { + switch (sd_flag) { case SD_BALANCE_WAKE: if (!sched_feat(LB_WAKEUP_UPDATE)) break; @@ -1392,7 +1392,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) want_affine = 0; } - if (!(tmp->flags & flag)) + if (!(tmp->flags & sd_flag)) continue; sd = tmp; @@ -1402,12 +1402,12 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) struct sched_group *group; int weight; - if (!(sd->flags & flag)) { + if (!(sd->flags & sd_flag)) { sd = sd->child; continue; } - group = find_idlest_group(sd, p, cpu, flag); + group = find_idlest_group(sd, p, cpu, sd_flag); if (!group) { sd = sd->child; continue; @@ -1427,7 +1427,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) for_each_domain(cpu, tmp) { if (weight <= cpumask_weight(sched_domain_span(tmp))) break; - if (tmp->flags & flag) + if (tmp->flags & sd_flag) sd = tmp; } /* while loop will break here if sd == NULL */ diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 99b2f033760..9ff7697e5dc 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,7 @@ */ #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int flag, int sync) +static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 438380810ac..97c53f3f51a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -938,11 +938,11 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int flag, int sync) +static int select_task_rq_rt(struct task_struct *p, int sd_flag, int sync) { struct rq *rq = task_rq(p); - if (flag != SD_BALANCE_WAKE) + if (sd_flag != SD_BALANCE_WAKE) return smp_processor_id(); /* -- cgit v1.2.3 From 7d47872146398dbede13223299fe1cb368ebc781 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 19:55:44 +0200 Subject: sched: Rename sync arguments In order to extend the functions to have more than 1 flag (sync), rename the argument to flags, and explicitly define a WF_ space for individual flags. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 30 ++++++++++++++++-------------- kernel/sched_fair.c | 6 ++++-- kernel/sched_idletask.c | 4 ++-- kernel/sched_rt.c | 4 ++-- 4 files changed, 24 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e8e603bf876..4da335cec8e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -636,9 +636,10 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +static inline +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - rq->curr->sched_class->check_preempt_curr(rq, p, sync); + rq->curr->sched_class->check_preempt_curr(rq, p, flags); } static inline int cpu_of(struct rq *rq) @@ -2318,14 +2319,15 @@ void task_oncpu_function_call(struct task_struct *p, * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int try_to_wake_up(struct task_struct *p, unsigned int state, + int wake_flags) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) - sync = 0; + wake_flags &= ~WF_SYNC; this_cpu = get_cpu(); @@ -2352,7 +2354,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) p->state = TASK_WAKING; task_rq_unlock(rq, &flags); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync); + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); @@ -2378,7 +2380,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ schedstat_inc(p, se.nr_wakeups); - if (sync) + if (wake_flags & WF_SYNC) schedstat_inc(p, se.nr_wakeups_sync); if (orig_cpu != cpu) schedstat_inc(p, se.nr_wakeups_migrate); @@ -2407,7 +2409,7 @@ out_activate: out_running: trace_sched_wakeup(rq, p, success); - check_preempt_curr(rq, p, sync); + check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -5562,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void) #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, +int default_wake_function(wait_queue_t *curr, unsigned mode, int flags, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode, flags); } EXPORT_SYMBOL(default_wake_function); @@ -5579,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) + int nr_exclusive, int flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; - if (curr->func(curr, mode, sync, key) && + if (curr->func(curr, mode, flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } @@ -5647,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - int sync = 1; + int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) - sync = 0; + wake_flags = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, key); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b554e63c521..007958e3c93 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,13 +1331,14 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) * * preempt must be disabled. */ -static int select_task_rq_fair(struct task_struct *p, int sd_flag, int sync) +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) { struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; + int sync = flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { if (sched_feat(AFFINE_WAKEUPS)) @@ -1548,11 +1549,12 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); + int sync = flags & WF_SYNC; update_curr(cfs_rq); diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9ff7697e5dc..a8b448af004 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,7 @@ */ #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync) +static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { resched_task(rq->idle); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 97c53f3f51a..13de7126a6a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -938,7 +938,7 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int sd_flag, int sync) +static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { struct rq *rq = task_rq(p); @@ -1002,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); -- cgit v1.2.3 From a7558e01056f5191ff2ecff53b075dcb9e484188 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 20:02:34 +0200 Subject: sched: Add WF_FORK Avoid the cache buddies from biasing the time distribution away from fork()ers. Normally the next buddy will be the preferred scheduling target, but this makes fork()s prefer to run the new child, whereas we prefer to run the parent, since that will generate more work. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 4da335cec8e..0d4c4fea331 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2602,7 +2602,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) inc_nr_running(rq); } trace_sched_wakeup_new(rq, p, 1); - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 007958e3c93..6766959c7f4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1580,7 +1580,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags */ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) set_last_buddy(se); - if (sched_feat(NEXT_BUDDY)) + if (sched_feat(NEXT_BUDDY) && !(flags & WF_FORK)) set_next_buddy(pse); /* -- cgit v1.2.3 From 6ca6cca31ddc7cc8b1dc38b12d7593d2667defe8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 15 Sep 2009 12:24:22 -0400 Subject: tracing: optimize global_trace_clock cachelines The prev_trace_clock_time is only read or written to when the trace_clock_lock is taken. For better perfomance, they should share the same cache line. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace_clock.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index b588fd81f7f..20c5f92e28a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -66,10 +66,14 @@ u64 notrace trace_clock(void) * Used by plugins that need globally coherent timestamps. */ -static u64 prev_trace_clock_time; - -static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +/* keep prev_time and lock in the same cacheline. */ +static struct { + u64 prev_time; + raw_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = + { + .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, + }; u64 notrace trace_clock_global(void) { @@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void) if (unlikely(in_nmi())) goto out; - __raw_spin_lock(&trace_clock_lock); + __raw_spin_lock(&trace_clock_struct.lock); /* * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_trace_clock_time+1, to make sure + * my_scd->clock to prev_time+1, to make sure * we start ticking with the local clock from now on? */ - if ((s64)(now - prev_trace_clock_time) < 0) - now = prev_trace_clock_time + 1; + if ((s64)(now - trace_clock_struct.prev_time) < 0) + now = trace_clock_struct.prev_time + 1; - prev_trace_clock_time = now; + trace_clock_struct.prev_time = now; - __raw_spin_unlock(&trace_clock_lock); + __raw_spin_unlock(&trace_clock_struct.lock); out: raw_local_irq_restore(flags); -- cgit v1.2.3 From a56af87648054089d89874b52e3fc23ed4f274ad Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 12 Jul 2009 21:44:55 +0800 Subject: driver-core: move dma-coherent.c from kernel to driver/base Placing dma-coherent.c in driver/base is better than in kernel, since it contains code to do per-device coherent dma memory handling. Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- kernel/Makefile | 1 - kernel/dma-coherent.c | 176 -------------------------------------------------- 2 files changed, 177 deletions(-) delete mode 100644 kernel/dma-coherent.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 961379caf66..3d9c7e27e3f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -90,7 +90,6 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c deleted file mode 100644 index 962a3b574f2..00000000000 --- a/kernel/dma-coherent.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Coherent per-device memory handling. - * Borrowed from i386 - */ -#include -#include - -struct dma_coherent_mem { - void *virt_base; - u32 device_base; - int size; - int flags; - unsigned long *bitmap; -}; - -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -/** - * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area - * - * @dev: device from which we allocate memory - * @size: size of requested memory area - * @dma_handle: This will be filled with the correct dma handle - * @ret: This pointer will be filled with the virtual address - * to allocated area. - * - * This function should be only called from per-arch dma_alloc_coherent() - * to support allocation from per-device coherent memory pools. - * - * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return @ret. - */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem; - int order = get_order(size); - int pageno; - - if (!dev) - return 0; - mem = dev->dma_mem; - if (!mem) - return 0; - - *ret = NULL; - - if (unlikely(size > (mem->size << PAGE_SHIFT))) - goto err; - - pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); - if (unlikely(pageno < 0)) - goto err; - - /* - * Memory was found in the per-device area. - */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); - *ret = mem->virt_base + (pageno << PAGE_SHIFT); - memset(*ret, 0, size); - - return 1; - -err: - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; -} -EXPORT_SYMBOL(dma_alloc_from_coherent); - -/** - * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool - * @dev: device from which the memory was allocated - * @order: the order of pages allocated - * @vaddr: virtual address of allocated pages - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, releases that memory. - * - * Returns 1 if we correctly released the memory, or 0 if - * dma_release_coherent() should proceed with releasing memory from - * generic pools. - */ -int dma_release_from_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -EXPORT_SYMBOL(dma_release_from_coherent); -- cgit v1.2.3 From 63859d4fe4c97b737e7adbfe60acb1c2b2e668cb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 19:14:42 +0200 Subject: sched: Fix sync wakeups again The sync argument rename to introduce WF_* broke stuff by missing a local alias for an argument in __wake_up_common, fix it by using the more descriptive wake_flags name. This restores WF_SYNC propagation, which fixes wake_affine() behaviour, which fixes pipe-test. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0d4c4fea331..af04ede6dd2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5564,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void) #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int flags, +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, flags); + return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -5581,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int flags, void *key) + int nr_exclusive, int wake_flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; - if (curr->func(curr, mode, flags, key) && + if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } -- cgit v1.2.3 From e69b0f1b41c0e57bb1e29100b5810a5914efcb45 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 19:38:52 +0200 Subject: sched: Add a few SYNC hint knobs to play with Currently we use overlap to weaken the SYNC hint, but allow it to set the hint as well. echo NO_SYNC_WAKEUP > /debug/sched_features echo SYNC_MORE > /debug/sched_features preserves pipe-test behaviour without using the WF_SYNC hint. Worth playing with on more workloads... Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 14 +++++++++++--- kernel/sched_features.h | 10 ++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6766959c7f4..280892e9d85 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1165,9 +1165,17 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || - p->se.avg_overlap > sysctl_sched_migration_cost)) - sync = 0; + if (sync) { + if (sched_feat(SYNC_LESS) && + (curr->se.avg_overlap > sysctl_sched_migration_cost || + p->se.avg_overlap > sysctl_sched_migration_cost)) + sync = 0; + } else { + if (sched_feat(SYNC_MORE) && + (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost)) + sync = 1; + } /* * If sync wakeup then subtract the (maximum possible) diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 294e10edd3c..70115c69c7a 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -62,6 +62,16 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) */ SCHED_FEAT(AFFINE_WAKEUPS, 1) +/* + * Weaken SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_LESS, 1) + +/* + * Add SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_MORE, 0) + /* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we -- cgit v1.2.3 From cb684b5bcd6a79ae7e2360c6b158c4aa7b0a293a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Sep 2009 21:53:11 +0200 Subject: block: fix linkage problem with blk_iopoll and !CONFIG_BLOCK kernel/built-in.o:(.data+0x17b0): undefined reference to `blk_iopoll_enabled' Since the extern declaration makes the compile work, but the actual symbol is missing when block/blk-iopoll.o isn't linked in. Signed-off-by: Jens Axboe --- kernel/sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6bb59f70740..1a631ba684a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -91,7 +91,9 @@ extern int sysctl_nr_trim_pages; #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#ifdef CONFIG_BLOCK extern int blk_iopoll_enabled; +#endif /* Constants used for minimum and maximum */ #ifdef CONFIG_DETECT_SOFTLOCKUP @@ -998,6 +1000,7 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_BLOCK { .ctl_name = CTL_UNNUMBERED, .procname = "blk_iopoll", @@ -1006,6 +1009,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- cgit v1.2.3 From 59abf02644c45f1591e1374ee7bb45dc757fcb88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 08:28:30 +0200 Subject: sched: Add SD_PREFER_LOCAL And turn it on for NUMA and MC domains. This improves locality in balancing decisions by keeping up to capacity amount of tasks local before looking for idle CPUs. (and twice the capacity if SD_POWERSAVINGS_BALANCE is set.) Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 280892e9d85..a37f311f436 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1360,7 +1360,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) { + if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { unsigned long power = 0; unsigned long nr_running = 0; unsigned long capacity; @@ -1373,7 +1373,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); - if (nr_running/2 < capacity) + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + nr_running /= 2; + + if (nr_running < capacity) break; } -- cgit v1.2.3 From 51e0304ce6e55a6e59658558916b4f74da085ff0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 16 Sep 2009 08:54:45 +0200 Subject: sched: Implement a gentler fair-sleepers feature Add back FAIR_SLEEPERS and GENTLE_FAIR_SLEEPERS. FAIR_SLEEPERS is the old logic: credit sleepers with their sleep time. GENTLE_FAIR_SLEEPERS dampens this a bit: 50% of their sleep time gets credited. The hope here is to still give the benefits of fair-sleepers logic (quick wakeups, etc.) while not allow them to have 100% of their sleep time as if they were running. Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++++- kernel/sched_features.h | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a37f311f436..acf16a8d934 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -711,7 +711,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) { + if (sched_feat(FAIR_SLEEPERS)) { unsigned long thresh = sysctl_sched_latency; /* @@ -725,6 +725,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) task_of(se)->policy != SCHED_IDLE)) thresh = calc_delta_fair(thresh, se); + /* + * Halve their sleep time's effect, to allow + * for a gentler effect of sleepers: + */ + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + vruntime -= thresh; } } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 70115c69c7a..fd375675f83 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -3,7 +3,14 @@ * considers the task to be running during that period. This gives it * a service deficit on wakeup, allowing it to run sooner. */ -SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) +SCHED_FEAT(FAIR_SLEEPERS, 1) + +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) /* * By not normalizing the sleep time, heavy tasks get an effective -- cgit v1.2.3 From b36461da2a0389149d7f88f3cbc05a30d1db9faa Mon Sep 17 00:00:00 2001 From: Atsushi Tsuji Date: Tue, 15 Sep 2009 19:06:30 +0900 Subject: tracing: Fix minor bugs for __unregister_ftrace_function_probe Fix the condition of strcmp for "*". Also fix NULL pointer dereference when glob is NULL. Signed-off-by: Atsushi Tsuji LKML-Reference: <4AAF6726.5090905@bk.jp.nec.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b23d567008..f7ab7fc162c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2062,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int i, len = 0; char *search; - if (glob && (strcmp(glob, "*") || !strlen(glob))) + if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) glob = NULL; - else { + else if (glob) { int not; type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); -- cgit v1.2.3 From 3b6408942206f940dd538e980e9904e48f4b64f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 13:44:33 +0200 Subject: sched: Optimize cgroup vs wakeup a bit We don't need to call update_shares() for each domain we iterate, just got the largets one. However, we should call it before wake_affine() as well, so that that can use up-to-date values too. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 ------- kernel/sched_fair.c | 23 +++++++++-------------- kernel/sched_features.h | 2 +- 3 files changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index af04ede6dd2..5049d959bb2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -376,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return 1; -} -#endif - static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline struct task_group *task_group(struct task_struct *p) { diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index acf16a8d934..722d392b0da 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1348,7 +1348,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) { - struct sched_domain *tmp, *sd = NULL; + struct sched_domain *tmp, *shares = NULL, *sd = NULL; int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int new_cpu = cpu; @@ -1387,22 +1387,14 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) break; } - switch (sd_flag) { - case SD_BALANCE_WAKE: - if (!sched_feat(LB_WAKEUP_UPDATE)) - break; - case SD_BALANCE_FORK: - case SD_BALANCE_EXEC: - if (root_task_group_empty()) - break; - update_shares(tmp); - default: - break; - } - if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + if (sched_feat(LB_SHARES_UPDATE)) { + update_shares(tmp); + shares = tmp; + } + if (wake_affine(tmp, p, sync)) { new_cpu = cpu; goto out; @@ -1417,6 +1409,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) sd = tmp; } + if (sd && sd != shares && sched_feat(LB_SHARES_UPDATE)) + update_shares(sd); + while (sd) { struct sched_group *group; int weight; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index fd375675f83..d5059fd761d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -107,7 +107,7 @@ SCHED_FEAT(ARCH_POWER, 0) SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_WAKEUP_UPDATE, 1) +SCHED_FEAT(LB_SHARES_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) /* -- cgit v1.2.3 From 5158f4e4428c6b8d52796b3b460e95796123a114 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 13:46:59 +0200 Subject: sched: Clean up the load_idx selection in select_task_rq_fair Clean up the code a little. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 722d392b0da..aeff40e7ec1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1248,26 +1248,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int flag) + int this_cpu, int load_idx) { struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; int imbalance = 100 + (sd->imbalance_pct-100)/2; - int load_idx = 0; - - switch (flag) { - case SD_BALANCE_FORK: - case SD_BALANCE_EXEC: - load_idx = sd->forkexec_idx; - break; - - case SD_BALANCE_WAKE: - load_idx = sd->wake_idx; - break; - - default: - break; - } do { unsigned long load, avg_load; @@ -1346,14 +1331,14 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) * * preempt must be disabled. */ -static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *shares = NULL, *sd = NULL; int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; - int sync = flags & WF_SYNC; + int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { if (sched_feat(AFFINE_WAKEUPS)) @@ -1413,6 +1398,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) update_shares(sd); while (sd) { + int load_idx = sd->forkexec_idx; struct sched_group *group; int weight; @@ -1421,7 +1407,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) continue; } - group = find_idlest_group(sd, p, cpu, sd_flag); + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; + + group = find_idlest_group(sd, p, cpu, load_idx); if (!group) { sd = sd->child; continue; -- cgit v1.2.3 From 5a9b86f647a56862cdc0a1362bfb015ae921af7f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 13:47:58 +0200 Subject: sched: Rename flags to wake_flags For consistencies sake, rename the argument (again). Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index aeff40e7ec1..c741cd9d38d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1551,12 +1551,12 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int sync = flags & WF_SYNC; + int sync = wake_flags & WF_SYNC; update_curr(cfs_rq); @@ -1582,7 +1582,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags */ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) set_last_buddy(se); - if (sched_feat(NEXT_BUDDY) && !(flags & WF_FORK)) + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) set_next_buddy(pse); /* -- cgit v1.2.3 From eb24073bc1fe3e569a855cf38d529fb650c35524 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 16 Sep 2009 21:09:13 +0200 Subject: sched: Fix TASK_WAKING & loadaverage breakage Fix this: top - 21:54:00 up 2:59, 1 user, load average: 432512.33, 426421.74, 417432.74 Which happens because we now set TASK_WAKING before activate_task(). Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5049d959bb2..969dfaef246 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2343,7 +2343,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, /* * In order to handle concurrent wakeups and release the rq->lock * we put the task in TASK_WAKING state. + * + * First fix up the nr_uninterruptible count: */ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; p->state = TASK_WAKING; task_rq_unlock(rq, &flags); -- cgit v1.2.3 From ad4b78bbcbab66998b05d422ac6106b645796e54 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 12:31:31 +0200 Subject: sched: Add new wakeup preemption mode: WAKEUP_RUNNING Create a new wakeup preemption mode, preempt towards tasks that run shorter on avg. It sets next buddy to be sure we actually run the task we preempted for. Test results: root@twins:~# while :; do :; done & [1] 6537 root@twins:~# while :; do :; done & [2] 6538 root@twins:~# while :; do :; done & [3] 6539 root@twins:~# while :; do :; done & [4] 6540 root@twins:/home/peter# ./latt -c4 sleep 4 Entries: 48 (clients=4) Averages: ------------------------------ Max 4750 usec Avg 497 usec Stdev 737 usec root@twins:/home/peter# echo WAKEUP_RUNNING > /debug/sched_features root@twins:/home/peter# ./latt -c4 sleep 4 Entries: 48 (clients=4) Averages: ------------------------------ Max 14 usec Avg 5 usec Stdev 3 usec Disabled by default - needs more testing. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar LKML-Reference: --- kernel/sched.c | 17 ++++++++++------- kernel/sched_debug.c | 1 + kernel/sched_fair.c | 14 +++++++++++--- kernel/sched_features.h | 5 +++++ 4 files changed, 27 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 969dfaef246..3bb4ea2ee6f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,6 +2458,7 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; + p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -5310,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *prev) +static void put_prev_task(struct rq *rq, struct task_struct *p) { - if (prev->state == TASK_RUNNING) { - u64 runtime = prev->se.sum_exec_runtime; + u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; - runtime -= prev->se.prev_sum_exec_runtime; - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_running, runtime); + if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5327,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) * correlates to the amount of cache footprint a task can * build up. */ - update_avg(&prev->se.avg_overlap, runtime); + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_overlap, runtime); + } else { + update_avg(&p->se.avg_running, 0); } - prev->sched_class->put_prev_task(rq, prev); + p->sched_class->put_prev_task(rq, p); } /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ddbd089126..efb84409bc4 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); + PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c741cd9d38d..3e6f78c6687 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1605,9 +1605,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; } - if (!sched_feat(WAKEUP_PREEMPT)) - return; - if ((sched_feat(WAKEUP_SYNC) && sync) || (sched_feat(WAKEUP_OVERLAP) && (se->avg_overlap < sysctl_sched_migration_cost && @@ -1616,6 +1613,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; } + if (sched_feat(WAKEUP_RUNNING)) { + if (pse->avg_running < se->avg_running) { + set_next_buddy(pse); + resched_task(curr); + return; + } + } + + if (!sched_feat(WAKEUP_PREEMPT)) + return; + find_matching_se(&se, &pse); BUG_ON(!pse); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index d5059fd761d..0d94083582c 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -53,6 +53,11 @@ SCHED_FEAT(WAKEUP_SYNC, 0) */ SCHED_FEAT(WAKEUP_OVERLAP, 0) +/* + * Wakeup preemption towards tasks that run short + */ +SCHED_FEAT(WAKEUP_RUNNING, 0) + /* * Use the SYNC wakeup hint, pipes and the likes use this to indicate * the remote end is likely to consume the data we just wrote, and -- cgit v1.2.3 From de69a80be32445b0a71e8e3b757e584d7beb90f7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Sep 2009 09:01:20 +0200 Subject: sched: Stop buddies from hogging the system Clear buddies more agressively. The (theoretical, haven't actually observed any of this) problem is that when we do not select either buddy in pick_next_entity() because they are too far ahead of the left-most task, we do not clear the buddies. This means that as soon as we service the left-most task, these same buddies will be tried again on the next schedule. Now if the left-most task was a pure hog, it wouldn't have done any wakeups and it wouldn't have set buddies of its own. That leads to the old buddies dominating, which would lead to bad latencies. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3e6f78c6687..ffee827fa22 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -764,10 +764,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->last == se) + if (!se || cfs_rq->last == se) cfs_rq->last = NULL; - if (cfs_rq->next == se) + if (!se || cfs_rq->next == se) cfs_rq->next = NULL; } @@ -1646,8 +1646,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) /* * If se was a buddy, clear it so that it will have to earn * the favour again. + * + * If se was not a buddy, clear the buddies because neither + * was elegible to run, let them earn it again. + * + * IOW. unconditionally clear buddies. */ - __clear_buddies(cfs_rq, se); + __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.2.3 From 29cd8bae396583a2ee9a3340db8c5102acf9f6fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Sep 2009 09:01:14 +0200 Subject: sched: Fix SD_POWERSAVING_BALANCE|SD_PREFER_LOCAL vs SD_WAKE_AFFINE The SD_POWERSAVING_BALANCE|SD_PREFER_LOCAL code can break out of the domain iteration early, making us miss the SD_WAKE_AFFINE bits. Fix this by continuing iteration until there is no need for a larger domain. This also cleans up the cgroup stuff a bit, but not having two update_shares() invocations. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ffee827fa22..10d218ab69f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1333,11 +1333,12 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { - struct sched_domain *tmp, *shares = NULL, *sd = NULL; + struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; + int want_sd = 1; int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { @@ -1369,33 +1370,44 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag nr_running /= 2; if (nr_running < capacity) - break; + want_sd = 0; } if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { - if (sched_feat(LB_SHARES_UPDATE)) { - update_shares(tmp); - shares = tmp; - } - - if (wake_affine(tmp, p, sync)) { - new_cpu = cpu; - goto out; - } - + affine_sd = tmp; want_affine = 0; } + if (!want_sd && !want_affine) + break; + if (!(tmp->flags & sd_flag)) continue; - sd = tmp; + if (want_sd) + sd = tmp; + } + + if (sched_feat(LB_SHARES_UPDATE)) { + /* + * Pick the largest domain to update shares over + */ + tmp = sd; + if (affine_sd && (!tmp || + cpumask_weight(sched_domain_span(affine_sd)) > + cpumask_weight(sched_domain_span(sd)))) + tmp = affine_sd; + + if (tmp) + update_shares(tmp); } - if (sd && sd != shares && sched_feat(LB_SHARES_UPDATE)) - update_shares(sd); + if (affine_sd && wake_affine(affine_sd, p, sync)) { + new_cpu = cpu; + goto out; + } while (sd) { int load_idx = sd->forkexec_idx; -- cgit v1.2.3 From b375a11a239e9e1cac40c7f3ff28b343d9f7ac51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Sep 2009 00:05:58 -0400 Subject: tracing: switch function prints from %pf to %ps For direct function pointers (like what mcount provides) PowerPC64 requires the use of %ps, otherwise nothing is printed. This patch converts all prints of functions retrieved through mcount to use the %ps format from the %pf. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_functions_graph.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f7ab7fc162c..cc615f84751 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1405,7 +1405,7 @@ static int t_hash_show(struct seq_file *m, void *v) if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); - seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); + seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); if (rec->data) seq_printf(m, ":%p", rec->data); @@ -1515,7 +1515,7 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%pf\n", (void *)rec->ip); + seq_printf(m, "%ps\n", (void *)rec->ip); return 0; } @@ -2456,7 +2456,7 @@ static int g_show(struct seq_file *m, void *v) return 0; } - seq_printf(m, "%pf\n", (void *)*ptr); + seq_printf(m, "%ps\n", (void *)*ptr); return 0; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5b01b94518f..b3f3776b0cd 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, { long count = (long)data; - seq_printf(m, "%pf:", (void *)ip); + seq_printf(m, "%ps:", (void *)ip); if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 61f166707a0..45e6c01b2e4 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); WARN(1, "Bad frame pointer: expected %lx, received %lx\n" - " from func %pF return to %lx\n", + " from func %ps return to %lx\n", current->ret_stack[index].fp, frame_pointer, (void *)current->ret_stack[index].func, @@ -669,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); + ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -712,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); + ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3 From 5dd4de587fd9c25cb32a7a0fe9feec3647509b6f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 17 Sep 2009 17:38:32 +0800 Subject: softirq: add BLOCK_IOPOLL to softirq_to_name With BLOCK_IOPOLL_SOFTIRQ added, softirq_to_name[] and show_softirq_name() needs to be updated. Signed-off-by: Li Zefan LKML-Reference: <4AB20398.8070209@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 7db25067cd2..f8749e5216e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; -- cgit v1.2.3