From 0a02ad9331dd4db56c29c60db2e99c4daaad8a48 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Sep 2009 12:12:54 +0200 Subject: perf: Add 'perf sched' tool This turn-key tool allows scheduler measurements to be conducted and the results be displayed numerically. First baby step towards that goal: clone the new command off of perf trace. Fix a few other details along the way: - add (minimal) perf trace documentation - reorder a few places - list perf trace in the mainporcelain list as well as it's a very useful utility. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 297 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 tools/perf/builtin-sched.c (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c new file mode 100644 index 00000000000..60228d9179a --- /dev/null +++ b/tools/perf/builtin-sched.c @@ -0,0 +1,297 @@ +#include "builtin.h" + +#include "util/util.h" +#include "util/cache.h" +#include "util/symbol.h" +#include "util/thread.h" +#include "util/header.h" + +#include "util/parse-options.h" + +#include "perf.h" +#include "util/debug.h" + +#include "util/trace-event.h" + +static char const *input_name = "perf.data"; +static int input; +static unsigned long page_size; +static unsigned long mmap_window = 32; + +static unsigned long total = 0; +static unsigned long total_comm = 0; + +static struct rb_root threads; +static struct thread *last_match; + +static struct perf_header *header; +static u64 sample_type; + + +static int +process_comm_event(event_t *event, unsigned long offset, unsigned long head) +{ + struct thread *thread; + + thread = threads__findnew(event->comm.pid, &threads, &last_match); + + dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + (void *)(offset + head), + (void *)(long)(event->header.size), + event->comm.comm, event->comm.pid); + + if (thread == NULL || + thread__set_comm(thread, event->comm.comm)) { + dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + return -1; + } + total_comm++; + + return 0; +} + +static int +process_sample_event(event_t *event, unsigned long offset, unsigned long head) +{ + char level; + int show = 0; + struct dso *dso = NULL; + struct thread *thread; + u64 ip = event->ip.ip; + u64 timestamp = -1; + u32 cpu = -1; + u64 period = 1; + void *more_data = event->ip.__more_data; + int cpumode; + + thread = threads__findnew(event->ip.pid, &threads, &last_match); + + if (sample_type & PERF_SAMPLE_TIME) { + timestamp = *(u64 *)more_data; + more_data += sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_CPU) { + cpu = *(u32 *)more_data; + more_data += sizeof(u32); + more_data += sizeof(u32); /* reserved */ + } + + if (sample_type & PERF_SAMPLE_PERIOD) { + period = *(u64 *)more_data; + more_data += sizeof(u64); + } + + dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + (void *)(offset + head), + (void *)(long)(event->header.size), + event->header.misc, + event->ip.pid, event->ip.tid, + (void *)(long)ip, + (long long)period); + + dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); + + if (thread == NULL) { + eprintf("problem processing %d event, skipping it.\n", + event->header.type); + return -1; + } + + cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + + if (cpumode == PERF_EVENT_MISC_KERNEL) { + show = SHOW_KERNEL; + level = 'k'; + + dso = kernel_dso; + + dump_printf(" ...... dso: %s\n", dso->name); + + } else if (cpumode == PERF_EVENT_MISC_USER) { + + show = SHOW_USER; + level = '.'; + + } else { + show = SHOW_HV; + level = 'H'; + + dso = hypervisor_dso; + + dump_printf(" ...... dso: [hypervisor]\n"); + } + + if (sample_type & PERF_SAMPLE_RAW) { + struct { + u32 size; + char data[0]; + } *raw = more_data; + + /* + * FIXME: better resolve from pid from the struct trace_entry + * field, although it should be the same than this perf + * event pid + */ + print_event(cpu, raw->data, raw->size, timestamp, thread->comm); + } + total += period; + + return 0; +} + +static int +process_event(event_t *event, unsigned long offset, unsigned long head) +{ + trace_event(event); + + switch (event->header.type) { + case PERF_EVENT_MMAP ... PERF_EVENT_LOST: + return 0; + + case PERF_EVENT_COMM: + return process_comm_event(event, offset, head); + + case PERF_EVENT_EXIT ... PERF_EVENT_READ: + return 0; + + case PERF_EVENT_SAMPLE: + return process_sample_event(event, offset, head); + + case PERF_EVENT_MAX: + default: + return -1; + } + + return 0; +} + +static int __cmd_sched(void) +{ + int ret, rc = EXIT_FAILURE; + unsigned long offset = 0; + unsigned long head = 0; + struct stat perf_stat; + event_t *event; + uint32_t size; + char *buf; + + trace_report(); + register_idle_thread(&threads, &last_match); + + input = open(input_name, O_RDONLY); + if (input < 0) { + perror("failed to open file"); + exit(-1); + } + + ret = fstat(input, &perf_stat); + if (ret < 0) { + perror("failed to stat file"); + exit(-1); + } + + if (!perf_stat.st_size) { + fprintf(stderr, "zero-sized file, nothing to do!\n"); + exit(0); + } + header = perf_header__read(input); + head = header->data_offset; + sample_type = perf_header__sample_type(header); + + if (!(sample_type & PERF_SAMPLE_RAW)) + die("No trace sample to read. Did you call perf record " + "without -R?"); + + if (load_kernel() < 0) { + perror("failed to load kernel symbols"); + return EXIT_FAILURE; + } + +remap: + buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ, + MAP_SHARED, input, offset); + if (buf == MAP_FAILED) { + perror("failed to mmap file"); + exit(-1); + } + +more: + event = (event_t *)(buf + head); + + size = event->header.size; + if (!size) + size = 8; + + if (head + event->header.size >= page_size * mmap_window) { + unsigned long shift = page_size * (head / page_size); + int res; + + res = munmap(buf, page_size * mmap_window); + assert(res == 0); + + offset += shift; + head -= shift; + goto remap; + } + + size = event->header.size; + + + if (!size || process_event(event, offset, head) < 0) { + + /* + * assume we lost track of the stream, check alignment, and + * increment a single u64 in the hope to catch on again 'soon'. + */ + + if (unlikely(head & 7)) + head &= ~7ULL; + + size = 8; + } + + head += size; + + if (offset + head < (unsigned long)perf_stat.st_size) + goto more; + + rc = EXIT_SUCCESS; + close(input); + + return rc; +} + +static const char * const annotate_usage[] = { + "perf trace [] ", + NULL +}; + +static const struct option options[] = { + OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, + "dump raw trace in ASCII"), + OPT_BOOLEAN('v', "verbose", &verbose, + "be more verbose (show symbol address, etc)"), + OPT_END() +}; + +int cmd_sched(int argc, const char **argv, const char *prefix __used) +{ + symbol__init(); + page_size = getpagesize(); + + argc = parse_options(argc, argv, options, annotate_usage, 0); + if (argc) { + /* + * Special case: if there's an argument left then assume tha + * it's a symbol filter: + */ + if (argc > 1) + usage_with_options(annotate_usage, options); + } + + + setup_pager(); + + return __cmd_sched(); +} -- cgit v1.2.3 From ec156764d424dd67283c2cd5e9f6f1b8388364ac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Sep 2009 12:12:54 +0200 Subject: perf sched: Import schedbench.c Import the schedbench.c tool that i wrote some time ago to simulate scheduler behavior but never finished. It's a good basis for perf sched nevertheless. Most of its guts are not hooked up to the perf event loop yet - that will be done in the patches to come. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 842 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 817 insertions(+), 25 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 60228d9179a..c66e6a32137 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -12,22 +12,770 @@ #include "util/debug.h" #include "util/trace-event.h" +#include -static char const *input_name = "perf.data"; -static int input; -static unsigned long page_size; -static unsigned long mmap_window = 32; +static char const *input_name = "perf.data"; +static int input; +static unsigned long page_size; +static unsigned long mmap_window = 32; -static unsigned long total = 0; -static unsigned long total_comm = 0; +static unsigned long total_comm = 0; -static struct rb_root threads; -static struct thread *last_match; +static struct rb_root threads; +static struct thread *last_match; -static struct perf_header *header; -static u64 sample_type; +static struct perf_header *header; +static u64 sample_type; +/* + * Scheduler benchmarks + */ +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define PR_SET_NAME 15 /* Set process name */ + +#define BUG_ON(x) assert(!(x)) + +#define DEBUG 1 + +typedef unsigned long long nsec_t; + +#define printk(x...) do { printf(x); fflush(stdout); } while (0) + +nsec_t prev_printk; + +#define __dprintk(x,y...) do { \ + nsec_t __now = get_nsecs(), __delta = __now - prev_printk; \ + \ + prev_printk = __now; \ + \ + printf("%.3f [%Ld] [%.3f]: " x, (double)__now/1e6, __now, (double)__delta/1e6, y);\ +} while (0) + +#if !DEBUG +# define dprintk(x...) do { } while (0) +#else +# define dprintk(x...) __dprintk(x) +#endif + +#define __DP() __dprintk("parent: line %d\n", __LINE__) +#define DP() dprintk("parent: line %d\n", __LINE__) +#define D() dprintk("task %ld: line %d\n", this_task->nr, __LINE__) + + +static nsec_t run_measurement_overhead; +static nsec_t sleep_measurement_overhead; + +static nsec_t get_nsecs(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + + return ts.tv_sec * 1000000000ULL + ts.tv_nsec; +} + +static void burn_nsecs(nsec_t nsecs) +{ + nsec_t T0 = get_nsecs(), T1; + + do { + T1 = get_nsecs(); + } while (T1 + run_measurement_overhead < T0 + nsecs); +} + +static void sleep_nsecs(nsec_t nsecs) +{ + struct timespec ts; + + ts.tv_nsec = nsecs % 999999999; + ts.tv_sec = nsecs / 999999999; + + nanosleep(&ts, NULL); +} + +static void calibrate_run_measurement_overhead(void) +{ + nsec_t T0, T1, delta, min_delta = 1000000000ULL; + int i; + + for (i = 0; i < 10; i++) { + T0 = get_nsecs(); + burn_nsecs(0); + T1 = get_nsecs(); + delta = T1-T0; + min_delta = min(min_delta, delta); + } + run_measurement_overhead = min_delta; + + printk("run measurement overhead: %Ld nsecs\n", min_delta); +} + +static void calibrate_sleep_measurement_overhead(void) +{ + nsec_t T0, T1, delta, min_delta = 1000000000ULL; + int i; + + for (i = 0; i < 10; i++) { + T0 = get_nsecs(); + sleep_nsecs(10000); + T1 = get_nsecs(); + delta = T1-T0; + min_delta = min(min_delta, delta); + } + min_delta -= 10000; + sleep_measurement_overhead = min_delta; + + printk("sleep measurement overhead: %Ld nsecs\n", min_delta); +} + +#define COMM_LEN 20 +#define SYM_LEN 129 + +#define MAX_PID 65536 + +static unsigned long nr_tasks; + +struct sched_event; + +struct task_desc { + unsigned long nr; + unsigned long pid; + char comm[COMM_LEN]; + + unsigned long nr_events; + unsigned long curr_event; + struct sched_event **events; + + pthread_t thread; + sem_t sleep_sem; + + sem_t ready_for_work; + sem_t work_done_sem; + + nsec_t cpu_usage; +}; + +enum sched_event_type { + SCHED_EVENT_RUN, + SCHED_EVENT_SLEEP, + SCHED_EVENT_WAKEUP, +}; + +struct sched_event { + enum sched_event_type type; + nsec_t timestamp; + nsec_t duration; + unsigned long nr; + int specific_wait; + sem_t *wait_sem; + struct task_desc *wakee; +}; + +static struct task_desc *pid_to_task[MAX_PID]; + +static struct task_desc **tasks; + +static pthread_mutex_t start_work_mutex = PTHREAD_MUTEX_INITIALIZER; +static nsec_t start_time; + +static pthread_mutex_t work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER; + +static unsigned long nr_run_events; +static unsigned long nr_sleep_events; +static unsigned long nr_wakeup_events; + +static unsigned long nr_sleep_corrections; +static unsigned long nr_run_events_optimized; + +static struct sched_event * +get_new_event(struct task_desc *task, nsec_t timestamp) +{ + struct sched_event *event = calloc(1, sizeof(*event)); + unsigned long idx = task->nr_events; + size_t size; + + event->timestamp = timestamp; + event->nr = idx; + + task->nr_events++; + size = sizeof(struct sched_event *) * task->nr_events; + task->events = realloc(task->events, size); + BUG_ON(!task->events); + + task->events[idx] = event; + + return event; +} + +static struct sched_event *last_event(struct task_desc *task) +{ + if (!task->nr_events) + return NULL; + + return task->events[task->nr_events - 1]; +} + +static void +add_sched_event_run(struct task_desc *task, nsec_t timestamp, + unsigned long duration) +{ + struct sched_event *event, *curr_event = last_event(task); + + /* + * optimize an existing RUN event by merging this one + * to it: + */ + if (curr_event && curr_event->type == SCHED_EVENT_RUN) { + nr_run_events_optimized++; + curr_event->duration += duration; + return; + } + + event = get_new_event(task, timestamp); + + event->type = SCHED_EVENT_RUN; + event->duration = duration; + + nr_run_events++; +} + +static unsigned long targetless_wakeups; +static unsigned long multitarget_wakeups; + +static void +add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp, + struct task_desc *wakee) +{ + struct sched_event *event, *wakee_event; + + event = get_new_event(task, timestamp); + event->type = SCHED_EVENT_WAKEUP; + event->wakee = wakee; + + wakee_event = last_event(wakee); + if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) { + targetless_wakeups++; + return; + } + if (wakee_event->wait_sem) { + multitarget_wakeups++; + return; + } + + wakee_event->wait_sem = calloc(1, sizeof(*wakee_event->wait_sem)); + sem_init(wakee_event->wait_sem, 0, 0); + wakee_event->specific_wait = 1; + event->wait_sem = wakee_event->wait_sem; + + nr_wakeup_events++; +} + +static void +add_sched_event_sleep(struct task_desc *task, nsec_t timestamp, + unsigned long uninterruptible __used) +{ + struct sched_event *event = get_new_event(task, timestamp); + + event->type = SCHED_EVENT_SLEEP; + + nr_sleep_events++; +} + +static struct task_desc *register_pid(unsigned long pid, const char *comm) +{ + struct task_desc *task; + + BUG_ON(pid >= MAX_PID); + + task = pid_to_task[pid]; + + if (task) + return task; + + task = calloc(1, sizeof(*task)); + task->pid = pid; + task->nr = nr_tasks; + strcpy(task->comm, comm); + /* + * every task starts in sleeping state - this gets ignored + * if there's no wakeup pointing to this sleep state: + */ + add_sched_event_sleep(task, 0, 0); + + pid_to_task[pid] = task; + nr_tasks++; + tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *)); + BUG_ON(!tasks); + tasks[task->nr] = task; + + printk("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm); + + return task; +} + + +static int first_trace_line = 1; + +static nsec_t first_timestamp; +static nsec_t prev_timestamp; + +void parse_line(char *line); + +void parse_line(char *line) +{ + unsigned long param1 = 0, param2 = 0; + char comm[COMM_LEN], comm2[COMM_LEN]; + unsigned long pid, pid2, timestamp0; + struct task_desc *task, *task2; + char func_str[SYM_LEN]; + nsec_t timestamp; + int ret; + + //" 0 0D.s3 0us+: try_to_wake_up (1 0)" + ret = sscanf(line, "%20s %5ld %*s %ldus%*c:" + " %128s <%20s %ld> (%ld %ld)\n", + comm, &pid, ×tamp0, + func_str, comm2, &pid2, ¶m1, ¶m2); + dprintk("ret: %d\n", ret); + if (ret != 8) + return; + + timestamp = timestamp0 * 1000LL; + + if (first_trace_line) { + first_trace_line = 0; + first_timestamp = timestamp; + } + + timestamp -= first_timestamp; + BUG_ON(timestamp < prev_timestamp); + prev_timestamp = timestamp; + + dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n", + comm, + pid, + timestamp, + func_str, + comm2, + pid2, + param1, + param2); + + task = register_pid(pid, comm); + task2 = register_pid(pid2, comm2); + + if (!strcmp(func_str, "update_curr")) { + dprintk("%Ld: task %ld runs for %ld nsecs\n", + timestamp, task->nr, param1); + add_sched_event_run(task, timestamp, param1); + } else if (!strcmp(func_str, "try_to_wake_up")) { + dprintk("%Ld: task %ld wakes up task %ld\n", + timestamp, task->nr, task2->nr); + add_sched_event_wakeup(task, timestamp, task2); + } else if (!strcmp(func_str, "deactivate_task")) { + dprintk("%Ld: task %ld goes to sleep (uninterruptible: %ld)\n", + timestamp, task->nr, param1); + add_sched_event_sleep(task, timestamp, param1); + } +} + +static void print_task_traces(void) +{ + struct task_desc *task; + unsigned long i; + + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + printk("task %6ld (%20s:%10ld), nr_events: %ld\n", + task->nr, task->comm, task->pid, task->nr_events); + } +} + +static void add_cross_task_wakeups(void) +{ + struct task_desc *task1, *task2; + unsigned long i, j; + + for (i = 0; i < nr_tasks; i++) { + task1 = tasks[i]; + j = i + 1; + if (j == nr_tasks) + j = 0; + task2 = tasks[j]; + add_sched_event_wakeup(task1, 0, task2); + } +} + +static void +process_sched_event(struct task_desc *this_task, struct sched_event *event) +{ + int ret = 0; + nsec_t now; + long long delta; + + now = get_nsecs(); + delta = start_time + event->timestamp - now; + + dprintk("task %ld, event #%ld, %Ld, delta: %.3f (%Ld)\n", + this_task->nr, event->nr, event->timestamp, + (double)delta/1e6, delta); + + if (0 && delta > 0) { + dprintk("%.3f: task %ld FIX %.3f\n", + (double)event->timestamp/1e6, + this_task->nr, + (double)delta/1e6); + sleep_nsecs(start_time + event->timestamp - now); + nr_sleep_corrections++; + } + + switch (event->type) { + case SCHED_EVENT_RUN: + dprintk("%.3f: task %ld RUN for %.3f\n", + (double)event->timestamp/1e6, + this_task->nr, + (double)event->duration/1e6); + burn_nsecs(event->duration); + break; + case SCHED_EVENT_SLEEP: + dprintk("%.3f: task %ld %s SLEEP\n", + (double)event->timestamp/1e6, + this_task->nr, event->wait_sem ? "" : "SKIP"); + if (event->wait_sem) + ret = sem_wait(event->wait_sem); + BUG_ON(ret); + break; + case SCHED_EVENT_WAKEUP: + dprintk("%.3f: task %ld WAKEUP => task %ld\n", + (double)event->timestamp/1e6, + this_task->nr, + event->wakee->nr); + if (event->wait_sem) + ret = sem_post(event->wait_sem); + BUG_ON(ret); + break; + default: + BUG_ON(1); + } +} + +static nsec_t get_cpu_usage_nsec_parent(void) +{ + struct rusage ru; + nsec_t sum; + int err; + + err = getrusage(RUSAGE_SELF, &ru); + BUG_ON(err); + + sum = ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3; + sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3; + + return sum; +} + +static nsec_t get_cpu_usage_nsec_self(void) +{ + char filename [] = "/proc/1234567890/sched"; + unsigned long msecs, nsecs; + char *line = NULL; + nsec_t total = 0; + size_t len = 0; + ssize_t chars; + FILE *file; + int ret; + + sprintf(filename, "/proc/%d/sched", getpid()); + file = fopen(filename, "r"); + BUG_ON(!file); + + while ((chars = getline(&line, &len, file)) != -1) { + dprintk("got line with length %zu :\n", chars); + dprintk("%s", line); + ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n", + &msecs, &nsecs); + if (ret == 2) { + total = msecs*1e6 + nsecs; + dprintk("total: (%ld.%06ld) %Ld\n", + msecs, nsecs, total); + break; + } + } + if (line) + free(line); + fclose(file); + + return total; +} + +static void *thread_func(void *ctx) +{ + struct task_desc *this_task = ctx; + nsec_t cpu_usage_0, cpu_usage_1; + unsigned long i, ret; + char comm2[22]; + + dprintk("task %ld started up.\n", this_task->nr); + sprintf(comm2, ":%s", this_task->comm); + prctl(PR_SET_NAME, comm2); + +again: + ret = sem_post(&this_task->ready_for_work); + BUG_ON(ret); + D(); + ret = pthread_mutex_lock(&start_work_mutex); + BUG_ON(ret); + ret = pthread_mutex_unlock(&start_work_mutex); + BUG_ON(ret); + D(); + + cpu_usage_0 = get_cpu_usage_nsec_self(); + + for (i = 0; i < this_task->nr_events; i++) { + this_task->curr_event = i; + process_sched_event(this_task, this_task->events[i]); + } + + cpu_usage_1 = get_cpu_usage_nsec_self(); + this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; + + dprintk("task %ld cpu usage: %0.3f msecs\n", + this_task->nr, (double)this_task->cpu_usage / 1e6); + + D(); + ret = sem_post(&this_task->work_done_sem); + BUG_ON(ret); + D(); + + ret = pthread_mutex_lock(&work_done_wait_mutex); + BUG_ON(ret); + ret = pthread_mutex_unlock(&work_done_wait_mutex); + BUG_ON(ret); + D(); + + goto again; +} + +static void create_tasks(void) +{ + struct task_desc *task; + pthread_attr_t attr; + unsigned long i; + int err; + + err = pthread_attr_init(&attr); + BUG_ON(err); + err = pthread_attr_setstacksize(&attr, (size_t)(16*1024)); + BUG_ON(err); + err = pthread_mutex_lock(&start_work_mutex); + BUG_ON(err); + err = pthread_mutex_lock(&work_done_wait_mutex); + BUG_ON(err); + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + sem_init(&task->sleep_sem, 0, 0); + sem_init(&task->ready_for_work, 0, 0); + sem_init(&task->work_done_sem, 0, 0); + task->curr_event = 0; + err = pthread_create(&task->thread, &attr, thread_func, task); + BUG_ON(err); + } +} + +static nsec_t cpu_usage; +static nsec_t runavg_cpu_usage; +static nsec_t parent_cpu_usage; +static nsec_t runavg_parent_cpu_usage; + +static void wait_for_tasks(void) +{ + nsec_t cpu_usage_0, cpu_usage_1; + struct task_desc *task; + unsigned long i, ret; + + DP(); + start_time = get_nsecs(); + DP(); + cpu_usage = 0; + pthread_mutex_unlock(&work_done_wait_mutex); + + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + ret = sem_wait(&task->ready_for_work); + BUG_ON(ret); + sem_init(&task->ready_for_work, 0, 0); + } + ret = pthread_mutex_lock(&work_done_wait_mutex); + BUG_ON(ret); + + cpu_usage_0 = get_cpu_usage_nsec_parent(); + + pthread_mutex_unlock(&start_work_mutex); + +#if 0 + for (i = 0; i < nr_tasks; i++) { + unsigned long missed; + + task = tasks[i]; + while (task->curr_event + 1 < task->nr_events) { + dprintk("parent waiting for %ld (%ld != %ld)\n", + i, task->curr_event, task->nr_events); + sleep_nsecs(100000000); + } + missed = task->nr_events - 1 - task->curr_event; + if (missed) + printk("task %ld missed events: %ld\n", i, missed); + ret = sem_post(&task->sleep_sem); + BUG_ON(ret); + } +#endif + DP(); + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + ret = sem_wait(&task->work_done_sem); + BUG_ON(ret); + sem_init(&task->work_done_sem, 0, 0); + cpu_usage += task->cpu_usage; + task->cpu_usage = 0; + } + + cpu_usage_1 = get_cpu_usage_nsec_parent(); + if (!runavg_cpu_usage) + runavg_cpu_usage = cpu_usage; + runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10; + + parent_cpu_usage = cpu_usage_1 - cpu_usage_0; + if (!runavg_parent_cpu_usage) + runavg_parent_cpu_usage = parent_cpu_usage; + runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 + + parent_cpu_usage)/10; + + ret = pthread_mutex_lock(&start_work_mutex); + BUG_ON(ret); + + for (i = 0; i < nr_tasks; i++) { + task = tasks[i]; + sem_init(&task->sleep_sem, 0, 0); + task->curr_event = 0; + } +} + +static int __cmd_sched(void); + +static void parse_trace(void) +{ + __cmd_sched(); + + printk("nr_run_events: %ld\n", nr_run_events); + printk("nr_sleep_events: %ld\n", nr_sleep_events); + printk("nr_wakeup_events: %ld\n", nr_wakeup_events); + + if (targetless_wakeups) + printk("target-less wakeups: %ld\n", targetless_wakeups); + if (multitarget_wakeups) + printk("multi-target wakeups: %ld\n", multitarget_wakeups); + if (nr_run_events_optimized) + printk("run events optimized: %ld\n", + nr_run_events_optimized); +} + +static unsigned long nr_runs; +static nsec_t sum_runtime; +static nsec_t sum_fluct; +static nsec_t run_avg; + +static void run_one_test(void) +{ + nsec_t T0, T1, delta, avg_delta, fluct, std_dev; + + T0 = get_nsecs(); + wait_for_tasks(); + T1 = get_nsecs(); + + delta = T1 - T0; + sum_runtime += delta; + nr_runs++; + + avg_delta = sum_runtime / nr_runs; + if (delta < avg_delta) + fluct = avg_delta - delta; + else + fluct = delta - avg_delta; + sum_fluct += fluct; + std_dev = sum_fluct / nr_runs / sqrt(nr_runs); + if (!run_avg) + run_avg = delta; + run_avg = (run_avg*9 + delta)/10; + + printk("#%-3ld: %0.3f, ", + nr_runs, (double)delta/1000000.0); + +#if 0 + printk("%0.2f +- %0.2f, ", + (double)avg_delta/1e6, (double)std_dev/1e6); +#endif + printk("ravg: %0.2f, ", + (double)run_avg/1e6); + + printk("cpu: %0.2f / %0.2f", + (double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6); + +#if 0 + /* + * rusage statistics done by the parent, these are less + * accurate than the sum_exec_runtime based statistics: + */ + printk(" [%0.2f / %0.2f]", + (double)parent_cpu_usage/1e6, + (double)runavg_parent_cpu_usage/1e6); +#endif + + printk("\n"); + + if (nr_sleep_corrections) + printk(" (%ld sleep corrections)\n", nr_sleep_corrections); + nr_sleep_corrections = 0; +} + +static void test_calibrations(void) +{ + nsec_t T0, T1; + + T0 = get_nsecs(); + burn_nsecs(1e6); + T1 = get_nsecs(); + + printk("the run test took %Ld nsecs\n", T1-T0); + + T0 = get_nsecs(); + sleep_nsecs(1e6); + T1 = get_nsecs(); + + printk("the sleep test took %Ld nsecs\n", T1-T0); +} + static int process_comm_event(event_t *event, unsigned long offset, unsigned long head) { @@ -50,6 +798,46 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) return 0; } +static void process_sched_wakeup_event(struct event *event, + int cpu __used, u64 timestamp __used, struct thread *thread __used) +{ + printf("sched_wakeup event %p\n", event); +} + +static void process_sched_switch_event(struct event *event, + int cpu __used, u64 timestamp __used, struct thread *thread __used) +{ + printf("sched_switch event %p\n", event); +} + +static void +process_raw_event(event_t *raw_event, void *more_data, + int cpu, u64 timestamp, struct thread *thread) +{ + struct { + u32 size; + char data[0]; + } *raw = more_data; + struct event *event; + int type; + + type = trace_parse_common_type(raw->data); + event = trace_find_event(type); + + /* + * FIXME: better resolve from pid from the struct trace_entry + * field, although it should be the same than this perf + * event pid + */ + printf("id %d, type: %d, event: %s\n", + raw_event->header.type, type, event->name); + + if (!strcmp(event->name, "sched_switch")) + process_sched_switch_event(event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_wakeup")) + process_sched_wakeup_event(event, cpu, timestamp, thread); +} + static int process_sample_event(event_t *event, unsigned long offset, unsigned long head) { @@ -122,20 +910,8 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: [hypervisor]\n"); } - if (sample_type & PERF_SAMPLE_RAW) { - struct { - u32 size; - char data[0]; - } *raw = more_data; - - /* - * FIXME: better resolve from pid from the struct trace_entry - * field, although it should be the same than this perf - * event pid - */ - print_event(cpu, raw->data, raw->size, timestamp, thread->comm); - } - total += period; + if (sample_type & PERF_SAMPLE_RAW) + process_raw_event(event, more_data, cpu, timestamp, thread); return 0; } @@ -277,6 +1053,8 @@ static const struct option options[] = { int cmd_sched(int argc, const char **argv, const char *prefix __used) { + long nr_iterations = LONG_MAX, i; + symbol__init(); page_size = getpagesize(); @@ -293,5 +1071,19 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) setup_pager(); - return __cmd_sched(); + calibrate_run_measurement_overhead(); + calibrate_sleep_measurement_overhead(); + + test_calibrations(); + + parse_trace(); + print_task_traces(); + add_cross_task_wakeups(); + + create_tasks(); + printk("------------------------------------------------------------\n"); + for (i = 0; i < nr_iterations; i++) + run_one_test(); + + return 0; } -- cgit v1.2.3 From fbf9482911825f965829567aea8acff3bbc5279c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Sep 2009 12:12:54 +0200 Subject: perf sched: Implement the scheduling workload replay engine Integrate the schedbench.c bits with the raw trace events that we get from the perf machinery, and activate the workload replayer/simulator. Example of a captured 'make -j' workload: $ perf sched run measurement overhead: 90 nsecs sleep measurement overhead: 2724743 nsecs the run test took 1000081 nsecs the sleep test took 2981111 nsecs version = 0.5 ... nr_run_events: 70 nr_sleep_events: 66 nr_wakeup_events: 9 target-less wakeups: 71 multi-target wakeups: 47 run events optimized: 139 task 0 ( perf: 6607), nr_events: 2 task 1 ( perf: 6608), nr_events: 6 task 2 ( : 0), nr_events: 1 task 3 ( make: 6609), nr_events: 5 task 4 ( sh: 6610), nr_events: 4 task 5 ( make: 6611), nr_events: 6 task 6 ( sh: 6612), nr_events: 4 task 7 ( make: 6613), nr_events: 5 task 8 ( migration/11: 25), nr_events: 1 task 9 ( migration/13: 29), nr_events: 1 task 10 ( migration/15: 33), nr_events: 1 task 11 ( migration/9: 21), nr_events: 1 task 12 ( sh: 6614), nr_events: 4 task 13 ( make: 6615), nr_events: 5 task 14 ( sh: 6616), nr_events: 4 task 15 ( make: 6617), nr_events: 7 task 16 ( migration/3: 9), nr_events: 1 task 17 ( migration/5: 13), nr_events: 1 task 18 ( migration/7: 17), nr_events: 1 task 19 ( migration/1: 5), nr_events: 1 task 20 ( sh: 6618), nr_events: 4 task 21 ( make: 6619), nr_events: 5 task 22 ( sh: 6620), nr_events: 4 task 23 ( make: 6621), nr_events: 10 task 24 ( sh: 6623), nr_events: 3 task 25 ( gcc: 6624), nr_events: 4 task 26 ( gcc: 6625), nr_events: 4 task 27 ( gcc: 6626), nr_events: 5 task 28 ( collect2: 6627), nr_events: 5 task 29 ( sh: 6622), nr_events: 1 task 30 ( make: 6628), nr_events: 7 task 31 ( sh: 6630), nr_events: 4 task 32 ( gcc: 6631), nr_events: 4 task 33 ( sh: 6629), nr_events: 1 task 34 ( gcc: 6632), nr_events: 4 task 35 ( gcc: 6633), nr_events: 4 task 36 ( collect2: 6634), nr_events: 4 task 37 ( make: 6635), nr_events: 8 task 38 ( sh: 6637), nr_events: 4 task 39 ( sh: 6636), nr_events: 1 task 40 ( gcc: 6638), nr_events: 4 task 41 ( gcc: 6639), nr_events: 4 task 42 ( gcc: 6640), nr_events: 4 task 43 ( collect2: 6641), nr_events: 4 task 44 ( make: 6642), nr_events: 6 task 45 ( sh: 6643), nr_events: 5 task 46 ( sh: 6644), nr_events: 3 task 47 ( sh: 6645), nr_events: 4 task 48 ( make: 6646), nr_events: 6 task 49 ( sh: 6647), nr_events: 3 task 50 ( make: 6648), nr_events: 5 task 51 ( sh: 6649), nr_events: 5 task 52 ( sh: 6650), nr_events: 6 task 53 ( make: 6651), nr_events: 4 task 54 ( make: 6652), nr_events: 5 task 55 ( make: 6653), nr_events: 4 task 56 ( make: 6654), nr_events: 4 task 57 ( make: 6655), nr_events: 5 task 58 ( sh: 6656), nr_events: 4 task 59 ( gcc: 6657), nr_events: 9 task 60 ( ksoftirqd/3: 10), nr_events: 1 task 61 ( gcc: 6658), nr_events: 4 task 62 ( make: 6659), nr_events: 5 task 63 ( sh: 6660), nr_events: 3 task 64 ( gcc: 6661), nr_events: 5 task 65 ( collect2: 6662), nr_events: 4 ------------------------------------------------------------ #1 : 256.745, ravg: 256.74, cpu: 0.00 / 0.00 #2 : 439.372, ravg: 275.01, cpu: 0.00 / 0.00 #3 : 411.971, ravg: 288.70, cpu: 0.00 / 0.00 #4 : 385.500, ravg: 298.38, cpu: 0.00 / 0.00 #5 : 366.526, ravg: 305.20, cpu: 0.00 / 0.00 #6 : 381.281, ravg: 312.81, cpu: 0.00 / 0.00 #7 : 410.756, ravg: 322.60, cpu: 0.00 / 0.00 #8 : 368.009, ravg: 327.14, cpu: 0.00 / 0.00 #9 : 408.098, ravg: 335.24, cpu: 0.00 / 0.00 #10 : 368.582, ravg: 338.57, cpu: 0.00 / 0.00 I.e. we successfully analyzed the trace, replayed it via real threads and measured the replayed workload's scheduling properties. This is how it looked like in 'top' output: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 7164 mingo 20 0 1434m 8080 888 R 57.0 0.1 0:02.04 :perf 7165 mingo 20 0 1434m 8080 888 R 41.8 0.1 0:01.52 :perf 7228 mingo 20 0 1434m 8080 888 R 39.8 0.1 0:01.44 :gcc 7225 mingo 20 0 1434m 8080 888 R 33.8 0.1 0:01.26 :gcc 7202 mingo 20 0 1434m 8080 888 R 31.2 0.1 0:01.16 :sh 7222 mingo 20 0 1434m 8080 888 R 25.2 0.1 0:00.96 :sh 7211 mingo 20 0 1434m 8080 888 R 21.9 0.1 0:00.82 :sh 7213 mingo 20 0 1434m 8080 888 D 19.2 0.1 0:00.74 :sh 7194 mingo 20 0 1434m 8080 888 D 18.6 0.1 0:00.72 :make There's still various kinks in it - more patches to come. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 152 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 134 insertions(+), 18 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index c66e6a32137..6ec4f51d536 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -57,7 +57,7 @@ static u64 sample_type; #define BUG_ON(x) assert(!(x)) -#define DEBUG 1 +#define DEBUG 0 typedef unsigned long long nsec_t; @@ -238,15 +238,14 @@ static struct sched_event *last_event(struct task_desc *task) } static void -add_sched_event_run(struct task_desc *task, nsec_t timestamp, - unsigned long duration) +add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration) { struct sched_event *event, *curr_event = last_event(task); /* - * optimize an existing RUN event by merging this one - * to it: - */ + * optimize an existing RUN event by merging this one + * to it: + */ if (curr_event && curr_event->type == SCHED_EVENT_RUN) { nr_run_events_optimized++; curr_event->duration += duration; @@ -376,7 +375,7 @@ void parse_line(char *line) dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n", comm, pid, - timestamp, + timestamp, func_str, comm2, pid2, @@ -429,7 +428,7 @@ static void add_cross_task_wakeups(void) } static void -process_sched_event(struct task_desc *this_task, struct sched_event *event) +process_sched_event(struct task_desc *this_task __used, struct sched_event *event) { int ret = 0; nsec_t now; @@ -744,9 +743,9 @@ static void run_one_test(void) #if 0 /* - * rusage statistics done by the parent, these are less - * accurate than the sum_exec_runtime based statistics: - */ + * rusage statistics done by the parent, these are less + * accurate than the sum_exec_runtime based statistics: + */ printk(" [%0.2f / %0.2f]", (double)parent_cpu_usage/1e6, (double)runavg_parent_cpu_usage/1e6); @@ -798,16 +797,128 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) return 0; } -static void process_sched_wakeup_event(struct event *event, +struct trace_wakeup_event { + u32 size; + + u16 common_type; + u8 common_flags; + u8 common_preempt_count; + u32 common_pid; + u32 common_tgid; + + char comm[16]; + u32 pid; + + u32 prio; + u32 success; + u32 cpu; +}; + +static void +process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { + struct task_desc *waker, *wakee; + printf("sched_wakeup event %p\n", event); + + printf(" ... pid %d woke up %s/%d\n", + wakeup_event->common_pid, + wakeup_event->comm, + wakeup_event->pid); + + waker = register_pid(wakeup_event->common_pid, ""); + wakee = register_pid(wakeup_event->pid, wakeup_event->comm); + + add_sched_event_wakeup(waker, timestamp, wakee); } -static void process_sched_switch_event(struct event *event, +struct trace_switch_event { + u32 size; + + u16 common_type; + u8 common_flags; + u8 common_preempt_count; + u32 common_pid; + u32 common_tgid; + + char prev_comm[16]; + u32 prev_pid; + u32 prev_prio; + u64 prev_state; + char next_comm[16]; + u32 next_pid; + u32 next_prio; +}; + +#define MAX_CPUS 4096 + +unsigned long cpu_last_switched[MAX_CPUS]; + +static void +process_sched_switch_event(struct trace_switch_event *switch_event, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { + struct task_desc *prev, *next; + u64 timestamp0; + s64 delta; + printf("sched_switch event %p\n", event); + if (cpu >= MAX_CPUS || cpu < 0) + return; + + timestamp0 = cpu_last_switched[cpu]; + if (timestamp0) + delta = timestamp - timestamp0; + else + delta = 0; + + if (delta < 0) + die("hm, delta: %Ld < 0 ?\n", delta); + + printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n", + switch_event->prev_comm, switch_event->prev_pid, + switch_event->next_comm, switch_event->next_pid, + delta); + + prev = register_pid(switch_event->prev_pid, switch_event->prev_comm); + next = register_pid(switch_event->next_pid, switch_event->next_comm); + + cpu_last_switched[cpu] = timestamp; + + add_sched_event_run(prev, timestamp, delta); +} + +struct trace_fork_event { + u32 size; + + u16 common_type; + u8 common_flags; + u8 common_preempt_count; + u32 common_pid; + u32 common_tgid; + + char parent_comm[16]; + u32 parent_pid; + char child_comm[16]; + u32 child_pid; +}; + +static void +process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event, + int cpu __used, u64 timestamp __used, struct thread *thread __used) +{ + printf("sched_fork event %p\n", event); + printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid); + printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid); + register_pid(fork_event->parent_pid, fork_event->parent_comm); + register_pid(fork_event->child_pid, fork_event->child_comm); +} + +static void process_sched_exit_event(struct event *event, + int cpu __used, u64 timestamp __used, struct thread *thread __used) +{ + printf("sched_exit event %p\n", event); } static void @@ -833,9 +944,15 @@ process_raw_event(event_t *raw_event, void *more_data, raw_event->header.type, type, event->name); if (!strcmp(event->name, "sched_switch")) - process_sched_switch_event(event, cpu, timestamp, thread); + process_sched_switch_event(more_data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup")) - process_sched_wakeup_event(event, cpu, timestamp, thread); + process_sched_wakeup_event(more_data, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_wakeup_new")) + process_sched_wakeup_event(more_data, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_process_fork")) + process_sched_fork_event(more_data, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_process_exit")) + process_sched_exit_event(event, cpu, timestamp, thread); } static int @@ -1053,7 +1170,7 @@ static const struct option options[] = { int cmd_sched(int argc, const char **argv, const char *prefix __used) { - long nr_iterations = LONG_MAX, i; + long nr_iterations = 10, i; symbol__init(); page_size = getpagesize(); @@ -1068,8 +1185,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) usage_with_options(annotate_usage, options); } - - setup_pager(); +// setup_pager(); calibrate_run_measurement_overhead(); calibrate_sleep_measurement_overhead(); -- cgit v1.2.3 From ad236fd23b6d6372dcacd549983cce051d2ccff6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Sep 2009 12:12:54 +0200 Subject: perf sched: Tighten up the code Various small cleanups - removal of debug printks and dead functions, etc. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 236 +++++++++------------------------------------ 1 file changed, 47 insertions(+), 189 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 6ec4f51d536..de93a260452 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -61,29 +61,6 @@ static u64 sample_type; typedef unsigned long long nsec_t; -#define printk(x...) do { printf(x); fflush(stdout); } while (0) - -nsec_t prev_printk; - -#define __dprintk(x,y...) do { \ - nsec_t __now = get_nsecs(), __delta = __now - prev_printk; \ - \ - prev_printk = __now; \ - \ - printf("%.3f [%Ld] [%.3f]: " x, (double)__now/1e6, __now, (double)__delta/1e6, y);\ -} while (0) - -#if !DEBUG -# define dprintk(x...) do { } while (0) -#else -# define dprintk(x...) __dprintk(x) -#endif - -#define __DP() __dprintk("parent: line %d\n", __LINE__) -#define DP() dprintk("parent: line %d\n", __LINE__) -#define D() dprintk("task %ld: line %d\n", this_task->nr, __LINE__) - - static nsec_t run_measurement_overhead; static nsec_t sleep_measurement_overhead; @@ -129,7 +106,7 @@ static void calibrate_run_measurement_overhead(void) } run_measurement_overhead = min_delta; - printk("run measurement overhead: %Ld nsecs\n", min_delta); + printf("run measurement overhead: %Ld nsecs\n", min_delta); } static void calibrate_sleep_measurement_overhead(void) @@ -147,7 +124,7 @@ static void calibrate_sleep_measurement_overhead(void) min_delta -= 10000; sleep_measurement_overhead = min_delta; - printk("sleep measurement overhead: %Ld nsecs\n", min_delta); + printf("sleep measurement overhead: %Ld nsecs\n", min_delta); } #define COMM_LEN 20 @@ -293,7 +270,7 @@ add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp, static void add_sched_event_sleep(struct task_desc *task, nsec_t timestamp, - unsigned long uninterruptible __used) + u64 task_state __used) { struct sched_event *event = get_new_event(task, timestamp); @@ -329,77 +306,13 @@ static struct task_desc *register_pid(unsigned long pid, const char *comm) BUG_ON(!tasks); tasks[task->nr] = task; - printk("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm); + if (verbose) + printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm); return task; } -static int first_trace_line = 1; - -static nsec_t first_timestamp; -static nsec_t prev_timestamp; - -void parse_line(char *line); - -void parse_line(char *line) -{ - unsigned long param1 = 0, param2 = 0; - char comm[COMM_LEN], comm2[COMM_LEN]; - unsigned long pid, pid2, timestamp0; - struct task_desc *task, *task2; - char func_str[SYM_LEN]; - nsec_t timestamp; - int ret; - - //" 0 0D.s3 0us+: try_to_wake_up (1 0)" - ret = sscanf(line, "%20s %5ld %*s %ldus%*c:" - " %128s <%20s %ld> (%ld %ld)\n", - comm, &pid, ×tamp0, - func_str, comm2, &pid2, ¶m1, ¶m2); - dprintk("ret: %d\n", ret); - if (ret != 8) - return; - - timestamp = timestamp0 * 1000LL; - - if (first_trace_line) { - first_trace_line = 0; - first_timestamp = timestamp; - } - - timestamp -= first_timestamp; - BUG_ON(timestamp < prev_timestamp); - prev_timestamp = timestamp; - - dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n", - comm, - pid, - timestamp, - func_str, - comm2, - pid2, - param1, - param2); - - task = register_pid(pid, comm); - task2 = register_pid(pid2, comm2); - - if (!strcmp(func_str, "update_curr")) { - dprintk("%Ld: task %ld runs for %ld nsecs\n", - timestamp, task->nr, param1); - add_sched_event_run(task, timestamp, param1); - } else if (!strcmp(func_str, "try_to_wake_up")) { - dprintk("%Ld: task %ld wakes up task %ld\n", - timestamp, task->nr, task2->nr); - add_sched_event_wakeup(task, timestamp, task2); - } else if (!strcmp(func_str, "deactivate_task")) { - dprintk("%Ld: task %ld goes to sleep (uninterruptible: %ld)\n", - timestamp, task->nr, param1); - add_sched_event_sleep(task, timestamp, param1); - } -} - static void print_task_traces(void) { struct task_desc *task; @@ -407,7 +320,7 @@ static void print_task_traces(void) for (i = 0; i < nr_tasks; i++) { task = tasks[i]; - printk("task %6ld (%20s:%10ld), nr_events: %ld\n", + printf("task %6ld (%20s:%10ld), nr_events: %ld\n", task->nr, task->comm, task->pid, task->nr_events); } } @@ -437,40 +350,16 @@ process_sched_event(struct task_desc *this_task __used, struct sched_event *even now = get_nsecs(); delta = start_time + event->timestamp - now; - dprintk("task %ld, event #%ld, %Ld, delta: %.3f (%Ld)\n", - this_task->nr, event->nr, event->timestamp, - (double)delta/1e6, delta); - - if (0 && delta > 0) { - dprintk("%.3f: task %ld FIX %.3f\n", - (double)event->timestamp/1e6, - this_task->nr, - (double)delta/1e6); - sleep_nsecs(start_time + event->timestamp - now); - nr_sleep_corrections++; - } - switch (event->type) { case SCHED_EVENT_RUN: - dprintk("%.3f: task %ld RUN for %.3f\n", - (double)event->timestamp/1e6, - this_task->nr, - (double)event->duration/1e6); burn_nsecs(event->duration); break; case SCHED_EVENT_SLEEP: - dprintk("%.3f: task %ld %s SLEEP\n", - (double)event->timestamp/1e6, - this_task->nr, event->wait_sem ? "" : "SKIP"); if (event->wait_sem) ret = sem_wait(event->wait_sem); BUG_ON(ret); break; case SCHED_EVENT_WAKEUP: - dprintk("%.3f: task %ld WAKEUP => task %ld\n", - (double)event->timestamp/1e6, - this_task->nr, - event->wakee->nr); if (event->wait_sem) ret = sem_post(event->wait_sem); BUG_ON(ret); @@ -511,14 +400,10 @@ static nsec_t get_cpu_usage_nsec_self(void) BUG_ON(!file); while ((chars = getline(&line, &len, file)) != -1) { - dprintk("got line with length %zu :\n", chars); - dprintk("%s", line); ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n", &msecs, &nsecs); if (ret == 2) { total = msecs*1e6 + nsecs; - dprintk("total: (%ld.%06ld) %Ld\n", - msecs, nsecs, total); break; } } @@ -536,19 +421,16 @@ static void *thread_func(void *ctx) unsigned long i, ret; char comm2[22]; - dprintk("task %ld started up.\n", this_task->nr); sprintf(comm2, ":%s", this_task->comm); prctl(PR_SET_NAME, comm2); again: ret = sem_post(&this_task->ready_for_work); BUG_ON(ret); - D(); ret = pthread_mutex_lock(&start_work_mutex); BUG_ON(ret); ret = pthread_mutex_unlock(&start_work_mutex); BUG_ON(ret); - D(); cpu_usage_0 = get_cpu_usage_nsec_self(); @@ -560,19 +442,13 @@ again: cpu_usage_1 = get_cpu_usage_nsec_self(); this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; - dprintk("task %ld cpu usage: %0.3f msecs\n", - this_task->nr, (double)this_task->cpu_usage / 1e6); - - D(); ret = sem_post(&this_task->work_done_sem); BUG_ON(ret); - D(); ret = pthread_mutex_lock(&work_done_wait_mutex); BUG_ON(ret); ret = pthread_mutex_unlock(&work_done_wait_mutex); BUG_ON(ret); - D(); goto again; } @@ -614,9 +490,7 @@ static void wait_for_tasks(void) struct task_desc *task; unsigned long i, ret; - DP(); start_time = get_nsecs(); - DP(); cpu_usage = 0; pthread_mutex_unlock(&work_done_wait_mutex); @@ -633,24 +507,6 @@ static void wait_for_tasks(void) pthread_mutex_unlock(&start_work_mutex); -#if 0 - for (i = 0; i < nr_tasks; i++) { - unsigned long missed; - - task = tasks[i]; - while (task->curr_event + 1 < task->nr_events) { - dprintk("parent waiting for %ld (%ld != %ld)\n", - i, task->curr_event, task->nr_events); - sleep_nsecs(100000000); - } - missed = task->nr_events - 1 - task->curr_event; - if (missed) - printk("task %ld missed events: %ld\n", i, missed); - ret = sem_post(&task->sleep_sem); - BUG_ON(ret); - } -#endif - DP(); for (i = 0; i < nr_tasks; i++) { task = tasks[i]; ret = sem_wait(&task->work_done_sem); @@ -687,16 +543,16 @@ static void parse_trace(void) { __cmd_sched(); - printk("nr_run_events: %ld\n", nr_run_events); - printk("nr_sleep_events: %ld\n", nr_sleep_events); - printk("nr_wakeup_events: %ld\n", nr_wakeup_events); + printf("nr_run_events: %ld\n", nr_run_events); + printf("nr_sleep_events: %ld\n", nr_sleep_events); + printf("nr_wakeup_events: %ld\n", nr_wakeup_events); if (targetless_wakeups) - printk("target-less wakeups: %ld\n", targetless_wakeups); + printf("target-less wakeups: %ld\n", targetless_wakeups); if (multitarget_wakeups) - printk("multi-target wakeups: %ld\n", multitarget_wakeups); + printf("multi-target wakeups: %ld\n", multitarget_wakeups); if (nr_run_events_optimized) - printk("run events optimized: %ld\n", + printf("run events optimized: %ld\n", nr_run_events_optimized); } @@ -728,17 +584,17 @@ static void run_one_test(void) run_avg = delta; run_avg = (run_avg*9 + delta)/10; - printk("#%-3ld: %0.3f, ", + printf("#%-3ld: %0.3f, ", nr_runs, (double)delta/1000000.0); #if 0 - printk("%0.2f +- %0.2f, ", + printf("%0.2f +- %0.2f, ", (double)avg_delta/1e6, (double)std_dev/1e6); #endif - printk("ravg: %0.2f, ", + printf("ravg: %0.2f, ", (double)run_avg/1e6); - printk("cpu: %0.2f / %0.2f", + printf("cpu: %0.2f / %0.2f", (double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6); #if 0 @@ -746,15 +602,15 @@ static void run_one_test(void) * rusage statistics done by the parent, these are less * accurate than the sum_exec_runtime based statistics: */ - printk(" [%0.2f / %0.2f]", + printf(" [%0.2f / %0.2f]", (double)parent_cpu_usage/1e6, (double)runavg_parent_cpu_usage/1e6); #endif - printk("\n"); + printf("\n"); if (nr_sleep_corrections) - printk(" (%ld sleep corrections)\n", nr_sleep_corrections); + printf(" (%ld sleep corrections)\n", nr_sleep_corrections); nr_sleep_corrections = 0; } @@ -766,13 +622,13 @@ static void test_calibrations(void) burn_nsecs(1e6); T1 = get_nsecs(); - printk("the run test took %Ld nsecs\n", T1-T0); + printf("the run test took %Ld nsecs\n", T1-T0); T0 = get_nsecs(); sleep_nsecs(1e6); T1 = get_nsecs(); - printk("the sleep test took %Ld nsecs\n", T1-T0); + printf("the sleep test took %Ld nsecs\n", T1-T0); } static int @@ -820,12 +676,14 @@ process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event { struct task_desc *waker, *wakee; - printf("sched_wakeup event %p\n", event); + if (verbose) { + printf("sched_wakeup event %p\n", event); - printf(" ... pid %d woke up %s/%d\n", - wakeup_event->common_pid, - wakeup_event->comm, - wakeup_event->pid); + printf(" ... pid %d woke up %s/%d\n", + wakeup_event->common_pid, + wakeup_event->comm, + wakeup_event->pid); + } waker = register_pid(wakeup_event->common_pid, ""); wakee = register_pid(wakeup_event->pid, wakeup_event->comm); @@ -863,7 +721,9 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event u64 timestamp0; s64 delta; - printf("sched_switch event %p\n", event); + if (verbose) + printf("sched_switch event %p\n", event); + if (cpu >= MAX_CPUS || cpu < 0) return; @@ -876,10 +736,12 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event if (delta < 0) die("hm, delta: %Ld < 0 ?\n", delta); - printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n", - switch_event->prev_comm, switch_event->prev_pid, - switch_event->next_comm, switch_event->next_pid, - delta); + if (verbose) { + printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n", + switch_event->prev_comm, switch_event->prev_pid, + switch_event->next_comm, switch_event->next_pid, + delta); + } prev = register_pid(switch_event->prev_pid, switch_event->prev_comm); next = register_pid(switch_event->next_pid, switch_event->next_comm); @@ -887,6 +749,7 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event cpu_last_switched[cpu] = timestamp; add_sched_event_run(prev, timestamp, delta); + add_sched_event_sleep(prev, timestamp, switch_event->prev_state); } struct trace_fork_event { @@ -908,9 +771,11 @@ static void process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { - printf("sched_fork event %p\n", event); - printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid); - printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid); + if (verbose) { + printf("sched_fork event %p\n", event); + printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid); + printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid); + } register_pid(fork_event->parent_pid, fork_event->parent_comm); register_pid(fork_event->child_pid, fork_event->child_comm); } @@ -918,11 +783,12 @@ process_sched_fork_event(struct trace_fork_event *fork_event, struct event *even static void process_sched_exit_event(struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { - printf("sched_exit event %p\n", event); + if (verbose) + printf("sched_exit event %p\n", event); } static void -process_raw_event(event_t *raw_event, void *more_data, +process_raw_event(event_t *raw_event __used, void *more_data, int cpu, u64 timestamp, struct thread *thread) { struct { @@ -935,14 +801,6 @@ process_raw_event(event_t *raw_event, void *more_data, type = trace_parse_common_type(raw->data); event = trace_find_event(type); - /* - * FIXME: better resolve from pid from the struct trace_entry - * field, although it should be the same than this perf - * event pid - */ - printf("id %d, type: %d, event: %s\n", - raw_event->header.type, type, event->name); - if (!strcmp(event->name, "sched_switch")) process_sched_switch_event(more_data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup")) @@ -1197,7 +1055,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) add_cross_task_wakeups(); create_tasks(); - printk("------------------------------------------------------------\n"); + printf("------------------------------------------------------------\n"); for (i = 0; i < nr_iterations; i++) run_one_test(); -- cgit v1.2.3 From 46538818023e8ea94f656acfa1e38297e2df20e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 12 Sep 2009 02:43:45 +0200 Subject: perf sched: Fix bad event alignment perf sched raises the following error when it meets a sched switch event: perf: builtin-sched.c:286: register_pid: Assertion `!(pid >= 65536)' failed. Abandon Currently in x86-64, the sched switch events have a hole in the middle of the structure: u16 common_type; u8 common_flags; u8 common_preempt_count; u32 common_pid; u32 common_tgid; char prev_comm[16]; u32 prev_pid; u32 prev_prio; <--- there u64 prev_state; char next_comm[16]; u32 next_pid; u32 next_prio; Gcc inserts a 4 bytes hole there for prev_state to be u64 aligned. And the events are exported to userspace with this hole. But in userspace, from perf sched, we fetch it using a structure that has a new field in the beginning: u32 size. This is because our trace is exported with its size as a field. But now that we have this new field, the hole in the middle disappears because it makes prev_state becoming well aligned. And since we are using a pointer to the raw trace using this struct, instead of reading prev_state, we are reading the hole. We could fix it by keeping the size seperate from the struct but actually there a lot of other potential problems: some fields may be saved as long in a 64 bits system and later read as long in a 32 bits system. Also this direct cast doesn't care about the endianness differences between the host traced machine and the machine in which we do the post processing. So instead of using such dangerous direct casts, fetch the values using the trace parsing API that already takes care of all these problems. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 101 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 25 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index de93a260452..0215936696e 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -653,6 +653,30 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) return 0; } + +struct raw_event_sample { + u32 size; + char data[0]; +}; + +#define FILL_FIELD(ptr, field, event, data) \ + ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data) + +#define FILL_ARRAY(ptr, array, event, data) \ +do { \ + void *__array = raw_field_ptr(event, #array, data); \ + memcpy(ptr.array, __array, sizeof(ptr.array)); \ +} while(0) + +#define FILL_COMMON_FIELDS(ptr, event, data) \ +do { \ + FILL_FIELD(ptr, common_type, event, data); \ + FILL_FIELD(ptr, common_flags, event, data); \ + FILL_FIELD(ptr, common_preempt_count, event, data); \ + FILL_FIELD(ptr, common_pid, event, data); \ + FILL_FIELD(ptr, common_tgid, event, data); \ +} while (0) + struct trace_wakeup_event { u32 size; @@ -671,22 +695,32 @@ struct trace_wakeup_event { }; static void -process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event *event, +process_sched_wakeup_event(struct raw_event_sample *raw, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { struct task_desc *waker, *wakee; + struct trace_wakeup_event wakeup_event; + + FILL_COMMON_FIELDS(wakeup_event, event, raw->data); + + FILL_ARRAY(wakeup_event, comm, event, raw->data); + FILL_FIELD(wakeup_event, pid, event, raw->data); + FILL_FIELD(wakeup_event, prio, event, raw->data); + FILL_FIELD(wakeup_event, success, event, raw->data); + FILL_FIELD(wakeup_event, cpu, event, raw->data); + if (verbose) { printf("sched_wakeup event %p\n", event); printf(" ... pid %d woke up %s/%d\n", - wakeup_event->common_pid, - wakeup_event->comm, - wakeup_event->pid); + wakeup_event.common_pid, + wakeup_event.comm, + wakeup_event.pid); } - waker = register_pid(wakeup_event->common_pid, ""); - wakee = register_pid(wakeup_event->pid, wakeup_event->comm); + waker = register_pid(wakeup_event.common_pid, ""); + wakee = register_pid(wakeup_event.pid, wakeup_event.comm); add_sched_event_wakeup(waker, timestamp, wakee); } @@ -714,13 +748,24 @@ struct trace_switch_event { unsigned long cpu_last_switched[MAX_CPUS]; static void -process_sched_switch_event(struct trace_switch_event *switch_event, struct event *event, +process_sched_switch_event(struct raw_event_sample *raw, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { + struct trace_switch_event switch_event; struct task_desc *prev, *next; u64 timestamp0; s64 delta; + FILL_COMMON_FIELDS(switch_event, event, raw->data); + + FILL_ARRAY(switch_event, prev_comm, event, raw->data); + FILL_FIELD(switch_event, prev_pid, event, raw->data); + FILL_FIELD(switch_event, prev_prio, event, raw->data); + FILL_FIELD(switch_event, prev_state, event, raw->data); + FILL_ARRAY(switch_event, next_comm, event, raw->data); + FILL_FIELD(switch_event, next_pid, event, raw->data); + FILL_FIELD(switch_event, next_prio, event, raw->data); + if (verbose) printf("sched_switch event %p\n", event); @@ -738,18 +783,18 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event if (verbose) { printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n", - switch_event->prev_comm, switch_event->prev_pid, - switch_event->next_comm, switch_event->next_pid, + switch_event.prev_comm, switch_event.prev_pid, + switch_event.next_comm, switch_event.next_pid, delta); } - prev = register_pid(switch_event->prev_pid, switch_event->prev_comm); - next = register_pid(switch_event->next_pid, switch_event->next_comm); + prev = register_pid(switch_event.prev_pid, switch_event.prev_comm); + next = register_pid(switch_event.next_pid, switch_event.next_comm); cpu_last_switched[cpu] = timestamp; add_sched_event_run(prev, timestamp, delta); - add_sched_event_sleep(prev, timestamp, switch_event->prev_state); + add_sched_event_sleep(prev, timestamp, switch_event.prev_state); } struct trace_fork_event { @@ -768,16 +813,25 @@ struct trace_fork_event { }; static void -process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event, +process_sched_fork_event(struct raw_event_sample *raw, struct event *event, int cpu __used, u64 timestamp __used, struct thread *thread __used) { + struct trace_fork_event fork_event; + + FILL_COMMON_FIELDS(fork_event, event, raw->data); + + FILL_ARRAY(fork_event, parent_comm, event, raw->data); + FILL_FIELD(fork_event, parent_pid, event, raw->data); + FILL_ARRAY(fork_event, child_comm, event, raw->data); + FILL_FIELD(fork_event, child_pid, event, raw->data); + if (verbose) { printf("sched_fork event %p\n", event); - printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid); - printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid); + printf("... parent: %s/%d\n", fork_event.parent_comm, fork_event.parent_pid); + printf("... child: %s/%d\n", fork_event.child_comm, fork_event.child_pid); } - register_pid(fork_event->parent_pid, fork_event->parent_comm); - register_pid(fork_event->child_pid, fork_event->child_comm); + register_pid(fork_event.parent_pid, fork_event.parent_comm); + register_pid(fork_event.child_pid, fork_event.child_comm); } static void process_sched_exit_event(struct event *event, @@ -791,10 +845,7 @@ static void process_raw_event(event_t *raw_event __used, void *more_data, int cpu, u64 timestamp, struct thread *thread) { - struct { - u32 size; - char data[0]; - } *raw = more_data; + struct raw_event_sample *raw = more_data; struct event *event; int type; @@ -802,13 +853,13 @@ process_raw_event(event_t *raw_event __used, void *more_data, event = trace_find_event(type); if (!strcmp(event->name, "sched_switch")) - process_sched_switch_event(more_data, event, cpu, timestamp, thread); + process_sched_switch_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup")) - process_sched_wakeup_event(more_data, event, cpu, timestamp, thread); + process_sched_wakeup_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup_new")) - process_sched_wakeup_event(more_data, event, cpu, timestamp, thread); + process_sched_wakeup_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_fork")) - process_sched_fork_event(more_data, event, cpu, timestamp, thread); + process_sched_fork_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_exit")) process_sched_exit_event(event, cpu, timestamp, thread); } -- cgit v1.2.3 From 419ab0d6a959f41ec7fde807fe311aaafb05c3be Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 12 Sep 2009 03:59:01 +0200 Subject: perf sched: Make it easier to plug in new sub profilers Create a sched event structure of handlers in which various sched events reader can plug their own callbacks. This makes easier the addition of new perf sched sub commands. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 243 ++++++++++++++++++++++++++++++--------------- 1 file changed, 165 insertions(+), 78 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 0215936696e..756fe62eb04 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -14,6 +14,9 @@ #include "util/trace-event.h" #include + +#define MAX_CPUS 4096 + static char const *input_name = "perf.data"; static int input; static unsigned long page_size; @@ -27,6 +30,8 @@ static struct thread *last_match; static struct perf_header *header; static u64 sample_type; +static int replay_mode; + /* * Scheduler benchmarks @@ -677,6 +682,27 @@ do { \ FILL_FIELD(ptr, common_tgid, event, data); \ } while (0) + + +struct trace_switch_event { + u32 size; + + u16 common_type; + u8 common_flags; + u8 common_preempt_count; + u32 common_pid; + u32 common_tgid; + + char prev_comm[16]; + u32 prev_pid; + u32 prev_prio; + u64 prev_state; + char next_comm[16]; + u32 next_pid; + u32 next_prio; +}; + + struct trace_wakeup_event { u32 size; @@ -694,78 +720,79 @@ struct trace_wakeup_event { u32 cpu; }; -static void -process_sched_wakeup_event(struct raw_event_sample *raw, struct event *event, - int cpu __used, u64 timestamp __used, struct thread *thread __used) -{ - struct task_desc *waker, *wakee; - struct trace_wakeup_event wakeup_event; +struct trace_fork_event { + u32 size; - FILL_COMMON_FIELDS(wakeup_event, event, raw->data); + u16 common_type; + u8 common_flags; + u8 common_preempt_count; + u32 common_pid; + u32 common_tgid; + + char parent_comm[16]; + u32 parent_pid; + char child_comm[16]; + u32 child_pid; +}; + +struct trace_sched_handler { + void (*switch_event)(struct trace_switch_event *, + struct event *, + int cpu, + u64 timestamp, + struct thread *thread); + + void (*wakeup_event)(struct trace_wakeup_event *, + struct event *, + int cpu, + u64 timestamp, + struct thread *thread); + + void (*fork_event)(struct trace_fork_event *, + struct event *, + int cpu, + u64 timestamp, + struct thread *thread); +}; - FILL_ARRAY(wakeup_event, comm, event, raw->data); - FILL_FIELD(wakeup_event, pid, event, raw->data); - FILL_FIELD(wakeup_event, prio, event, raw->data); - FILL_FIELD(wakeup_event, success, event, raw->data); - FILL_FIELD(wakeup_event, cpu, event, raw->data); +static void +replay_wakeup_event(struct trace_wakeup_event *wakeup_event, + struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) +{ + struct task_desc *waker, *wakee; if (verbose) { printf("sched_wakeup event %p\n", event); printf(" ... pid %d woke up %s/%d\n", - wakeup_event.common_pid, - wakeup_event.comm, - wakeup_event.pid); + wakeup_event->common_pid, + wakeup_event->comm, + wakeup_event->pid); } - waker = register_pid(wakeup_event.common_pid, ""); - wakee = register_pid(wakeup_event.pid, wakeup_event.comm); + waker = register_pid(wakeup_event->common_pid, ""); + wakee = register_pid(wakeup_event->pid, wakeup_event->comm); add_sched_event_wakeup(waker, timestamp, wakee); } -struct trace_switch_event { - u32 size; - - u16 common_type; - u8 common_flags; - u8 common_preempt_count; - u32 common_pid; - u32 common_tgid; - - char prev_comm[16]; - u32 prev_pid; - u32 prev_prio; - u64 prev_state; - char next_comm[16]; - u32 next_pid; - u32 next_prio; -}; - -#define MAX_CPUS 4096 - -unsigned long cpu_last_switched[MAX_CPUS]; +static unsigned long cpu_last_switched[MAX_CPUS]; static void -process_sched_switch_event(struct raw_event_sample *raw, struct event *event, - int cpu __used, u64 timestamp __used, struct thread *thread __used) +replay_switch_event(struct trace_switch_event *switch_event, + struct event *event, + int cpu, + u64 timestamp, + struct thread *thread __used) { - struct trace_switch_event switch_event; struct task_desc *prev, *next; u64 timestamp0; s64 delta; - FILL_COMMON_FIELDS(switch_event, event, raw->data); - - FILL_ARRAY(switch_event, prev_comm, event, raw->data); - FILL_FIELD(switch_event, prev_pid, event, raw->data); - FILL_FIELD(switch_event, prev_prio, event, raw->data); - FILL_FIELD(switch_event, prev_state, event, raw->data); - FILL_ARRAY(switch_event, next_comm, event, raw->data); - FILL_FIELD(switch_event, next_pid, event, raw->data); - FILL_FIELD(switch_event, next_prio, event, raw->data); - if (verbose) printf("sched_switch event %p\n", event); @@ -783,38 +810,94 @@ process_sched_switch_event(struct raw_event_sample *raw, struct event *event, if (verbose) { printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n", - switch_event.prev_comm, switch_event.prev_pid, - switch_event.next_comm, switch_event.next_pid, + switch_event->prev_comm, switch_event->prev_pid, + switch_event->next_comm, switch_event->next_pid, delta); } - prev = register_pid(switch_event.prev_pid, switch_event.prev_comm); - next = register_pid(switch_event.next_pid, switch_event.next_comm); + prev = register_pid(switch_event->prev_pid, switch_event->prev_comm); + next = register_pid(switch_event->next_pid, switch_event->next_comm); cpu_last_switched[cpu] = timestamp; add_sched_event_run(prev, timestamp, delta); - add_sched_event_sleep(prev, timestamp, switch_event.prev_state); + add_sched_event_sleep(prev, timestamp, switch_event->prev_state); } -struct trace_fork_event { - u32 size; - u16 common_type; - u8 common_flags; - u8 common_preempt_count; - u32 common_pid; - u32 common_tgid; +static void +replay_fork_event(struct trace_fork_event *fork_event, + struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) +{ + if (verbose) { + printf("sched_fork event %p\n", event); + printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid); + printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid); + } + register_pid(fork_event->parent_pid, fork_event->parent_comm); + register_pid(fork_event->child_pid, fork_event->child_comm); +} - char parent_comm[16]; - u32 parent_pid; - char child_comm[16]; - u32 child_pid; +static struct trace_sched_handler replay_ops = { + .wakeup_event = replay_wakeup_event, + .switch_event = replay_switch_event, + .fork_event = replay_fork_event, }; + +static struct trace_sched_handler *trace_handler; + static void -process_sched_fork_event(struct raw_event_sample *raw, struct event *event, - int cpu __used, u64 timestamp __used, struct thread *thread __used) +process_sched_wakeup_event(struct raw_event_sample *raw, + struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) +{ + struct trace_wakeup_event wakeup_event; + + FILL_COMMON_FIELDS(wakeup_event, event, raw->data); + + FILL_ARRAY(wakeup_event, comm, event, raw->data); + FILL_FIELD(wakeup_event, pid, event, raw->data); + FILL_FIELD(wakeup_event, prio, event, raw->data); + FILL_FIELD(wakeup_event, success, event, raw->data); + FILL_FIELD(wakeup_event, cpu, event, raw->data); + + trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread); +} + +static void +process_sched_switch_event(struct raw_event_sample *raw, + struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) +{ + struct trace_switch_event switch_event; + + FILL_COMMON_FIELDS(switch_event, event, raw->data); + + FILL_ARRAY(switch_event, prev_comm, event, raw->data); + FILL_FIELD(switch_event, prev_pid, event, raw->data); + FILL_FIELD(switch_event, prev_prio, event, raw->data); + FILL_FIELD(switch_event, prev_state, event, raw->data); + FILL_ARRAY(switch_event, next_comm, event, raw->data); + FILL_FIELD(switch_event, next_pid, event, raw->data); + FILL_FIELD(switch_event, next_prio, event, raw->data); + + trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread); +} + +static void +process_sched_fork_event(struct raw_event_sample *raw, + struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) { struct trace_fork_event fork_event; @@ -825,17 +908,14 @@ process_sched_fork_event(struct raw_event_sample *raw, struct event *event, FILL_ARRAY(fork_event, child_comm, event, raw->data); FILL_FIELD(fork_event, child_pid, event, raw->data); - if (verbose) { - printf("sched_fork event %p\n", event); - printf("... parent: %s/%d\n", fork_event.parent_comm, fork_event.parent_pid); - printf("... child: %s/%d\n", fork_event.child_comm, fork_event.child_pid); - } - register_pid(fork_event.parent_pid, fork_event.parent_comm); - register_pid(fork_event.child_pid, fork_event.child_comm); + trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread); } -static void process_sched_exit_event(struct event *event, - int cpu __used, u64 timestamp __used, struct thread *thread __used) +static void +process_sched_exit_event(struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) { if (verbose) printf("sched_exit event %p\n", event); @@ -1072,6 +1152,8 @@ static const char * const annotate_usage[] = { static const struct option options[] = { OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), + OPT_BOOLEAN('r', "replay", &replay_mode, + "replay sched behaviour from traces"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_END() @@ -1096,6 +1178,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) // setup_pager(); + if (replay_mode) + trace_handler = &replay_ops; + else /* We may need a default subcommand */ + die("Please select a sub command (-r)\n"); + calibrate_run_measurement_overhead(); calibrate_sleep_measurement_overhead(); -- cgit v1.2.3 From cdce9d738b91e1ec686e4ef6ca46d46e2891e208 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 12 Sep 2009 08:06:14 +0200 Subject: perf sched: Add sched latency profiling Add the -l --latency option that reports statistics about the scheduler latencies. For now, the latencies are measured in the following sequence scope: - task A is sleeping (D or S state) - task B wakes up A ^ | | latency timeframe | | v - task A is scheduled in Start by recording every scheduler events: perf record -e sched:* and then fetch the results: perf sched -l Tasks count total avg max migration/0 2 39849 19924 28826 ksoftirqd/0 7 756383 108054 373014 migration/1 5 45391 9078 10452 ksoftirqd/1 2 399055 199527 359130 events/0 8 4780110 597513 4500250 events/1 9 6353057 705895 2986012 kblockd/0 42 37805097 900121 5077684 The snapshot are in nanoseconds. - Count: number of snapshots taken for the given task - Total: total latencies in nanosec - Avg : average of latency between wake up and sched in - Max : max snapshot latency Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 296 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 285 insertions(+), 11 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 756fe62eb04..4f9e943181a 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -31,6 +31,7 @@ static struct perf_header *header; static u64 sample_type; static int replay_mode; +static int lat_mode; /* @@ -847,6 +848,269 @@ static struct trace_sched_handler replay_ops = { .fork_event = replay_fork_event, }; +#define TASK_STATE_TO_CHAR_STR "RSDTtZX" + +enum thread_state { + THREAD_SLEEPING, + THREAD_WAKED_UP, + THREAD_SCHED_IN, + THREAD_IGNORE +}; + +struct lat_snapshot { + struct list_head list; + enum thread_state state; + u64 wake_up_time; + u64 sched_in_time; +}; + +struct thread_latency { + struct list_head snapshot_list; + struct thread *thread; + struct rb_node node; +}; + +static struct rb_root lat_snapshot_root; + +static struct thread_latency * +thread_latency_search(struct rb_root *root, struct thread *thread) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct thread_latency *lat; + + lat = container_of(node, struct thread_latency, node); + if (thread->pid < lat->thread->pid) + node = node->rb_left; + else if (thread->pid > lat->thread->pid) + node = node->rb_right; + else { + return lat; + } + } + return NULL; +} + +static void +__thread_latency_insert(struct rb_root *root, struct thread_latency *data) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct thread_latency *this; + + this = container_of(*new, struct thread_latency, node); + parent = *new; + if (data->thread->pid < this->thread->pid) + new = &((*new)->rb_left); + else if (data->thread->pid > this->thread->pid) + new = &((*new)->rb_right); + else + die("Double thread insertion\n"); + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + +static void thread_latency_insert(struct thread *thread) +{ + struct thread_latency *lat; + lat = calloc(sizeof(*lat), 1); + if (!lat) + die("No memory"); + + lat->thread = thread; + INIT_LIST_HEAD(&lat->snapshot_list); + __thread_latency_insert(&lat_snapshot_root, lat); +} + +static void +latency_fork_event(struct trace_fork_event *fork_event __used, + struct event *event __used, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) +{ + /* should insert the newcomer */ +} + +static char sched_out_state(struct trace_switch_event *switch_event) +{ + const char *str = TASK_STATE_TO_CHAR_STR; + + return str[switch_event->prev_state]; +} + +static void +lat_sched_out(struct thread_latency *lat, + struct trace_switch_event *switch_event) +{ + struct lat_snapshot *snapshot; + + if (sched_out_state(switch_event) == 'R') + return; + + snapshot = calloc(sizeof(*snapshot), 1); + if (!snapshot) + die("Non memory"); + + list_add_tail(&snapshot->list, &lat->snapshot_list); +} + +static void +lat_sched_in(struct thread_latency *lat, u64 timestamp) +{ + struct lat_snapshot *snapshot; + + if (list_empty(&lat->snapshot_list)) + return; + + snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot, + list); + + if (snapshot->state != THREAD_WAKED_UP) + return; + + if (timestamp < snapshot->wake_up_time) { + snapshot->state = THREAD_IGNORE; + return; + } + + snapshot->state = THREAD_SCHED_IN; + snapshot->sched_in_time = timestamp; +} + + +static void +latency_switch_event(struct trace_switch_event *switch_event, + struct event *event __used, + int cpu __used, + u64 timestamp, + struct thread *thread __used) +{ + struct thread_latency *out_lat, *in_lat; + struct thread *sched_out, *sched_in; + + sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match); + sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match); + + in_lat = thread_latency_search(&lat_snapshot_root, sched_in); + if (!in_lat) { + thread_latency_insert(sched_in); + in_lat = thread_latency_search(&lat_snapshot_root, sched_in); + if (!in_lat) + die("Internal latency tree error"); + } + + out_lat = thread_latency_search(&lat_snapshot_root, sched_out); + if (!out_lat) { + thread_latency_insert(sched_out); + out_lat = thread_latency_search(&lat_snapshot_root, sched_out); + if (!out_lat) + die("Internal latency tree error"); + } + + lat_sched_in(in_lat, timestamp); + lat_sched_out(out_lat, switch_event); +} + +static void +latency_wakeup_event(struct trace_wakeup_event *wakeup_event, + struct event *event __used, + int cpu __used, + u64 timestamp, + struct thread *thread __used) +{ + struct thread_latency *lat; + struct lat_snapshot *snapshot; + struct thread *wakee; + + /* Note for later, it may be interesting to observe the failing cases */ + if (!wakeup_event->success) + return; + + wakee = threads__findnew(wakeup_event->pid, &threads, &last_match); + lat = thread_latency_search(&lat_snapshot_root, wakee); + if (!lat) { + thread_latency_insert(wakee); + return; + } + + if (list_empty(&lat->snapshot_list)) + return; + + snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot, + list); + + if (snapshot->state != THREAD_SLEEPING) + return; + + snapshot->state = THREAD_WAKED_UP; + snapshot->wake_up_time = timestamp; +} + +static struct trace_sched_handler lat_ops = { + .wakeup_event = latency_wakeup_event, + .switch_event = latency_switch_event, + .fork_event = latency_fork_event, +}; + +static void output_lat_thread(struct thread_latency *lat) +{ + struct lat_snapshot *shot; + int count = 0; + int i; + int ret; + u64 max = 0, avg; + u64 total = 0, delta; + + list_for_each_entry(shot, &lat->snapshot_list, list) { + if (shot->state != THREAD_SCHED_IN) + continue; + + count++; + + delta = shot->sched_in_time - shot->wake_up_time; + if (delta > max) + max = delta; + total += delta; + } + + if (!count) + return; + + ret = printf("%s", lat->thread->comm); + + for (i = 0; i < 25 - ret; i++) + printf(" "); + + avg = total / count; + + printf("%5d %10llu %10llu %10llu\n", count, total, avg, max); +} + +static void output_lat_results(void) +{ + struct rb_node *next; + + printf(" Tasks"); + printf(" count"); + printf(" total"); + printf(" avg"); + printf(" max\n\n"); + + next = rb_first(&lat_snapshot_root); + + while (next) { + struct thread_latency *lat; + + lat = rb_entry(next, struct thread_latency, node); + output_lat_thread(lat); + next = rb_next(next); + } +} static struct trace_sched_handler *trace_handler; @@ -1154,6 +1418,8 @@ static const struct option options[] = { "dump raw trace in ASCII"), OPT_BOOLEAN('r', "replay", &replay_mode, "replay sched behaviour from traces"), + OPT_BOOLEAN('l', "latency", &lat_mode, + "measure various latencies"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_END() @@ -1180,22 +1446,30 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) if (replay_mode) trace_handler = &replay_ops; - else /* We may need a default subcommand */ + else if (lat_mode) + trace_handler = &lat_ops; + else /* We may need a default subcommand (perf trace?) */ die("Please select a sub command (-r)\n"); - calibrate_run_measurement_overhead(); - calibrate_sleep_measurement_overhead(); + if (replay_mode) { + calibrate_run_measurement_overhead(); + calibrate_sleep_measurement_overhead(); - test_calibrations(); + test_calibrations(); - parse_trace(); - print_task_traces(); - add_cross_task_wakeups(); + parse_trace(); + print_task_traces(); + add_cross_task_wakeups(); - create_tasks(); - printf("------------------------------------------------------------\n"); - for (i = 0; i < nr_iterations; i++) - run_one_test(); + create_tasks(); + printf("------------------------------------------------------------\n"); + for (i = 0; i < nr_iterations; i++) + run_one_test(); + } else if (lat_mode) { + setup_pager(); + __cmd_sched(); + output_lat_results(); + } return 0; } -- cgit v1.2.3 From 46f392c97f9fd772426ed3361c5179a0d44b8c3f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 12 Sep 2009 10:08:34 +0200 Subject: perf sched: Clean up latency and replay sub-commands - Separate the latency and the replay commands more cleanly - Use consistent naming - Display help page on 'perf sched' outlining comments, instead of aborting Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 99 +++++++++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 50 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 4f9e943181a..84699cf036a 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -543,24 +543,7 @@ static void wait_for_tasks(void) } } -static int __cmd_sched(void); - -static void parse_trace(void) -{ - __cmd_sched(); - - printf("nr_run_events: %ld\n", nr_run_events); - printf("nr_sleep_events: %ld\n", nr_sleep_events); - printf("nr_wakeup_events: %ld\n", nr_wakeup_events); - - if (targetless_wakeups) - printf("target-less wakeups: %ld\n", targetless_wakeups); - if (multitarget_wakeups) - printf("multi-target wakeups: %ld\n", multitarget_wakeups); - if (nr_run_events_optimized) - printf("run events optimized: %ld\n", - nr_run_events_optimized); -} +static int read_events(void); static unsigned long nr_runs; static nsec_t sum_runtime; @@ -637,6 +620,38 @@ static void test_calibrations(void) printf("the sleep test took %Ld nsecs\n", T1-T0); } +static void __cmd_replay(void) +{ + long nr_iterations = 10, i; + + calibrate_run_measurement_overhead(); + calibrate_sleep_measurement_overhead(); + + test_calibrations(); + + read_events(); + + printf("nr_run_events: %ld\n", nr_run_events); + printf("nr_sleep_events: %ld\n", nr_sleep_events); + printf("nr_wakeup_events: %ld\n", nr_wakeup_events); + + if (targetless_wakeups) + printf("target-less wakeups: %ld\n", targetless_wakeups); + if (multitarget_wakeups) + printf("multi-target wakeups: %ld\n", multitarget_wakeups); + if (nr_run_events_optimized) + printf("run events optimized: %ld\n", + nr_run_events_optimized); + + print_task_traces(); + add_cross_task_wakeups(); + + create_tasks(); + printf("------------------------------------------------------------\n"); + for (i = 0; i < nr_iterations; i++) + run_one_test(); +} + static int process_comm_event(event_t *event, unsigned long offset, unsigned long head) { @@ -1091,10 +1106,13 @@ static void output_lat_thread(struct thread_latency *lat) printf("%5d %10llu %10llu %10llu\n", count, total, avg, max); } -static void output_lat_results(void) +static void __cmd_lat(void) { struct rb_node *next; + setup_pager(); + read_events(); + printf(" Tasks"); printf(" count"); printf(" total"); @@ -1312,7 +1330,7 @@ process_event(event_t *event, unsigned long offset, unsigned long head) return 0; } -static int __cmd_sched(void) +static int read_events(void) { int ret, rc = EXIT_FAILURE; unsigned long offset = 0; @@ -1408,8 +1426,8 @@ more: return rc; } -static const char * const annotate_usage[] = { - "perf trace [] ", +static const char * const sched_usage[] = { + "perf sched [] ", NULL }; @@ -1427,49 +1445,30 @@ static const struct option options[] = { int cmd_sched(int argc, const char **argv, const char *prefix __used) { - long nr_iterations = 10, i; - symbol__init(); page_size = getpagesize(); - argc = parse_options(argc, argv, options, annotate_usage, 0); + argc = parse_options(argc, argv, options, sched_usage, 0); if (argc) { /* * Special case: if there's an argument left then assume tha * it's a symbol filter: */ if (argc > 1) - usage_with_options(annotate_usage, options); + usage_with_options(sched_usage, options); } -// setup_pager(); - if (replay_mode) trace_handler = &replay_ops; else if (lat_mode) trace_handler = &lat_ops; - else /* We may need a default subcommand (perf trace?) */ - die("Please select a sub command (-r)\n"); - - if (replay_mode) { - calibrate_run_measurement_overhead(); - calibrate_sleep_measurement_overhead(); - - test_calibrations(); - - parse_trace(); - print_task_traces(); - add_cross_task_wakeups(); - - create_tasks(); - printf("------------------------------------------------------------\n"); - for (i = 0; i < nr_iterations; i++) - run_one_test(); - } else if (lat_mode) { - setup_pager(); - __cmd_sched(); - output_lat_results(); - } + else + usage_with_options(sched_usage, options); + + if (replay_mode) + __cmd_replay(); + else if (lat_mode) + __cmd_lat(); return 0; } -- cgit v1.2.3 From d9340c1db3f52460a8335eeb127a2728c5bba6ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 12 Sep 2009 10:08:34 +0200 Subject: perf sched: Display time in milliseconds, reorganize output After: ----------------------------------------------------------------------------------- Task | runtime ms | switches | average delay ms | maximum delay ms | ----------------------------------------------------------------------------------- migration/0 | 0.000 ms | 1 | avg: 0.047 ms | max: 0.047 ms | ksoftirqd/0 | 0.000 ms | 1 | avg: 0.039 ms | max: 0.039 ms | migration/1 | 0.000 ms | 3 | avg: 0.013 ms | max: 0.016 ms | migration/3 | 0.000 ms | 2 | avg: 0.003 ms | max: 0.004 ms | migration/4 | 0.000 ms | 1 | avg: 0.022 ms | max: 0.022 ms | distccd | 0.000 ms | 1 | avg: 0.004 ms | max: 0.004 ms | distccd | 0.000 ms | 1 | avg: 0.014 ms | max: 0.014 ms | distccd | 0.000 ms | 2 | avg: 0.000 ms | max: 0.000 ms | distccd | 0.000 ms | 2 | avg: 0.012 ms | max: 0.019 ms | distccd | 0.000 ms | 1 | avg: 0.002 ms | max: 0.002 ms | as | 0.000 ms | 2 | avg: 0.019 ms | max: 0.019 ms | as | 0.000 ms | 3 | avg: 0.015 ms | max: 0.017 ms | as | 0.000 ms | 1 | avg: 0.009 ms | max: 0.009 ms | perf | 0.000 ms | 1 | avg: 0.001 ms | max: 0.001 ms | gcc | 0.000 ms | 1 | avg: 0.021 ms | max: 0.021 ms | run-mozilla.sh | 0.000 ms | 2 | avg: 0.010 ms | max: 0.017 ms | mozilla-plugin- | 0.000 ms | 1 | avg: 0.006 ms | max: 0.006 ms | gcc | 0.000 ms | 2 | avg: 0.013 ms | max: 0.013 ms | ----------------------------------------------------------------------------------- (The runtime ms column is not filled in yet.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 84699cf036a..a084c284e19 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1096,14 +1096,15 @@ static void output_lat_thread(struct thread_latency *lat) if (!count) return; - ret = printf("%s", lat->thread->comm); + ret = printf(" %s ", lat->thread->comm); - for (i = 0; i < 25 - ret; i++) + for (i = 0; i < 19 - ret; i++) printf(" "); avg = total / count; - printf("%5d %10llu %10llu %10llu\n", count, total, avg, max); + printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n", + 0.0, count, (double)avg/1e9, (double)max/1e9); } static void __cmd_lat(void) @@ -1113,11 +1114,9 @@ static void __cmd_lat(void) setup_pager(); read_events(); - printf(" Tasks"); - printf(" count"); - printf(" total"); - printf(" avg"); - printf(" max\n\n"); + printf("-----------------------------------------------------------------------------------\n"); + printf(" Task | runtime ms | switches | average delay ms | maximum delay ms |\n"); + printf("-----------------------------------------------------------------------------------\n"); next = rb_first(&lat_snapshot_root); @@ -1128,6 +1127,8 @@ static void __cmd_lat(void) output_lat_thread(lat); next = rb_next(next); } + + printf("-----------------------------------------------------------------------------------\n"); } static struct trace_sched_handler *trace_handler; -- cgit v1.2.3 From ea92ed5a8f4e6c638efe7de2efe8a875d580ad3f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 12 Sep 2009 10:08:34 +0200 Subject: perf sched: Add runtime stats Extend the latency tracking structure with scheduling atom runtime info - and sum it up during per task display. (Also clean up a few details.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 58 ++++++++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 20 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index a084c284e19..c382f530d4c 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -243,8 +243,8 @@ add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration) nr_run_events++; } -static unsigned long targetless_wakeups; -static unsigned long multitarget_wakeups; +static unsigned long targetless_wakeups; +static unsigned long multitarget_wakeups; static void add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp, @@ -485,10 +485,10 @@ static void create_tasks(void) } } -static nsec_t cpu_usage; -static nsec_t runavg_cpu_usage; -static nsec_t parent_cpu_usage; -static nsec_t runavg_parent_cpu_usage; +static nsec_t cpu_usage; +static nsec_t runavg_cpu_usage; +static nsec_t parent_cpu_usage; +static nsec_t runavg_parent_cpu_usage; static void wait_for_tasks(void) { @@ -858,9 +858,9 @@ replay_fork_event(struct trace_fork_event *fork_event, } static struct trace_sched_handler replay_ops = { - .wakeup_event = replay_wakeup_event, - .switch_event = replay_switch_event, - .fork_event = replay_fork_event, + .wakeup_event = replay_wakeup_event, + .switch_event = replay_switch_event, + .fork_event = replay_fork_event, }; #define TASK_STATE_TO_CHAR_STR "RSDTtZX" @@ -877,6 +877,7 @@ struct lat_snapshot { enum thread_state state; u64 wake_up_time; u64 sched_in_time; + u64 runtime; }; struct thread_latency { @@ -951,6 +952,7 @@ latency_fork_event(struct trace_fork_event *fork_event __used, /* should insert the newcomer */ } +__used static char sched_out_state(struct trace_switch_event *switch_event) { const char *str = TASK_STATE_TO_CHAR_STR; @@ -960,17 +962,15 @@ static char sched_out_state(struct trace_switch_event *switch_event) static void lat_sched_out(struct thread_latency *lat, - struct trace_switch_event *switch_event) + struct trace_switch_event *switch_event __used, u64 delta) { struct lat_snapshot *snapshot; - if (sched_out_state(switch_event) == 'R') - return; - snapshot = calloc(sizeof(*snapshot), 1); if (!snapshot) die("Non memory"); + snapshot->runtime = delta; list_add_tail(&snapshot->list, &lat->snapshot_list); } @@ -997,16 +997,31 @@ lat_sched_in(struct thread_latency *lat, u64 timestamp) snapshot->sched_in_time = timestamp; } - static void latency_switch_event(struct trace_switch_event *switch_event, struct event *event __used, - int cpu __used, + int cpu, u64 timestamp, struct thread *thread __used) { struct thread_latency *out_lat, *in_lat; struct thread *sched_out, *sched_in; + u64 timestamp0; + s64 delta; + + if (cpu >= MAX_CPUS || cpu < 0) + return; + + timestamp0 = cpu_last_switched[cpu]; + cpu_last_switched[cpu] = timestamp; + if (timestamp0) + delta = timestamp - timestamp0; + else + delta = 0; + + if (delta < 0) + die("hm, delta: %Ld < 0 ?\n", delta); + sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match); sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match); @@ -1028,7 +1043,7 @@ latency_switch_event(struct trace_switch_event *switch_event, } lat_sched_in(in_lat, timestamp); - lat_sched_out(out_lat, switch_event); + lat_sched_out(out_lat, switch_event, delta); } static void @@ -1067,9 +1082,9 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, } static struct trace_sched_handler lat_ops = { - .wakeup_event = latency_wakeup_event, - .switch_event = latency_switch_event, - .fork_event = latency_fork_event, + .wakeup_event = latency_wakeup_event, + .switch_event = latency_switch_event, + .fork_event = latency_fork_event, }; static void output_lat_thread(struct thread_latency *lat) @@ -1080,8 +1095,11 @@ static void output_lat_thread(struct thread_latency *lat) int ret; u64 max = 0, avg; u64 total = 0, delta; + u64 total_runtime = 0; list_for_each_entry(shot, &lat->snapshot_list, list) { + total_runtime += shot->runtime; + if (shot->state != THREAD_SCHED_IN) continue; @@ -1104,7 +1122,7 @@ static void output_lat_thread(struct thread_latency *lat) avg = total / count; printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n", - 0.0, count, (double)avg/1e9, (double)max/1e9); + (double)total_runtime/1e9, count, (double)avg/1e9, (double)max/1e9); } static void __cmd_lat(void) -- cgit v1.2.3 From 3e304147cdb404ce6d1dd0e50cb19f52142bb363 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 12 Sep 2009 10:08:34 +0200 Subject: perf sched: Output runtime and context switch totals After: ----------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | ----------------------------------------------------------------------------------- make | 0.678 ms | 13 | avg: 0.018 ms | max: 0.050 ms | gcc | 0.014 ms | 2 | avg: 0.320 ms | max: 0.627 ms | gcc | 0.000 ms | 2 | avg: 0.185 ms | max: 0.369 ms | ... ----------------------------------------------------------------------------------- TOTAL: | 21.316 ms | 63 | --------------------------------------------- Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index c382f530d4c..727cc5b852c 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1087,6 +1087,9 @@ static struct trace_sched_handler lat_ops = { .fork_event = latency_fork_event, }; +static u64 all_runtime; +static u64 all_count; + static void output_lat_thread(struct thread_latency *lat) { struct lat_snapshot *shot; @@ -1111,6 +1114,9 @@ static void output_lat_thread(struct thread_latency *lat) total += delta; } + all_runtime += total_runtime; + all_count += count; + if (!count) return; @@ -1133,7 +1139,7 @@ static void __cmd_lat(void) read_events(); printf("-----------------------------------------------------------------------------------\n"); - printf(" Task | runtime ms | switches | average delay ms | maximum delay ms |\n"); + printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); printf("-----------------------------------------------------------------------------------\n"); next = rb_first(&lat_snapshot_root); @@ -1147,6 +1153,9 @@ static void __cmd_lat(void) } printf("-----------------------------------------------------------------------------------\n"); + printf(" TOTAL: |%9.3f ms |%9Ld |\n", + (double)all_runtime/1e9, all_count); + printf("---------------------------------------------\n"); } static struct trace_sched_handler *trace_handler; -- cgit v1.2.3 From 175622053069afbd366ba3c6030b5af82f378d40 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 12 Sep 2009 23:11:32 +0200 Subject: perf sched: Rename struct lat_snapshot to struct work atoms To measures the latencies, we capture the sched atoms data into a specific structure named struct lat_snapshot. As this structure can be used for other purposes of scheduler profiling and mirrors what happens in a thread work atom, lets rename it to struct work_atom and propagate this renaming in other functions and structures names to keep it coherent. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 112 ++++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 56 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 727cc5b852c..7e57a986c05 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -872,7 +872,7 @@ enum thread_state { THREAD_IGNORE }; -struct lat_snapshot { +struct work_atom { struct list_head list; enum thread_state state; u64 wake_up_time; @@ -880,7 +880,7 @@ struct lat_snapshot { u64 runtime; }; -struct thread_latency { +struct task_atoms { struct list_head snapshot_list; struct thread *thread; struct rb_node node; @@ -888,35 +888,35 @@ struct thread_latency { static struct rb_root lat_snapshot_root; -static struct thread_latency * -thread_latency_search(struct rb_root *root, struct thread *thread) +static struct task_atoms * +thread_atom_list_search(struct rb_root *root, struct thread *thread) { struct rb_node *node = root->rb_node; while (node) { - struct thread_latency *lat; + struct task_atoms *atoms; - lat = container_of(node, struct thread_latency, node); - if (thread->pid < lat->thread->pid) + atoms = container_of(node, struct task_atoms, node); + if (thread->pid < atoms->thread->pid) node = node->rb_left; - else if (thread->pid > lat->thread->pid) + else if (thread->pid > atoms->thread->pid) node = node->rb_right; else { - return lat; + return atoms; } } return NULL; } static void -__thread_latency_insert(struct rb_root *root, struct thread_latency *data) +__thread_latency_insert(struct rb_root *root, struct task_atoms *data) { struct rb_node **new = &(root->rb_node), *parent = NULL; while (*new) { - struct thread_latency *this; + struct task_atoms *this; - this = container_of(*new, struct thread_latency, node); + this = container_of(*new, struct task_atoms, node); parent = *new; if (data->thread->pid < this->thread->pid) new = &((*new)->rb_left); @@ -930,16 +930,16 @@ __thread_latency_insert(struct rb_root *root, struct thread_latency *data) rb_insert_color(&data->node, root); } -static void thread_latency_insert(struct thread *thread) +static void thread_atom_list_insert(struct thread *thread) { - struct thread_latency *lat; - lat = calloc(sizeof(*lat), 1); - if (!lat) + struct task_atoms *atoms; + atoms = calloc(sizeof(*atoms), 1); + if (!atoms) die("No memory"); - lat->thread = thread; - INIT_LIST_HEAD(&lat->snapshot_list); - __thread_latency_insert(&lat_snapshot_root, lat); + atoms->thread = thread; + INIT_LIST_HEAD(&atoms->snapshot_list); + __thread_latency_insert(&lat_snapshot_root, atoms); } static void @@ -961,28 +961,28 @@ static char sched_out_state(struct trace_switch_event *switch_event) } static void -lat_sched_out(struct thread_latency *lat, +lat_sched_out(struct task_atoms *atoms, struct trace_switch_event *switch_event __used, u64 delta) { - struct lat_snapshot *snapshot; + struct work_atom *snapshot; snapshot = calloc(sizeof(*snapshot), 1); if (!snapshot) die("Non memory"); snapshot->runtime = delta; - list_add_tail(&snapshot->list, &lat->snapshot_list); + list_add_tail(&snapshot->list, &atoms->snapshot_list); } static void -lat_sched_in(struct thread_latency *lat, u64 timestamp) +lat_sched_in(struct task_atoms *atoms, u64 timestamp) { - struct lat_snapshot *snapshot; + struct work_atom *snapshot; - if (list_empty(&lat->snapshot_list)) + if (list_empty(&atoms->snapshot_list)) return; - snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot, + snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom, list); if (snapshot->state != THREAD_WAKED_UP) @@ -1004,7 +1004,7 @@ latency_switch_event(struct trace_switch_event *switch_event, u64 timestamp, struct thread *thread __used) { - struct thread_latency *out_lat, *in_lat; + struct task_atoms *out_atoms, *in_atoms; struct thread *sched_out, *sched_in; u64 timestamp0; s64 delta; @@ -1026,24 +1026,24 @@ latency_switch_event(struct trace_switch_event *switch_event, sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match); sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match); - in_lat = thread_latency_search(&lat_snapshot_root, sched_in); - if (!in_lat) { - thread_latency_insert(sched_in); - in_lat = thread_latency_search(&lat_snapshot_root, sched_in); - if (!in_lat) + in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in); + if (!in_atoms) { + thread_atom_list_insert(sched_in); + in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in); + if (!in_atoms) die("Internal latency tree error"); } - out_lat = thread_latency_search(&lat_snapshot_root, sched_out); - if (!out_lat) { - thread_latency_insert(sched_out); - out_lat = thread_latency_search(&lat_snapshot_root, sched_out); - if (!out_lat) + out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out); + if (!out_atoms) { + thread_atom_list_insert(sched_out); + out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out); + if (!out_atoms) die("Internal latency tree error"); } - lat_sched_in(in_lat, timestamp); - lat_sched_out(out_lat, switch_event, delta); + lat_sched_in(in_atoms, timestamp); + lat_sched_out(out_atoms, switch_event, delta); } static void @@ -1053,8 +1053,8 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, u64 timestamp, struct thread *thread __used) { - struct thread_latency *lat; - struct lat_snapshot *snapshot; + struct task_atoms *atoms; + struct work_atom *snapshot; struct thread *wakee; /* Note for later, it may be interesting to observe the failing cases */ @@ -1062,16 +1062,16 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, return; wakee = threads__findnew(wakeup_event->pid, &threads, &last_match); - lat = thread_latency_search(&lat_snapshot_root, wakee); - if (!lat) { - thread_latency_insert(wakee); + atoms = thread_atom_list_search(&lat_snapshot_root, wakee); + if (!atoms) { + thread_atom_list_insert(wakee); return; } - if (list_empty(&lat->snapshot_list)) + if (list_empty(&atoms->snapshot_list)) return; - snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot, + snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom, list); if (snapshot->state != THREAD_SLEEPING) @@ -1090,9 +1090,9 @@ static struct trace_sched_handler lat_ops = { static u64 all_runtime; static u64 all_count; -static void output_lat_thread(struct thread_latency *lat) +static void output_lat_thread(struct task_atoms *atom_list) { - struct lat_snapshot *shot; + struct work_atom *atom; int count = 0; int i; int ret; @@ -1100,15 +1100,15 @@ static void output_lat_thread(struct thread_latency *lat) u64 total = 0, delta; u64 total_runtime = 0; - list_for_each_entry(shot, &lat->snapshot_list, list) { - total_runtime += shot->runtime; + list_for_each_entry(atom, &atom_list->snapshot_list, list) { + total_runtime += atom->runtime; - if (shot->state != THREAD_SCHED_IN) + if (atom->state != THREAD_SCHED_IN) continue; count++; - delta = shot->sched_in_time - shot->wake_up_time; + delta = atom->sched_in_time - atom->wake_up_time; if (delta > max) max = delta; total += delta; @@ -1120,7 +1120,7 @@ static void output_lat_thread(struct thread_latency *lat) if (!count) return; - ret = printf(" %s ", lat->thread->comm); + ret = printf(" %s ", atom_list->thread->comm); for (i = 0; i < 19 - ret; i++) printf(" "); @@ -1145,10 +1145,10 @@ static void __cmd_lat(void) next = rb_first(&lat_snapshot_root); while (next) { - struct thread_latency *lat; + struct task_atoms *atom_list; - lat = rb_entry(next, struct thread_latency, node); - output_lat_thread(lat); + atom_list = rb_entry(next, struct task_atoms, node); + output_lat_thread(atom_list); next = rb_next(next); } -- cgit v1.2.3 From c6ced61112f1e6139914149fab65695801a74f0f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 13 Sep 2009 00:46:19 +0200 Subject: perf sched: Add involuntarily sleeping task in work atoms Currently in perf sched, we are measuring the scheduler wakeup latencies. Now we also want measure the time a task wait to be scheduled after it gets preempted. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 7e57a986c05..61a80e8c9d0 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -866,8 +866,8 @@ static struct trace_sched_handler replay_ops = { #define TASK_STATE_TO_CHAR_STR "RSDTtZX" enum thread_state { - THREAD_SLEEPING, - THREAD_WAKED_UP, + THREAD_SLEEPING = 0, + THREAD_WAIT_CPU, THREAD_SCHED_IN, THREAD_IGNORE }; @@ -962,7 +962,9 @@ static char sched_out_state(struct trace_switch_event *switch_event) static void lat_sched_out(struct task_atoms *atoms, - struct trace_switch_event *switch_event __used, u64 delta) + struct trace_switch_event *switch_event __used, + u64 delta, + u64 timestamp) { struct work_atom *snapshot; @@ -970,6 +972,11 @@ lat_sched_out(struct task_atoms *atoms, if (!snapshot) die("Non memory"); + if (sched_out_state(switch_event) == 'R') { + snapshot->state = THREAD_WAIT_CPU; + snapshot->wake_up_time = timestamp; + } + snapshot->runtime = delta; list_add_tail(&snapshot->list, &atoms->snapshot_list); } @@ -985,7 +992,7 @@ lat_sched_in(struct task_atoms *atoms, u64 timestamp) snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom, list); - if (snapshot->state != THREAD_WAKED_UP) + if (snapshot->state != THREAD_WAIT_CPU) return; if (timestamp < snapshot->wake_up_time) { @@ -1043,7 +1050,7 @@ latency_switch_event(struct trace_switch_event *switch_event, } lat_sched_in(in_atoms, timestamp); - lat_sched_out(out_atoms, switch_event, delta); + lat_sched_out(out_atoms, switch_event, delta, timestamp); } static void @@ -1077,7 +1084,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, if (snapshot->state != THREAD_SLEEPING) return; - snapshot->state = THREAD_WAKED_UP; + snapshot->state = THREAD_WAIT_CPU; snapshot->wake_up_time = timestamp; } -- cgit v1.2.3 From 66685678a03d0d8e8ba015472f280fb4f12f84c1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 13 Sep 2009 01:56:25 +0200 Subject: perf sched: Export the total, max latency and total runtime to thread atoms list Add a field in the thread atom list that keeps track of the total and max latencies and also the total runtime. This makes a faster output and also prepares for sorting. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 48 +++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 61a80e8c9d0..43570270278 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -884,6 +884,10 @@ struct task_atoms { struct list_head snapshot_list; struct thread *thread; struct rb_node node; + u64 max_lat; + u64 total_lat; + u64 nb_atoms; + u64 total_runtime; }; static struct rb_root lat_snapshot_root; @@ -985,6 +989,7 @@ static void lat_sched_in(struct task_atoms *atoms, u64 timestamp) { struct work_atom *snapshot; + u64 delta; if (list_empty(&atoms->snapshot_list)) return; @@ -1002,6 +1007,13 @@ lat_sched_in(struct task_atoms *atoms, u64 timestamp) snapshot->state = THREAD_SCHED_IN; snapshot->sched_in_time = timestamp; + + delta = snapshot->sched_in_time - snapshot->wake_up_time; + atoms->total_lat += delta; + if (delta > atoms->max_lat) + atoms->max_lat = delta; + atoms->nb_atoms++; + atoms->total_runtime += snapshot->runtime; } static void @@ -1099,43 +1111,27 @@ static u64 all_count; static void output_lat_thread(struct task_atoms *atom_list) { - struct work_atom *atom; - int count = 0; int i; int ret; - u64 max = 0, avg; - u64 total = 0, delta; - u64 total_runtime = 0; - - list_for_each_entry(atom, &atom_list->snapshot_list, list) { - total_runtime += atom->runtime; - - if (atom->state != THREAD_SCHED_IN) - continue; - - count++; + u64 avg; - delta = atom->sched_in_time - atom->wake_up_time; - if (delta > max) - max = delta; - total += delta; - } - - all_runtime += total_runtime; - all_count += count; - - if (!count) + if (!atom_list->nb_atoms) return; + all_runtime += atom_list->total_runtime; + all_count += atom_list->nb_atoms; + ret = printf(" %s ", atom_list->thread->comm); for (i = 0; i < 19 - ret; i++) printf(" "); - avg = total / count; + avg = atom_list->total_lat / atom_list->nb_atoms; - printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n", - (double)total_runtime/1e9, count, (double)avg/1e9, (double)max/1e9); + printf("|%9.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n", + (double)atom_list->total_runtime / 1e9, + atom_list->nb_atoms, (double)avg / 1e9, + (double)atom_list->max_lat / 1e9); } static void __cmd_lat(void) -- cgit v1.2.3 From 7362262687b21b0d04927a7615c162a3d064849e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 13 Sep 2009 01:59:05 +0200 Subject: perf sched: Fix nsec to msec conversion We are dividing a time in ns by 1e9. This is a nsec to sec conversion. What we want is msecs. Fix it by dividing by 1e6. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 43570270278..67a0ba88aec 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1129,9 +1129,9 @@ static void output_lat_thread(struct task_atoms *atom_list) avg = atom_list->total_lat / atom_list->nb_atoms; printf("|%9.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n", - (double)atom_list->total_runtime / 1e9, - atom_list->nb_atoms, (double)avg / 1e9, - (double)atom_list->max_lat / 1e9); + (double)atom_list->total_runtime / 1e6, + atom_list->nb_atoms, (double)avg / 1e6, + (double)atom_list->max_lat / 1e6); } static void __cmd_lat(void) @@ -1157,7 +1157,7 @@ static void __cmd_lat(void) printf("-----------------------------------------------------------------------------------\n"); printf(" TOTAL: |%9.3f ms |%9Ld |\n", - (double)all_runtime/1e9, all_count); + (double)all_runtime/1e6, all_count); printf("---------------------------------------------\n"); } -- cgit v1.2.3 From daa1d7a5eafc0a3a91a9add6a9a9f1bcaed63108 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 13 Sep 2009 03:36:29 +0200 Subject: perf sched: Implement multidimensional sorting Implement multidimensional sorting on perf sched so that you can sort either by number of switches, latency average, latency maximum, runtime. perf sched -l -s avg,max (this is the default) ----------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | ----------------------------------------------------------------------------------- gnome-power-man | 0.113 ms | 1 | avg: 4998.531 ms | max: 4998.531 ms | xfdesktop | 1.190 ms | 7 | avg: 136.475 ms | max: 940.933 ms | xfce-mcs-manage | 2.194 ms | 22 | avg: 38.534 ms | max: 735.174 ms | notification-da | 2.749 ms | 31 | avg: 27.436 ms | max: 731.791 ms | xfce4-session | 3.343 ms | 28 | avg: 26.796 ms | max: 734.891 ms | xfwm4 | 3.159 ms | 22 | avg: 12.406 ms | max: 241.333 ms | xchat | 42.789 ms | 214 | avg: 11.886 ms | max: 100.349 ms | xfce4-terminal | 5.386 ms | 22 | avg: 11.414 ms | max: 241.611 ms | firefox | 151.992 ms | 123 | avg: 9.543 ms | max: 153.717 ms | xfce4-panel | 24.324 ms | 47 | avg: 8.189 ms | max: 242.352 ms | :5090 | 6.932 ms | 111 | avg: 8.131 ms | max: 102.665 ms | events/0 | 0.758 ms | 12 | avg: 1.964 ms | max: 21.879 ms | Xorg | 280.558 ms | 340 | avg: 1.864 ms | max: 99.526 ms | geany | 63.391 ms | 295 | avg: 1.099 ms | max: 9.334 ms | reiserfs/0 | 0.039 ms | 2 | avg: 0.854 ms | max: 1.487 ms | kondemand/0 | 8.251 ms | 245 | avg: 0.691 ms | max: 34.372 ms | Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 206 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 196 insertions(+), 10 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 67a0ba88aec..10fcd49e298 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -33,6 +33,9 @@ static u64 sample_type; static int replay_mode; static int lat_mode; +static char default_sort_order[] = "avg, max, switch, runtime"; +static char *sort_order = default_sort_order; + /* * Scheduler benchmarks @@ -890,7 +893,17 @@ struct task_atoms { u64 total_runtime; }; -static struct rb_root lat_snapshot_root; +typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *); + +struct sort_dimension { + const char *name; + sort_thread_lat cmp; + struct list_head list; +}; + +static LIST_HEAD(cmp_pid); + +static struct rb_root lat_snapshot_root, sorted_lat_snapshot_root; static struct task_atoms * thread_atom_list_search(struct rb_root *root, struct thread *thread) @@ -901,9 +914,9 @@ thread_atom_list_search(struct rb_root *root, struct thread *thread) struct task_atoms *atoms; atoms = container_of(node, struct task_atoms, node); - if (thread->pid < atoms->thread->pid) + if (thread->pid > atoms->thread->pid) node = node->rb_left; - else if (thread->pid > atoms->thread->pid) + else if (thread->pid < atoms->thread->pid) node = node->rb_right; else { return atoms; @@ -912,22 +925,41 @@ thread_atom_list_search(struct rb_root *root, struct thread *thread) return NULL; } +static int +thread_lat_cmp(struct list_head *list, struct task_atoms *l, + struct task_atoms *r) +{ + struct sort_dimension *sort; + int ret = 0; + + list_for_each_entry(sort, list, list) { + ret = sort->cmp(l, r); + if (ret) + return ret; + } + + return ret; +} + static void -__thread_latency_insert(struct rb_root *root, struct task_atoms *data) +__thread_latency_insert(struct rb_root *root, struct task_atoms *data, + struct list_head *sort_list) { struct rb_node **new = &(root->rb_node), *parent = NULL; while (*new) { struct task_atoms *this; + int cmp; this = container_of(*new, struct task_atoms, node); parent = *new; - if (data->thread->pid < this->thread->pid) + + cmp = thread_lat_cmp(sort_list, data, this); + + if (cmp > 0) new = &((*new)->rb_left); - else if (data->thread->pid > this->thread->pid) - new = &((*new)->rb_right); else - die("Double thread insertion\n"); + new = &((*new)->rb_right); } rb_link_node(&data->node, parent, new); @@ -943,7 +975,7 @@ static void thread_atom_list_insert(struct thread *thread) atoms->thread = thread; INIT_LIST_HEAD(&atoms->snapshot_list); - __thread_latency_insert(&lat_snapshot_root, atoms); + __thread_latency_insert(&lat_snapshot_root, atoms, &cmp_pid); } static void @@ -1134,18 +1166,151 @@ static void output_lat_thread(struct task_atoms *atom_list) (double)atom_list->max_lat / 1e6); } +static int pid_cmp(struct task_atoms *l, struct task_atoms *r) +{ + + if (l->thread->pid < r->thread->pid) + return -1; + if (l->thread->pid > r->thread->pid) + return 1; + + return 0; +} + +static struct sort_dimension pid_sort_dimension = { + .name = "pid", + .cmp = pid_cmp, +}; + +static int avg_cmp(struct task_atoms *l, struct task_atoms *r) +{ + u64 avgl, avgr; + + if (!l->nb_atoms) + return -1; + + if (!r->nb_atoms) + return 1; + + avgl = l->total_lat / l->nb_atoms; + avgr = r->total_lat / r->nb_atoms; + + if (avgl < avgr) + return -1; + if (avgl > avgr) + return 1; + + return 0; +} + +static struct sort_dimension avg_sort_dimension = { + .name = "avg", + .cmp = avg_cmp, +}; + +static int max_cmp(struct task_atoms *l, struct task_atoms *r) +{ + if (l->max_lat < r->max_lat) + return -1; + if (l->max_lat > r->max_lat) + return 1; + + return 0; +} + +static struct sort_dimension max_sort_dimension = { + .name = "max", + .cmp = max_cmp, +}; + +static int switch_cmp(struct task_atoms *l, struct task_atoms *r) +{ + if (l->nb_atoms < r->nb_atoms) + return -1; + if (l->nb_atoms > r->nb_atoms) + return 1; + + return 0; +} + +static struct sort_dimension switch_sort_dimension = { + .name = "switch", + .cmp = switch_cmp, +}; + +static int runtime_cmp(struct task_atoms *l, struct task_atoms *r) +{ + if (l->total_runtime < r->total_runtime) + return -1; + if (l->total_runtime > r->total_runtime) + return 1; + + return 0; +} + +static struct sort_dimension runtime_sort_dimension = { + .name = "runtime", + .cmp = runtime_cmp, +}; + +static struct sort_dimension *available_sorts[] = { + &pid_sort_dimension, + &avg_sort_dimension, + &max_sort_dimension, + &switch_sort_dimension, + &runtime_sort_dimension, +}; + +#define NB_AVAILABLE_SORTS (int)(sizeof(available_sorts) / sizeof(struct sort_dimension *)) + +static LIST_HEAD(sort_list); + +static int sort_dimension__add(char *tok, struct list_head *list) +{ + int i; + + for (i = 0; i < NB_AVAILABLE_SORTS; i++) { + if (!strcmp(available_sorts[i]->name, tok)) { + list_add_tail(&available_sorts[i]->list, list); + + return 0; + } + } + + return -1; +} + +static void setup_sorting(void); + +static void sort_lat(void) +{ + struct rb_node *node; + + for (;;) { + struct task_atoms *data; + node = rb_first(&lat_snapshot_root); + if (!node) + break; + + rb_erase(node, &lat_snapshot_root); + data = rb_entry(node, struct task_atoms, node); + __thread_latency_insert(&sorted_lat_snapshot_root, data, &sort_list); + } +} + static void __cmd_lat(void) { struct rb_node *next; setup_pager(); read_events(); + sort_lat(); printf("-----------------------------------------------------------------------------------\n"); printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); printf("-----------------------------------------------------------------------------------\n"); - next = rb_first(&lat_snapshot_root); + next = rb_first(&sorted_lat_snapshot_root); while (next) { struct task_atoms *atom_list; @@ -1469,11 +1634,30 @@ static const struct option options[] = { "replay sched behaviour from traces"), OPT_BOOLEAN('l', "latency", &lat_mode, "measure various latencies"), + OPT_STRING('s', "sort", &sort_order, "key[,key2...]", + "sort by key(s): runtime, switch, avg, max"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_END() }; +static void setup_sorting(void) +{ + char *tmp, *tok, *str = strdup(sort_order); + + for (tok = strtok_r(str, ", ", &tmp); + tok; tok = strtok_r(NULL, ", ", &tmp)) { + if (sort_dimension__add(tok, &sort_list) < 0) { + error("Unknown --sort key: `%s'", tok); + usage_with_options(sched_usage, options); + } + } + + free(str); + + sort_dimension__add((char *)"pid", &cmp_pid); +} + int cmd_sched(int argc, const char **argv, const char *prefix __used) { symbol__init(); @@ -1496,6 +1680,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) else usage_with_options(sched_usage, options); + setup_sorting(); + if (replay_mode) __cmd_replay(); else if (lat_mode) -- cgit v1.2.3 From f2858d8ad9858e63c87257553c5721cba5db95ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Sep 2009 12:12:54 +0200 Subject: perf sched: Add 'perf sched latency' and 'perf sched replay' Separate the option parsing cleanly and add two variants: - 'perf sched latency' (can be abbreviated via 'perf sched lat') - 'perf sched replay' (can be abbreviated via 'perf sched rep') Also add a repeat count option to replay and add a separation set of options for replay. Do the sorting setup only in the latency sub-command. Display separate help screens for 'perf sched' and 'perf sched replay -h' - i.e. further separation of the sub-commands. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 90 +++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 32 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 10fcd49e298..e01cc63b98c 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -30,9 +30,6 @@ static struct thread *last_match; static struct perf_header *header; static u64 sample_type; -static int replay_mode; -static int lat_mode; - static char default_sort_order[] = "avg, max, switch, runtime"; static char *sort_order = default_sort_order; @@ -623,9 +620,11 @@ static void test_calibrations(void) printf("the sleep test took %Ld nsecs\n", T1-T0); } +static unsigned long replay_repeat = 10; + static void __cmd_replay(void) { - long nr_iterations = 10, i; + unsigned long i; calibrate_run_measurement_overhead(); calibrate_sleep_measurement_overhead(); @@ -651,7 +650,7 @@ static void __cmd_replay(void) create_tasks(); printf("------------------------------------------------------------\n"); - for (i = 0; i < nr_iterations; i++) + for (i = 0; i < replay_repeat; i++) run_one_test(); } @@ -1623,21 +1622,45 @@ more: } static const char * const sched_usage[] = { - "perf sched [] ", + "perf sched [] {record|latency|replay}", NULL }; -static const struct option options[] = { +static const struct option sched_options[] = { + OPT_BOOLEAN('v', "verbose", &verbose, + "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), - OPT_BOOLEAN('r', "replay", &replay_mode, - "replay sched behaviour from traces"), - OPT_BOOLEAN('l', "latency", &lat_mode, - "measure various latencies"), + OPT_END() +}; + +static const char * const latency_usage[] = { + "perf sched latency []", + NULL +}; + +static const struct option latency_options[] = { OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): runtime, switch, avg, max"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), + OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, + "dump raw trace in ASCII"), + OPT_END() +}; + +static const char * const replay_usage[] = { + "perf sched replay []", + NULL +}; + +static const struct option replay_options[] = { + OPT_INTEGER('r', "repeat", &replay_repeat, + "repeat the workload replay N times (-1: infinite)"), + OPT_BOOLEAN('v', "verbose", &verbose, + "be more verbose (show symbol address, etc)"), + OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, + "dump raw trace in ASCII"), OPT_END() }; @@ -1649,7 +1672,7 @@ static void setup_sorting(void) tok; tok = strtok_r(NULL, ", ", &tmp)) { if (sort_dimension__add(tok, &sort_list) < 0) { error("Unknown --sort key: `%s'", tok); - usage_with_options(sched_usage, options); + usage_with_options(latency_usage, latency_options); } } @@ -1663,29 +1686,32 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) symbol__init(); page_size = getpagesize(); - argc = parse_options(argc, argv, options, sched_usage, 0); - if (argc) { - /* - * Special case: if there's an argument left then assume tha - * it's a symbol filter: - */ - if (argc > 1) - usage_with_options(sched_usage, options); - } + argc = parse_options(argc, argv, sched_options, sched_usage, + PARSE_OPT_STOP_AT_NON_OPTION); + if (!argc) + usage_with_options(sched_usage, sched_options); - if (replay_mode) - trace_handler = &replay_ops; - else if (lat_mode) + if (!strncmp(argv[0], "lat", 3)) { trace_handler = &lat_ops; - else - usage_with_options(sched_usage, options); - - setup_sorting(); - - if (replay_mode) - __cmd_replay(); - else if (lat_mode) + if (argc > 1) { + argc = parse_options(argc, argv, latency_options, latency_usage, 0); + if (argc) + usage_with_options(latency_usage, latency_options); + setup_sorting(); + } __cmd_lat(); + } else if (!strncmp(argv[0], "rep", 3)) { + trace_handler = &replay_ops; + if (argc) { + argc = parse_options(argc, argv, replay_options, replay_usage, 0); + if (argc) + usage_with_options(replay_usage, replay_options); + } + __cmd_replay(); + } else { + usage_with_options(sched_usage, sched_options); + } + return 0; } -- cgit v1.2.3 From b1ffe8f3e0c96f5527a89e24410d6b0e59b3554a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Sep 2009 12:12:54 +0200 Subject: perf sched: Finish latency => atom rename and misc cleanups - Rename 'latency' field/variable names to the better 'atom' ones - Reduce the number of #include lines and consolidate them - Gather file scope variables at the top of the file - Remove unused bits No change in functionality. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 399 +++++++++++++++++++++------------------------ 1 file changed, 184 insertions(+), 215 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index e01cc63b98c..cc2dbd5b50e 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1,4 +1,5 @@ #include "builtin.h" +#include "perf.h" #include "util/util.h" #include "util/cache.h" @@ -7,15 +8,16 @@ #include "util/header.h" #include "util/parse-options.h" +#include "util/trace-event.h" -#include "perf.h" #include "util/debug.h" -#include "util/trace-event.h" #include +#include - -#define MAX_CPUS 4096 +#include +#include +#include static char const *input_name = "perf.data"; static int input; @@ -33,44 +35,126 @@ static u64 sample_type; static char default_sort_order[] = "avg, max, switch, runtime"; static char *sort_order = default_sort_order; +#define PR_SET_NAME 15 /* Set process name */ +#define MAX_CPUS 4096 -/* - * Scheduler benchmarks - */ -#include -#include -#include -#include -#include +#define BUG_ON(x) assert(!(x)) -#include +static u64 run_measurement_overhead; +static u64 sleep_measurement_overhead; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#define COMM_LEN 20 +#define SYM_LEN 129 -#include +#define MAX_PID 65536 -#define PR_SET_NAME 15 /* Set process name */ +static unsigned long nr_tasks; -#define BUG_ON(x) assert(!(x)) +struct sched_event; -#define DEBUG 0 +struct task_desc { + unsigned long nr; + unsigned long pid; + char comm[COMM_LEN]; -typedef unsigned long long nsec_t; + unsigned long nr_events; + unsigned long curr_event; + struct sched_event **events; + + pthread_t thread; + sem_t sleep_sem; -static nsec_t run_measurement_overhead; -static nsec_t sleep_measurement_overhead; + sem_t ready_for_work; + sem_t work_done_sem; + + u64 cpu_usage; +}; + +enum sched_event_type { + SCHED_EVENT_RUN, + SCHED_EVENT_SLEEP, + SCHED_EVENT_WAKEUP, +}; + +struct sched_event { + enum sched_event_type type; + u64 timestamp; + u64 duration; + unsigned long nr; + int specific_wait; + sem_t *wait_sem; + struct task_desc *wakee; +}; + +static struct task_desc *pid_to_task[MAX_PID]; + +static struct task_desc **tasks; + +static pthread_mutex_t start_work_mutex = PTHREAD_MUTEX_INITIALIZER; +static u64 start_time; + +static pthread_mutex_t work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER; -static nsec_t get_nsecs(void) +static unsigned long nr_run_events; +static unsigned long nr_sleep_events; +static unsigned long nr_wakeup_events; + +static unsigned long nr_sleep_corrections; +static unsigned long nr_run_events_optimized; + +static unsigned long targetless_wakeups; +static unsigned long multitarget_wakeups; + +static u64 cpu_usage; +static u64 runavg_cpu_usage; +static u64 parent_cpu_usage; +static u64 runavg_parent_cpu_usage; + +static unsigned long nr_runs; +static u64 sum_runtime; +static u64 sum_fluct; +static u64 run_avg; + +static unsigned long replay_repeat = 10; + +#define TASK_STATE_TO_CHAR_STR "RSDTtZX" + +enum thread_state { + THREAD_SLEEPING = 0, + THREAD_WAIT_CPU, + THREAD_SCHED_IN, + THREAD_IGNORE +}; + +struct work_atom { + struct list_head list; + enum thread_state state; + u64 wake_up_time; + u64 sched_in_time; + u64 runtime; +}; + +struct task_atoms { + struct list_head atom_list; + struct thread *thread; + struct rb_node node; + u64 max_lat; + u64 total_lat; + u64 nb_atoms; + u64 total_runtime; +}; + +typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *); + +static struct rb_root atom_root, sorted_atom_root; + +static u64 all_runtime; +static u64 all_count; + +static int read_events(void); + + +static u64 get_nsecs(void) { struct timespec ts; @@ -79,16 +163,16 @@ static nsec_t get_nsecs(void) return ts.tv_sec * 1000000000ULL + ts.tv_nsec; } -static void burn_nsecs(nsec_t nsecs) +static void burn_nsecs(u64 nsecs) { - nsec_t T0 = get_nsecs(), T1; + u64 T0 = get_nsecs(), T1; do { T1 = get_nsecs(); } while (T1 + run_measurement_overhead < T0 + nsecs); } -static void sleep_nsecs(nsec_t nsecs) +static void sleep_nsecs(u64 nsecs) { struct timespec ts; @@ -100,7 +184,7 @@ static void sleep_nsecs(nsec_t nsecs) static void calibrate_run_measurement_overhead(void) { - nsec_t T0, T1, delta, min_delta = 1000000000ULL; + u64 T0, T1, delta, min_delta = 1000000000ULL; int i; for (i = 0; i < 10; i++) { @@ -117,7 +201,7 @@ static void calibrate_run_measurement_overhead(void) static void calibrate_sleep_measurement_overhead(void) { - nsec_t T0, T1, delta, min_delta = 1000000000ULL; + u64 T0, T1, delta, min_delta = 1000000000ULL; int i; for (i = 0; i < 10; i++) { @@ -133,67 +217,8 @@ static void calibrate_sleep_measurement_overhead(void) printf("sleep measurement overhead: %Ld nsecs\n", min_delta); } -#define COMM_LEN 20 -#define SYM_LEN 129 - -#define MAX_PID 65536 - -static unsigned long nr_tasks; - -struct sched_event; - -struct task_desc { - unsigned long nr; - unsigned long pid; - char comm[COMM_LEN]; - - unsigned long nr_events; - unsigned long curr_event; - struct sched_event **events; - - pthread_t thread; - sem_t sleep_sem; - - sem_t ready_for_work; - sem_t work_done_sem; - - nsec_t cpu_usage; -}; - -enum sched_event_type { - SCHED_EVENT_RUN, - SCHED_EVENT_SLEEP, - SCHED_EVENT_WAKEUP, -}; - -struct sched_event { - enum sched_event_type type; - nsec_t timestamp; - nsec_t duration; - unsigned long nr; - int specific_wait; - sem_t *wait_sem; - struct task_desc *wakee; -}; - -static struct task_desc *pid_to_task[MAX_PID]; - -static struct task_desc **tasks; - -static pthread_mutex_t start_work_mutex = PTHREAD_MUTEX_INITIALIZER; -static nsec_t start_time; - -static pthread_mutex_t work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER; - -static unsigned long nr_run_events; -static unsigned long nr_sleep_events; -static unsigned long nr_wakeup_events; - -static unsigned long nr_sleep_corrections; -static unsigned long nr_run_events_optimized; - static struct sched_event * -get_new_event(struct task_desc *task, nsec_t timestamp) +get_new_event(struct task_desc *task, u64 timestamp) { struct sched_event *event = calloc(1, sizeof(*event)); unsigned long idx = task->nr_events; @@ -221,7 +246,7 @@ static struct sched_event *last_event(struct task_desc *task) } static void -add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration) +add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration) { struct sched_event *event, *curr_event = last_event(task); @@ -243,11 +268,8 @@ add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration) nr_run_events++; } -static unsigned long targetless_wakeups; -static unsigned long multitarget_wakeups; - static void -add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp, +add_sched_event_wakeup(struct task_desc *task, u64 timestamp, struct task_desc *wakee) { struct sched_event *event, *wakee_event; @@ -275,7 +297,7 @@ add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp, } static void -add_sched_event_sleep(struct task_desc *task, nsec_t timestamp, +add_sched_event_sleep(struct task_desc *task, u64 timestamp, u64 task_state __used) { struct sched_event *event = get_new_event(task, timestamp); @@ -350,7 +372,7 @@ static void process_sched_event(struct task_desc *this_task __used, struct sched_event *event) { int ret = 0; - nsec_t now; + u64 now; long long delta; now = get_nsecs(); @@ -375,10 +397,10 @@ process_sched_event(struct task_desc *this_task __used, struct sched_event *even } } -static nsec_t get_cpu_usage_nsec_parent(void) +static u64 get_cpu_usage_nsec_parent(void) { struct rusage ru; - nsec_t sum; + u64 sum; int err; err = getrusage(RUSAGE_SELF, &ru); @@ -390,12 +412,12 @@ static nsec_t get_cpu_usage_nsec_parent(void) return sum; } -static nsec_t get_cpu_usage_nsec_self(void) +static u64 get_cpu_usage_nsec_self(void) { char filename [] = "/proc/1234567890/sched"; unsigned long msecs, nsecs; char *line = NULL; - nsec_t total = 0; + u64 total = 0; size_t len = 0; ssize_t chars; FILE *file; @@ -423,7 +445,7 @@ static nsec_t get_cpu_usage_nsec_self(void) static void *thread_func(void *ctx) { struct task_desc *this_task = ctx; - nsec_t cpu_usage_0, cpu_usage_1; + u64 cpu_usage_0, cpu_usage_1; unsigned long i, ret; char comm2[22]; @@ -485,14 +507,9 @@ static void create_tasks(void) } } -static nsec_t cpu_usage; -static nsec_t runavg_cpu_usage; -static nsec_t parent_cpu_usage; -static nsec_t runavg_parent_cpu_usage; - static void wait_for_tasks(void) { - nsec_t cpu_usage_0, cpu_usage_1; + u64 cpu_usage_0, cpu_usage_1; struct task_desc *task; unsigned long i, ret; @@ -543,16 +560,9 @@ static void wait_for_tasks(void) } } -static int read_events(void); - -static unsigned long nr_runs; -static nsec_t sum_runtime; -static nsec_t sum_fluct; -static nsec_t run_avg; - static void run_one_test(void) { - nsec_t T0, T1, delta, avg_delta, fluct, std_dev; + u64 T0, T1, delta, avg_delta, fluct, std_dev; T0 = get_nsecs(); wait_for_tasks(); @@ -576,10 +586,6 @@ static void run_one_test(void) printf("#%-3ld: %0.3f, ", nr_runs, (double)delta/1000000.0); -#if 0 - printf("%0.2f +- %0.2f, ", - (double)avg_delta/1e6, (double)std_dev/1e6); -#endif printf("ravg: %0.2f, ", (double)run_avg/1e6); @@ -605,7 +611,7 @@ static void run_one_test(void) static void test_calibrations(void) { - nsec_t T0, T1; + u64 T0, T1; T0 = get_nsecs(); burn_nsecs(1e6); @@ -620,8 +626,6 @@ static void test_calibrations(void) printf("the sleep test took %Ld nsecs\n", T1-T0); } -static unsigned long replay_repeat = 10; - static void __cmd_replay(void) { unsigned long i; @@ -865,47 +869,8 @@ static struct trace_sched_handler replay_ops = { .fork_event = replay_fork_event, }; -#define TASK_STATE_TO_CHAR_STR "RSDTtZX" - -enum thread_state { - THREAD_SLEEPING = 0, - THREAD_WAIT_CPU, - THREAD_SCHED_IN, - THREAD_IGNORE -}; - -struct work_atom { - struct list_head list; - enum thread_state state; - u64 wake_up_time; - u64 sched_in_time; - u64 runtime; -}; - -struct task_atoms { - struct list_head snapshot_list; - struct thread *thread; - struct rb_node node; - u64 max_lat; - u64 total_lat; - u64 nb_atoms; - u64 total_runtime; -}; - -typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *); - -struct sort_dimension { - const char *name; - sort_thread_lat cmp; - struct list_head list; -}; - -static LIST_HEAD(cmp_pid); - -static struct rb_root lat_snapshot_root, sorted_lat_snapshot_root; - static struct task_atoms * -thread_atom_list_search(struct rb_root *root, struct thread *thread) +thread_atoms_search(struct rb_root *root, struct thread *thread) { struct rb_node *node = root->rb_node; @@ -924,6 +889,14 @@ thread_atom_list_search(struct rb_root *root, struct thread *thread) return NULL; } +struct sort_dimension { + const char *name; + sort_thread_lat cmp; + struct list_head list; +}; + +static LIST_HEAD(cmp_pid); + static int thread_lat_cmp(struct list_head *list, struct task_atoms *l, struct task_atoms *r) @@ -965,16 +938,17 @@ __thread_latency_insert(struct rb_root *root, struct task_atoms *data, rb_insert_color(&data->node, root); } -static void thread_atom_list_insert(struct thread *thread) +static void thread_atoms_insert(struct thread *thread) { struct task_atoms *atoms; + atoms = calloc(sizeof(*atoms), 1); if (!atoms) die("No memory"); atoms->thread = thread; - INIT_LIST_HEAD(&atoms->snapshot_list); - __thread_latency_insert(&lat_snapshot_root, atoms, &cmp_pid); + INIT_LIST_HEAD(&atoms->atom_list); + __thread_latency_insert(&atom_root, atoms, &cmp_pid); } static void @@ -1001,50 +975,49 @@ lat_sched_out(struct task_atoms *atoms, u64 delta, u64 timestamp) { - struct work_atom *snapshot; + struct work_atom *atom; - snapshot = calloc(sizeof(*snapshot), 1); - if (!snapshot) + atom = calloc(sizeof(*atom), 1); + if (!atom) die("Non memory"); if (sched_out_state(switch_event) == 'R') { - snapshot->state = THREAD_WAIT_CPU; - snapshot->wake_up_time = timestamp; + atom->state = THREAD_WAIT_CPU; + atom->wake_up_time = timestamp; } - snapshot->runtime = delta; - list_add_tail(&snapshot->list, &atoms->snapshot_list); + atom->runtime = delta; + list_add_tail(&atom->list, &atoms->atom_list); } static void lat_sched_in(struct task_atoms *atoms, u64 timestamp) { - struct work_atom *snapshot; + struct work_atom *atom; u64 delta; - if (list_empty(&atoms->snapshot_list)) + if (list_empty(&atoms->atom_list)) return; - snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom, - list); + atom = list_entry(atoms->atom_list.prev, struct work_atom, list); - if (snapshot->state != THREAD_WAIT_CPU) + if (atom->state != THREAD_WAIT_CPU) return; - if (timestamp < snapshot->wake_up_time) { - snapshot->state = THREAD_IGNORE; + if (timestamp < atom->wake_up_time) { + atom->state = THREAD_IGNORE; return; } - snapshot->state = THREAD_SCHED_IN; - snapshot->sched_in_time = timestamp; + atom->state = THREAD_SCHED_IN; + atom->sched_in_time = timestamp; - delta = snapshot->sched_in_time - snapshot->wake_up_time; + delta = atom->sched_in_time - atom->wake_up_time; atoms->total_lat += delta; if (delta > atoms->max_lat) atoms->max_lat = delta; atoms->nb_atoms++; - atoms->total_runtime += snapshot->runtime; + atoms->total_runtime += atom->runtime; } static void @@ -1076,20 +1049,20 @@ latency_switch_event(struct trace_switch_event *switch_event, sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match); sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match); - in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in); + in_atoms = thread_atoms_search(&atom_root, sched_in); if (!in_atoms) { - thread_atom_list_insert(sched_in); - in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in); + thread_atoms_insert(sched_in); + in_atoms = thread_atoms_search(&atom_root, sched_in); if (!in_atoms) - die("Internal latency tree error"); + die("in-atom: Internal tree error"); } - out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out); + out_atoms = thread_atoms_search(&atom_root, sched_out); if (!out_atoms) { - thread_atom_list_insert(sched_out); - out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out); + thread_atoms_insert(sched_out); + out_atoms = thread_atoms_search(&atom_root, sched_out); if (!out_atoms) - die("Internal latency tree error"); + die("out-atom: Internal tree error"); } lat_sched_in(in_atoms, timestamp); @@ -1104,7 +1077,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, struct thread *thread __used) { struct task_atoms *atoms; - struct work_atom *snapshot; + struct work_atom *atom; struct thread *wakee; /* Note for later, it may be interesting to observe the failing cases */ @@ -1112,23 +1085,22 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, return; wakee = threads__findnew(wakeup_event->pid, &threads, &last_match); - atoms = thread_atom_list_search(&lat_snapshot_root, wakee); + atoms = thread_atoms_search(&atom_root, wakee); if (!atoms) { - thread_atom_list_insert(wakee); + thread_atoms_insert(wakee); return; } - if (list_empty(&atoms->snapshot_list)) + if (list_empty(&atoms->atom_list)) return; - snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom, - list); + atom = list_entry(atoms->atom_list.prev, struct work_atom, list); - if (snapshot->state != THREAD_SLEEPING) + if (atom->state != THREAD_SLEEPING) return; - snapshot->state = THREAD_WAIT_CPU; - snapshot->wake_up_time = timestamp; + atom->state = THREAD_WAIT_CPU; + atom->wake_up_time = timestamp; } static struct trace_sched_handler lat_ops = { @@ -1137,9 +1109,6 @@ static struct trace_sched_handler lat_ops = { .fork_event = latency_fork_event, }; -static u64 all_runtime; -static u64 all_count; - static void output_lat_thread(struct task_atoms *atom_list) { int i; @@ -1287,13 +1256,13 @@ static void sort_lat(void) for (;;) { struct task_atoms *data; - node = rb_first(&lat_snapshot_root); + node = rb_first(&atom_root); if (!node) break; - rb_erase(node, &lat_snapshot_root); + rb_erase(node, &atom_root); data = rb_entry(node, struct task_atoms, node); - __thread_latency_insert(&sorted_lat_snapshot_root, data, &sort_list); + __thread_latency_insert(&sorted_atom_root, data, &sort_list); } } @@ -1309,7 +1278,7 @@ static void __cmd_lat(void) printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); printf("-----------------------------------------------------------------------------------\n"); - next = rb_first(&sorted_lat_snapshot_root); + next = rb_first(&sorted_atom_root); while (next) { struct task_atoms *atom_list; -- cgit v1.2.3 From b5fae128e41021889777f8ead810cbd2a8b249fc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Sep 2009 12:12:54 +0200 Subject: perf sched: Clean up PID sorting logic Use a sort list for thread atoms insertion as well - instead of hardcoded for PID. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 88 +++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 41 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index cc2dbd5b50e..b72544f2b96 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -144,7 +144,7 @@ struct task_atoms { u64 total_runtime; }; -typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *); +typedef int (*sort_fn_t)(struct task_atoms *, struct task_atoms *); static struct rb_root atom_root, sorted_atom_root; @@ -869,41 +869,22 @@ static struct trace_sched_handler replay_ops = { .fork_event = replay_fork_event, }; -static struct task_atoms * -thread_atoms_search(struct rb_root *root, struct thread *thread) -{ - struct rb_node *node = root->rb_node; - - while (node) { - struct task_atoms *atoms; - - atoms = container_of(node, struct task_atoms, node); - if (thread->pid > atoms->thread->pid) - node = node->rb_left; - else if (thread->pid < atoms->thread->pid) - node = node->rb_right; - else { - return atoms; - } - } - return NULL; -} - struct sort_dimension { const char *name; - sort_thread_lat cmp; + sort_fn_t cmp; struct list_head list; }; static LIST_HEAD(cmp_pid); static int -thread_lat_cmp(struct list_head *list, struct task_atoms *l, - struct task_atoms *r) +thread_lat_cmp(struct list_head *list, struct task_atoms *l, struct task_atoms *r) { struct sort_dimension *sort; int ret = 0; + BUG_ON(list_empty(list)); + list_for_each_entry(sort, list, list) { ret = sort->cmp(l, r); if (ret) @@ -913,6 +894,32 @@ thread_lat_cmp(struct list_head *list, struct task_atoms *l, return ret; } +static struct task_atoms * +thread_atoms_search(struct rb_root *root, struct thread *thread, + struct list_head *sort_list) +{ + struct rb_node *node = root->rb_node; + struct task_atoms key = { .thread = thread }; + + while (node) { + struct task_atoms *atoms; + int cmp; + + atoms = container_of(node, struct task_atoms, node); + + cmp = thread_lat_cmp(sort_list, &key, atoms); + if (cmp > 0) + node = node->rb_left; + else if (cmp < 0) + node = node->rb_right; + else { + BUG_ON(thread != atoms->thread); + return atoms; + } + } + return NULL; +} + static void __thread_latency_insert(struct rb_root *root, struct task_atoms *data, struct list_head *sort_list) @@ -1049,18 +1056,18 @@ latency_switch_event(struct trace_switch_event *switch_event, sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match); sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match); - in_atoms = thread_atoms_search(&atom_root, sched_in); + in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid); if (!in_atoms) { thread_atoms_insert(sched_in); - in_atoms = thread_atoms_search(&atom_root, sched_in); + in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid); if (!in_atoms) die("in-atom: Internal tree error"); } - out_atoms = thread_atoms_search(&atom_root, sched_out); + out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid); if (!out_atoms) { thread_atoms_insert(sched_out); - out_atoms = thread_atoms_search(&atom_root, sched_out); + out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid); if (!out_atoms) die("out-atom: Internal tree error"); } @@ -1085,7 +1092,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, return; wakee = threads__findnew(wakeup_event->pid, &threads, &last_match); - atoms = thread_atoms_search(&atom_root, wakee); + atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); if (!atoms) { thread_atoms_insert(wakee); return; @@ -1136,7 +1143,6 @@ static void output_lat_thread(struct task_atoms *atom_list) static int pid_cmp(struct task_atoms *l, struct task_atoms *r) { - if (l->thread->pid < r->thread->pid) return -1; if (l->thread->pid > r->thread->pid) @@ -1146,8 +1152,8 @@ static int pid_cmp(struct task_atoms *l, struct task_atoms *r) } static struct sort_dimension pid_sort_dimension = { - .name = "pid", - .cmp = pid_cmp, + .name = "pid", + .cmp = pid_cmp, }; static int avg_cmp(struct task_atoms *l, struct task_atoms *r) @@ -1172,8 +1178,8 @@ static int avg_cmp(struct task_atoms *l, struct task_atoms *r) } static struct sort_dimension avg_sort_dimension = { - .name = "avg", - .cmp = avg_cmp, + .name = "avg", + .cmp = avg_cmp, }; static int max_cmp(struct task_atoms *l, struct task_atoms *r) @@ -1187,8 +1193,8 @@ static int max_cmp(struct task_atoms *l, struct task_atoms *r) } static struct sort_dimension max_sort_dimension = { - .name = "max", - .cmp = max_cmp, + .name = "max", + .cmp = max_cmp, }; static int switch_cmp(struct task_atoms *l, struct task_atoms *r) @@ -1202,8 +1208,8 @@ static int switch_cmp(struct task_atoms *l, struct task_atoms *r) } static struct sort_dimension switch_sort_dimension = { - .name = "switch", - .cmp = switch_cmp, + .name = "switch", + .cmp = switch_cmp, }; static int runtime_cmp(struct task_atoms *l, struct task_atoms *r) @@ -1217,8 +1223,8 @@ static int runtime_cmp(struct task_atoms *l, struct task_atoms *r) } static struct sort_dimension runtime_sort_dimension = { - .name = "runtime", - .cmp = runtime_cmp, + .name = "runtime", + .cmp = runtime_cmp, }; static struct sort_dimension *available_sorts[] = { @@ -1666,8 +1672,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) argc = parse_options(argc, argv, latency_options, latency_usage, 0); if (argc) usage_with_options(latency_usage, latency_options); - setup_sorting(); } + setup_sorting(); __cmd_lat(); } else if (!strncmp(argv[0], "rep", 3)) { trace_handler = &replay_ops; -- cgit v1.2.3 From 1fc35b29b4098aa3bf9fc9acb4c1615d0b5dd95d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Sep 2009 09:44:29 +0200 Subject: perf sched: Implement the 'perf sched record' subcommand Implement the 'perf sched record' subcommand that adds a default list of events, turns on raw sampling and system-wide tracing and passes off the rest of the command to perf record. This is more convenient than having to specify the events all the time. Before: $ perf record -a -R -e sched:sched_switch:r -e sched:sched_stat_wait:r -e sched:sched_stat_sleep:r -e sched:sched_stat_iowait:r -e sched:sched_process_exit:r -e sched:sched_process_fork:r -e sched:sched_wakeup:r -e sched:sched_migrate_task:r -c 1 sleep 1 After: $ perf sched record -f sleep 1 Also fix an assumption in the event string parser that assumed that strings passed in can be modified. (In this case they wont be as they come from a readonly constant section.) Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index b72544f2b96..ede40c1429a 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1656,6 +1656,40 @@ static void setup_sorting(void) sort_dimension__add((char *)"pid", &cmp_pid); } +static const char *record_args[] = { + "record", + "-a", + "-R", + "-c", "1", + "-e", "sched:sched_switch:r", + "-e", "sched:sched_stat_wait:r", + "-e", "sched:sched_stat_sleep:r", + "-e", "sched:sched_stat_iowait:r", + "-e", "sched:sched_process_exit:r", + "-e", "sched:sched_process_fork:r", + "-e", "sched:sched_wakeup:r", + "-e", "sched:sched_migrate_task:r", +}; + +static int __cmd_record(int argc, const char **argv) +{ + unsigned int rec_argc, i, j; + const char **rec_argv; + + rec_argc = ARRAY_SIZE(record_args) + argc - 1; + rec_argv = calloc(rec_argc + 1, sizeof(char *)); + + for (i = 0; i < ARRAY_SIZE(record_args); i++) + rec_argv[i] = strdup(record_args[i]); + + for (j = 1; j < (unsigned int)argc; j++, i++) + rec_argv[i] = argv[j]; + + BUG_ON(i != rec_argc); + + return cmd_record(i, rec_argv, NULL); +} + int cmd_sched(int argc, const char **argv, const char *prefix __used) { symbol__init(); @@ -1666,7 +1700,9 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) if (!argc) usage_with_options(sched_usage, sched_options); - if (!strncmp(argv[0], "lat", 3)) { + if (!strncmp(argv[0], "rec", 3)) { + return __cmd_record(argc, argv); + } else if (!strncmp(argv[0], "lat", 3)) { trace_handler = &lat_ops; if (argc > 1) { argc = parse_options(argc, argv, latency_options, latency_usage, 0); @@ -1687,6 +1723,5 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) usage_with_options(sched_usage, sched_options); } - return 0; } -- cgit v1.2.3 From c13f0d3c8165e9592102687fa999da0a0d9c3724 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Sep 2009 16:51:04 +0200 Subject: perf sched: Add 'perf sched trace', improve documentation Alias 'perf sched trace' to 'perf trace', for workflow completeness. Add a bit of documentation for perf sched. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index ede40c1429a..8db0fd222f8 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1597,7 +1597,7 @@ more: } static const char * const sched_usage[] = { - "perf sched [] {record|latency|replay}", + "perf sched [] {record|latency|replay|trace}", NULL }; @@ -1719,6 +1719,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) usage_with_options(replay_usage, replay_options); } __cmd_replay(); + } else if (!strcmp(argv[0], "trace")) { + /* + * Aliased to 'perf trace' for now: + */ + return cmd_trace(argc, argv, prefix); } else { usage_with_options(sched_usage, sched_options); } -- cgit v1.2.3 From d13025222cdb0043e2239b3b819389358bb31bc0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 14 Sep 2009 08:57:15 +0200 Subject: perf tools: Add an option to multiplex counters in a single channel Add an option to multiplex counters output in the channel of the group leader, ie: the first counter opened: -M --multiplex The effect is better serialized samples. This is especially useful for tracepoint samples that need to be well serialized for their post-processing. Also make use of this option in 'perf sched'. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 8db0fd222f8..686af633b35 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1660,6 +1660,8 @@ static const char *record_args[] = { "record", "-a", "-R", + "-M", + "-g", "-c", "1", "-e", "sched:sched_switch:r", "-e", "sched:sched_stat_wait:r", -- cgit v1.2.3 From aa1ab9d26ae9fe2566a9036e3cb83e7d555b3987 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 14 Sep 2009 03:01:12 +0200 Subject: perf tools: Fix processing of randomly serialized sched traces Currently it's possible to meet such too high latency results with 'perf sched latency'. ----------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | ----------------------------------------------------------------------------------- xfce4-panel | 0.222 ms | 2 | avg: 4718.345 ms | max: 9436.493 ms | scsi_eh_3 | 3.962 ms | 36 | avg: 55.957 ms | max: 1977.829 ms | The origin is on traces that are sometimes badly serialized across cpus. For example the raw traces that raised such results for xfce4-panel: (1) [init]-0 [000] 1494.663899990: sched_switch: task swapper:0 [140] (R) ==> xfce4-panel:4569 [120] (2) xfce4-panel-4569 [000] 1494.663928373: sched_switch: task xfce4-panel:4569 [120] (S) ==> swapper:0 [140] (3) Xorg-4276 [001] 1494.663860125: sched_wakeup: task xfce4-panel:4569 [120] success=1 [000] (4) Xorg-4276 [001] 1504.098252756: sched_wakeup: task xfce4-panel:4569 [120] success=1 [000] (5) perf-5219 [000] 1504.100353302: sched_switch: task perf:5219 [120] (S) ==> xfce4-panel:4569 [120] The traces are processed in the order they arrive. Then in (2), xfce4-panel sleeps, it is first waken up in (3) and eventually scheduled in (5). The latency reported is then 1504 - 1495 = 9 secs, as reported by perf sched. But this is wrong, we are confident in the fact the traces are nicely serialized while we should actually more trust the timestamps. If we reorder by timestamps we get: (1) Xorg-4276 [001] 1494.663860125: sched_wakeup: task xfce4-panel:4569 [120] success=1 [000] (2) [init]-0 [000] 1494.663899990: sched_switch: task swapper:0 [140] (R) ==> xfce4-panel:4569 [120] (3) xfce4-panel-4569 [000] 1494.663928373: sched_switch: task xfce4-panel:4569 [120] (S) ==> swapper:0 [140] (4) Xorg-4276 [001] 1504.098252756: sched_wakeup: task xfce4-panel:4569 [120] success=1 [000] (5) perf-5219 [000] 1504.100353302: sched_switch: task perf:5219 [120] (S) ==> xfce4-panel:4569 [120] Now the trace make more sense, xfce4-panel is sleeping. Then it is woken up in (1), scheduled in (2) It goes to sleep in (3), woken up in (4) and scheduled in (5). Now, latency captured between (1) and (2) is of 39 us. And between (4) and (5) it is 2.1 ms. Such pattern of bad serializing is the origin of the high latencies reported by perf sched. Basically, we need to check whether wake up time is higher than schedule out time. If it's not the case, we need to tag the current work atom as invalid. Beside that, we may need to work later on a better ordering of the traces given by the kernel. After this patch: xfce4-session | 0.221 ms | 1 | avg: 0.538 ms | max: 0.538 ms | Signed-off-by: Frederic Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 686af633b35..3e003237c42 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -129,6 +129,7 @@ enum thread_state { struct work_atom { struct list_head list; enum thread_state state; + u64 sched_out_time; u64 wake_up_time; u64 sched_in_time; u64 runtime; @@ -988,9 +989,11 @@ lat_sched_out(struct task_atoms *atoms, if (!atom) die("Non memory"); + atom->sched_out_time = timestamp; + if (sched_out_state(switch_event) == 'R') { atom->state = THREAD_WAIT_CPU; - atom->wake_up_time = timestamp; + atom->wake_up_time = atom->sched_out_time; } atom->runtime = delta; @@ -1106,6 +1109,9 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, if (atom->state != THREAD_SLEEPING) return; + if (atom->sched_out_time > timestamp) + return; + atom->state = THREAD_WAIT_CPU; atom->wake_up_time = timestamp; } -- cgit v1.2.3 From ea57c4f5203d82c7844c54cdef54e972cf4e9d1f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Sep 2009 18:15:54 +0200 Subject: perf tools: Implement counter output multiplexing Finish the -M/--multiplex option implementation: - separate it out from group_fd - correctly set it via the ioctl and dont mmap counters that are multiplexed - modify the perf record event loop to deal with buffer-less counters. - remove the -g option from perf sched record - account for unordered events in perf sched latency - (add -f to perf sched record to ease measurements) - skip idle threads (pid==0) in latency output The result is better latency output by 'perf sched latency': ----------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | ----------------------------------------------------------------------------------- ksoftirqd/8 | 0.071 ms | 2 | avg: 0.458 ms | max: 0.913 ms | at-spi-registry | 0.609 ms | 19 | avg: 0.013 ms | max: 0.023 ms | perf | 3.316 ms | 16 | avg: 0.013 ms | max: 0.054 ms | Xorg | 0.392 ms | 19 | avg: 0.011 ms | max: 0.018 ms | sleep | 0.537 ms | 2 | avg: 0.009 ms | max: 0.009 ms | ----------------------------------------------------------------------------------- TOTAL: | 4.925 ms | 58 | --------------------------------------------- Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 3e003237c42..2ce87ef5a3e 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -116,6 +116,8 @@ static u64 sum_fluct; static u64 run_avg; static unsigned long replay_repeat = 10; +static unsigned long nr_timestamps; +static unsigned long unordered_timestamps; #define TASK_STATE_TO_CHAR_STR "RSDTtZX" @@ -1109,8 +1111,11 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, if (atom->state != THREAD_SLEEPING) return; - if (atom->sched_out_time > timestamp) + nr_timestamps++; + if (atom->sched_out_time > timestamp) { + unordered_timestamps++; return; + } atom->state = THREAD_WAIT_CPU; atom->wake_up_time = timestamp; @@ -1130,6 +1135,11 @@ static void output_lat_thread(struct task_atoms *atom_list) if (!atom_list->nb_atoms) return; + /* + * Ignore idle threads: + */ + if (!atom_list->thread->pid) + return; all_runtime += atom_list->total_runtime; all_count += atom_list->nb_atoms; @@ -1301,8 +1311,16 @@ static void __cmd_lat(void) } printf("-----------------------------------------------------------------------------------\n"); - printf(" TOTAL: |%9.3f ms |%9Ld |\n", + printf(" TOTAL: |%9.3f ms |%9Ld |", (double)all_runtime/1e6, all_count); + + if (unordered_timestamps && nr_timestamps) { + printf(" INFO: %.2f%% unordered events.\n", + (double)unordered_timestamps/(double)nr_timestamps*100.0); + } else { + printf("\n"); + } + printf("---------------------------------------------\n"); } @@ -1667,12 +1685,13 @@ static const char *record_args[] = { "-a", "-R", "-M", - "-g", + "-f", "-c", "1", "-e", "sched:sched_switch:r", "-e", "sched:sched_stat_wait:r", "-e", "sched:sched_stat_sleep:r", "-e", "sched:sched_stat_iowait:r", + "-e", "sched:sched_stat_runtime:r", "-e", "sched:sched_process_exit:r", "-e", "sched:sched_process_fork:r", "-e", "sched:sched_wakeup:r", -- cgit v1.2.3 From d11533893b31ab7806ff04bfa69ae646068610ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 14 Sep 2009 18:22:53 +0200 Subject: perf sched: Fix 'perf sched latency' output on 32-bit systems Before: ----------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | ----------------------------------------------------------------------------------- perf |4853313.251 ms | 10 | avg: 0.046 ms | max: 0.337 ms | flush-8:0 |2426659.202 ms | 5 | avg: 0.015 ms | max: 0.016 ms | sleep |485331.966 ms | 1 | avg: 0.012 ms | max: 0.012 ms | ksoftirqd/1 |485331.320 ms | 1 | avg: 0.005 ms | max: 0.005 ms | ----------------------------------------------------------------------------------- TOTAL: |8250635.739 ms | 17 | --------------------------------------------- After: ----------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | ----------------------------------------------------------------------------------- perf | 0.206 ms | 10 | avg: 0.046 ms | max: 0.337 ms | flush-8:0 | 2.680 ms | 5 | avg: 0.015 ms | max: 0.016 ms | sleep | 0.662 ms | 1 | avg: 0.012 ms | max: 0.012 ms | ksoftirqd/1 | 0.015 ms | 1 | avg: 0.005 ms | max: 0.005 ms | ----------------------------------------------------------------------------------- TOTAL: | 3.563 ms | 17 | --------------------------------------------- Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 2ce87ef5a3e..f856a02cd4f 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -805,7 +805,7 @@ replay_wakeup_event(struct trace_wakeup_event *wakeup_event, add_sched_event_wakeup(waker, timestamp, wakee); } -static unsigned long cpu_last_switched[MAX_CPUS]; +static u64 cpu_last_switched[MAX_CPUS]; static void replay_switch_event(struct trace_switch_event *switch_event, -- cgit v1.2.3 From 08f69e6c2e59b3d73343f8c9ecf758e0133dbc22 Mon Sep 17 00:00:00 2001 From: mingo Date: Mon, 14 Sep 2009 18:30:44 +0200 Subject: perf sched: Print PIDs too Often it's useful to know the PID of the task as well - print it out too. ( While at it, reformat the output to be a bit more paste-into-commit-logs friendly. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index f856a02cd4f..93ef7b215ab 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1144,9 +1144,9 @@ static void output_lat_thread(struct task_atoms *atom_list) all_runtime += atom_list->total_runtime; all_count += atom_list->nb_atoms; - ret = printf(" %s ", atom_list->thread->comm); + ret = printf(" %s-%d ", atom_list->thread->comm, atom_list->thread->pid); - for (i = 0; i < 19 - ret; i++) + for (i = 0; i < 24 - ret; i++) printf(" "); avg = atom_list->total_lat / atom_list->nb_atoms; @@ -1296,9 +1296,9 @@ static void __cmd_lat(void) read_events(); sort_lat(); - printf("-----------------------------------------------------------------------------------\n"); - printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); - printf("-----------------------------------------------------------------------------------\n"); + printf("\n ---------------------------------------------------------------------------------------\n"); + printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); + printf(" ---------------------------------------------------------------------------------------\n"); next = rb_first(&sorted_atom_root); @@ -1310,8 +1310,8 @@ static void __cmd_lat(void) next = rb_next(next); } - printf("-----------------------------------------------------------------------------------\n"); - printf(" TOTAL: |%9.3f ms |%9Ld |", + printf(" ---------------------------------------------------------------------------------------\n"); + printf(" TOTAL: |%9.3f ms |%9Ld |", (double)all_runtime/1e6, all_count); if (unordered_timestamps && nr_timestamps) { @@ -1321,7 +1321,7 @@ static void __cmd_lat(void) printf("\n"); } - printf("---------------------------------------------\n"); + printf(" -------------------------------------------------\n\n"); } static struct trace_sched_handler *trace_handler; -- cgit v1.2.3 From 39aeb52f99f2380c1f16036deed2f7bb8b2e0559 Mon Sep 17 00:00:00 2001 From: mingo Date: Mon, 14 Sep 2009 20:04:48 +0200 Subject: perf sched: Add support for sched:sched_stat_runtime events This allows more precise 'perf sched latency' output: --------------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | --------------------------------------------------------------------------------------- ksoftirqd/0-4 | 0.010 ms | 2 | avg: 2.476 ms | max: 2.977 ms | perf-12328 | 15.844 ms | 66 | avg: 1.118 ms | max: 9.979 ms | bdi-default-235 | 0.009 ms | 1 | avg: 0.998 ms | max: 0.998 ms | events/1-8 | 0.020 ms | 2 | avg: 0.998 ms | max: 0.998 ms | events/0-7 | 0.018 ms | 2 | avg: 0.992 ms | max: 0.996 ms | sleep-12329 | 0.742 ms | 3 | avg: 0.906 ms | max: 2.289 ms | sshd-12122 | 0.163 ms | 2 | avg: 0.283 ms | max: 0.562 ms | loop-getpid-lon-12322 | 1023.636 ms | 69 | avg: 0.208 ms | max: 5.996 ms | loop-getpid-lon-12321 | 1038.638 ms | 5 | avg: 0.073 ms | max: 0.171 ms | migration/1-5 | 0.000 ms | 1 | avg: 0.006 ms | max: 0.006 ms | --------------------------------------------------------------------------------------- TOTAL: | 2079.078 ms | 153 | ------------------------------------------------- Also, streamline the code a bit more, add asserts for various state machine failures (they should be debugged if they occur) and fix a few odd ends. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 264 +++++++++++++++++++++++++++++---------------- 1 file changed, 173 insertions(+), 91 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 93ef7b215ab..adcb563ec4d 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -50,7 +50,7 @@ static u64 sleep_measurement_overhead; static unsigned long nr_tasks; -struct sched_event; +struct sched_atom; struct task_desc { unsigned long nr; @@ -59,7 +59,7 @@ struct task_desc { unsigned long nr_events; unsigned long curr_event; - struct sched_event **events; + struct sched_atom **atoms; pthread_t thread; sem_t sleep_sem; @@ -76,7 +76,7 @@ enum sched_event_type { SCHED_EVENT_WAKEUP, }; -struct sched_event { +struct sched_atom { enum sched_event_type type; u64 timestamp; u64 duration; @@ -137,8 +137,8 @@ struct work_atom { u64 runtime; }; -struct task_atoms { - struct list_head atom_list; +struct work_atoms { + struct list_head work_list; struct thread *thread; struct rb_node node; u64 max_lat; @@ -147,7 +147,7 @@ struct task_atoms { u64 total_runtime; }; -typedef int (*sort_fn_t)(struct task_atoms *, struct task_atoms *); +typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *); static struct rb_root atom_root, sorted_atom_root; @@ -220,10 +220,10 @@ static void calibrate_sleep_measurement_overhead(void) printf("sleep measurement overhead: %Ld nsecs\n", min_delta); } -static struct sched_event * +static struct sched_atom * get_new_event(struct task_desc *task, u64 timestamp) { - struct sched_event *event = calloc(1, sizeof(*event)); + struct sched_atom *event = calloc(1, sizeof(*event)); unsigned long idx = task->nr_events; size_t size; @@ -231,27 +231,27 @@ get_new_event(struct task_desc *task, u64 timestamp) event->nr = idx; task->nr_events++; - size = sizeof(struct sched_event *) * task->nr_events; - task->events = realloc(task->events, size); - BUG_ON(!task->events); + size = sizeof(struct sched_atom *) * task->nr_events; + task->atoms = realloc(task->atoms, size); + BUG_ON(!task->atoms); - task->events[idx] = event; + task->atoms[idx] = event; return event; } -static struct sched_event *last_event(struct task_desc *task) +static struct sched_atom *last_event(struct task_desc *task) { if (!task->nr_events) return NULL; - return task->events[task->nr_events - 1]; + return task->atoms[task->nr_events - 1]; } static void add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration) { - struct sched_event *event, *curr_event = last_event(task); + struct sched_atom *event, *curr_event = last_event(task); /* * optimize an existing RUN event by merging this one @@ -275,7 +275,7 @@ static void add_sched_event_wakeup(struct task_desc *task, u64 timestamp, struct task_desc *wakee) { - struct sched_event *event, *wakee_event; + struct sched_atom *event, *wakee_event; event = get_new_event(task, timestamp); event->type = SCHED_EVENT_WAKEUP; @@ -303,7 +303,7 @@ static void add_sched_event_sleep(struct task_desc *task, u64 timestamp, u64 task_state __used) { - struct sched_event *event = get_new_event(task, timestamp); + struct sched_atom *event = get_new_event(task, timestamp); event->type = SCHED_EVENT_SLEEP; @@ -372,27 +372,27 @@ static void add_cross_task_wakeups(void) } static void -process_sched_event(struct task_desc *this_task __used, struct sched_event *event) +process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom) { int ret = 0; u64 now; long long delta; now = get_nsecs(); - delta = start_time + event->timestamp - now; + delta = start_time + atom->timestamp - now; - switch (event->type) { + switch (atom->type) { case SCHED_EVENT_RUN: - burn_nsecs(event->duration); + burn_nsecs(atom->duration); break; case SCHED_EVENT_SLEEP: - if (event->wait_sem) - ret = sem_wait(event->wait_sem); + if (atom->wait_sem) + ret = sem_wait(atom->wait_sem); BUG_ON(ret); break; case SCHED_EVENT_WAKEUP: - if (event->wait_sem) - ret = sem_post(event->wait_sem); + if (atom->wait_sem) + ret = sem_post(atom->wait_sem); BUG_ON(ret); break; default: @@ -467,7 +467,7 @@ again: for (i = 0; i < this_task->nr_events; i++) { this_task->curr_event = i; - process_sched_event(this_task, this_task->events[i]); + process_sched_event(this_task, this_task->atoms[i]); } cpu_usage_1 = get_cpu_usage_nsec_self(); @@ -649,7 +649,7 @@ static void __cmd_replay(void) if (multitarget_wakeups) printf("multi-target wakeups: %ld\n", multitarget_wakeups); if (nr_run_events_optimized) - printf("run events optimized: %ld\n", + printf("run atoms optimized: %ld\n", nr_run_events_optimized); print_task_traces(); @@ -727,6 +727,20 @@ struct trace_switch_event { u32 next_prio; }; +struct trace_runtime_event { + u32 size; + + u16 common_type; + u8 common_flags; + u8 common_preempt_count; + u32 common_pid; + u32 common_tgid; + + char comm[16]; + u32 pid; + u64 runtime; + u64 vruntime; +}; struct trace_wakeup_event { u32 size; @@ -767,6 +781,12 @@ struct trace_sched_handler { u64 timestamp, struct thread *thread); + void (*runtime_event)(struct trace_runtime_event *, + struct event *, + int cpu, + u64 timestamp, + struct thread *thread); + void (*wakeup_event)(struct trace_wakeup_event *, struct event *, int cpu, @@ -881,7 +901,7 @@ struct sort_dimension { static LIST_HEAD(cmp_pid); static int -thread_lat_cmp(struct list_head *list, struct task_atoms *l, struct task_atoms *r) +thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r) { struct sort_dimension *sort; int ret = 0; @@ -897,18 +917,18 @@ thread_lat_cmp(struct list_head *list, struct task_atoms *l, struct task_atoms * return ret; } -static struct task_atoms * +static struct work_atoms * thread_atoms_search(struct rb_root *root, struct thread *thread, struct list_head *sort_list) { struct rb_node *node = root->rb_node; - struct task_atoms key = { .thread = thread }; + struct work_atoms key = { .thread = thread }; while (node) { - struct task_atoms *atoms; + struct work_atoms *atoms; int cmp; - atoms = container_of(node, struct task_atoms, node); + atoms = container_of(node, struct work_atoms, node); cmp = thread_lat_cmp(sort_list, &key, atoms); if (cmp > 0) @@ -924,16 +944,16 @@ thread_atoms_search(struct rb_root *root, struct thread *thread, } static void -__thread_latency_insert(struct rb_root *root, struct task_atoms *data, +__thread_latency_insert(struct rb_root *root, struct work_atoms *data, struct list_head *sort_list) { struct rb_node **new = &(root->rb_node), *parent = NULL; while (*new) { - struct task_atoms *this; + struct work_atoms *this; int cmp; - this = container_of(*new, struct task_atoms, node); + this = container_of(*new, struct work_atoms, node); parent = *new; cmp = thread_lat_cmp(sort_list, data, this); @@ -950,14 +970,14 @@ __thread_latency_insert(struct rb_root *root, struct task_atoms *data, static void thread_atoms_insert(struct thread *thread) { - struct task_atoms *atoms; + struct work_atoms *atoms; atoms = calloc(sizeof(*atoms), 1); if (!atoms) die("No memory"); atoms->thread = thread; - INIT_LIST_HEAD(&atoms->atom_list); + INIT_LIST_HEAD(&atoms->work_list); __thread_latency_insert(&atom_root, atoms, &cmp_pid); } @@ -980,10 +1000,9 @@ static char sched_out_state(struct trace_switch_event *switch_event) } static void -lat_sched_out(struct task_atoms *atoms, - struct trace_switch_event *switch_event __used, - u64 delta, - u64 timestamp) +add_sched_out_event(struct work_atoms *atoms, + char run_state, + u64 timestamp) { struct work_atom *atom; @@ -993,25 +1012,37 @@ lat_sched_out(struct task_atoms *atoms, atom->sched_out_time = timestamp; - if (sched_out_state(switch_event) == 'R') { + if (run_state == 'R') { atom->state = THREAD_WAIT_CPU; atom->wake_up_time = atom->sched_out_time; } - atom->runtime = delta; - list_add_tail(&atom->list, &atoms->atom_list); + list_add_tail(&atom->list, &atoms->work_list); +} + +static void +add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used) +{ + struct work_atom *atom; + + BUG_ON(list_empty(&atoms->work_list)); + + atom = list_entry(atoms->work_list.prev, struct work_atom, list); + + atom->runtime += delta; + atoms->total_runtime += delta; } static void -lat_sched_in(struct task_atoms *atoms, u64 timestamp) +add_sched_in_event(struct work_atoms *atoms, u64 timestamp) { struct work_atom *atom; u64 delta; - if (list_empty(&atoms->atom_list)) + if (list_empty(&atoms->work_list)) return; - atom = list_entry(atoms->atom_list.prev, struct work_atom, list); + atom = list_entry(atoms->work_list.prev, struct work_atom, list); if (atom->state != THREAD_WAIT_CPU) return; @@ -1029,7 +1060,6 @@ lat_sched_in(struct task_atoms *atoms, u64 timestamp) if (delta > atoms->max_lat) atoms->max_lat = delta; atoms->nb_atoms++; - atoms->total_runtime += atom->runtime; } static void @@ -1039,13 +1069,12 @@ latency_switch_event(struct trace_switch_event *switch_event, u64 timestamp, struct thread *thread __used) { - struct task_atoms *out_atoms, *in_atoms; + struct work_atoms *out_events, *in_events; struct thread *sched_out, *sched_in; u64 timestamp0; s64 delta; - if (cpu >= MAX_CPUS || cpu < 0) - return; + BUG_ON(cpu >= MAX_CPUS || cpu < 0); timestamp0 = cpu_last_switched[cpu]; cpu_last_switched[cpu] = timestamp; @@ -1061,34 +1090,63 @@ latency_switch_event(struct trace_switch_event *switch_event, sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match); sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match); - in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid); - if (!in_atoms) { + out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid); + if (!out_events) { + thread_atoms_insert(sched_out); + out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid); + if (!out_events) + die("out-event: Internal tree error"); + } + add_sched_out_event(out_events, sched_out_state(switch_event), timestamp); + + in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid); + if (!in_events) { thread_atoms_insert(sched_in); - in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid); - if (!in_atoms) - die("in-atom: Internal tree error"); + in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid); + if (!in_events) + die("in-event: Internal tree error"); + /* + * Take came in we have not heard about yet, + * add in an initial atom in runnable state: + */ + add_sched_out_event(in_events, 'R', timestamp); } + add_sched_in_event(in_events, timestamp); +} - out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid); - if (!out_atoms) { - thread_atoms_insert(sched_out); - out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid); - if (!out_atoms) - die("out-atom: Internal tree error"); +static void +latency_runtime_event(struct trace_runtime_event *runtime_event, + struct event *event __used, + int cpu, + u64 timestamp, + struct thread *this_thread __used) +{ + struct work_atoms *atoms; + struct thread *thread; + + BUG_ON(cpu >= MAX_CPUS || cpu < 0); + + thread = threads__findnew(runtime_event->pid, &threads, &last_match); + atoms = thread_atoms_search(&atom_root, thread, &cmp_pid); + if (!atoms) { + thread_atoms_insert(thread); + atoms = thread_atoms_search(&atom_root, thread, &cmp_pid); + if (!atoms) + die("in-event: Internal tree error"); + add_sched_out_event(atoms, 'R', timestamp); } - lat_sched_in(in_atoms, timestamp); - lat_sched_out(out_atoms, switch_event, delta, timestamp); + add_runtime_event(atoms, runtime_event->runtime, timestamp); } static void latency_wakeup_event(struct trace_wakeup_event *wakeup_event, - struct event *event __used, + struct event *__event __used, int cpu __used, u64 timestamp, struct thread *thread __used) { - struct task_atoms *atoms; + struct work_atoms *atoms; struct work_atom *atom; struct thread *wakee; @@ -1100,16 +1158,20 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); if (!atoms) { thread_atoms_insert(wakee); - return; + atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); + if (!atoms) + die("wakeup-event: Internal tree error"); + add_sched_out_event(atoms, 'S', timestamp); } - if (list_empty(&atoms->atom_list)) - return; + BUG_ON(list_empty(&atoms->work_list)); - atom = list_entry(atoms->atom_list.prev, struct work_atom, list); + atom = list_entry(atoms->work_list.prev, struct work_atom, list); - if (atom->state != THREAD_SLEEPING) + if (atom->state != THREAD_SLEEPING) { + printf("boo2\n"); return; + } nr_timestamps++; if (atom->sched_out_time > timestamp) { @@ -1124,40 +1186,41 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, static struct trace_sched_handler lat_ops = { .wakeup_event = latency_wakeup_event, .switch_event = latency_switch_event, + .runtime_event = latency_runtime_event, .fork_event = latency_fork_event, }; -static void output_lat_thread(struct task_atoms *atom_list) +static void output_lat_thread(struct work_atoms *work_list) { int i; int ret; u64 avg; - if (!atom_list->nb_atoms) + if (!work_list->nb_atoms) return; /* * Ignore idle threads: */ - if (!atom_list->thread->pid) + if (!work_list->thread->pid) return; - all_runtime += atom_list->total_runtime; - all_count += atom_list->nb_atoms; + all_runtime += work_list->total_runtime; + all_count += work_list->nb_atoms; - ret = printf(" %s-%d ", atom_list->thread->comm, atom_list->thread->pid); + ret = printf(" %s-%d ", work_list->thread->comm, work_list->thread->pid); for (i = 0; i < 24 - ret; i++) printf(" "); - avg = atom_list->total_lat / atom_list->nb_atoms; + avg = work_list->total_lat / work_list->nb_atoms; printf("|%9.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n", - (double)atom_list->total_runtime / 1e6, - atom_list->nb_atoms, (double)avg / 1e6, - (double)atom_list->max_lat / 1e6); + (double)work_list->total_runtime / 1e6, + work_list->nb_atoms, (double)avg / 1e6, + (double)work_list->max_lat / 1e6); } -static int pid_cmp(struct task_atoms *l, struct task_atoms *r) +static int pid_cmp(struct work_atoms *l, struct work_atoms *r) { if (l->thread->pid < r->thread->pid) return -1; @@ -1172,7 +1235,7 @@ static struct sort_dimension pid_sort_dimension = { .cmp = pid_cmp, }; -static int avg_cmp(struct task_atoms *l, struct task_atoms *r) +static int avg_cmp(struct work_atoms *l, struct work_atoms *r) { u64 avgl, avgr; @@ -1198,7 +1261,7 @@ static struct sort_dimension avg_sort_dimension = { .cmp = avg_cmp, }; -static int max_cmp(struct task_atoms *l, struct task_atoms *r) +static int max_cmp(struct work_atoms *l, struct work_atoms *r) { if (l->max_lat < r->max_lat) return -1; @@ -1213,7 +1276,7 @@ static struct sort_dimension max_sort_dimension = { .cmp = max_cmp, }; -static int switch_cmp(struct task_atoms *l, struct task_atoms *r) +static int switch_cmp(struct work_atoms *l, struct work_atoms *r) { if (l->nb_atoms < r->nb_atoms) return -1; @@ -1228,7 +1291,7 @@ static struct sort_dimension switch_sort_dimension = { .cmp = switch_cmp, }; -static int runtime_cmp(struct task_atoms *l, struct task_atoms *r) +static int runtime_cmp(struct work_atoms *l, struct work_atoms *r) { if (l->total_runtime < r->total_runtime) return -1; @@ -1277,13 +1340,13 @@ static void sort_lat(void) struct rb_node *node; for (;;) { - struct task_atoms *data; + struct work_atoms *data; node = rb_first(&atom_root); if (!node) break; rb_erase(node, &atom_root); - data = rb_entry(node, struct task_atoms, node); + data = rb_entry(node, struct work_atoms, node); __thread_latency_insert(&sorted_atom_root, data, &sort_list); } } @@ -1303,10 +1366,10 @@ static void __cmd_lat(void) next = rb_first(&sorted_atom_root); while (next) { - struct task_atoms *atom_list; + struct work_atoms *work_list; - atom_list = rb_entry(next, struct task_atoms, node); - output_lat_thread(atom_list); + work_list = rb_entry(next, struct work_atoms, node); + output_lat_thread(work_list); next = rb_next(next); } @@ -1368,6 +1431,23 @@ process_sched_switch_event(struct raw_event_sample *raw, trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread); } +static void +process_sched_runtime_event(struct raw_event_sample *raw, + struct event *event, + int cpu __used, + u64 timestamp __used, + struct thread *thread __used) +{ + struct trace_runtime_event runtime_event; + + FILL_ARRAY(runtime_event, comm, event, raw->data); + FILL_FIELD(runtime_event, pid, event, raw->data); + FILL_FIELD(runtime_event, runtime, event, raw->data); + FILL_FIELD(runtime_event, vruntime, event, raw->data); + + trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread); +} + static void process_sched_fork_event(struct raw_event_sample *raw, struct event *event, @@ -1410,6 +1490,8 @@ process_raw_event(event_t *raw_event __used, void *more_data, if (!strcmp(event->name, "sched_switch")) process_sched_switch_event(raw, event, cpu, timestamp, thread); + if (!strcmp(event->name, "sched_stat_runtime")) + process_sched_runtime_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup")) process_sched_wakeup_event(raw, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup_new")) -- cgit v1.2.3 From dc02bf7178c8e2cb3d442ae19027b736d51c7dd5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 16 Sep 2009 13:45:00 +0200 Subject: perf sched: Account for lost events, increase default buffering Output such lost event and state machine weirdness stats: TOTAL: | 14974.910 ms | 46384 | --------------------------------------------------- INFO: 8.865% lost events (19132 out of 215819, in 8 chunks) INFO: 0.198% state machine bugs (49 out of 24708) (due to lost events?) And increase buffering to -m 1024 (4 MB) by default. Since we use output multiplexing that kind of space is needed. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 60 +++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 19 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index adcb563ec4d..1f0f9be34fa 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -117,7 +117,11 @@ static u64 run_avg; static unsigned long replay_repeat = 10; static unsigned long nr_timestamps; -static unsigned long unordered_timestamps; +static unsigned long nr_unordered_timestamps; +static unsigned long nr_state_machine_bugs; +static unsigned long nr_events; +static unsigned long nr_lost_chunks; +static unsigned long nr_lost_events; #define TASK_STATE_TO_CHAR_STR "RSDTtZX" @@ -668,14 +672,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: perf_event_comm: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing perf_event_comm, skipping event.\n"); return -1; } total_comm++; @@ -1168,14 +1172,12 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, atom = list_entry(atoms->work_list.prev, struct work_atom, list); - if (atom->state != THREAD_SLEEPING) { - printf("boo2\n"); - return; - } + if (atom->state != THREAD_SLEEPING) + nr_state_machine_bugs++; nr_timestamps++; if (atom->sched_out_time > timestamp) { - unordered_timestamps++; + nr_unordered_timestamps++; return; } @@ -1214,7 +1216,7 @@ static void output_lat_thread(struct work_atoms *work_list) avg = work_list->total_lat / work_list->nb_atoms; - printf("|%9.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n", + printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n", (double)work_list->total_runtime / 1e6, work_list->nb_atoms, (double)avg / 1e6, (double)work_list->max_lat / 1e6); @@ -1359,9 +1361,9 @@ static void __cmd_lat(void) read_events(); sort_lat(); - printf("\n ---------------------------------------------------------------------------------------\n"); - printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); - printf(" ---------------------------------------------------------------------------------------\n"); + printf("\n -----------------------------------------------------------------------------------------\n"); + printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); + printf(" -----------------------------------------------------------------------------------------\n"); next = rb_first(&sorted_atom_root); @@ -1373,18 +1375,32 @@ static void __cmd_lat(void) next = rb_next(next); } - printf(" ---------------------------------------------------------------------------------------\n"); - printf(" TOTAL: |%9.3f ms |%9Ld |", + printf(" -----------------------------------------------------------------------------------------\n"); + printf(" TOTAL: |%11.3f ms |%9Ld |\n", (double)all_runtime/1e6, all_count); - if (unordered_timestamps && nr_timestamps) { - printf(" INFO: %.2f%% unordered events.\n", - (double)unordered_timestamps/(double)nr_timestamps*100.0); + printf(" ---------------------------------------------------\n"); + if (nr_unordered_timestamps && nr_timestamps) { + printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n", + (double)nr_unordered_timestamps/(double)nr_timestamps*100.0, + nr_unordered_timestamps, nr_timestamps); } else { + } + if (nr_lost_events && nr_events) { + printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n", + (double)nr_lost_events/(double)nr_events*100.0, + nr_lost_events, nr_events, nr_lost_chunks); + } + if (nr_state_machine_bugs && nr_timestamps) { + printf(" INFO: %.3f%% state machine bugs (%ld out of %ld)", + (double)nr_state_machine_bugs/(double)nr_timestamps*100.0, + nr_state_machine_bugs, nr_timestamps); + if (nr_lost_events) + printf(" (due to lost events?)"); printf("\n"); } + printf("\n"); - printf(" -------------------------------------------------\n\n"); } static struct trace_sched_handler *trace_handler; @@ -1585,8 +1601,13 @@ process_event(event_t *event, unsigned long offset, unsigned long head) { trace_event(event); + nr_events++; switch (event->header.type) { - case PERF_EVENT_MMAP ... PERF_EVENT_LOST: + case PERF_EVENT_MMAP: + return 0; + case PERF_EVENT_LOST: + nr_lost_chunks++; + nr_lost_events += event->lost.lost; return 0; case PERF_EVENT_COMM: @@ -1768,6 +1789,7 @@ static const char *record_args[] = { "-R", "-M", "-f", + "-m", "1024", "-c", "1", "-e", "sched:sched_switch:r", "-e", "sched:sched_stat_wait:r", -- cgit v1.2.3 From c8a37751043427c6e4397a2cbfd617cb5f215c72 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 16 Sep 2009 14:07:00 +0200 Subject: perf sched: Sanity check context switch events Use 'perf sched latency' to track the current task based on context-switch events, and flag the cases where there's some impossible transition: such as a PID being switched out that was not switched in. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 1f0f9be34fa..2d542368de3 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -119,6 +119,7 @@ static unsigned long replay_repeat = 10; static unsigned long nr_timestamps; static unsigned long nr_unordered_timestamps; static unsigned long nr_state_machine_bugs; +static unsigned long nr_context_switch_bugs; static unsigned long nr_events; static unsigned long nr_lost_chunks; static unsigned long nr_lost_events; @@ -1399,6 +1400,14 @@ static void __cmd_lat(void) printf(" (due to lost events?)"); printf("\n"); } + if (nr_context_switch_bugs && nr_timestamps) { + printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)", + (double)nr_context_switch_bugs/(double)nr_timestamps*100.0, + nr_context_switch_bugs, nr_timestamps); + if (nr_lost_events) + printf(" (due to lost events?)"); + printf("\n"); + } printf("\n"); } @@ -1425,10 +1434,16 @@ process_sched_wakeup_event(struct raw_event_sample *raw, trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread); } +/* + * Track the current task - that way we can know whether there's any + * weird events, such as a task being switched away that is not current. + */ +static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 }; + static void process_sched_switch_event(struct raw_event_sample *raw, struct event *event, - int cpu __used, + int cpu, u64 timestamp __used, struct thread *thread __used) { @@ -1444,6 +1459,16 @@ process_sched_switch_event(struct raw_event_sample *raw, FILL_FIELD(switch_event, next_pid, event, raw->data); FILL_FIELD(switch_event, next_prio, event, raw->data); + if (curr_pid[cpu] != (u32)-1) { + /* + * Are we trying to switch away a PID that is + * not current? + */ + if (curr_pid[cpu] != switch_event.prev_pid) + nr_context_switch_bugs++; + } + curr_pid[cpu] = switch_event.next_pid; + trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread); } -- cgit v1.2.3 From 80ed0987f363d7eb50193df3e6f6d71451f74bc3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 16 Sep 2009 14:12:36 +0200 Subject: perf sched: Make idle thread and comm/pid names more consistent Peter noticed that we have 3 ways of referring to the idle thread: [idle]:0 swapper:0 swapper-0 Standardize on 'swapper:0'. Reported-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 2d542368de3..da8f67483ae 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1204,13 +1204,13 @@ static void output_lat_thread(struct work_atoms *work_list) /* * Ignore idle threads: */ - if (!work_list->thread->pid) + if (!strcmp(work_list->thread->comm, "swapper")) return; all_runtime += work_list->total_runtime; all_count += work_list->nb_atoms; - ret = printf(" %s-%d ", work_list->thread->comm, work_list->thread->pid); + ret = printf(" %s:%d ", work_list->thread->comm, work_list->thread->pid); for (i = 0; i < 24 - ret; i++) printf(" "); -- cgit v1.2.3 From 0ec04e16d08b69d8da46abbcfa3e3f2cd9738852 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 16 Sep 2009 17:40:48 +0200 Subject: perf sched: Add 'perf sched map' scheduling event map printout This prints a textual context-switching outline of workload captured via perf sched record. For example, on a 16 CPU box it outputs: N1 O1 . . . S1 . . . B0 . *I0 C1 . M1 . 23002.773423 secs N1 O1 . *Q0 . S1 . . . B0 . I0 C1 . M1 . 23002.773423 secs N1 O1 . Q0 . S1 . . . B0 . *R1 C1 . M1 . 23002.773485 secs N1 O1 . Q0 . S1 . *S0 . B0 . R1 C1 . M1 . 23002.773478 secs *L0 O1 . Q0 . S1 . S0 . B0 . R1 C1 . M1 . 23002.773523 secs L0 O1 . *. . S1 . S0 . B0 . R1 C1 . M1 . 23002.773531 secs L0 O1 . . . S1 . S0 . B0 . R1 C1 *T1 M1 . 23002.773547 secs T1 => irqbalance:2089 L0 O1 . . . S1 . S0 . *P0 . R1 C1 T1 M1 . 23002.773549 secs *N1 O1 . . . S1 . S0 . P0 . R1 C1 T1 M1 . 23002.773566 secs N1 O1 . . . *J0 . S0 . P0 . R1 C1 T1 M1 . 23002.773571 secs N1 O1 . . . J0 . S0 *B0 P0 . R1 C1 T1 M1 . 23002.773592 secs N1 O1 . . . J0 . *U0 B0 P0 . R1 C1 T1 M1 . 23002.773582 secs N1 O1 . . . *S1 . U0 B0 P0 . R1 C1 T1 M1 . 23002.773604 secs N1 O1 . . . S1 . U0 B0 *. . R1 C1 T1 M1 . 23002.773615 secs N1 O1 . . . S1 . U0 B0 . . *K0 C1 T1 M1 . 23002.773631 secs N1 O1 . *M0 . S1 . U0 B0 . . K0 C1 T1 M1 . 23002.773624 secs N1 O1 . M0 . S1 . U0 *. . . K0 C1 T1 M1 . 23002.773644 secs N1 O1 . M0 . S1 . U0 . . . *R1 C1 T1 M1 . 23002.773662 secs N1 O1 . M0 . S1 . *. . . . R1 C1 T1 M1 . 23002.773648 secs N1 O1 . *. . S1 . . . . . R1 C1 T1 M1 . 23002.773680 secs N1 O1 . . . *L0 . . . . . R1 C1 T1 M1 . 23002.773717 secs *N0 O1 . . . L0 . . . . . R1 C1 T1 M1 . 23002.773709 secs *N1 O1 . . . L0 . . . . . R1 C1 T1 M1 . 23002.773747 secs Columns stand for individual CPUs, from CPU0 to CPU15, and the two-letter shortcuts stand for tasks that are running on a CPU. '*' denotes the CPU that had the event. A dot signals an idle CPU. New tasks are assigned new two-letter shortcuts - when they occur first they are printed. In the above example 'T1' stood for irqbalance: T1 => irqbalance:2089 Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 312 ++++++++++++++++++++++++++++++--------------- 1 file changed, 212 insertions(+), 100 deletions(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index da8f67483ae..f67e351b050 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -159,8 +159,6 @@ static struct rb_root atom_root, sorted_atom_root; static u64 all_runtime; static u64 all_count; -static int read_events(void); - static u64 get_nsecs(void) { @@ -634,38 +632,6 @@ static void test_calibrations(void) printf("the sleep test took %Ld nsecs\n", T1-T0); } -static void __cmd_replay(void) -{ - unsigned long i; - - calibrate_run_measurement_overhead(); - calibrate_sleep_measurement_overhead(); - - test_calibrations(); - - read_events(); - - printf("nr_run_events: %ld\n", nr_run_events); - printf("nr_sleep_events: %ld\n", nr_sleep_events); - printf("nr_wakeup_events: %ld\n", nr_wakeup_events); - - if (targetless_wakeups) - printf("target-less wakeups: %ld\n", targetless_wakeups); - if (multitarget_wakeups) - printf("multi-target wakeups: %ld\n", multitarget_wakeups); - if (nr_run_events_optimized) - printf("run atoms optimized: %ld\n", - nr_run_events_optimized); - - print_task_traces(); - add_cross_task_wakeups(); - - create_tasks(); - printf("------------------------------------------------------------\n"); - for (i = 0; i < replay_repeat; i++) - run_one_test(); -} - static int process_comm_event(event_t *event, unsigned long offset, unsigned long head) { @@ -1354,64 +1320,6 @@ static void sort_lat(void) } } -static void __cmd_lat(void) -{ - struct rb_node *next; - - setup_pager(); - read_events(); - sort_lat(); - - printf("\n -----------------------------------------------------------------------------------------\n"); - printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); - printf(" -----------------------------------------------------------------------------------------\n"); - - next = rb_first(&sorted_atom_root); - - while (next) { - struct work_atoms *work_list; - - work_list = rb_entry(next, struct work_atoms, node); - output_lat_thread(work_list); - next = rb_next(next); - } - - printf(" -----------------------------------------------------------------------------------------\n"); - printf(" TOTAL: |%11.3f ms |%9Ld |\n", - (double)all_runtime/1e6, all_count); - - printf(" ---------------------------------------------------\n"); - if (nr_unordered_timestamps && nr_timestamps) { - printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n", - (double)nr_unordered_timestamps/(double)nr_timestamps*100.0, - nr_unordered_timestamps, nr_timestamps); - } else { - } - if (nr_lost_events && nr_events) { - printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n", - (double)nr_lost_events/(double)nr_events*100.0, - nr_lost_events, nr_events, nr_lost_chunks); - } - if (nr_state_machine_bugs && nr_timestamps) { - printf(" INFO: %.3f%% state machine bugs (%ld out of %ld)", - (double)nr_state_machine_bugs/(double)nr_timestamps*100.0, - nr_state_machine_bugs, nr_timestamps); - if (nr_lost_events) - printf(" (due to lost events?)"); - printf("\n"); - } - if (nr_context_switch_bugs && nr_timestamps) { - printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)", - (double)nr_context_switch_bugs/(double)nr_timestamps*100.0, - nr_context_switch_bugs, nr_timestamps); - if (nr_lost_events) - printf(" (due to lost events?)"); - printf("\n"); - } - printf("\n"); - -} - static struct trace_sched_handler *trace_handler; static void @@ -1431,19 +1339,106 @@ process_sched_wakeup_event(struct raw_event_sample *raw, FILL_FIELD(wakeup_event, success, event, raw->data); FILL_FIELD(wakeup_event, cpu, event, raw->data); - trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread); + if (trace_handler->wakeup_event) + trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread); } /* * Track the current task - that way we can know whether there's any * weird events, such as a task being switched away that is not current. */ +static int max_cpu = 15; + static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 }; +static struct thread *curr_thread[MAX_CPUS]; + +static char next_shortname1 = 'A'; +static char next_shortname2 = '0'; + +static void +map_switch_event(struct trace_switch_event *switch_event, + struct event *event __used, + int this_cpu, + u64 timestamp, + struct thread *thread __used) +{ + struct thread *sched_out, *sched_in; + int new_shortname; + u64 timestamp0; + s64 delta; + int cpu; + + BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0); + + if (this_cpu > max_cpu) + max_cpu = this_cpu; + + timestamp0 = cpu_last_switched[this_cpu]; + cpu_last_switched[this_cpu] = timestamp; + if (timestamp0) + delta = timestamp - timestamp0; + else + delta = 0; + + if (delta < 0) + die("hm, delta: %Ld < 0 ?\n", delta); + + + sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match); + sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match); + + curr_thread[this_cpu] = sched_in; + + printf(" "); + + new_shortname = 0; + if (!sched_in->shortname[0]) { + sched_in->shortname[0] = next_shortname1; + sched_in->shortname[1] = next_shortname2; + + if (next_shortname1 < 'Z') { + next_shortname1++; + } else { + next_shortname1='A'; + if (next_shortname2 < '9') { + next_shortname2++; + } else { + next_shortname2='0'; + } + } + new_shortname = 1; + } + + for (cpu = 0; cpu <= max_cpu; cpu++) { + if (cpu != this_cpu) + printf(" "); + else + printf("*"); + + if (curr_thread[cpu]) { + if (curr_thread[cpu]->pid) + printf("%2s ", curr_thread[cpu]->shortname); + else + printf(". "); + } else + printf(" "); + } + + printf(" %12.6f secs ", (double)timestamp/1e9); + if (new_shortname) { + printf("%s => %s:%d\n", + sched_in->shortname, sched_in->comm, sched_in->pid); + } else { + printf("\n"); + } +} + + static void process_sched_switch_event(struct raw_event_sample *raw, struct event *event, - int cpu, + int this_cpu, u64 timestamp __used, struct thread *thread __used) { @@ -1459,17 +1454,18 @@ process_sched_switch_event(struct raw_event_sample *raw, FILL_FIELD(switch_event, next_pid, event, raw->data); FILL_FIELD(switch_event, next_prio, event, raw->data); - if (curr_pid[cpu] != (u32)-1) { + if (curr_pid[this_cpu] != (u32)-1) { /* * Are we trying to switch away a PID that is * not current? */ - if (curr_pid[cpu] != switch_event.prev_pid) + if (curr_pid[this_cpu] != switch_event.prev_pid) nr_context_switch_bugs++; } - curr_pid[cpu] = switch_event.next_pid; + if (trace_handler->switch_event) + trace_handler->switch_event(&switch_event, event, this_cpu, timestamp, thread); - trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread); + curr_pid[this_cpu] = switch_event.next_pid; } static void @@ -1486,7 +1482,8 @@ process_sched_runtime_event(struct raw_event_sample *raw, FILL_FIELD(runtime_event, runtime, event, raw->data); FILL_FIELD(runtime_event, vruntime, event, raw->data); - trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread); + if (trace_handler->runtime_event) + trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread); } static void @@ -1505,7 +1502,8 @@ process_sched_fork_event(struct raw_event_sample *raw, FILL_ARRAY(fork_event, child_comm, event, raw->data); FILL_FIELD(fork_event, child_pid, event, raw->data); - trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread); + if (trace_handler->fork_event) + trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread); } static void @@ -1748,6 +1746,116 @@ more: return rc; } +static void print_bad_events(void) +{ + if (nr_unordered_timestamps && nr_timestamps) { + printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n", + (double)nr_unordered_timestamps/(double)nr_timestamps*100.0, + nr_unordered_timestamps, nr_timestamps); + } + if (nr_lost_events && nr_events) { + printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n", + (double)nr_lost_events/(double)nr_events*100.0, + nr_lost_events, nr_events, nr_lost_chunks); + } + if (nr_state_machine_bugs && nr_timestamps) { + printf(" INFO: %.3f%% state machine bugs (%ld out of %ld)", + (double)nr_state_machine_bugs/(double)nr_timestamps*100.0, + nr_state_machine_bugs, nr_timestamps); + if (nr_lost_events) + printf(" (due to lost events?)"); + printf("\n"); + } + if (nr_context_switch_bugs && nr_timestamps) { + printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)", + (double)nr_context_switch_bugs/(double)nr_timestamps*100.0, + nr_context_switch_bugs, nr_timestamps); + if (nr_lost_events) + printf(" (due to lost events?)"); + printf("\n"); + } +} + +static void __cmd_lat(void) +{ + struct rb_node *next; + + setup_pager(); + read_events(); + sort_lat(); + + printf("\n -----------------------------------------------------------------------------------------\n"); + printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); + printf(" -----------------------------------------------------------------------------------------\n"); + + next = rb_first(&sorted_atom_root); + + while (next) { + struct work_atoms *work_list; + + work_list = rb_entry(next, struct work_atoms, node); + output_lat_thread(work_list); + next = rb_next(next); + } + + printf(" -----------------------------------------------------------------------------------------\n"); + printf(" TOTAL: |%11.3f ms |%9Ld |\n", + (double)all_runtime/1e6, all_count); + + printf(" ---------------------------------------------------\n"); + + print_bad_events(); + printf("\n"); + +} + +static struct trace_sched_handler map_ops = { + .wakeup_event = NULL, + .switch_event = map_switch_event, + .runtime_event = NULL, + .fork_event = NULL, +}; + +static void __cmd_map(void) +{ + setup_pager(); + read_events(); + print_bad_events(); +} + +static void __cmd_replay(void) +{ + unsigned long i; + + calibrate_run_measurement_overhead(); + calibrate_sleep_measurement_overhead(); + + test_calibrations(); + + read_events(); + + printf("nr_run_events: %ld\n", nr_run_events); + printf("nr_sleep_events: %ld\n", nr_sleep_events); + printf("nr_wakeup_events: %ld\n", nr_wakeup_events); + + if (targetless_wakeups) + printf("target-less wakeups: %ld\n", targetless_wakeups); + if (multitarget_wakeups) + printf("multi-target wakeups: %ld\n", multitarget_wakeups); + if (nr_run_events_optimized) + printf("run atoms optimized: %ld\n", + nr_run_events_optimized); + + print_task_traces(); + add_cross_task_wakeups(); + + create_tasks(); + printf("------------------------------------------------------------\n"); + for (i = 0; i < replay_repeat; i++) + run_one_test(); +} + + static const char * const sched_usage[] = { "perf sched [] {record|latency|replay|trace}", NULL @@ -1867,6 +1975,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) } setup_sorting(); __cmd_lat(); + } else if (!strcmp(argv[0], "map")) { + trace_handler = &map_ops; + setup_sorting(); + __cmd_map(); } else if (!strncmp(argv[0], "rep", 3)) { trace_handler = &replay_ops; if (argc) { -- cgit v1.2.3 From 40749d0ff49f99c3661b336fe5e5625207bd925a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 17 Sep 2009 18:24:55 +0200 Subject: perf sched: Determine the number of CPUs automatically For 'perf sched map' output, determine max_cpu automatically, instead of the static default of 15. [ v2: use sysconf() pointed out by Arjan van de Ven ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index f67e351b050..9e04827d16b 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1347,7 +1347,7 @@ process_sched_wakeup_event(struct raw_event_sample *raw, * Track the current task - that way we can know whether there's any * weird events, such as a task being switched away that is not current. */ -static int max_cpu = 15; +static int max_cpu; static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 }; @@ -1818,6 +1818,8 @@ static struct trace_sched_handler map_ops = { static void __cmd_map(void) { + max_cpu = sysconf(_SC_NPROCESSORS_CONF); + setup_pager(); read_events(); print_bad_events(); -- cgit v1.2.3 From 4b77a7297795229eca96c41e1709a3c87909fabe Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 18 Sep 2009 08:22:24 +0200 Subject: perf sched: Add --input=file option to builtin-sched.c perf sched record passes unparsed args on to perf record, so specifying an output file via perf sched record -o FILE (cmd) just works. Ergo, provide an option to specify input file as well. Also add the missing 'map' command to help. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1253254944.20589.11.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-sched.c') diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 9e04827d16b..275d79c6627 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1859,11 +1859,13 @@ static void __cmd_replay(void) static const char * const sched_usage[] = { - "perf sched [] {record|latency|replay|trace}", + "perf sched [] {record|latency|map|replay|trace}", NULL }; static const struct option sched_options[] = { + OPT_STRING('i', "input", &input_name, "file", + "input file name"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, -- cgit v1.2.3