diff options
Diffstat (limited to 'kernel')
70 files changed, 6102 insertions, 2120 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0669b70fa6a..9fdba03dc1f 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -52,8 +52,23 @@ config PREEMPT endchoice +config PREEMPT_RCU + bool "Preemptible RCU" + depends on PREEMPT + default n + help + This option reduces the latency of the kernel by making certain + RCU sections preemptible. Normally RCU code is non-preemptible, if + this option is selected then read-only RCU sections become + preemptible. This helps latency, but may expose bugs due to + now-naive assumptions about each RCU read-side critical section + remaining on a given CPU through its execution. + + Say N if you are unsure. + config RCU_TRACE bool "Enable tracing for RCU - currently stats in debugfs" + depends on PREEMPT_RCU select DEBUG_FS default y help diff --git a/kernel/Makefile b/kernel/Makefile index 6c584c55a6e..6c5f081132a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o nsproxy.o srcu.o \ + hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o obj-$(CONFIG_SYSCTL) += sysctl_check.o @@ -53,6 +53,7 @@ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o diff --git a/kernel/acct.c b/kernel/acct.c index 521dfa53cb9..91e1cfd734d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -58,6 +58,7 @@ #include <asm/uaccess.h> #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ +#include <linux/pid_namespace.h> /* * These constants control the amount of freespace that suspend and @@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct file *); +static void do_acct_process(struct pid_namespace *ns, struct file *); /* * This structure is used so that all the data protected by lock @@ -86,6 +87,7 @@ struct acct_glbs { volatile int active; volatile int needcheck; struct file *file; + struct pid_namespace *ns; struct timer_list timer; }; @@ -175,9 +177,11 @@ out: static void acct_file_reopen(struct file *file) { struct file *old_acct = NULL; + struct pid_namespace *old_ns = NULL; if (acct_globals.file) { old_acct = acct_globals.file; + old_ns = acct_globals.ns; del_timer(&acct_globals.timer); acct_globals.active = 0; acct_globals.needcheck = 0; @@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file) } if (file) { acct_globals.file = file; + acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); acct_globals.needcheck = 0; acct_globals.active = 1; /* It's been deleted if it was used before so this is safe */ @@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); - do_acct_process(old_acct); + do_acct_process(old_ns, old_acct); filp_close(old_acct, NULL); + put_pid_ns(old_ns); spin_lock(&acct_globals.lock); } } @@ -419,7 +425,7 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct file *file) +static void do_acct_process(struct pid_namespace *ns, struct file *file) { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; @@ -481,8 +487,10 @@ static void do_acct_process(struct file *file) ac.ac_gid16 = current->gid; #endif #if ACCT_VERSION==3 - ac.ac_pid = current->tgid; - ac.ac_ppid = current->real_parent->tgid; + ac.ac_pid = task_tgid_nr_ns(current, ns); + rcu_read_lock(); + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); + rcu_read_unlock(); #endif spin_lock_irq(¤t->sighand->siglock); @@ -578,6 +586,7 @@ void acct_collect(long exitcode, int group_dead) void acct_process(void) { struct file *file = NULL; + struct pid_namespace *ns; /* * accelerate the common fastpath: @@ -592,8 +601,10 @@ void acct_process(void) return; } get_file(file); + ns = get_pid_ns(acct_globals.ns); spin_unlock(&acct_globals.lock); - do_acct_process(file); + do_acct_process(ns, file); fput(file); + put_pid_ns(ns); } diff --git a/kernel/audit.c b/kernel/audit.c index 2eeea9a1424..a7b16086d36 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -21,7 +21,7 @@ * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * - * Goals: 1) Integrate fully with SELinux. + * Goals: 1) Integrate fully with Security Modules. * 2) Minimal run-time overhead: * a) Minimal when syscall auditing is disabled (audit_enable=0). * b) Small when syscall auditing is enabled and no audit record @@ -55,7 +55,6 @@ #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/netlink.h> -#include <linux/selinux.h> #include <linux/inotify.h> #include <linux/freezer.h> #include <linux/tty.h> @@ -78,9 +77,13 @@ static int audit_default; /* If auditing cannot proceed, audit_failure selects what happens. */ static int audit_failure = AUDIT_FAIL_PRINTK; -/* If audit records are to be written to the netlink socket, audit_pid - * contains the (non-zero) pid. */ +/* + * If audit records are to be written to the netlink socket, audit_pid + * contains the pid of the auditd process and audit_nlk_pid contains + * the pid to use to send netlink messages to that process. + */ int audit_pid; +static int audit_nlk_pid; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -170,7 +173,9 @@ void audit_panic(const char *message) printk(KERN_ERR "audit: %s\n", message); break; case AUDIT_FAIL_PANIC: - panic("audit: %s\n", message); + /* test audit_pid since printk is always losey, why bother? */ + if (audit_pid) + panic("audit: %s\n", message); break; } } @@ -259,13 +264,13 @@ static int audit_log_config_change(char *function_name, int new, int old, char *ctx = NULL; u32 len; - rc = selinux_sid_to_string(sid, &ctx, &len); + rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) { audit_log_format(ab, " sid=%u", sid); allow_changes = 0; /* Something weird, deny request */ } else { audit_log_format(ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); } } audit_log_format(ab, " res=%d", allow_changes); @@ -348,10 +353,11 @@ static int kauditd_thread(void *dummy) wake_up(&audit_backlog_wait); if (skb) { if (audit_pid) { - int err = netlink_unicast(audit_sock, skb, audit_pid, 0); + int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); + audit_log_lost("auditd dissapeared\n"); audit_pid = 0; } } else { @@ -543,12 +549,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, audit_log_format(*ab, "user pid=%d uid=%u auid=%u", pid, uid, auid); if (sid) { - rc = selinux_sid_to_string(sid, &ctx, &len); + rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) audit_log_format(*ab, " ssid=%u", sid); - else + else { audit_log_format(*ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } return rc; @@ -623,6 +630,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sid, 1); audit_pid = new_pid; + audit_nlk_pid = NETLINK_CB(skb).pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) err = audit_set_rate_limit(status_get->rate_limit, @@ -750,18 +758,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_SIGNAL_INFO: - err = selinux_sid_to_string(audit_sig_sid, &ctx, &len); + err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); if (err) return err; sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); if (!sig_data) { - kfree(ctx); + security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = audit_sig_uid; sig_data->pid = audit_sig_pid; memcpy(sig_data->ctx, ctx, len); - kfree(ctx); + security_release_secctx(ctx, len); audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); @@ -873,10 +881,6 @@ static int __init audit_init(void) audit_enabled = audit_default; audit_ever_enabled |= !!audit_default; - /* Register the callback with selinux. This callback will be invoked - * when a new policy is loaded. */ - selinux_audit_set_callback(&selinux_audit_rule_update); - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); #ifdef CONFIG_AUDITSYSCALL @@ -1261,8 +1265,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, /** * audit_string_contains_control - does a string need to be logged in hex - * @string - string to be checked - * @len - max length of the string to check + * @string: string to be checked + * @len: max length of the string to check */ int audit_string_contains_control(const char *string, size_t len) { @@ -1277,7 +1281,7 @@ int audit_string_contains_control(const char *string, size_t len) /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer - * @len: lenth of string (not including trailing null) + * @len: length of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string @@ -1350,17 +1354,19 @@ void audit_log_end(struct audit_buffer *ab) if (!audit_rate_check()) { audit_log_lost("rate limit exceeded"); } else { + struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); if (audit_pid) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); skb_queue_tail(&audit_skb_queue, ab->skb); ab->skb = NULL; wake_up_interruptible(&kauditd_wait); - } else if (printk_ratelimit()) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0)); - } else { - audit_log_lost("printk limit exceeded\n"); + } else if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) { + printk(KERN_NOTICE "type=%d %s\n", + nlh->nlmsg_type, + ab->skb->data + NLMSG_SPACE(0)); + } else + audit_log_lost("printk limit exceeded\n"); } } audit_buffer_free(ab); diff --git a/kernel/audit.h b/kernel/audit.h index 2554bd524fd..3cfc54ee3e1 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -65,34 +65,9 @@ struct audit_watch { struct list_head rules; /* associated rules */ }; -struct audit_field { - u32 type; - u32 val; - u32 op; - char *se_str; - struct selinux_audit_rule *se_rule; -}; - struct audit_tree; struct audit_chunk; -struct audit_krule { - int vers_ops; - u32 flags; - u32 listnr; - u32 action; - u32 mask[AUDIT_BITMASK_SIZE]; - u32 buflen; /* for data alloc on list rules */ - u32 field_count; - char *filterkey; /* ties events to rules */ - struct audit_field *fields; - struct audit_field *arch_f; /* quick access to arch field */ - struct audit_field *inode_f; /* quick access to an inode field */ - struct audit_watch *watch; /* associated watch */ - struct audit_tree *tree; /* associated watched tree */ - struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ -}; - struct audit_entry { struct list_head list; struct rcu_head rcu; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 2f2914b7cc3..28fef6bf853 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -28,7 +28,7 @@ #include <linux/netlink.h> #include <linux/sched.h> #include <linux/inotify.h> -#include <linux/selinux.h> +#include <linux/security.h> #include "audit.h" /* @@ -38,7 +38,7 @@ * Synchronizes writes and blocking reads of audit's filterlist * data. Rcu is used to traverse the filterlist and access * contents of structs audit_entry, audit_watch and opaque - * selinux rules during filtering. If modified, these structures + * LSM rules during filtering. If modified, these structures * must be copied and replace their counterparts in the filterlist. * An audit_parent struct is not accessed during filtering, so may * be written directly provided audit_filter_mutex is held. @@ -139,8 +139,8 @@ static inline void audit_free_rule(struct audit_entry *e) if (e->rule.fields) for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; - kfree(f->se_str); - selinux_audit_rule_free(f->se_rule); + kfree(f->lsm_str); + security_audit_rule_free(f->lsm_rule); } kfree(e->rule.fields); kfree(e->rule.filterkey); @@ -554,8 +554,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->op = data->fieldflags[i] & AUDIT_OPERATORS; f->type = data->fields[i]; f->val = data->values[i]; - f->se_str = NULL; - f->se_rule = NULL; + f->lsm_str = NULL; + f->lsm_rule = NULL; switch(f->type) { case AUDIT_PID: case AUDIT_UID: @@ -597,12 +597,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; entry->rule.buflen += f->val; - err = selinux_audit_rule_init(f->type, f->op, str, - &f->se_rule); + err = security_audit_rule_init(f->type, f->op, str, + (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { - printk(KERN_WARNING "audit rule for selinux " + printk(KERN_WARNING "audit rule for LSM " "\'%s\' is invalid\n", str); err = 0; } @@ -610,7 +610,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, kfree(str); goto exit_free; } else - f->se_str = str; + f->lsm_str = str; break; case AUDIT_WATCH: str = audit_unpack_string(&bufp, &remain, f->val); @@ -754,7 +754,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: data->buflen += data->values[i] = - audit_pack_string(&bufp, f->se_str); + audit_pack_string(&bufp, f->lsm_str); break; case AUDIT_WATCH: data->buflen += data->values[i] = @@ -806,7 +806,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) + if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str)) return 1; break; case AUDIT_WATCH: @@ -862,28 +862,28 @@ out: return new; } -/* Duplicate selinux field information. The se_rule is opaque, so must be +/* Duplicate LSM field information. The lsm_rule is opaque, so must be * re-initialized. */ -static inline int audit_dupe_selinux_field(struct audit_field *df, +static inline int audit_dupe_lsm_field(struct audit_field *df, struct audit_field *sf) { int ret = 0; - char *se_str; + char *lsm_str; - /* our own copy of se_str */ - se_str = kstrdup(sf->se_str, GFP_KERNEL); - if (unlikely(!se_str)) + /* our own copy of lsm_str */ + lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL); + if (unlikely(!lsm_str)) return -ENOMEM; - df->se_str = se_str; + df->lsm_str = lsm_str; - /* our own (refreshed) copy of se_rule */ - ret = selinux_audit_rule_init(df->type, df->op, df->se_str, - &df->se_rule); + /* our own (refreshed) copy of lsm_rule */ + ret = security_audit_rule_init(df->type, df->op, df->lsm_str, + (void **)&df->lsm_rule); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { - printk(KERN_WARNING "audit rule for selinux \'%s\' is " - "invalid\n", df->se_str); + printk(KERN_WARNING "audit rule for LSM \'%s\' is " + "invalid\n", df->lsm_str); ret = 0; } @@ -891,7 +891,7 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, } /* Duplicate an audit rule. This will be a deep copy with the exception - * of the watch - that pointer is carried over. The selinux specific fields + * of the watch - that pointer is carried over. The LSM specific fields * will be updated in the copy. The point is to be able to replace the old * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from @@ -930,7 +930,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, new->tree = old->tree; memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); - /* deep copy this information, updating the se_rule fields, because + /* deep copy this information, updating the lsm_rule fields, because * the originals will all be freed when the old rule is freed. */ for (i = 0; i < fcount; i++) { switch (new->fields[i].type) { @@ -944,7 +944,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - err = audit_dupe_selinux_field(&new->fields[i], + err = audit_dupe_lsm_field(&new->fields[i], &old->fields[i]); break; case AUDIT_FILTERKEY: @@ -1515,11 +1515,12 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, if (sid) { char *ctx = NULL; u32 len; - if (selinux_sid_to_string(sid, &ctx, &len)) + if (security_secid_to_secctx(sid, &ctx, &len)) audit_log_format(ab, " ssid=%u", sid); - else + else { audit_log_format(ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } audit_log_format(ab, " op=%s rule key=", action); if (rule->filterkey) @@ -1761,38 +1762,12 @@ unlock_and_return: return result; } -/* Check to see if the rule contains any selinux fields. Returns 1 if there - are selinux fields specified in the rule, 0 otherwise. */ -static inline int audit_rule_has_selinux(struct audit_krule *rule) -{ - int i; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &rule->fields[i]; - switch (f->type) { - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - return 1; - } - } - - return 0; -} - -/* This function will re-initialize the se_rule field of all applicable rules. - * It will traverse the filter lists serarching for rules that contain selinux +/* This function will re-initialize the lsm_rule field of all applicable rules. + * It will traverse the filter lists serarching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the - * selinux field is re-initialized, and the old rule is replaced with the + * LSM field is re-initialized, and the old rule is replaced with the * updated rule. */ -int selinux_audit_rule_update(void) +int audit_update_lsm_rules(void) { struct audit_entry *entry, *n, *nentry; struct audit_watch *watch; @@ -1804,7 +1779,7 @@ int selinux_audit_rule_update(void) for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { - if (!audit_rule_has_selinux(&entry->rule)) + if (!security_audit_rule_known(&entry->rule)) continue; watch = entry->rule.watch; @@ -1815,7 +1790,7 @@ int selinux_audit_rule_update(void) * return value */ if (!err) err = PTR_ERR(nentry); - audit_panic("error updating selinux filters"); + audit_panic("error updating LSM filters"); if (watch) list_del(&entry->rule.rlist); list_del_rcu(&entry->list); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2087d6de67e..56e56ed594a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -61,7 +61,6 @@ #include <linux/security.h> #include <linux/list.h> #include <linux/tty.h> -#include <linux/selinux.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> @@ -528,14 +527,14 @@ static int audit_filter_rules(struct task_struct *tsk, match for now to avoid losing information that may be wanted. An error message will also be logged upon error */ - if (f->se_rule) { + if (f->lsm_rule) { if (need_sid) { - selinux_get_task_sid(tsk, &sid); + security_task_getsecid(tsk, &sid); need_sid = 0; } - result = selinux_audit_rule_match(sid, f->type, + result = security_audit_rule_match(sid, f->type, f->op, - f->se_rule, + f->lsm_rule, ctx); } break; @@ -546,18 +545,18 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_OBJ_LEV_HIGH: /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR also applies here */ - if (f->se_rule) { + if (f->lsm_rule) { /* Find files that match */ if (name) { - result = selinux_audit_rule_match( + result = security_audit_rule_match( name->osid, f->type, f->op, - f->se_rule, ctx); + f->lsm_rule, ctx); } else if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (selinux_audit_rule_match( + if (security_audit_rule_match( ctx->names[j].osid, f->type, f->op, - f->se_rule, ctx)) { + f->lsm_rule, ctx)) { ++result; break; } @@ -570,7 +569,7 @@ static int audit_filter_rules(struct task_struct *tsk, aux = aux->next) { if (aux->type == AUDIT_IPC) { struct audit_aux_data_ipcctl *axi = (void *)aux; - if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { + if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) { ++result; break; } @@ -885,11 +884,11 @@ void audit_log_task_context(struct audit_buffer *ab) int error; u32 sid; - selinux_get_task_sid(current, &sid); + security_task_getsecid(current, &sid); if (!sid) return; - error = selinux_sid_to_string(sid, &ctx, &len); + error = security_secid_to_secctx(sid, &ctx, &len); if (error) { if (error != -EINVAL) goto error_path; @@ -897,7 +896,7 @@ void audit_log_task_context(struct audit_buffer *ab) } audit_log_format(ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); return; error_path: @@ -941,7 +940,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, u32 sid, char *comm) { struct audit_buffer *ab; - char *s = NULL; + char *ctx = NULL; u32 len; int rc = 0; @@ -951,15 +950,16 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, uid, sessionid); - if (selinux_sid_to_string(sid, &s, &len)) { + if (security_secid_to_secctx(sid, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); rc = 1; - } else - audit_log_format(ab, " obj=%s", s); + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); - kfree(s); return rc; } @@ -1070,7 +1070,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * so we can be sure nothing was lost. */ if ((i == 0) && (too_long)) - audit_log_format(*ab, "a%d_len=%ld ", arg_num, + audit_log_format(*ab, "a%d_len=%zu ", arg_num, has_cntl ? 2*len : len); /* @@ -1271,14 +1271,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (axi->osid != 0) { char *ctx = NULL; u32 len; - if (selinux_sid_to_string( + if (security_secid_to_secctx( axi->osid, &ctx, &len)) { audit_log_format(ab, " osid=%u", axi->osid); call_panic = 1; - } else + } else { audit_log_format(ab, " obj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } break; } @@ -1392,13 +1393,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (n->osid != 0) { char *ctx = NULL; u32 len; - if (selinux_sid_to_string( + if (security_secid_to_secctx( n->osid, &ctx, &len)) { audit_log_format(ab, " osid=%u", n->osid); call_panic = 2; - } else + } else { audit_log_format(ab, " obj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } audit_log_end(ab); @@ -1775,7 +1777,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode name->uid = inode->i_uid; name->gid = inode->i_gid; name->rdev = inode->i_rdev; - selinux_get_inode_sid(inode, &name->osid); + security_inode_getsecid(inode, &name->osid); } /** @@ -2190,8 +2192,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) ax->uid = ipcp->uid; ax->gid = ipcp->gid; ax->mode = ipcp->mode; - selinux_get_ipc_sid(ipcp, &ax->osid); - + security_ipc_getsecid(ipcp, &ax->osid); ax->d.type = AUDIT_IPC; ax->d.next = context->aux; context->aux = (void *)ax; @@ -2343,7 +2344,7 @@ void __audit_ptrace(struct task_struct *t) context->target_auid = audit_get_loginuid(t); context->target_uid = t->uid; context->target_sessionid = audit_get_sessionid(t); - selinux_get_task_sid(t, &context->target_sid); + security_task_getsecid(t, &context->target_sid); memcpy(context->target_comm, t->comm, TASK_COMM_LEN); } @@ -2371,7 +2372,7 @@ int __audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = tsk->loginuid; else audit_sig_uid = tsk->uid; - selinux_get_task_sid(tsk, &audit_sig_sid); + security_task_getsecid(tsk, &audit_sig_sid); } if (!audit_signals || audit_dummy_context()) return 0; @@ -2384,7 +2385,7 @@ int __audit_signal_info(int sig, struct task_struct *t) ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t->uid; ctx->target_sessionid = audit_get_sessionid(t); - selinux_get_task_sid(t, &ctx->target_sid); + security_task_getsecid(t, &ctx->target_sid); memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); return 0; } @@ -2405,7 +2406,7 @@ int __audit_signal_info(int sig, struct task_struct *t) axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t->uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); - selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); + security_task_getsecid(t, &axp->target_sid[axp->pid_count]); memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); axp->pid_count++; @@ -2435,16 +2436,17 @@ void audit_core_dumps(long signr) ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", auid, current->uid, current->gid, sessionid); - selinux_get_task_sid(current, &sid); + security_task_getsecid(current, &sid); if (sid) { char *ctx = NULL; u32 len; - if (selinux_sid_to_string(sid, &ctx, &len)) + if (security_secid_to_secctx(sid, &ctx, &len)) audit_log_format(ab, " ssid=%u", sid); - else + else { audit_log_format(ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/bounds.c b/kernel/bounds.c new file mode 100644 index 00000000000..c3c55544db2 --- /dev/null +++ b/kernel/bounds.c @@ -0,0 +1,23 @@ +/* + * Generate definitions needed by the preprocessor. + * This code generates raw asm output which is post-processed + * to extract and format the required data. + */ + +#define __GENERATING_BOUNDS_H +/* Include headers that define the enum constants of interest */ +#include <linux/page-flags.h> +#include <linux/mmzone.h> + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define BLANK() asm volatile("\n->" : : ) + +void foo(void) +{ + /* The enum constants to put into include/linux/bounds.h */ + DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); + DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + /* End of constants */ +} diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4766bb65e4d..6d8de051382 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -113,9 +113,9 @@ static int root_count; #define dummytop (&rootnode.top_cgroup) /* This flag indicates whether tasks in the fork and exit paths should - * take callback_mutex and check for fork/exit handlers to call. This - * avoids us having to do extra work in the fork/exit path if none of the - * subsystems need to be called. + * check for fork/exit handlers to call. This avoids us having to do + * extra work in the fork/exit path if none of the subsystems need to + * be called. */ static int need_forkexit_callback; @@ -307,7 +307,6 @@ static inline void put_css_set_taskexit(struct css_set *cg) * template: location in which to build the desired set of subsystem * state objects for the new cgroup group */ - static struct css_set *find_existing_css_set( struct css_set *oldcg, struct cgroup *cgrp, @@ -320,7 +319,7 @@ static struct css_set *find_existing_css_set( /* Built the set of subsystem state objects that we want to * see in the new css_set */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1ull << i)) { + if (root->subsys_bits & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -354,7 +353,6 @@ static struct css_set *find_existing_css_set( * and chains them on tmp through their cgrp_link_list fields. Returns 0 on * success or a negative error */ - static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; @@ -396,7 +394,6 @@ static void free_cg_links(struct list_head *tmp) * substituted into the appropriate hierarchy. Must be called with * cgroup_mutex held */ - static struct css_set *find_css_set( struct css_set *oldcg, struct cgroup *cgrp) { @@ -473,7 +470,6 @@ static struct css_set *find_css_set( /* Link this cgroup group into the list */ list_add(&res->list, &init_css_set.list); css_set_count++; - INIT_LIST_HEAD(&res->tasks); write_unlock(&css_set_lock); return res; @@ -507,8 +503,8 @@ static struct css_set *find_css_set( * critical pieces of code here. The exception occurs on cgroup_exit(), * when a task in a notify_on_release cgroup exits. Then cgroup_mutex * is taken, and if the cgroup count is zero, a usermode call made - * to /sbin/cgroup_release_agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. + * to the release agent with the name of the cgroup (path relative to + * the root of cgroup file system) as the argument. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all @@ -521,7 +517,7 @@ static struct css_set *find_css_set( * * The need for this exception arises from the action of * cgroup_attach_task(), which overwrites one tasks cgroup pointer with - * another. It does so using cgroup_mutexe, however there are + * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as @@ -537,7 +533,6 @@ static struct css_set *find_css_set( * cgroup_lock - lock out any changes to cgroup structures * */ - void cgroup_lock(void) { mutex_lock(&cgroup_mutex); @@ -548,7 +543,6 @@ void cgroup_lock(void) * * Undo the lock taken in a previous cgroup_lock() call. */ - void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); @@ -590,7 +584,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) * Call subsys's pre_destroy handler. * This is called before css refcnt check. */ - static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -600,7 +593,6 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) return; } - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -696,7 +688,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, added_bits = final_bits & ~root->actual_subsys_bits; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long long bit = 1ull << i; + unsigned long bit = 1UL << i; struct cgroup_subsys *ss = subsys[i]; if (!(bit & added_bits)) continue; @@ -790,7 +782,14 @@ static int parse_cgroupfs_options(char *data, if (!*token) return -EINVAL; if (!strcmp(token, "all")) { - opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; + /* Add all non-disabled subsystems */ + int i; + opts->subsys_bits = 0; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (!ss->disabled) + opts->subsys_bits |= 1ul << i; + } } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { @@ -808,7 +807,8 @@ static int parse_cgroupfs_options(char *data, for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ss = subsys[i]; if (!strcmp(token, ss->name)) { - set_bit(i, &opts->subsys_bits); + if (!ss->disabled) + set_bit(i, &opts->subsys_bits); break; } } @@ -927,7 +927,6 @@ static int cgroup_get_rootdir(struct super_block *sb) if (!inode) return -ENOMEM; - inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ @@ -961,8 +960,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, } root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) + if (!root) { + if (opts.release_agent) + kfree(opts.release_agent); return -ENOMEM; + } init_cgroup_root(root); root->subsys_bits = opts.subsys_bits; @@ -1129,8 +1131,13 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return dentry->d_fsdata; } -/* - * Called with cgroup_mutex held. Writes path of cgroup into buf. +/** + * cgroup_path - generate the path of a cgroup + * @cgrp: the cgroup in question + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Called with cgroup_mutex held. Writes path of cgroup into buf. * Returns 0 on success, -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) @@ -1188,11 +1195,13 @@ static void get_first_subsys(const struct cgroup *cgrp, *subsys_id = test_ss->subsys_id; } -/* - * Attach task 'tsk' to cgroup 'cgrp' +/** + * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' + * @cgrp: the cgroup the task is attaching to + * @tsk: the task to be attached * - * Call holding cgroup_mutex. May take task_lock of - * the task 'pid' during call. + * Call holding cgroup_mutex. May take task_lock of + * the task 'tsk' during call. */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { @@ -1293,7 +1302,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) } /* The various types of files and directories in a cgroup file system */ - enum cgroup_filetype { FILE_ROOT, FILE_DIR, @@ -1584,12 +1592,11 @@ static int cgroup_create_file(struct dentry *dentry, int mode, } /* - * cgroup_create_dir - create a directory for an object. - * cgrp: the cgroup we create the directory for. - * It must have a valid ->parent field - * And we are going to fill its ->dentry field. - * dentry: dentry of the new cgroup - * mode: mode to set on new directory. + * cgroup_create_dir - create a directory for an object. + * @cgrp: the cgroup we create the directory for. It must have a valid + * ->parent field. And we are going to fill its ->dentry field. + * @dentry: dentry of the new cgroup + * @mode: mode to set on new directory. */ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, int mode) @@ -1651,8 +1658,12 @@ int cgroup_add_files(struct cgroup *cgrp, return 0; } -/* Count the number of tasks in a cgroup. */ - +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. + */ int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; @@ -1711,7 +1722,12 @@ void cgroup_enable_task_cg_lists(void) use_task_css_set_links = 1; do_each_thread(g, p) { task_lock(p); - if (list_empty(&p->cg_list)) + /* + * We should check if the process is exiting, otherwise + * it will race with cgroup_exit() in that the list + * entry won't be deleted though the process has exited. + */ + if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); } while_each_thread(g, p); @@ -1962,12 +1978,13 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) } /** - * Build and fill cgroupstats so that taskstats can export it to user - * space. - * + * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { @@ -2078,7 +2095,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) kfree(pidarray); } else { - ctr->buf = 0; + ctr->buf = NULL; ctr->bufsz = 0; } file->private_data = ctr; @@ -2199,14 +2216,13 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, } /* - * cgroup_create - create a cgroup - * parent: cgroup that will be parent of the new cgroup. - * name: name of the new cgroup. Will be strcpy'ed. - * mode: mode to set on new inode + * cgroup_create - create a cgroup + * @parent: cgroup that will be parent of the new cgroup + * @dentry: dentry of the new cgroup + * @mode: mode to set on new inode * - * Must be called with the mutex on the parent inode held + * Must be called with the mutex on the parent inode held */ - static long cgroup_create(struct cgroup *parent, struct dentry *dentry, int mode) { @@ -2229,7 +2245,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, mutex_lock(&cgroup_mutex); - cgrp->flags = 0; INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); @@ -2239,6 +2254,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->root = parent->root; cgrp->top_cgroup = parent->top_cgroup; + if (notify_on_release(parent)) + set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); if (IS_ERR(css)) { @@ -2349,13 +2367,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) parent = cgrp->parent; root = cgrp->root; sb = root->sb; + /* - * Call pre_destroy handlers of subsys + * Call pre_destroy handlers of subsys. Notify subsystems + * that rmdir() request comes. */ cgroup_call_pre_destroy(cgrp); - /* - * Notify subsyses that rmdir() request comes. - */ if (cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); @@ -2431,8 +2448,10 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) } /** - * cgroup_init_early - initialize cgroups at system boot, and - * initialize any subsystems that request early init. + * cgroup_init_early - cgroup initialization at system boot + * + * Initialize cgroups at system boot, and initialize any + * subsystems that request early init. */ int __init cgroup_init_early(void) { @@ -2474,8 +2493,10 @@ int __init cgroup_init_early(void) } /** - * cgroup_init - register cgroup filesystem and /proc file, and - * initialize any subsystems that didn't request early init. + * cgroup_init - cgroup initialization + * + * Register cgroup filesystem and /proc file, and initialize + * any subsystems that didn't request early init. */ int __init cgroup_init(void) { @@ -2553,6 +2574,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v) /* Skip this hierarchy if it has no active subsystems */ if (!root->actual_subsys_bits) continue; + seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); seq_putc(m, ':'); @@ -2592,13 +2614,13 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) { int i; - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n"); + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\n", + seq_printf(m, "%s\t%lu\t%d\t%d\n", ss->name, ss->root->subsys_bits, - ss->root->number_of_cgroups); + ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); return 0; @@ -2606,7 +2628,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) static int cgroupstats_open(struct inode *inode, struct file *file) { - return single_open(file, proc_cgroupstats_show, 0); + return single_open(file, proc_cgroupstats_show, NULL); } static struct file_operations proc_cgroupstats_operations = { @@ -2618,7 +2640,7 @@ static struct file_operations proc_cgroupstats_operations = { /** * cgroup_fork - attach newly forked task to its parents cgroup. - * @tsk: pointer to task_struct of forking parent process. + * @child: pointer to task_struct of forking parent process. * * Description: A task inherits its parent's cgroup at fork(). * @@ -2642,9 +2664,12 @@ void cgroup_fork(struct task_struct *child) } /** - * cgroup_fork_callbacks - called on a new task very soon before - * adding it to the tasklist. No need to take any locks since no-one - * can be operating on this task + * cgroup_fork_callbacks - run fork callbacks + * @child: the new task + * + * Called on a new task very soon before adding it to the + * tasklist. No need to take any locks since no-one can + * be operating on this task. */ void cgroup_fork_callbacks(struct task_struct *child) { @@ -2659,11 +2684,14 @@ void cgroup_fork_callbacks(struct task_struct *child) } /** - * cgroup_post_fork - called on a new task after adding it to the - * task list. Adds the task to the list running through its css_set - * if necessary. Has to be after the task is visible on the task list - * in case we race with the first call to cgroup_iter_start() - to - * guarantee that the new task ends up on its list. */ + * cgroup_post_fork - called on a new task after adding it to the task list + * @child: the task in question + * + * Adds the task to the list running through its css_set if necessary. + * Has to be after the task is visible on the task list in case we race + * with the first call to cgroup_iter_start() - to guarantee that the + * new task ends up on its list. + */ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { @@ -2676,6 +2704,7 @@ void cgroup_post_fork(struct task_struct *child) /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process + * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * @@ -2706,7 +2735,6 @@ void cgroup_post_fork(struct task_struct *child) * top_cgroup isn't going away, and either task has PF_EXITING set, * which wards off any cgroup_attach_task() attempts, or task is a failed * fork, never visible to cgroup_attach_task. - * */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { @@ -2743,9 +2771,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } /** - * cgroup_clone - duplicate the current cgroup in the hierarchy - * that the given subsystem is attached to, and move this task into - * the new child + * cgroup_clone - clone the cgroup the given subsystem is attached to + * @tsk: the task to be moved + * @subsys: the given subsystem + * + * Duplicate the current cgroup in the hierarchy that the given + * subsystem is attached to, and move this task into the new + * child. */ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) { @@ -2858,9 +2890,12 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) return ret; } -/* - * See if "cgrp" is a descendant of the current task's cgroup in - * the appropriate hierarchy +/** + * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp + * @cgrp: the cgroup in question + * + * See if @cgrp is a descendant of the current task's cgroup in + * the appropriate hierarchy. * * If we are sending in dummytop, then presumably we are creating * the top cgroup in the subsystem. @@ -2939,9 +2974,7 @@ void __css_put(struct cgroup_subsys_state *css) * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. - * */ - static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); @@ -2991,3 +3024,27 @@ static void cgroup_release_agent(struct work_struct *work) spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } + +static int __init cgroup_disable(char *str) +{ + int i; + char *token; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + if (!strcmp(token, ss->name)) { + ss->disabled = 1; + printk(KERN_INFO "Disabling %s control group" + " subsystem\n", ss->name); + break; + } + } + } + return 1; +} +__setup("cgroup_disable=", cgroup_disable); diff --git a/kernel/compat.c b/kernel/compat.c index 5f0e201bcfd..e1ef04870c2 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -47,15 +47,14 @@ static long compat_nanosleep_restart(struct restart_block *restart) mm_segment_t oldfs; long ret; - rmtp = (struct compat_timespec __user *)(restart->arg1); - restart->arg1 = (unsigned long)&rmt; + restart->nanosleep.rmtp = (struct timespec __user *) &rmt; oldfs = get_fs(); set_fs(KERNEL_DS); ret = hrtimer_nanosleep_restart(restart); set_fs(oldfs); if (ret) { - restart->arg1 = (unsigned long)rmtp; + rmtp = restart->nanosleep.compat_rmtp; if (rmtp && put_compat_timespec(&rmt, rmtp)) return -EFAULT; @@ -89,7 +88,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, = ¤t_thread_info()->restart_block; restart->fn = compat_nanosleep_restart; - restart->arg1 = (unsigned long)rmtp; + restart->nanosleep.compat_rmtp = rmtp; if (rmtp && put_compat_timespec(&rmt, rmtp)) return -EFAULT; @@ -446,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, if (retval) return retval; - return sched_setaffinity(pid, new_mask); + return sched_setaffinity(pid, &new_mask); } asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, @@ -607,9 +606,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) long err; mm_segment_t oldfs; struct timespec tu; - struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); + struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; - restart->arg1 = (unsigned long) &tu; + restart->nanosleep.rmtp = (struct timespec __user *) &tu; oldfs = get_fs(); set_fs(KERNEL_DS); err = clock_nanosleep_restart(restart); @@ -621,7 +620,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) if (err == -ERESTART_RESTARTBLOCK) { restart->fn = compat_clock_nanosleep_restart; - restart->arg1 = (unsigned long) rmtp; + restart->nanosleep.compat_rmtp = rmtp; } return err; } @@ -652,7 +651,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, if (err == -ERESTART_RESTARTBLOCK) { restart = ¤t_thread_info()->restart_block; restart->fn = compat_clock_nanosleep_restart; - restart->arg1 = (unsigned long) rmtp; + restart->nanosleep.compat_rmtp = rmtp; } return err; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 2eff3f63abe..2011ad8d269 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -232,9 +232,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* Ensure that we are not runnable on dying cpu */ old_allowed = current->cpus_allowed; - tmp = CPU_MASK_ALL; + cpus_setall(tmp); cpu_clear(cpu, tmp); - set_cpus_allowed(current, tmp); + set_cpus_allowed_ptr(current, &tmp); p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); @@ -268,7 +268,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) out_thread: err = kthread_stop(p); out_allowed: - set_cpus_allowed(current, old_allowed); + set_cpus_allowed_ptr(current, &old_allowed); out_release: cpu_hotplug_done(); return err; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e296ed81d4..48a976c52cf 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -98,6 +98,9 @@ struct cpuset { /* partition number for rebuild_sched_domains() */ int pn; + /* for custom sched domain */ + int relax_domain_level; + /* used for walking a cpuset heirarchy */ struct list_head stack_list; }; @@ -322,8 +325,8 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * Call without callback_mutex or task_lock() held. May be * called with or without cgroup_mutex held. Thanks in part to * 'the_top_cpuset_hack', the task's cpuset pointer will never - * be NULL. This routine also might acquire callback_mutex and - * current->mm->mmap_sem during call. + * be NULL. This routine also might acquire callback_mutex during + * call. * * Reading current->cpuset->mems_generation doesn't need task_lock * to guard the current->cpuset derefence, because it is guarded @@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) return cpus_intersects(a->cpus_allowed, b->cpus_allowed); } +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (!dattr) + return; + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; + return; +} + /* * rebuild_sched_domains() * @@ -553,12 +566,14 @@ static void rebuild_sched_domains(void) int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ q = NULL; csa = NULL; doms = NULL; + dattr = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { @@ -566,6 +581,11 @@ static void rebuild_sched_domains(void) doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr(dattr, &top_cpuset); + } *doms = top_cpuset.cpus_allowed; goto rebuild; } @@ -622,6 +642,7 @@ restart: doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; + dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; @@ -644,12 +665,15 @@ restart: } cpus_clear(*dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { struct cpuset *b = csa[j]; if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; + update_domain_attr(dattr, b); } } nslot++; @@ -660,7 +684,7 @@ restart: rebuild: /* Have scheduler rebuild sched domains */ get_online_cpus(); - partition_sched_domains(ndoms, doms); + partition_sched_domains(ndoms, doms, dattr); put_online_cpus(); done: @@ -668,6 +692,7 @@ done: kfifo_free(q); kfree(csa); /* Don't kfree(doms) -- partition_sched_domains() does that. */ + /* Don't kfree(dattr) -- partition_sched_domains() does that. */ } static inline int started_after_time(struct task_struct *t1, @@ -729,7 +754,7 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) */ void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); + set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); } /** @@ -916,7 +941,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); - cpuset_being_rebound = cs; /* causes mpol_copy() rebind */ + cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ fudge = 10; /* spare mmarray[] slots */ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ @@ -967,7 +992,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) * rebind the vma mempolicies of each mm in mmarray[] to their * new cpuset, and release that mm. The mpol_rebind_mm() * call takes mmap_sem, which we couldn't take while holding - * tasklist_lock. Forks can happen again now - the mpol_copy() + * tasklist_lock. Forks can happen again now - the mpol_dup() * cpuset_being_rebound check will catch such forks, and rebind * their vma mempolicies too. Because we still hold the global * cgroup_mutex, we know that no other rebind effort will @@ -1011,6 +1036,21 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) return 0; } +static int update_relax_domain_level(struct cpuset *cs, char *buf) +{ + int val = simple_strtol(buf, NULL, 10); + + if (val < 0) + val = -1; + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + rebuild_sched_domains(); + } + + return 0; +} + /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, @@ -1178,7 +1218,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); - set_cpus_allowed(tsk, cpus); + set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); from = oldcs->mems_allowed; @@ -1202,6 +1242,7 @@ typedef enum { FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_SCHED_LOAD_BALANCE, + FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, @@ -1224,7 +1265,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, return -E2BIG; /* +1 for nul-terminator */ - if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) + buffer = kmalloc(nbytes + 1, GFP_KERNEL); + if (!buffer) return -ENOMEM; if (copy_from_user(buffer, userbuf, nbytes)) { @@ -1256,6 +1298,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, case FILE_SCHED_LOAD_BALANCE: retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); break; + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, buffer); + break; case FILE_MEMORY_MIGRATE: retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); break; @@ -1354,6 +1399,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, case FILE_SCHED_LOAD_BALANCE: *s++ = is_sched_load_balance(cs) ? '1' : '0'; break; + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + s += sprintf(s, "%d", cs->relax_domain_level); + break; case FILE_MEMORY_MIGRATE: *s++ = is_memory_migrate(cs) ? '1' : '0'; break; @@ -1424,6 +1472,13 @@ static struct cftype cft_sched_load_balance = { .private = FILE_SCHED_LOAD_BALANCE, }; +static struct cftype cft_sched_relax_domain_level = { + .name = "sched_relax_domain_level", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, +}; + static struct cftype cft_memory_migrate = { .name = "memory_migrate", .read = cpuset_common_file_read, @@ -1475,6 +1530,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) return err; if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) return err; + if ((err = cgroup_add_file(cont, ss, + &cft_sched_relax_domain_level)) < 0) + return err; if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) return err; if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) @@ -1555,10 +1613,11 @@ static struct cgroup_subsys_state *cpuset_create( if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cs->cpus_allowed = CPU_MASK_NONE; - cs->mems_allowed = NODE_MASK_NONE; + cpus_clear(cs->cpus_allowed); + nodes_clear(cs->mems_allowed); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; cs->parent = parent; number_of_cpusets++; @@ -1625,12 +1684,13 @@ int __init cpuset_init(void) { int err = 0; - top_cpuset.cpus_allowed = CPU_MASK_ALL; - top_cpuset.mems_allowed = NODE_MASK_ALL; + cpus_setall(top_cpuset.cpus_allowed); + nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); top_cpuset.mems_generation = cpuset_mems_generation++; set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); + top_cpuset.relax_domain_level = -1; err = register_filesystem(&cpuset_fs_type); if (err < 0) @@ -1844,6 +1904,7 @@ void __init cpuset_init_smp(void) * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. * * Description: Returns the cpumask_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty @@ -1851,35 +1912,27 @@ void __init cpuset_init_smp(void) * tasks cpuset. **/ -cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) +void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) { - cpumask_t mask; - mutex_lock(&callback_mutex); - mask = cpuset_cpus_allowed_locked(tsk); + cpuset_cpus_allowed_locked(tsk, pmask); mutex_unlock(&callback_mutex); - - return mask; } /** * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/ -cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) +void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) { - cpumask_t mask; - task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), &mask); + guarantee_online_cpus(task_cs(tsk), pmask); task_unlock(tsk); - - return mask; } void cpuset_init_current_mems_allowed(void) { - current->mems_allowed = NODE_MASK_ALL; + nodes_setall(current->mems_allowed); } /** @@ -1906,22 +1959,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) } /** - * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed - * @zl: the zonelist to be checked + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed + * @nodemask: the nodemask to be checked * - * Are any of the nodes on zonelist zl allowed in current->mems_allowed? + * Are any of the nodes in the nodemask allowed in current->mems_allowed? */ -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { - int i; - - for (i = 0; zl->zones[i]; i++) { - int nid = zone_to_nid(zl->zones[i]); - - if (node_isset(nid, current->mems_allowed)) - return 1; - } - return 0; + return nodes_intersects(*nodemask, current->mems_allowed); } /* @@ -2261,8 +2306,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, task->cpus_allowed); seq_printf(m, "\n"); + seq_printf(m, "Cpus_allowed_list:\t"); + m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, + task->cpus_allowed); + seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, task->mems_allowed); seq_printf(m, "\n"); + seq_printf(m, "Mems_allowed_list:\t"); + m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, + task->mems_allowed); + seq_printf(m, "\n"); } diff --git a/kernel/exit.c b/kernel/exit.c index 506a957b665..2a9d98c641a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp) static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; - int ret = 1; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (p == ignored_task - || p->exit_state - || is_global_init(p->real_parent)) + if ((p == ignored_task) || + (p->exit_state && thread_group_empty(p)) || + is_global_init(p->real_parent)) continue; + if (task_pgrp(p->real_parent) != pgrp && - task_session(p->real_parent) == task_session(p)) { - ret = 0; - break; - } + task_session(p->real_parent) == task_session(p)) + return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return ret; /* (sighing) "Often!" */ + + return 1; } int is_current_pgrp_orphaned(void) @@ -255,6 +254,37 @@ static int has_stopped_jobs(struct pid *pgrp) return retval; } +/* + * Check to see if any process groups have become orphaned as + * a result of our exiting, and if they have any stopped jobs, + * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void +kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) +{ + struct pid *pgrp = task_pgrp(tsk); + struct task_struct *ignored_task = tsk; + + if (!parent) + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ + parent = tsk->real_parent; + else + /* reparent: our child is in a different pgrp than + * we are, and it was the only connection outside. + */ + ignored_task = NULL; + + if (task_pgrp(parent) != pgrp && + task_session(parent) == task_session(tsk) && + will_become_orphaned_pgrp(pgrp, ignored_task) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); + } +} + /** * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd * @@ -477,10 +507,9 @@ void put_files_struct(struct files_struct *files) } } -EXPORT_SYMBOL(put_files_struct); - -void reset_files_struct(struct task_struct *tsk, struct files_struct *files) +void reset_files_struct(struct files_struct *files) { + struct task_struct *tsk = current; struct files_struct *old; old = tsk->files; @@ -489,9 +518,8 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files) task_unlock(tsk); put_files_struct(old); } -EXPORT_SYMBOL(reset_files_struct); -static void __exit_files(struct task_struct *tsk) +void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; @@ -503,12 +531,7 @@ static void __exit_files(struct task_struct *tsk) } } -void exit_files(struct task_struct *tsk) -{ - __exit_files(tsk); -} - -static void __put_fs_struct(struct fs_struct *fs) +void put_fs_struct(struct fs_struct *fs) { /* No need to hold fs->lock if we are killing it */ if (atomic_dec_and_test(&fs->count)) { @@ -520,12 +543,7 @@ static void __put_fs_struct(struct fs_struct *fs) } } -void put_fs_struct(struct fs_struct *fs) -{ - __put_fs_struct(fs); -} - -static void __exit_fs(struct task_struct *tsk) +void exit_fs(struct task_struct *tsk) { struct fs_struct * fs = tsk->fs; @@ -533,15 +551,10 @@ static void __exit_fs(struct task_struct *tsk) task_lock(tsk); tsk->fs = NULL; task_unlock(tsk); - __put_fs_struct(fs); + put_fs_struct(fs); } } -void exit_fs(struct task_struct *tsk) -{ - __exit_fs(tsk); -} - EXPORT_SYMBOL_GPL(exit_fs); /* @@ -635,22 +648,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) p->exit_signal != -1 && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); - /* - * process group orphan check - * Case ii: Our child is in a different pgrp - * than we are, and it was the only connection - * outside, so the child pgrp is now orphaned. - */ - if ((task_pgrp(p) != task_pgrp(father)) && - (task_session(p) == task_session(father))) { - struct pid *pgrp = task_pgrp(p); - - if (will_become_orphaned_pgrp(pgrp, NULL) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } - } + kill_orphaned_pgrp(p, father); } /* @@ -735,11 +733,9 @@ static void forget_original_parent(struct task_struct *father) * Send signals to all our closest relatives so that they know * to properly mourn us.. */ -static void exit_notify(struct task_struct *tsk) +static void exit_notify(struct task_struct *tsk, int group_dead) { int state; - struct task_struct *t; - struct pid *pgrp; /* * This does two things: @@ -753,25 +749,8 @@ static void exit_notify(struct task_struct *tsk) exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); - /* - * Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - * - * Case i: Our father is in a different pgrp than we are - * and we were the only connection outside, so our pgrp - * is about to become orphaned. - */ - t = tsk->real_parent; - - pgrp = task_pgrp(tsk); - if ((task_pgrp(t) != pgrp) && - (task_session(t) == task_session(tsk)) && - will_become_orphaned_pgrp(pgrp, tsk) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } + if (group_dead) + kill_orphaned_pgrp(tsk->group_leader, NULL); /* Let father know we died * @@ -788,8 +767,8 @@ static void exit_notify(struct task_struct *tsk) * the same after a fork. */ if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && - ( tsk->parent_exec_id != t->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id) + (tsk->parent_exec_id != tsk->real_parent->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) && !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; @@ -971,8 +950,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) acct_process(); exit_sem(tsk); - __exit_files(tsk); - __exit_fs(tsk); + exit_files(tsk); + exit_fs(tsk); check_stack_usage(); exit_thread(); cgroup_exit(tsk, 1); @@ -986,9 +965,9 @@ NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); - exit_notify(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA - mpol_free(tsk->mempolicy); + mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX @@ -1382,7 +1361,7 @@ unlock_sig: if (!retval && infop) retval = put_user(0, &infop->si_errno); if (!retval && infop) - retval = put_user(why, &infop->si_code); + retval = put_user((short)why, &infop->si_code); if (!retval && infop) retval = put_user(exit_code, &infop->si_status); if (!retval && infop) @@ -1612,7 +1591,7 @@ asmlinkage long sys_waitid(int which, pid_t upid, put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; } @@ -1644,7 +1623,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index dd249c37b3a..6067e429f28 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -132,6 +132,14 @@ void __put_task_struct(struct task_struct *tsk) free_task(tsk); } +/* + * macro override instead of weak attribute alias, to workaround + * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. + */ +#ifndef arch_task_cache_init +#define arch_task_cache_init() +#endif + void __init fork_init(unsigned long mempages) { #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR @@ -144,6 +152,9 @@ void __init fork_init(unsigned long mempages) ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); #endif + /* do the arch specific task caches init */ + arch_task_cache_init(); + /* * The default maximum number of threads is set to a safe * value: the thread structures can take up at most half @@ -163,6 +174,13 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } +int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + *dst = *src; + return 0; +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; @@ -181,15 +199,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - *tsk = *orig; + err = arch_dup_task_struct(tsk, orig); + if (err) + goto out; + tsk->stack = ti; err = prop_local_init_single(&tsk->dirties); - if (err) { - free_thread_info(ti); - free_task_struct(tsk); - return NULL; - } + if (err) + goto out; setup_thread_stack(tsk, orig); @@ -205,6 +223,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) #endif tsk->splice_pipe = NULL; return tsk; + +out: + free_thread_info(ti); + free_task_struct(tsk); + return NULL; } #ifdef CONFIG_MMU @@ -256,7 +279,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (!tmp) goto fail_nomem; *tmp = *mpnt; - pol = mpol_copy(vma_policy(mpnt)); + pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; @@ -394,7 +417,6 @@ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); - mm_free_cgroup(mm); destroy_context(mm); free_mm(mm); } @@ -416,6 +438,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + mm_free_cgroup(mm); mmdrop(mm); } } @@ -498,7 +521,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; @@ -782,12 +805,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) goto out; } - /* - * Note: we may be using current for both targets (See exec.c) - * This works because we cache current->files (old) as oldf. Don't - * break this. - */ - tsk->files = NULL; newf = dup_fd(oldf, &error); if (!newf) goto out; @@ -823,34 +840,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return 0; } -/* - * Helper to unshare the files of the current task. - * We don't want to expose copy_files internals to - * the exec layer of the kernel. - */ - -int unshare_files(void) -{ - struct files_struct *files = current->files; - int rc; - - BUG_ON(!files); - - /* This can race but the race causes us to copy when we don't - need to and drop the copy */ - if(atomic_read(&files->count) == 1) - { - atomic_inc(&files->count); - return 0; - } - rc = copy_files(0, current); - if(rc) - current->files = files; - return rc; -} - -EXPORT_SYMBOL(unshare_files); - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -1127,7 +1116,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->audit_context = NULL; cgroup_fork(p); #ifdef CONFIG_NUMA - p->mempolicy = mpol_copy(p->mempolicy); + p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; @@ -1385,7 +1374,7 @@ bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_policy: #ifdef CONFIG_NUMA - mpol_free(p->mempolicy); + mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); @@ -1788,3 +1777,27 @@ bad_unshare_cleanup_thread: bad_unshare_out: return err; } + +/* + * Helper to unshare the files of the current task. + * We don't want to expose copy_files internals to + * the exec layer of the kernel. + */ + +int unshare_files(struct files_struct **displaced) +{ + struct task_struct *task = current; + struct files_struct *copy = NULL; + int error; + + error = unshare_fd(CLONE_FILES, ©); + if (error || !copy) { + *displaced = NULL; + return error; + } + *displaced = task->files; + task_lock(task); + task->files = copy; + task_unlock(task); + return 0; +} diff --git a/kernel/futex.c b/kernel/futex.c index 221f2128a43..e43945e995f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,8 @@ #include "rtmutex_common.h" +int __read_mostly futex_cmpxchg_enabled; + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -279,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, */ static void get_futex_key_refs(union futex_key *key) { - if (key->both.ptr == 0) + if (key->both.ptr == NULL) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: @@ -469,6 +471,8 @@ void exit_pi_state_list(struct task_struct *curr) struct futex_hash_bucket *hb; union futex_key key; + if (!futex_cmpxchg_enabled) + return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -1870,6 +1874,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; /* * The kernel knows only one size for now: */ @@ -1894,6 +1900,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, struct robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->robust_list; else { @@ -1997,6 +2006,9 @@ void exit_robust_list(struct task_struct *curr) unsigned long futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -2051,7 +2063,7 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret; + int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; struct rw_semaphore *fshared = NULL; @@ -2083,13 +2095,16 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr, fshared); + if (futex_cmpxchg_enabled) + ret = futex_unlock_pi(uaddr, fshared); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; default: ret = -ENOSYS; @@ -2143,10 +2158,31 @@ static struct file_system_type futex_fs_type = { .kill_sb = kill_anon_super, }; -static int __init init(void) +static int __init futex_init(void) { - int i = register_filesystem(&futex_fs_type); + u32 curval; + int i; + + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non functional ones will return + * -ENOSYS. + */ + curval = cmpxchg_futex_value_locked(NULL, 0, 0); + if (curval == -EFAULT) + futex_cmpxchg_enabled = 1; + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); + spin_lock_init(&futex_queues[i].lock); + } + + i = register_filesystem(&futex_fs_type); if (i) return i; @@ -2156,10 +2192,6 @@ static int __init init(void) return PTR_ERR(futex_mnt); } - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); - spin_lock_init(&futex_queues[i].lock); - } return 0; } -__initcall(init); +__initcall(futex_init); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7d5e4b016f3..04ac3a9e42c 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, return 0; } -static void __user *futex_uaddr(struct robust_list *entry, +static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); @@ -54,6 +54,9 @@ void compat_exit_robust_list(struct task_struct *curr) compat_long_t futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -115,6 +118,9 @@ asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, compat_size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (unlikely(len != sizeof(*head))) return -EINVAL; @@ -130,6 +136,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, struct compat_robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->compat_robust_list; else { diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 98bee013f71..dea4c9124ac 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -590,7 +590,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, list_add_tail(&timer->cb_entry, &base->cpu_base->cb_pending); timer->state = HRTIMER_STATE_PENDING; - raise_softirq(HRTIMER_SOFTIRQ); return 1; default: BUG(); @@ -633,6 +632,11 @@ static int hrtimer_switch_to_hres(void) return 1; } +static inline void hrtimer_raise_softirq(void) +{ + raise_softirq(HRTIMER_SOFTIRQ); +} + #else static inline int hrtimer_hres_active(void) { return 0; } @@ -651,6 +655,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer, { return 0; } +static inline void hrtimer_raise_softirq(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -850,7 +855,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret; + int ret, raise; base = lock_hrtimer_base(timer, &flags); @@ -884,8 +889,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) enqueue_hrtimer(timer, new_base, new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); + /* + * The timer may be expired and moved to the cb_pending + * list. We can not raise the softirq with base lock held due + * to a possible deadlock with runqueue lock. + */ + raise = timer->state == HRTIMER_STATE_PENDING; + unlock_hrtimer_base(timer, &flags); + if (raise) + hrtimer_raise_softirq(); + return ret; } EXPORT_SYMBOL_GPL(hrtimer_start); @@ -1080,8 +1095,19 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) * If the timer was rearmed on another CPU, reprogram * the event device. */ - if (timer->base->first == &timer->node) - hrtimer_reprogram(timer, timer->base); + struct hrtimer_clock_base *base = timer->base; + + if (base->first == &timer->node && + hrtimer_reprogram(timer, base)) { + /* + * Timer is expired. Thus move it from tree to + * pending list again. + */ + __remove_hrtimer(timer, base, + HRTIMER_STATE_PENDING, 0); + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + } } } spin_unlock_irq(&cpu_base->lock); @@ -1238,51 +1264,50 @@ void hrtimer_run_pending(void) /* * Called from hardirq context every jiffy */ -static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, - int index) +void hrtimer_run_queues(void) { struct rb_node *node; - struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base; + int index, gettime = 1; - if (!base->first) + if (hrtimer_hres_active()) return; - if (base->get_softirq_time) - base->softirq_time = base->get_softirq_time(); - - spin_lock(&cpu_base->lock); - - while ((node = base->first)) { - struct hrtimer *timer; - - timer = rb_entry(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= timer->expires.tv64) - break; + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { + base = &cpu_base->clock_base[index]; - if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { - __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); + if (!base->first) continue; + + if (base->get_softirq_time) + base->softirq_time = base->get_softirq_time(); + else if (gettime) { + hrtimer_get_softirq_time(cpu_base); + gettime = 0; } - __run_hrtimer(timer); - } - spin_unlock(&cpu_base->lock); -} + spin_lock(&cpu_base->lock); -void hrtimer_run_queues(void) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - int i; + while ((node = base->first)) { + struct hrtimer *timer; - if (hrtimer_hres_active()) - return; + timer = rb_entry(node, struct hrtimer, node); + if (base->softirq_time.tv64 <= timer->expires.tv64) + break; - hrtimer_get_softirq_time(cpu_base); + if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { + __remove_hrtimer(timer, base, + HRTIMER_STATE_PENDING, 0); + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + continue; + } - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - run_hrtimer_queue(cpu_base, i); + __run_hrtimer(timer); + } + spin_unlock(&cpu_base->lock); + } } /* @@ -1354,13 +1379,13 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct hrtimer_sleeper t; struct timespec __user *rmtp; - hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); - t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; + hrtimer_init(&t.timer, restart->nanosleep.index, HRTIMER_MODE_ABS); + t.timer.expires.tv64 = restart->nanosleep.expires; if (do_nanosleep(&t, HRTIMER_MODE_ABS)) return 0; - rmtp = (struct timespec __user *)restart->arg1; + rmtp = restart->nanosleep.rmtp; if (rmtp) { int ret = update_rmtp(&t.timer, rmtp); if (ret <= 0) @@ -1394,10 +1419,9 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, restart = ¤t_thread_info()->restart_block; restart->fn = hrtimer_nanosleep_restart; - restart->arg0 = (unsigned long) t.timer.base->index; - restart->arg1 = (unsigned long) rmtp; - restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; - restart->arg3 = t.timer.expires.tv64 >> 32; + restart->nanosleep.index = t.timer.base->index; + restart->nanosleep.rmtp = rmtp; + restart->nanosleep.expires = t.timer.expires.tv64; return -ERESTART_RESTARTBLOCK; } @@ -1425,7 +1449,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) int i; spin_lock_init(&cpu_base->lock); - lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) cpu_base->clock_base[i].cpu_base = cpu_base; @@ -1466,16 +1489,16 @@ static void migrate_hrtimers(int cpu) tick_cancel_sched_timer(cpu); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_lock(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } - double_spin_unlock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fdb3fbe2b0c..964964baefa 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - desc->affinity = CPU_MASK_ALL; + cpus_setall(desc->affinity); #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/kexec.c b/kernel/kexec.c index 06a0e277565..cb85c79989b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -29,7 +29,6 @@ #include <asm/uaccess.h> #include <asm/io.h> #include <asm/system.h> -#include <asm/semaphore.h> #include <asm/sections.h> /* Per cpu memory for storing cpu states in case of system crash. */ @@ -1406,6 +1405,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); arch_crash_save_vmcoreinfo(); diff --git a/kernel/kgdb.c b/kernel/kgdb.c new file mode 100644 index 00000000000..1bd0ec1c80b --- /dev/null +++ b/kernel/kgdb.c @@ -0,0 +1,1700 @@ +/* + * KGDB stub. + * + * Maintainer: Jason Wessel <jason.wessel@windriver.com> + * + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002-2004 Timesys Corporation + * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> + * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> + * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2005-2008 Wind River Systems, Inc. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Contributors at various stages not listed above: + * Jason Wessel ( jason.wessel@windriver.com ) + * George Anzinger <george@mvista.com> + * Anurekh Saxena (anurekh.saxena@timesys.com) + * Lake Stevens Instrument Division (Glenn Engel) + * Jim Kingdon, Cygnus Support. + * + * Original KGDB stub: David Grothe <dave@gcom.com>, + * Tigran Aivazian <tigran@sco.com> + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ +#include <linux/pid_namespace.h> +#include <linux/clocksource.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/console.h> +#include <linux/threads.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/reboot.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/sysrq.h> +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/pid.h> +#include <linux/smp.h> +#include <linux/mm.h> + +#include <asm/cacheflush.h> +#include <asm/byteorder.h> +#include <asm/atomic.h> +#include <asm/system.h> + +static int kgdb_break_asap; + +struct kgdb_state { + int ex_vector; + int signo; + int err_code; + int cpu; + int pass_exception; + long threadid; + long kgdb_usethreadid; + struct pt_regs *linux_regs; +}; + +static struct debuggerinfo_struct { + void *debuggerinfo; + struct task_struct *task; +} kgdb_info[NR_CPUS]; + +/** + * kgdb_connected - Is a host GDB connected to us? + */ +int kgdb_connected; +EXPORT_SYMBOL_GPL(kgdb_connected); + +/* All the KGDB handlers are installed */ +static int kgdb_io_module_registered; + +/* Guard for recursive entry */ +static int exception_level; + +static struct kgdb_io *kgdb_io_ops; +static DEFINE_SPINLOCK(kgdb_registration_lock); + +/* kgdb console driver is loaded */ +static int kgdb_con_registered; +/* determine if kgdb console output should be used */ +static int kgdb_use_con; + +static int __init opt_kgdb_con(char *str) +{ + kgdb_use_con = 1; + return 0; +} + +early_param("kgdbcon", opt_kgdb_con); + +module_param(kgdb_use_con, int, 0644); + +/* + * Holds information about breakpoints in a kernel. These breakpoints are + * added and removed by gdb. + */ +static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { + [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } +}; + +/* + * The CPU# of the active CPU, or -1 if none: + */ +atomic_t kgdb_active = ATOMIC_INIT(-1); + +/* + * We use NR_CPUs not PERCPU, in case kgdb is used to debug early + * bootup code (which might not have percpu set up yet): + */ +static atomic_t passive_cpu_wait[NR_CPUS]; +static atomic_t cpu_in_kgdb[NR_CPUS]; +atomic_t kgdb_setting_breakpoint; + +struct task_struct *kgdb_usethread; +struct task_struct *kgdb_contthread; + +int kgdb_single_step; + +/* Our I/O buffers. */ +static char remcom_in_buffer[BUFMAX]; +static char remcom_out_buffer[BUFMAX]; + +/* Storage for the registers, in GDB format. */ +static unsigned long gdb_regs[(NUMREGBYTES + + sizeof(unsigned long) - 1) / + sizeof(unsigned long)]; + +/* to keep track of the CPU which is doing the single stepping*/ +atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); + +/* + * If you are debugging a problem where roundup (the collection of + * all other CPUs) is a problem [this should be extremely rare], + * then use the nokgdbroundup option to avoid roundup. In that case + * the other CPUs might interfere with your debugging context, so + * use this with care: + */ +int kgdb_do_roundup = 1; + +static int __init opt_nokgdbroundup(char *str) +{ + kgdb_do_roundup = 0; + + return 0; +} + +early_param("nokgdbroundup", opt_nokgdbroundup); + +/* + * Finally, some KGDB code :-) + */ + +/* + * Weak aliases for breakpoint management, + * can be overriden by architectures when needed: + */ +int __weak kgdb_validate_break_address(unsigned long addr) +{ + char tmp_variable[BREAK_INSTR_SIZE]; + + return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE); +} + +int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) +{ + int err; + + err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); + if (err) + return err; + + return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); +} + +int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) +{ + return probe_kernel_write((char *)addr, + (char *)bundle, BREAK_INSTR_SIZE); +} + +unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +int __weak kgdb_arch_init(void) +{ + return 0; +} + +int __weak kgdb_skipexception(int exception, struct pt_regs *regs) +{ + return 0; +} + +void __weak +kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) +{ + return; +} + +/** + * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. + * @regs: Current &struct pt_regs. + * + * This function will be called if the particular architecture must + * disable hardware debugging while it is processing gdb packets or + * handling exception. + */ +void __weak kgdb_disable_hw_debug(struct pt_regs *regs) +{ +} + +/* + * GDB remote protocol parser: + */ + +static const char hexchars[] = "0123456789abcdef"; + +static int hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return ch - 'a' + 10; + if ((ch >= '0') && (ch <= '9')) + return ch - '0'; + if ((ch >= 'A') && (ch <= 'F')) + return ch - 'A' + 10; + return -1; +} + +/* scan for the sequence $<data>#<checksum> */ +static void get_packet(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int count; + char ch; + + do { + /* + * Spin and wait around for the start character, ignore all + * other characters: + */ + while ((ch = (kgdb_io_ops->read_char())) != '$') + /* nothing */; + + kgdb_connected = 1; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* + * now, read until a # or end of buffer is found: + */ + while (count < (BUFMAX - 1)) { + ch = kgdb_io_ops->read_char(); + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(kgdb_io_ops->read_char()) << 4; + xmitcsum += hex(kgdb_io_ops->read_char()); + + if (checksum != xmitcsum) + /* failed checksum */ + kgdb_io_ops->write_char('-'); + else + /* successful transfer */ + kgdb_io_ops->write_char('+'); + if (kgdb_io_ops->flush) + kgdb_io_ops->flush(); + } + } while (checksum != xmitcsum); +} + +/* + * Send the packet in buffer. + * Check for gdb connection if asked for. + */ +static void put_packet(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* + * $<packet info>#<checksum>. + */ + while (1) { + kgdb_io_ops->write_char('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + kgdb_io_ops->write_char(ch); + checksum += ch; + count++; + } + + kgdb_io_ops->write_char('#'); + kgdb_io_ops->write_char(hexchars[checksum >> 4]); + kgdb_io_ops->write_char(hexchars[checksum & 0xf]); + if (kgdb_io_ops->flush) + kgdb_io_ops->flush(); + + /* Now see what we get in reply. */ + ch = kgdb_io_ops->read_char(); + + if (ch == 3) + ch = kgdb_io_ops->read_char(); + + /* If we get an ACK, we are done. */ + if (ch == '+') + return; + + /* + * If we get the start of another packet, this means + * that GDB is attempting to reconnect. We will NAK + * the packet being sent, and stop trying to send this + * packet. + */ + if (ch == '$') { + kgdb_io_ops->write_char('-'); + if (kgdb_io_ops->flush) + kgdb_io_ops->flush(); + return; + } + } +} + +static char *pack_hex_byte(char *pkt, u8 byte) +{ + *pkt++ = hexchars[byte >> 4]; + *pkt++ = hexchars[byte & 0xf]; + + return pkt; +} + +/* + * Convert the memory pointed to by mem into hex, placing result in buf. + * Return a pointer to the last char put in buf (null). May return an error. + */ +int kgdb_mem2hex(char *mem, char *buf, int count) +{ + char *tmp; + int err; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory copy. Hex conversion will work against this one. + */ + tmp = buf + count; + + err = probe_kernel_read(tmp, mem, count); + if (!err) { + while (count > 0) { + buf = pack_hex_byte(buf, *tmp); + tmp++; + count--; + } + + *buf = 0; + } + + return err; +} + +/* + * Copy the binary array pointed to by buf into mem. Fix $, #, and + * 0x7d escaped with 0x7d. Return a pointer to the character after + * the last byte written. + */ +static int kgdb_ebin2mem(char *buf, char *mem, int count) +{ + int err = 0; + char c; + + while (count-- > 0) { + c = *buf++; + if (c == 0x7d) + c = *buf++ ^ 0x20; + + err = probe_kernel_write(mem, &c, 1); + if (err) + break; + + mem++; + } + + return err; +} + +/* + * Convert the hex array pointed to by buf into binary to be placed in mem. + * Return a pointer to the character AFTER the last byte written. + * May return an error. + */ +int kgdb_hex2mem(char *buf, char *mem, int count) +{ + char *tmp_raw; + char *tmp_hex; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory that is converted from hex. + */ + tmp_raw = buf + count * 2; + + tmp_hex = tmp_raw - 1; + while (tmp_hex >= buf) { + tmp_raw--; + *tmp_raw = hex(*tmp_hex--); + *tmp_raw |= hex(*tmp_hex--) << 4; + } + + return probe_kernel_write(mem, tmp_raw, count); +} + +/* + * While we find nice hex chars, build a long_val. + * Return number of chars processed. + */ +int kgdb_hex2long(char **ptr, long *long_val) +{ + int hex_val; + int num = 0; + + *long_val = 0; + + while (**ptr) { + hex_val = hex(**ptr); + if (hex_val < 0) + break; + + *long_val = (*long_val << 4) | hex_val; + num++; + (*ptr)++; + } + + return num; +} + +/* Write memory due to an 'M' or 'X' packet. */ +static int write_mem_msg(int binary) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long addr; + unsigned long length; + int err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && + kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { + if (binary) + err = kgdb_ebin2mem(ptr, (char *)addr, length); + else + err = kgdb_hex2mem(ptr, (char *)addr, length); + if (err) + return err; + if (CACHE_FLUSH_IS_SAFE) + flush_icache_range(addr, addr + length + 1); + return 0; + } + + return -EINVAL; +} + +static void error_packet(char *pkt, int error) +{ + error = -error; + pkt[0] = 'E'; + pkt[1] = hexchars[(error / 10)]; + pkt[2] = hexchars[(error % 10)]; + pkt[3] = '\0'; +} + +/* + * Thread ID accessors. We represent a flat TID space to GDB, where + * the per CPU idle threads (which under Linux all have PID 0) are + * remapped to negative TIDs. + */ + +#define BUF_THREAD_ID_SIZE 16 + +static char *pack_threadid(char *pkt, unsigned char *id) +{ + char *limit; + + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *id++); + + return pkt; +} + +static void int_to_threadref(unsigned char *id, int value) +{ + unsigned char *scan; + int i = 4; + + scan = (unsigned char *)id; + while (i--) + *scan++ = 0; + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} + +static struct task_struct *getthread(struct pt_regs *regs, int tid) +{ + /* + * Non-positive TIDs are remapped idle tasks: + */ + if (tid <= 0) + return idle_task(-tid); + + /* + * find_task_by_pid_ns() does not take the tasklist lock anymore + * but is nicely RCU locked - hence is a pretty resilient + * thing to use: + */ + return find_task_by_pid_ns(tid, &init_pid_ns); +} + +/* + * CPU debug state control: + */ + +#ifdef CONFIG_SMP +static void kgdb_wait(struct pt_regs *regs) +{ + unsigned long flags; + int cpu; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + kgdb_info[cpu].debuggerinfo = regs; + kgdb_info[cpu].task = current; + /* + * Make sure the above info reaches the primary CPU before + * our cpu_in_kgdb[] flag setting does: + */ + smp_wmb(); + atomic_set(&cpu_in_kgdb[cpu], 1); + + /* Wait till primary CPU is done with debugging */ + while (atomic_read(&passive_cpu_wait[cpu])) + cpu_relax(); + + kgdb_info[cpu].debuggerinfo = NULL; + kgdb_info[cpu].task = NULL; + + /* fix up hardware debug registers on local cpu */ + if (arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + + /* Signal the primary CPU that we are done: */ + atomic_set(&cpu_in_kgdb[cpu], 0); + clocksource_touch_watchdog(); + local_irq_restore(flags); +} +#endif + +/* + * Some architectures need cache flushes when we set/clear a + * breakpoint: + */ +static void kgdb_flush_swbreak_addr(unsigned long addr) +{ + if (!CACHE_FLUSH_IS_SAFE) + return; + + if (current->mm && current->mm->mmap_cache) { + flush_cache_range(current->mm->mmap_cache, + addr, addr + BREAK_INSTR_SIZE); + } + /* Force flush instruction cache if it was outside the mm */ + flush_icache_range(addr, addr + BREAK_INSTR_SIZE); +} + +/* + * SW breakpoint management: + */ +static int kgdb_activate_sw_breakpoints(void) +{ + unsigned long addr; + int error = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_SET) + continue; + + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_set_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + return error; + + kgdb_flush_swbreak_addr(addr); + kgdb_break[i].state = BP_ACTIVE; + } + return 0; +} + +static int kgdb_set_sw_break(unsigned long addr) +{ + int err = kgdb_validate_break_address(addr); + int breakno = -1; + int i; + + if (err) + return err; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) + return -EEXIST; + } + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_REMOVED && + kgdb_break[i].bpt_addr == addr) { + breakno = i; + break; + } + } + + if (breakno == -1) { + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_UNDEFINED) { + breakno = i; + break; + } + } + } + + if (breakno == -1) + return -E2BIG; + + kgdb_break[breakno].state = BP_SET; + kgdb_break[breakno].type = BP_BREAKPOINT; + kgdb_break[breakno].bpt_addr = addr; + + return 0; +} + +static int kgdb_deactivate_sw_breakpoints(void) +{ + unsigned long addr; + int error = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + continue; + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_remove_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + return error; + + kgdb_flush_swbreak_addr(addr); + kgdb_break[i].state = BP_SET; + } + return 0; +} + +static int kgdb_remove_sw_break(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) { + kgdb_break[i].state = BP_REMOVED; + return 0; + } + } + return -ENOENT; +} + +int kgdb_isremovedbreak(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_REMOVED) && + (kgdb_break[i].bpt_addr == addr)) + return 1; + } + return 0; +} + +int remove_all_break(void) +{ + unsigned long addr; + int error; + int i; + + /* Clear memory breakpoints. */ + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + goto setundefined; + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_remove_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", + addr); +setundefined: + kgdb_break[i].state = BP_UNDEFINED; + } + + /* Clear hardware breakpoints. */ + if (arch_kgdb_ops.remove_all_hw_break) + arch_kgdb_ops.remove_all_hw_break(); + + return 0; +} + +/* + * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: + */ +static inline int shadow_pid(int realpid) +{ + if (realpid) + return realpid; + + return -1-raw_smp_processor_id(); +} + +static char gdbmsgbuf[BUFMAX + 1]; + +static void kgdb_msg_write(const char *s, int len) +{ + char *bufptr; + int wcount; + int i; + + /* 'O'utput */ + gdbmsgbuf[0] = 'O'; + + /* Fill and send buffers... */ + while (len > 0) { + bufptr = gdbmsgbuf + 1; + + /* Calculate how many this time */ + if ((len << 1) > (BUFMAX - 2)) + wcount = (BUFMAX - 2) >> 1; + else + wcount = len; + + /* Pack in hex chars */ + for (i = 0; i < wcount; i++) + bufptr = pack_hex_byte(bufptr, s[i]); + *bufptr = '\0'; + + /* Move up */ + s += wcount; + len -= wcount; + + /* Write packet */ + put_packet(gdbmsgbuf); + } +} + +/* + * Return true if there is a valid kgdb I/O module. Also if no + * debugger is attached a message can be printed to the console about + * waiting for the debugger to attach. + * + * The print_wait argument is only to be true when called from inside + * the core kgdb_handle_exception, because it will wait for the + * debugger to attach. + */ +static int kgdb_io_ready(int print_wait) +{ + if (!kgdb_io_ops) + return 0; + if (kgdb_connected) + return 1; + if (atomic_read(&kgdb_setting_breakpoint)) + return 1; + if (print_wait) + printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); + return 1; +} + +/* + * All the functions that start with gdb_cmd are the various + * operations to implement the handlers for the gdbserial protocol + * where KGDB is communicating with an external debugger + */ + +/* Handle the '?' status packets */ +static void gdb_cmd_status(struct kgdb_state *ks) +{ + /* + * We know that this packet is only sent + * during initial connect. So to be safe, + * we clear out our breakpoints now in case + * GDB is reconnecting. + */ + remove_all_break(); + + remcom_out_buffer[0] = 'S'; + pack_hex_byte(&remcom_out_buffer[1], ks->signo); +} + +/* Handle the 'g' get registers request */ +static void gdb_cmd_getregs(struct kgdb_state *ks) +{ + struct task_struct *thread; + void *local_debuggerinfo; + int i; + + thread = kgdb_usethread; + if (!thread) { + thread = kgdb_info[ks->cpu].task; + local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; + } else { + local_debuggerinfo = NULL; + for (i = 0; i < NR_CPUS; i++) { + /* + * Try to find the task on some other + * or possibly this node if we do not + * find the matching task then we try + * to approximate the results. + */ + if (thread == kgdb_info[i].task) + local_debuggerinfo = kgdb_info[i].debuggerinfo; + } + } + + /* + * All threads that don't have debuggerinfo should be + * in __schedule() sleeping, since all other CPUs + * are in kgdb_wait, and thus have debuggerinfo. + */ + if (local_debuggerinfo) { + pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); + } else { + /* + * Pull stuff saved during switch_to; nothing + * else is accessible (or even particularly + * relevant). + * + * This should be enough for a stack trace. + */ + sleeping_thread_to_gdb_regs(gdb_regs, thread); + } + kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); +} + +/* Handle the 'G' set registers request */ +static void gdb_cmd_setregs(struct kgdb_state *ks) +{ + kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); + + if (kgdb_usethread && kgdb_usethread != current) { + error_packet(remcom_out_buffer, -EINVAL); + } else { + gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); + } +} + +/* Handle the 'm' memory read bytes */ +static void gdb_cmd_memread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long length; + unsigned long addr; + int err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && + kgdb_hex2long(&ptr, &length) > 0) { + err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); + if (err) + error_packet(remcom_out_buffer, err); + } else { + error_packet(remcom_out_buffer, -EINVAL); + } +} + +/* Handle the 'M' memory write bytes */ +static void gdb_cmd_memwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(0); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +/* Handle the 'X' memory binary write bytes */ +static void gdb_cmd_binwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(1); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +/* Handle the 'D' or 'k', detach or kill packets */ +static void gdb_cmd_detachkill(struct kgdb_state *ks) +{ + int error; + + /* The detach case */ + if (remcom_in_buffer[0] == 'D') { + error = remove_all_break(); + if (error < 0) { + error_packet(remcom_out_buffer, error); + } else { + strcpy(remcom_out_buffer, "OK"); + kgdb_connected = 0; + } + put_packet(remcom_out_buffer); + } else { + /* + * Assume the kill case, with no exit code checking, + * trying to force detach the debugger: + */ + remove_all_break(); + kgdb_connected = 0; + } +} + +/* Handle the 'R' reboot packets */ +static int gdb_cmd_reboot(struct kgdb_state *ks) +{ + /* For now, only honor R0 */ + if (strcmp(remcom_in_buffer, "R0") == 0) { + printk(KERN_CRIT "Executing emergency reboot\n"); + strcpy(remcom_out_buffer, "OK"); + put_packet(remcom_out_buffer); + + /* + * Execution should not return from + * machine_emergency_restart() + */ + machine_emergency_restart(); + kgdb_connected = 0; + + return 1; + } + return 0; +} + +/* Handle the 'q' query packets */ +static void gdb_cmd_query(struct kgdb_state *ks) +{ + struct task_struct *thread; + unsigned char thref[8]; + char *ptr; + int i; + + switch (remcom_in_buffer[1]) { + case 's': + case 'f': + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + + if (remcom_in_buffer[1] == 'f') + ks->threadid = 1; + + remcom_out_buffer[0] = 'm'; + ptr = remcom_out_buffer + 1; + + for (i = 0; i < 17; ks->threadid++) { + thread = getthread(ks->linux_regs, ks->threadid); + if (thread) { + int_to_threadref(thref, ks->threadid); + pack_threadid(ptr, thref); + ptr += BUF_THREAD_ID_SIZE; + *(ptr++) = ','; + i++; + } + } + *(--ptr) = '\0'; + break; + + case 'C': + /* Current thread id */ + strcpy(remcom_out_buffer, "QC"); + ks->threadid = shadow_pid(current->pid); + int_to_threadref(thref, ks->threadid); + pack_threadid(remcom_out_buffer + 2, thref); + break; + case 'T': + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + ks->threadid = 0; + ptr = remcom_in_buffer + 17; + kgdb_hex2long(&ptr, &ks->threadid); + if (!getthread(ks->linux_regs, ks->threadid)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + if (ks->threadid > 0) { + kgdb_mem2hex(getthread(ks->linux_regs, + ks->threadid)->comm, + remcom_out_buffer, 16); + } else { + static char tmpstr[23 + BUF_THREAD_ID_SIZE]; + + sprintf(tmpstr, "Shadow task %d for pid 0", + (int)(-ks->threadid-1)); + kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); + } + break; + } +} + +/* Handle the 'H' task query packets */ +static void gdb_cmd_task(struct kgdb_state *ks) +{ + struct task_struct *thread; + char *ptr; + + switch (remcom_in_buffer[1]) { + case 'g': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_usethread = thread; + ks->kgdb_usethreadid = ks->threadid; + strcpy(remcom_out_buffer, "OK"); + break; + case 'c': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + if (!ks->threadid) { + kgdb_contthread = NULL; + } else { + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_contthread = thread; + } + strcpy(remcom_out_buffer, "OK"); + break; + } +} + +/* Handle the 'T' thread query packets */ +static void gdb_cmd_thread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + struct task_struct *thread; + + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (thread) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, -EINVAL); +} + +/* Handle the 'z' or 'Z' breakpoint remove or set packets */ +static void gdb_cmd_break(struct kgdb_state *ks) +{ + /* + * Since GDB-5.3, it's been drafted that '0' is a software + * breakpoint, '1' is a hardware breakpoint, so let's do that. + */ + char *bpt_type = &remcom_in_buffer[1]; + char *ptr = &remcom_in_buffer[2]; + unsigned long addr; + unsigned long length; + int error = 0; + + if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { + /* Unsupported */ + if (*bpt_type > '4') + return; + } else { + if (*bpt_type != '0' && *bpt_type != '1') + /* Unsupported. */ + return; + } + + /* + * Test if this is a hardware breakpoint, and + * if we support it: + */ + if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) + /* Unsupported. */ + return; + + if (*(ptr++) != ',') { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (!kgdb_hex2long(&ptr, &addr)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (*(ptr++) != ',' || + !kgdb_hex2long(&ptr, &length)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + + if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') + error = kgdb_set_sw_break(addr); + else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') + error = kgdb_remove_sw_break(addr); + else if (remcom_in_buffer[0] == 'Z') + error = arch_kgdb_ops.set_hw_breakpoint(addr, + (int)length, *bpt_type - '0'); + else if (remcom_in_buffer[0] == 'z') + error = arch_kgdb_ops.remove_hw_breakpoint(addr, + (int) length, *bpt_type - '0'); + + if (error == 0) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, error); +} + +/* Handle the 'C' signal / exception passing packets */ +static int gdb_cmd_exception_pass(struct kgdb_state *ks) +{ + /* C09 == pass exception + * C15 == detach kgdb, pass exception + */ + if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'c'; + + } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'D'; + remove_all_break(); + kgdb_connected = 0; + return 1; + + } else { + error_packet(remcom_out_buffer, -EINVAL); + return 0; + } + + /* Indicate fall through */ + return -1; +} + +/* + * This function performs all gdbserial command procesing + */ +static int gdb_serial_stub(struct kgdb_state *ks) +{ + int error = 0; + int tmp; + + /* Clear the out buffer. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + + if (kgdb_connected) { + unsigned char thref[8]; + char *ptr; + + /* Reply to host that an exception has occurred */ + ptr = remcom_out_buffer; + *ptr++ = 'T'; + ptr = pack_hex_byte(ptr, ks->signo); + ptr += strlen(strcpy(ptr, "thread:")); + int_to_threadref(thref, shadow_pid(current->pid)); + ptr = pack_threadid(ptr, thref); + *ptr++ = ';'; + put_packet(remcom_out_buffer); + } + + kgdb_usethread = kgdb_info[ks->cpu].task; + ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); + ks->pass_exception = 0; + + while (1) { + error = 0; + + /* Clear the out buffer. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + + get_packet(remcom_in_buffer); + + switch (remcom_in_buffer[0]) { + case '?': /* gdbserial status */ + gdb_cmd_status(ks); + break; + case 'g': /* return the value of the CPU registers */ + gdb_cmd_getregs(ks); + break; + case 'G': /* set the value of the CPU registers - return OK */ + gdb_cmd_setregs(ks); + break; + case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + gdb_cmd_memread(ks); + break; + case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_memwrite(ks); + break; + case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_binwrite(ks); + break; + /* kill or detach. KGDB should treat this like a + * continue. + */ + case 'D': /* Debugger detach */ + case 'k': /* Debugger detach via kill */ + gdb_cmd_detachkill(ks); + goto default_handle; + case 'R': /* Reboot */ + if (gdb_cmd_reboot(ks)) + goto default_handle; + break; + case 'q': /* query command */ + gdb_cmd_query(ks); + break; + case 'H': /* task related */ + gdb_cmd_task(ks); + break; + case 'T': /* Query thread status */ + gdb_cmd_thread(ks); + break; + case 'z': /* Break point remove */ + case 'Z': /* Break point set */ + gdb_cmd_break(ks); + break; + case 'C': /* Exception passing */ + tmp = gdb_cmd_exception_pass(ks); + if (tmp > 0) + goto default_handle; + if (tmp == 0) + break; + /* Fall through on tmp < 0 */ + case 'c': /* Continue packet */ + case 's': /* Single step packet */ + if (kgdb_contthread && kgdb_contthread != current) { + /* Can't switch threads in kgdb */ + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_activate_sw_breakpoints(); + /* Fall through to default processing */ + default: +default_handle: + error = kgdb_arch_handle_exception(ks->ex_vector, + ks->signo, + ks->err_code, + remcom_in_buffer, + remcom_out_buffer, + ks->linux_regs); + /* + * Leave cmd processing on error, detach, + * kill, continue, or single step. + */ + if (error >= 0 || remcom_in_buffer[0] == 'D' || + remcom_in_buffer[0] == 'k') { + error = 0; + goto kgdb_exit; + } + + } + + /* reply to the request */ + put_packet(remcom_out_buffer); + } + +kgdb_exit: + if (ks->pass_exception) + error = 1; + return error; +} + +static int kgdb_reenter_check(struct kgdb_state *ks) +{ + unsigned long addr; + + if (atomic_read(&kgdb_active) != raw_smp_processor_id()) + return 0; + + /* Panic on recursive debugger calls: */ + exception_level++; + addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); + kgdb_deactivate_sw_breakpoints(); + + /* + * If the break point removed ok at the place exception + * occurred, try to recover and print a warning to the end + * user because the user planted a breakpoint in a place that + * KGDB needs in order to function. + */ + if (kgdb_remove_sw_break(addr) == 0) { + exception_level = 0; + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + kgdb_activate_sw_breakpoints(); + printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", + addr); + WARN_ON_ONCE(1); + + return 1; + } + remove_all_break(); + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + + if (exception_level > 1) { + dump_stack(); + panic("Recursive entry to debugger"); + } + + printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); + dump_stack(); + panic("Recursive entry to debugger"); + + return 1; +} + +/* + * kgdb_handle_exception() - main entry point from a kernel exception + * + * Locking hierarchy: + * interface locks, if any (begin_session) + * kgdb lock (kgdb_active) + */ +int +kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +{ + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + unsigned long flags; + int error = 0; + int i, cpu; + + ks->cpu = raw_smp_processor_id(); + ks->ex_vector = evector; + ks->signo = signo; + ks->ex_vector = evector; + ks->err_code = ecode; + ks->kgdb_usethreadid = 0; + ks->linux_regs = regs; + + if (kgdb_reenter_check(ks)) + return 0; /* Ouch, double exception ! */ + +acquirelock: + /* + * Interrupts will be restored by the 'trap return' code, except when + * single stepping. + */ + local_irq_save(flags); + + cpu = raw_smp_processor_id(); + + /* + * Acquire the kgdb_active lock: + */ + while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) + cpu_relax(); + + /* + * Do not start the debugger connection on this CPU if the last + * instance of the exception handler wanted to come into the + * debugger on a different CPU via a single step + */ + if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && + atomic_read(&kgdb_cpu_doing_single_step) != cpu) { + + atomic_set(&kgdb_active, -1); + clocksource_touch_watchdog(); + local_irq_restore(flags); + + goto acquirelock; + } + + if (!kgdb_io_ready(1)) { + error = 1; + goto kgdb_restore; /* No I/O connection, so resume the system */ + } + + /* + * Don't enter if we have hit a removed breakpoint. + */ + if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) + goto kgdb_restore; + + /* Call the I/O driver's pre_exception routine */ + if (kgdb_io_ops->pre_exception) + kgdb_io_ops->pre_exception(); + + kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; + kgdb_info[ks->cpu].task = current; + + kgdb_disable_hw_debug(ks->linux_regs); + + /* + * Get the passive CPU lock which will hold all the non-primary + * CPU in a spin state while the debugger is active + */ + if (!kgdb_single_step || !kgdb_contthread) { + for (i = 0; i < NR_CPUS; i++) + atomic_set(&passive_cpu_wait[i], 1); + } + + /* + * spin_lock code is good enough as a barrier so we don't + * need one here: + */ + atomic_set(&cpu_in_kgdb[ks->cpu], 1); + +#ifdef CONFIG_SMP + /* Signal the other CPUs to enter kgdb_wait() */ + if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) + kgdb_roundup_cpus(flags); +#endif + + /* + * Wait for the other CPUs to be notified and be waiting for us: + */ + for_each_online_cpu(i) { + while (!atomic_read(&cpu_in_kgdb[i])) + cpu_relax(); + } + + /* + * At this point the primary processor is completely + * in the debugger and all secondary CPUs are quiescent + */ + kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); + kgdb_deactivate_sw_breakpoints(); + kgdb_single_step = 0; + kgdb_contthread = NULL; + exception_level = 0; + + /* Talk to debugger with gdbserial protocol */ + error = gdb_serial_stub(ks); + + /* Call the I/O driver's post_exception routine */ + if (kgdb_io_ops->post_exception) + kgdb_io_ops->post_exception(); + + kgdb_info[ks->cpu].debuggerinfo = NULL; + kgdb_info[ks->cpu].task = NULL; + atomic_set(&cpu_in_kgdb[ks->cpu], 0); + + if (!kgdb_single_step || !kgdb_contthread) { + for (i = NR_CPUS-1; i >= 0; i--) + atomic_set(&passive_cpu_wait[i], 0); + /* + * Wait till all the CPUs have quit + * from the debugger. + */ + for_each_online_cpu(i) { + while (atomic_read(&cpu_in_kgdb[i])) + cpu_relax(); + } + } + +kgdb_restore: + /* Free kgdb_active */ + atomic_set(&kgdb_active, -1); + clocksource_touch_watchdog(); + local_irq_restore(flags); + + return error; +} + +int kgdb_nmicallback(int cpu, void *regs) +{ +#ifdef CONFIG_SMP + if (!atomic_read(&cpu_in_kgdb[cpu]) && + atomic_read(&kgdb_active) != cpu && + atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { + kgdb_wait((struct pt_regs *)regs); + return 0; + } +#endif + return 1; +} + +void kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + unsigned long flags; + + /* If we're debugging, or KGDB has not connected, don't try + * and print. */ + if (!kgdb_connected || atomic_read(&kgdb_active) != -1) + return; + + local_irq_save(flags); + kgdb_msg_write(s, count); + local_irq_restore(flags); +} + +static struct console kgdbcons = { + .name = "kgdb", + .write = kgdb_console_write, + .flags = CON_PRINTBUFFER | CON_ENABLED, + .index = -1, +}; + +#ifdef CONFIG_MAGIC_SYSRQ +static void sysrq_handle_gdb(int key, struct tty_struct *tty) +{ + if (!kgdb_io_ops) { + printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); + return; + } + if (!kgdb_connected) + printk(KERN_CRIT "Entering KGDB\n"); + + kgdb_breakpoint(); +} + +static struct sysrq_key_op sysrq_gdb_op = { + .handler = sysrq_handle_gdb, + .help_msg = "Gdb", + .action_msg = "GDB", +}; +#endif + +static void kgdb_register_callbacks(void) +{ + if (!kgdb_io_module_registered) { + kgdb_io_module_registered = 1; + kgdb_arch_init(); +#ifdef CONFIG_MAGIC_SYSRQ + register_sysrq_key('g', &sysrq_gdb_op); +#endif + if (kgdb_use_con && !kgdb_con_registered) { + register_console(&kgdbcons); + kgdb_con_registered = 1; + } + } +} + +static void kgdb_unregister_callbacks(void) +{ + /* + * When this routine is called KGDB should unregister from the + * panic handler and clean up, making sure it is not handling any + * break exceptions at the time. + */ + if (kgdb_io_module_registered) { + kgdb_io_module_registered = 0; + kgdb_arch_exit(); +#ifdef CONFIG_MAGIC_SYSRQ + unregister_sysrq_key('g', &sysrq_gdb_op); +#endif + if (kgdb_con_registered) { + unregister_console(&kgdbcons); + kgdb_con_registered = 0; + } + } +} + +static void kgdb_initial_breakpoint(void) +{ + kgdb_break_asap = 0; + + printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); + kgdb_breakpoint(); +} + +/** + * kgdb_register_io_module - register KGDB IO module + * @new_kgdb_io_ops: the io ops vector + * + * Register it with the KGDB core. + */ +int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops) +{ + int err; + + spin_lock(&kgdb_registration_lock); + + if (kgdb_io_ops) { + spin_unlock(&kgdb_registration_lock); + + printk(KERN_ERR "kgdb: Another I/O driver is already " + "registered with KGDB.\n"); + return -EBUSY; + } + + if (new_kgdb_io_ops->init) { + err = new_kgdb_io_ops->init(); + if (err) { + spin_unlock(&kgdb_registration_lock); + return err; + } + } + + kgdb_io_ops = new_kgdb_io_ops; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", + new_kgdb_io_ops->name); + + /* Arm KGDB now. */ + kgdb_register_callbacks(); + + if (kgdb_break_asap) + kgdb_initial_breakpoint(); + + return 0; +} +EXPORT_SYMBOL_GPL(kgdb_register_io_module); + +/** + * kkgdb_unregister_io_module - unregister KGDB IO module + * @old_kgdb_io_ops: the io ops vector + * + * Unregister it with the KGDB core. + */ +void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops) +{ + BUG_ON(kgdb_connected); + + /* + * KGDB is no longer able to communicate out, so + * unregister our callbacks and reset state. + */ + kgdb_unregister_callbacks(); + + spin_lock(&kgdb_registration_lock); + + WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops); + kgdb_io_ops = NULL; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO + "kgdb: Unregistered I/O driver %s, debugger disabled.\n", + old_kgdb_io_ops->name); +} +EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); + +/** + * kgdb_breakpoint - generate breakpoint exception + * + * This function will generate a breakpoint exception. It is used at the + * beginning of a program to sync up with a debugger and can be used + * otherwise as a quick means to stop program execution and "break" into + * the debugger. + */ +void kgdb_breakpoint(void) +{ + atomic_set(&kgdb_setting_breakpoint, 1); + wmb(); /* Sync point before breakpoint */ + arch_kgdb_breakpoint(); + wmb(); /* Sync point after breakpoint */ + atomic_set(&kgdb_setting_breakpoint, 0); +} +EXPORT_SYMBOL_GPL(kgdb_breakpoint); + +static int __init opt_kgdb_wait(char *str) +{ + kgdb_break_asap = 1; + + if (kgdb_io_module_registered) + kgdb_initial_breakpoint(); + + return 0; +} + +early_param("kgdbwait", opt_kgdb_wait); diff --git a/kernel/kmod.c b/kernel/kmod.c index 22be3ff3f36..e2764047ec0 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -165,7 +165,7 @@ static int ____call_usermodehelper(void *data) } /* We can run anywhere, unlike our parent keventd(). */ - set_cpus_allowed(current, CPU_MASK_ALL); + set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); /* * Our parent is keventd, which runs with elevated scheduling priority. diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7a86e643233..1e0250cb948 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -72,6 +72,18 @@ DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +/* + * Normally, functions that we'd want to prohibit kprobes in, are marked + * __kprobes. But, there are cases where such functions already belong to + * a different section (__sched for preempt_schedule) + * + * For such cases, we now have a blacklist + */ +struct kprobe_blackpoint kprobe_blacklist[] = { + {"preempt_schedule",}, + {NULL} /* Terminator */ +}; + #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* * kprobe->ainsn.insn points to the copy of the instruction to be @@ -417,6 +429,21 @@ static inline void free_rp_inst(struct kretprobe *rp) } } +static void __kprobes cleanup_rp_inst(struct kretprobe *rp) +{ + unsigned long flags; + struct kretprobe_instance *ri; + struct hlist_node *pos, *next; + /* No race here */ + spin_lock_irqsave(&kretprobe_lock, flags); + hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { + ri->rp = NULL; + hlist_del(&ri->uflist); + } + spin_unlock_irqrestore(&kretprobe_lock, flags); + free_rp_inst(rp); +} + /* * Keep all fields in the kprobe consistent */ @@ -492,33 +519,55 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, static int __kprobes in_kprobes_functions(unsigned long addr) { + struct kprobe_blackpoint *kb; + if (addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end) return -EINVAL; + /* + * If there exists a kprobe_blacklist, verify and + * fail any probe registration in the prohibited area + */ + for (kb = kprobe_blacklist; kb->name != NULL; kb++) { + if (kb->start_addr) { + if (addr >= kb->start_addr && + addr < (kb->start_addr + kb->range)) + return -EINVAL; + } + } return 0; } +/* + * If we have a symbol_name argument, look it up and add the offset field + * to it. This way, we can specify a relative address to a symbol. + */ +static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) +{ + kprobe_opcode_t *addr = p->addr; + if (p->symbol_name) { + if (addr) + return NULL; + kprobe_lookup_name(p->symbol_name, addr); + } + + if (!addr) + return NULL; + return (kprobe_opcode_t *)(((char *)addr) + p->offset); +} + static int __kprobes __register_kprobe(struct kprobe *p, unsigned long called_from) { int ret = 0; struct kprobe *old_p; struct module *probed_mod; + kprobe_opcode_t *addr; - /* - * If we have a symbol_name argument look it up, - * and add it to the address. That way the addr - * field can either be global or relative to a symbol. - */ - if (p->symbol_name) { - if (p->addr) - return -EINVAL; - kprobe_lookup_name(p->symbol_name, p->addr); - } - - if (!p->addr) + addr = kprobe_addr(p); + if (!addr) return -EINVAL; - p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); + p->addr = addr; if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) @@ -546,6 +595,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, } p->nmissed = 0; + INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (old_p) { @@ -572,35 +622,28 @@ out: return ret; } -int __kprobes register_kprobe(struct kprobe *p) -{ - return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); -} - -void __kprobes unregister_kprobe(struct kprobe *p) +/* + * Unregister a kprobe without a scheduler synchronization. + */ +static int __kprobes __unregister_kprobe_top(struct kprobe *p) { - struct module *mod; struct kprobe *old_p, *list_p; - int cleanup_p; - mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) { - mutex_unlock(&kprobe_mutex); - return; - } + if (unlikely(!old_p)) + return -EINVAL; + if (p != old_p) { list_for_each_entry_rcu(list_p, &old_p->list, list) if (list_p == p) /* kprobe p is a valid probe */ goto valid_p; - mutex_unlock(&kprobe_mutex); - return; + return -EINVAL; } valid_p: if (old_p == p || (old_p->pre_handler == aggr_pre_handler && - p->list.next == &old_p->list && p->list.prev == &old_p->list)) { + list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are * enabled - otherwise, the breakpoint would already have @@ -609,43 +652,97 @@ valid_p: if (kprobe_enabled) arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); - cleanup_p = 1; } else { + if (p->break_handler) + old_p->break_handler = NULL; + if (p->post_handler) { + list_for_each_entry_rcu(list_p, &old_p->list, list) { + if ((list_p != p) && (list_p->post_handler)) + goto noclean; + } + old_p->post_handler = NULL; + } +noclean: list_del_rcu(&p->list); - cleanup_p = 0; } + return 0; +} - mutex_unlock(&kprobe_mutex); +static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) +{ + struct module *mod; + struct kprobe *old_p; - synchronize_sched(); if (p->mod_refcounted) { mod = module_text_address((unsigned long)p->addr); if (mod) module_put(mod); } - if (cleanup_p) { - if (p != old_p) { - list_del_rcu(&p->list); + if (list_empty(&p->list) || list_is_singular(&p->list)) { + if (!list_empty(&p->list)) { + /* "p" is the last child of an aggr_kprobe */ + old_p = list_entry(p->list.next, struct kprobe, list); + list_del(&p->list); kfree(old_p); } arch_remove_kprobe(p); - } else { - mutex_lock(&kprobe_mutex); - if (p->break_handler) - old_p->break_handler = NULL; - if (p->post_handler){ - list_for_each_entry_rcu(list_p, &old_p->list, list){ - if (list_p->post_handler){ - cleanup_p = 2; - break; - } - } - if (cleanup_p == 0) - old_p->post_handler = NULL; + } +} + +static int __register_kprobes(struct kprobe **kps, int num, + unsigned long called_from) +{ + int i, ret = 0; + + if (num <= 0) + return -EINVAL; + for (i = 0; i < num; i++) { + ret = __register_kprobe(kps[i], called_from); + if (ret < 0 && i > 0) { + unregister_kprobes(kps, i); + break; } - mutex_unlock(&kprobe_mutex); } + return ret; +} + +/* + * Registration and unregistration functions for kprobe. + */ +int __kprobes register_kprobe(struct kprobe *p) +{ + return __register_kprobes(&p, 1, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kprobe(struct kprobe *p) +{ + unregister_kprobes(&p, 1); +} + +int __kprobes register_kprobes(struct kprobe **kps, int num) +{ + return __register_kprobes(kps, num, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kprobes(struct kprobe **kps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(kps[i]) < 0) + kps[i]->addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) + if (kps[i]->addr) + __unregister_kprobe_bottom(kps[i]); } static struct notifier_block kprobe_exceptions_nb = { @@ -658,28 +755,72 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -int __kprobes register_jprobe(struct jprobe *jp) +static int __register_jprobes(struct jprobe **jps, int num, + unsigned long called_from) { - unsigned long addr = arch_deref_entry_point(jp->entry); + struct jprobe *jp; + int ret = 0, i; - if (!kernel_text_address(addr)) + if (num <= 0) return -EINVAL; + for (i = 0; i < num; i++) { + unsigned long addr; + jp = jps[i]; + addr = arch_deref_entry_point(jp->entry); + + if (!kernel_text_address(addr)) + ret = -EINVAL; + else { + /* Todo: Verify probepoint is a function entry point */ + jp->kp.pre_handler = setjmp_pre_handler; + jp->kp.break_handler = longjmp_break_handler; + ret = __register_kprobe(&jp->kp, called_from); + } + if (ret < 0 && i > 0) { + unregister_jprobes(jps, i); + break; + } + } + return ret; +} - /* Todo: Verify probepoint is a function entry point */ - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - - return __register_kprobe(&jp->kp, +int __kprobes register_jprobe(struct jprobe *jp) +{ + return __register_jprobes(&jp, 1, (unsigned long)__builtin_return_address(0)); } void __kprobes unregister_jprobe(struct jprobe *jp) { - unregister_kprobe(&jp->kp); + unregister_jprobes(&jp, 1); } -#ifdef ARCH_SUPPORTS_KRETPROBES +int __kprobes register_jprobes(struct jprobe **jps, int num) +{ + return __register_jprobes(jps, num, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_jprobes(struct jprobe **jps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(&jps[i]->kp) < 0) + jps[i]->kp.addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) { + if (jps[i]->kp.addr) + __unregister_kprobe_bottom(&jps[i]->kp); + } +} +#ifdef CONFIG_KRETPROBES /* * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. @@ -717,17 +858,18 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -int __kprobes register_kretprobe(struct kretprobe *rp) +static int __kprobes __register_kretprobe(struct kretprobe *rp, + unsigned long called_from) { int ret = 0; struct kretprobe_instance *inst; int i; - void *addr = rp->kp.addr; + void *addr; if (kretprobe_blacklist_size) { - if (addr == NULL) - kprobe_lookup_name(rp->kp.symbol_name, addr); - addr += rp->kp.offset; + addr = kprobe_addr(&rp->kp); + if (!addr) + return -EINVAL; for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { if (kretprobe_blacklist[i].addr == addr) @@ -763,48 +905,101 @@ int __kprobes register_kretprobe(struct kretprobe *rp) rp->nmissed = 0; /* Establish function entry probe point */ - if ((ret = __register_kprobe(&rp->kp, - (unsigned long)__builtin_return_address(0))) != 0) + ret = __register_kprobe(&rp->kp, called_from); + if (ret != 0) free_rp_inst(rp); return ret; } -#else /* ARCH_SUPPORTS_KRETPROBES */ +static int __register_kretprobes(struct kretprobe **rps, int num, + unsigned long called_from) +{ + int ret = 0, i; + + if (num <= 0) + return -EINVAL; + for (i = 0; i < num; i++) { + ret = __register_kretprobe(rps[i], called_from); + if (ret < 0 && i > 0) { + unregister_kretprobes(rps, i); + break; + } + } + return ret; +} int __kprobes register_kretprobe(struct kretprobe *rp) { - return -ENOSYS; + return __register_kretprobes(&rp, 1, + (unsigned long)__builtin_return_address(0)); } -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) +void __kprobes unregister_kretprobe(struct kretprobe *rp) { - return 0; + unregister_kretprobes(&rp, 1); } -#endif /* ARCH_SUPPORTS_KRETPROBES */ +int __kprobes register_kretprobes(struct kretprobe **rps, int num) +{ + return __register_kretprobes(rps, num, + (unsigned long)__builtin_return_address(0)); +} -void __kprobes unregister_kretprobe(struct kretprobe *rp) +void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) { - unsigned long flags; - struct kretprobe_instance *ri; - struct hlist_node *pos, *next; + int i; - unregister_kprobe(&rp->kp); + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(&rps[i]->kp) < 0) + rps[i]->kp.addr = NULL; + mutex_unlock(&kprobe_mutex); - /* No race here */ - spin_lock_irqsave(&kretprobe_lock, flags); - hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { - ri->rp = NULL; - hlist_del(&ri->uflist); + synchronize_sched(); + for (i = 0; i < num; i++) { + if (rps[i]->kp.addr) { + __unregister_kprobe_bottom(&rps[i]->kp); + cleanup_rp_inst(rps[i]); + } } - spin_unlock_irqrestore(&kretprobe_lock, flags); - free_rp_inst(rp); } +#else /* CONFIG_KRETPROBES */ +int __kprobes register_kretprobe(struct kretprobe *rp) +{ + return -ENOSYS; +} + +int __kprobes register_kretprobes(struct kretprobe **rps, int num) +{ + return -ENOSYS; +} +void __kprobes unregister_kretprobe(struct kretprobe *rp) +{ +} + +void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +{ +} + +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) +{ + return 0; +} + +#endif /* CONFIG_KRETPROBES */ + static int __init init_kprobes(void) { int i, err = 0; + unsigned long offset = 0, size = 0; + char *modname, namebuf[128]; + const char *symbol_name; + void *addr; + struct kprobe_blackpoint *kb; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ @@ -813,6 +1008,28 @@ static int __init init_kprobes(void) INIT_HLIST_HEAD(&kretprobe_inst_table[i]); } + /* + * Lookup and populate the kprobe_blacklist. + * + * Unlike the kretprobe blacklist, we'll need to determine + * the range of addresses that belong to the said functions, + * since a kprobe need not necessarily be at the beginning + * of a function. + */ + for (kb = kprobe_blacklist; kb->name != NULL; kb++) { + kprobe_lookup_name(kb->name, addr); + if (!addr) + continue; + + kb->start_addr = (unsigned long)addr; + symbol_name = kallsyms_lookup(kb->start_addr, + &size, &offset, &modname, namebuf); + if (!symbol_name) + kb->range = 0; + else + kb->range = size; + } + if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -1060,8 +1277,12 @@ module_init(init_kprobes); EXPORT_SYMBOL_GPL(register_kprobe); EXPORT_SYMBOL_GPL(unregister_kprobe); +EXPORT_SYMBOL_GPL(register_kprobes); +EXPORT_SYMBOL_GPL(unregister_kprobes); EXPORT_SYMBOL_GPL(register_jprobe); EXPORT_SYMBOL_GPL(unregister_jprobe); +EXPORT_SYMBOL_GPL(register_jprobes); +EXPORT_SYMBOL_GPL(unregister_jprobes); #ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(jprobe_return); #endif @@ -1069,4 +1290,6 @@ EXPORT_SYMBOL_GPL(jprobe_return); #ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(register_kretprobe); EXPORT_SYMBOL_GPL(unregister_kretprobe); +EXPORT_SYMBOL_GPL(register_kretprobes); +EXPORT_SYMBOL_GPL(unregister_kretprobes); #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index 0ac887882f9..92cf6930ab5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,7 +13,6 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> -#include <asm/semaphore.h> #define KTHREAD_NICE_LEVEL (-5) @@ -180,6 +179,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) wait_task_inactive(k); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); + k->rt.nr_cpus_allowed = 1; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index b4e3c85abe7..7c74dab0d21 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record return; for (i = 0; i < MAXLR; i++) { - int q; - int same = 1; + int q, same = 1; + /* Nothing stored: */ if (!latency_record[i].backtrace[0]) { if (firstnonnull > i) @@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record continue; } for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - if (latency_record[i].backtrace[q] != - lat->backtrace[q]) + unsigned long record = lat->backtrace[q]; + + if (latency_record[i].backtrace[q] != record) { same = 0; - if (same && lat->backtrace[q] == 0) break; - if (same && lat->backtrace[q] == ULONG_MAX) + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) break; } if (same) { @@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) for (i = 0; i < LT_SAVECOUNT ; i++) { struct latency_record *mylat; int same = 1; + mylat = &tsk->latency_record[i]; for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - if (mylat->backtrace[q] != - lat.backtrace[q]) + unsigned long record = lat.backtrace[q]; + + if (mylat->backtrace[q] != record) { same = 0; - if (same && lat.backtrace[q] == 0) break; - if (same && lat.backtrace[q] == ULONG_MAX) + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) break; } if (same) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3574379f4d6..81a4e4a3f08 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * parallel walking of the hash-list safe: */ list_add_tail_rcu(&class->hash_entry, hash_head); + /* + * Add it to the global list of classes: + */ + list_add_tail_rcu(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; break; case LOCK_USED: - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); debug_atomic_dec(&nr_unused_locks); break; default: diff --git a/kernel/marker.c b/kernel/marker.c index c4c2cd8b61f..005b9595459 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -61,8 +61,8 @@ struct marker_entry { int refcount; /* Number of times armed. 0 if disarmed. */ struct rcu_head rcu; void *oldptr; - char rcu_pending:1; - char ptype:1; + unsigned char rcu_pending:1; + unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -104,18 +104,18 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, char ptype; /* - * disabling preemption to make sure the teardown of the callbacks can - * be done correctly when they are in modules and they insure RCU read - * coherency. + * preempt_disable does two things : disabling preemption to make sure + * the teardown of the callbacks can be done correctly when they are in + * modules and they insure RCU read coherency. */ preempt_disable(); - ptype = ACCESS_ONCE(mdata->ptype); + ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; /* Must read the ptype before ptr. They are not data dependant, * so we put an explicit smp_rmb() here. */ smp_rmb(); - func = ACCESS_ONCE(mdata->single.func); + func = mdata->single.func; /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); @@ -133,7 +133,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = ACCESS_ONCE(mdata->multi); + multi = mdata->multi; for (i = 0; multi[i].func; i++) { va_start(args, fmt); multi[i].func(multi[i].probe_private, call_private, fmt, @@ -161,13 +161,13 @@ void marker_probe_cb_noarg(const struct marker *mdata, char ptype; preempt_disable(); - ptype = ACCESS_ONCE(mdata->ptype); + ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; /* Must read the ptype before ptr. They are not data dependant, * so we put an explicit smp_rmb() here. */ smp_rmb(); - func = ACCESS_ONCE(mdata->single.func); + func = mdata->single.func; /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); @@ -183,7 +183,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = ACCESS_ONCE(mdata->multi); + multi = mdata->multi; for (i = 0; multi[i].func; i++) multi[i].func(multi[i].probe_private, call_private, fmt, &args); @@ -551,9 +551,9 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, /* * Disable a marker and its probe callback. - * Note: only after a synchronize_sched() issued after setting elem->call to the - * empty function insures that the original callback is not used anymore. This - * insured by preemption disabling around the call site. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. */ static void disable_marker(struct marker *elem) { @@ -565,8 +565,8 @@ static void disable_marker(struct marker *elem) elem->ptype = 0; /* single probe */ /* * Leave the private data and id there, because removal is racy and - * should be done only after a synchronize_sched(). These are never used - * until the next initialization anyway. + * should be done only after an RCU period. These are never used until + * the next initialization anyway. */ } @@ -601,9 +601,6 @@ void marker_update_probe_range(struct marker *begin, /* * Update probes, removing the faulty probes. - * Issues a synchronize_sched() when no reference to the module passed - * as parameter is found in the probes so the probe module can be - * safely unloaded from now on. * * Internal callback only changed before the first probe is connected to it. * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 @@ -674,6 +671,9 @@ int marker_probe_register(const char *name, const char *format, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif call_rcu(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); @@ -698,14 +698,12 @@ int marker_probe_unregister(const char *name, { struct marker_entry *entry; struct marker_probe_closure *old; - int ret = 0; + int ret = -ENOENT; mutex_lock(&markers_mutex); entry = get_marker(name); - if (!entry) { - ret = -ENOENT; + if (!entry) goto end; - } if (entry->rcu_pending) rcu_barrier(); old = marker_entry_remove_probe(entry, probe, probe_private); @@ -713,12 +711,18 @@ int marker_probe_unregister(const char *name, marker_update_probes(); /* may update entry */ mutex_lock(&markers_mutex); entry = get_marker(name); + if (!entry) + goto end; entry->oldptr = old; entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif call_rcu(&entry->rcu, free_old_closure); remove_marker(name); /* Ignore busy error message */ + ret = 0; end: mutex_unlock(&markers_mutex); return ret; @@ -794,6 +798,9 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif call_rcu(&entry->rcu, free_old_closure); remove_marker(entry->name); /* Ignore busy error message */ end: diff --git a/kernel/module.c b/kernel/module.c index 901cd6ac2f1..8d6cccc6c3c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -43,7 +43,6 @@ #include <linux/mutex.h> #include <linux/unwind.h> #include <asm/uaccess.h> -#include <asm/semaphore.h> #include <asm/cacheflush.h> #include <linux/license.h> #include <asm/sections.h> @@ -664,7 +663,7 @@ static void free_module(struct module *mod); static void wait_for_zero_refcount(struct module *mod) { - /* Since we might sleep for some time, drop the semaphore first */ + /* Since we might sleep for some time, release the mutex first */ mutex_unlock(&module_mutex); for (;;) { DEBUGP("Looking at refcount...\n"); @@ -1933,8 +1932,15 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); + /* + * ndiswrapper is under GPL by itself, but loads proprietary modules. + * Don't use add_taint_module(), as it would prevent ndiswrapper from + * using GPL-only symbols it needs. + */ if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE); + + /* driverloader was caught wrongly pretending to be under GPL */ if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); @@ -2171,10 +2177,20 @@ sys_init_module(void __user *umod, wake_up(&module_wq); return ret; } + if (ret > 0) { + printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " + "it should follow 0/-E convention\n" + KERN_WARNING "%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } - /* Now it's a first class citizen! */ - mutex_lock(&module_mutex); + /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; + wake_up(&module_wq); + + mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); unwind_remove_table(mod->unwind_info, 1); @@ -2183,7 +2199,6 @@ sys_init_module(void __user *umod, mod->init_size = 0; mod->init_text_size = 0; mutex_unlock(&module_mutex); - wake_up(&module_wq); return 0; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6d792b66d85..5ca37fa50be 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level) atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); for (i = 1; i < PIDMAP_ENTRIES; i++) { - ns->pidmap[i].page = 0; + ns->pidmap[i].page = NULL; atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 2eae91f954c..ae5c6c147c4 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1087,45 +1087,45 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { - prof_expires = t->expires.cpu; + if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { + prof_expires = tl->expires.cpu; break; } - t->firing = 1; - list_move_tail(&t->entry, firing); + tl->firing = 1; + list_move_tail(&tl->entry, firing); } ++timers; maxfire = 20; virt_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { - virt_expires = t->expires.cpu; + if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { + virt_expires = tl->expires.cpu; break; } - t->firing = 1; - list_move_tail(&t->entry, firing); + tl->firing = 1; + list_move_tail(&tl->entry, firing); } ++timers; maxfire = 20; sched_expires = 0; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sum_sched_runtime < t->expires.sched) { - sched_expires = t->expires.sched; + if (!--maxfire || sum_sched_runtime < tl->expires.sched) { + sched_expires = tl->expires.sched; break; } - t->firing = 1; - list_move_tail(&t->entry, firing); + tl->firing = 1; + list_move_tail(&tl->entry, firing); } /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a9b04203a66..8476956ffd9 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -37,7 +37,6 @@ #include <linux/mutex.h> #include <asm/uaccess.h> -#include <asm/semaphore.h> #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 79833170bb9..b45da40e8d2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -19,16 +19,6 @@ config PM will issue the hlt instruction if nothing is to be done, thereby sending the processor to sleep and saving power. -config PM_LEGACY - bool "Legacy Power Management API (DEPRECATED)" - depends on PM - default n - ---help--- - Support for pm_register() and friends. This old API is obsoleted - by the driver model. - - If unsure, say N. - config PM_DEBUG bool "Power Management Debug Support" depends on PM @@ -190,7 +180,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/pm.txt> and the + and more information, read <file:Documentation/power/pm.txt> and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f7dfff28ecd..597823b5b70 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -4,7 +4,6 @@ EXTRA_CFLAGS += -DDEBUG endif obj-y := main.o -obj-$(CONFIG_PM_LEGACY) += pm.o obj-$(CONFIG_PM_SLEEP) += process.o console.o obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o diff --git a/kernel/power/console.c b/kernel/power/console.c index 89bcf4973ee..b8628be2a46 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -7,17 +7,39 @@ #include <linux/vt_kern.h> #include <linux/kbd_kern.h> #include <linux/console.h> +#include <linux/module.h> #include "power.h" #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static int disable_vt_switch; + +/* + * Normally during a suspend, we allocate a new console and switch to it. + * When we resume, we switch back to the original console. This switch + * can be slow, so on systems where the framebuffer can handle restoration + * of video registers anyways, there's little point in doing the console + * switch. This function allows you to disable it by passing it '0'. + */ +void pm_set_vt_switch(int do_switch) +{ + acquire_console_sem(); + disable_vt_switch = !do_switch; + release_console_sem(); +} +EXPORT_SYMBOL(pm_set_vt_switch); int pm_prepare_console(void) { acquire_console_sem(); + if (disable_vt_switch) { + release_console_sem(); + return 0; + } + orig_fgconsole = fg_console; if (vc_allocate(SUSPEND_CONSOLE)) { @@ -50,9 +72,12 @@ int pm_prepare_console(void) void pm_restore_console(void) { acquire_console_sem(); + if (disable_vt_switch) { + release_console_sem(); + return; + } set_console(orig_fgconsole); release_console_sem(); kmsg_redirect = orig_kmsg; - return; } #endif diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 859a8e59773..14a656cdc65 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -391,7 +391,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); - error = device_suspend(PMSG_SUSPEND); + error = device_suspend(PMSG_HIBERNATE); if (error) goto Resume_console; @@ -404,7 +404,7 @@ int hibernation_platform_enter(void) goto Finish; local_irq_disable(); - error = device_power_down(PMSG_SUSPEND); + error = device_power_down(PMSG_HIBERNATE); if (!error) { hibernation_ops->enter(); /* We should never get here */ diff --git a/kernel/power/pm.c b/kernel/power/pm.c deleted file mode 100644 index 60c73fa670d..00000000000 --- a/kernel/power/pm.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * pm.c - Power management interface - * - * Copyright (C) 2000 Andrew Henroid - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/pm.h> -#include <linux/pm_legacy.h> -#include <linux/interrupt.h> -#include <linux/mutex.h> - -/* - * Locking notes: - * pm_devs_lock can be a semaphore providing pm ops are not called - * from an interrupt handler (already a bad idea so no change here). Each - * change must be protected so that an unlink of an entry doesn't clash - * with a pm send - which is permitted to sleep in the current architecture - * - * Module unloads clashing with pm events now work out safely, the module - * unload path will block until the event has been sent. It may well block - * until a resume but that will be fine. - */ - -static DEFINE_MUTEX(pm_devs_lock); -static LIST_HEAD(pm_devs); - -/** - * pm_register - register a device with power management - * @type: device type - * @id: device ID - * @callback: callback function - * - * Add a device to the list of devices that wish to be notified about - * power management events. A &pm_dev structure is returned on success, - * on failure the return is %NULL. - * - * The callback function will be called in process context and - * it may sleep. - */ - -struct pm_dev *pm_register(pm_dev_t type, - unsigned long id, - pm_callback callback) -{ - struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); - if (dev) { - dev->type = type; - dev->id = id; - dev->callback = callback; - - mutex_lock(&pm_devs_lock); - list_add(&dev->entry, &pm_devs); - mutex_unlock(&pm_devs_lock); - } - return dev; -} - -/** - * pm_send - send request to a single device - * @dev: device to send to - * @rqst: power management request - * @data: data for the callback - * - * Issue a power management request to a given device. The - * %PM_SUSPEND and %PM_RESUME events are handled specially. The - * data field must hold the intended next state. No call is made - * if the state matches. - * - * BUGS: what stops two power management requests occurring in parallel - * and conflicting. - * - * WARNING: Calling pm_send directly is not generally recommended, in - * particular there is no locking against the pm_dev going away. The - * caller must maintain all needed locking or have 'inside knowledge' - * on the safety. Also remember that this function is not locked against - * pm_unregister. This means that you must handle SMP races on callback - * execution and unload yourself. - */ - -static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data) -{ - int status = 0; - unsigned long prev_state, next_state; - - if (in_interrupt()) - BUG(); - - switch (rqst) { - case PM_SUSPEND: - case PM_RESUME: - prev_state = dev->state; - next_state = (unsigned long) data; - if (prev_state != next_state) { - if (dev->callback) - status = (*dev->callback)(dev, rqst, data); - if (!status) { - dev->state = next_state; - dev->prev_state = prev_state; - } - } - else { - dev->prev_state = prev_state; - } - break; - default: - if (dev->callback) - status = (*dev->callback)(dev, rqst, data); - break; - } - return status; -} - -/* - * Undo incomplete request - */ -static void pm_undo_all(struct pm_dev *last) -{ - struct list_head *entry = last->entry.prev; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - if (dev->state != dev->prev_state) { - /* previous state was zero (running) resume or - * previous state was non-zero (suspended) suspend - */ - pm_request_t undo = (dev->prev_state - ? PM_SUSPEND:PM_RESUME); - pm_send(dev, undo, (void*) dev->prev_state); - } - entry = entry->prev; - } -} - -/** - * pm_send_all - send request to all managed devices - * @rqst: power management request - * @data: data for the callback - * - * Issue a power management request to a all devices. The - * %PM_SUSPEND events are handled specially. Any device is - * permitted to fail a suspend by returning a non zero (error) - * value from its callback function. If any device vetoes a - * suspend request then all other devices that have suspended - * during the processing of this request are restored to their - * previous state. - * - * WARNING: This function takes the pm_devs_lock. The lock is not dropped until - * the callbacks have completed. This prevents races against pm locking - * functions, races against module unload pm_unregister code. It does - * mean however that you must not issue pm_ functions within the callback - * or you will deadlock and users will hate you. - * - * Zero is returned on success. If a suspend fails then the status - * from the device that vetoes the suspend is returned. - * - * BUGS: what stops two power management requests occurring in parallel - * and conflicting. - */ - -int pm_send_all(pm_request_t rqst, void *data) -{ - struct list_head *entry; - - mutex_lock(&pm_devs_lock); - entry = pm_devs.next; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - if (dev->callback) { - int status = pm_send(dev, rqst, data); - if (status) { - /* return devices to previous state on - * failed suspend request - */ - if (rqst == PM_SUSPEND) - pm_undo_all(dev); - mutex_unlock(&pm_devs_lock); - return status; - } - } - entry = entry->next; - } - mutex_unlock(&pm_devs_lock); - return 0; -} - -EXPORT_SYMBOL(pm_register); -EXPORT_SYMBOL(pm_send_all); - diff --git a/kernel/power/process.c b/kernel/power/process.c index 7c2118f9597..f1d0b345c9b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -75,22 +75,15 @@ void refrigerator(void) __set_current_state(save); } -static void fake_signal_wake_up(struct task_struct *p, int resume) +static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, resume); + signal_wake_up(p, 0); spin_unlock_irqrestore(&p->sighand->siglock, flags); } -static void send_fake_signal(struct task_struct *p) -{ - if (task_is_stopped(p)) - force_sig_specific(SIGSTOP, p); - fake_signal_wake_up(p, task_is_stopped(p)); -} - static int has_mm(struct task_struct *p) { return (p->mm && !(p->flags & PF_BORROWED_MM)); @@ -121,7 +114,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only) if (freezing(p)) { if (has_mm(p)) { if (!signal_pending(p)) - fake_signal_wake_up(p, 0); + fake_signal_wake_up(p); } else { if (with_mm_only) ret = 0; @@ -135,7 +128,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only) } else { if (has_mm(p)) { set_freeze_flag(p); - send_fake_signal(p); + fake_signal_wake_up(p); } else { if (with_mm_only) { ret = 0; @@ -182,15 +175,17 @@ static int try_to_freeze_tasks(int freeze_user_space) if (frozen(p) || !freezeable(p)) continue; - if (task_is_traced(p) && frozen(p->parent)) { - cancel_freezing(p); - continue; - } - if (!freeze_task(p, freeze_user_space)) continue; - if (!freezer_should_skip(p)) + /* + * Now that we've done set_freeze_flag, don't + * perturb a task in TASK_STOPPED or TASK_TRACED. + * It is "frozen enough". If the task does wake + * up, it will immediately call try_to_freeze. + */ + if (!task_is_stopped_or_traced(p) && + !freezer_should_skip(p)) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 72a020cabb4..5f91a07c4ea 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * of @bm->cur_zone_bm are updated. */ -static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { struct zone_bitmap *zone_bm; @@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { zone_bm = zone_bm->next; - BUG_ON(!zone_bm); + if (!zone_bm) + return -EFAULT; } bm->cur.zone_bm = zone_bm; } @@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, pfn -= bb->start_pfn; *bit_nr = pfn % BM_BITS_PER_CHUNK; *addr = bb->data + pfn / BM_BITS_PER_CHUNK; + return 0; } static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); set_bit(bit, addr); } +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + if (!error) + set_bit(bit, addr); + return error; +} + static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); clear_bit(bit, addr); } @@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); return test_bit(bit, addr); } @@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) region->end_pfn << PAGE_SHIFT); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) - memory_bm_set_bit(bm, pfn); + if (pfn_valid(pfn)) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } diff --git a/kernel/printk.c b/kernel/printk.c index bee36100f11..bdd4ea8c3f2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -616,6 +616,53 @@ asmlinkage int printk(const char *fmt, ...) /* cpu currently holding logbuf_lock */ static volatile unsigned int printk_cpu = UINT_MAX; +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. + */ +static inline int can_use_console(unsigned int cpu) +{ + return cpu_online(cpu) || have_callable_console(); +} + +/* + * Try to get console ownership to actually show the kernel + * messages from a 'printk'. Return true (and with the + * console_semaphore held, and 'console_locked' set) if it + * is successful, false otherwise. + * + * This gets called with the 'logbuf_lock' spinlock held and + * interrupts disabled. It should return with 'lockbuf_lock' + * released but interrupts still disabled. + */ +static int acquire_console_semaphore_for_printk(unsigned int cpu) +{ + int retval = 0; + + if (!try_acquire_console_sem()) { + retval = 1; + + /* + * If we can't use the console, we need to release + * the console semaphore by hand to avoid flushing + * the buffer. We need to hold the console semaphore + * in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; + up(&console_sem); + retval = 0; + } + } + printk_cpu = UINT_MAX; + spin_unlock(&logbuf_lock); + return retval; +} + const char printk_recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int printk_recursion_bug; @@ -666,7 +713,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) } /* Emit the output into the temporary buffer */ printed_len += vscnprintf(printk_buf + printed_len, - sizeof(printk_buf), fmt, args); + sizeof(printk_buf) - printed_len, fmt, args); /* * Copy the output into log_buf. If the caller didn't provide @@ -725,43 +772,22 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!down_trylock(&console_sem)) { - /* - * We own the drivers. We can drop the spinlock and - * let release_console_sem() print the text, maybe ... - */ - console_locked = 1; - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); + /* + * Try to acquire and then immediately release the + * console semaphore. The release will do all the + * actual magic (print out buffers, wake up klogd, + * etc). + * + * The acquire_console_semaphore_for_printk() function + * will release 'logbuf_lock' regardless of whether it + * actually gets the semaphore or not. + */ + if (acquire_console_semaphore_for_printk(this_cpu)) + release_console_sem(); - /* - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. - */ - if (cpu_online(smp_processor_id()) || have_callable_console()) { - console_may_schedule = 0; - release_console_sem(); - } else { - /* Release by hand to avoid flushing the buffer. */ - console_locked = 0; - up(&console_sem); - } - lockdep_on(); - raw_local_irq_restore(flags); - } else { - /* - * Someone else owns the drivers. We drop the spinlock, which - * allows the semaphore holder to proceed and to call the - * console drivers with the output which we just produced. - */ - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); - lockdep_on(); + lockdep_on(); out_restore_irqs: - raw_local_irq_restore(flags); - } + raw_local_irq_restore(flags); preempt_enable(); return printed_len; diff --git a/kernel/profile.c b/kernel/profile.c index 3b7a1b05512..606d7387265 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -23,7 +23,6 @@ #include <linux/highmem.h> #include <linux/mutex.h> #include <asm/sections.h> -#include <asm/semaphore.h> #include <asm/irq_regs.h> #include <asm/ptrace.h> diff --git a/kernel/ptrace.c b/kernel/ptrace.c index fdb34e86f92..dac4b4e5729 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -323,9 +323,8 @@ static int ptrace_setoptions(struct task_struct *child, long data) return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; } -static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) +static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) { - siginfo_t lastinfo; int error = -ESRCH; read_lock(&tasklist_lock); @@ -333,31 +332,25 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) error = -EINVAL; spin_lock_irq(&child->sighand->siglock); if (likely(child->last_siginfo != NULL)) { - lastinfo = *child->last_siginfo; + *info = *child->last_siginfo; error = 0; } spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!error) - return copy_siginfo_to_user(data, &lastinfo); return error; } -static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) +static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) { - siginfo_t newinfo; int error = -ESRCH; - if (copy_from_user(&newinfo, data, sizeof (siginfo_t))) - return -EFAULT; - read_lock(&tasklist_lock); if (likely(child->sighand != NULL)) { error = -EINVAL; spin_lock_irq(&child->sighand->siglock); if (likely(child->last_siginfo != NULL)) { - *child->last_siginfo = newinfo; + *child->last_siginfo = *info; error = 0; } spin_unlock_irq(&child->sighand->siglock); @@ -424,6 +417,7 @@ int ptrace_request(struct task_struct *child, long request, long addr, long data) { int ret = -EIO; + siginfo_t siginfo; switch (request) { case PTRACE_PEEKTEXT: @@ -442,12 +436,22 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_GETEVENTMSG: ret = put_user(child->ptrace_message, (unsigned long __user *) data); break; + case PTRACE_GETSIGINFO: - ret = ptrace_getsiginfo(child, (siginfo_t __user *) data); + ret = ptrace_getsiginfo(child, &siginfo); + if (!ret) + ret = copy_siginfo_to_user((siginfo_t __user *) data, + &siginfo); break; + case PTRACE_SETSIGINFO: - ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); + if (copy_from_user(&siginfo, (siginfo_t __user *) data, + sizeof siginfo)) + ret = -EFAULT; + else + ret = ptrace_setsiginfo(child, &siginfo); break; + case PTRACE_DETACH: /* detach a process that was attached. */ ret = ptrace_detach(child, data); break; @@ -608,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) return (copied == sizeof(data)) ? 0 : -EIO; } -#ifdef CONFIG_COMPAT +#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE #include <linux/compat.h> int compat_ptrace_request(struct task_struct *child, compat_long_t request, @@ -616,6 +620,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, { compat_ulong_t __user *datap = compat_ptr(data); compat_ulong_t word; + siginfo_t siginfo; int ret; switch (request) { @@ -638,6 +643,23 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, ret = put_user((compat_ulong_t) child->ptrace_message, datap); break; + case PTRACE_GETSIGINFO: + ret = ptrace_getsiginfo(child, &siginfo); + if (!ret) + ret = copy_siginfo_to_user32( + (struct compat_siginfo __user *) datap, + &siginfo); + break; + + case PTRACE_SETSIGINFO: + memset(&siginfo, 0, sizeof siginfo); + if (copy_siginfo_from_user32( + &siginfo, (struct compat_siginfo __user *) datap)) + ret = -EFAULT; + else + ret = ptrace_setsiginfo(child, &siginfo); + break; + default: ret = ptrace_request(child, request, addr, data); } @@ -645,7 +667,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, return ret; } -#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, compat_long_t addr, compat_long_t data) { @@ -688,6 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, unlock_kernel(); return ret; } -#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ - -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 987cfb7ade8..e1cdf196a51 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -23,6 +23,10 @@ * to Suparna Bhattacharya for pushing me completely away * from atomic instructions on the read side. * + * - Added handling of Dynamic Ticks + * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com> + * - Steven Rostedt <srostedt@redhat.com> + * * Papers: http://www.rdrop.com/users/paulmck/RCU * * Design Document: http://lwn.net/Articles/253651/ @@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) } } +#ifdef CONFIG_NO_HZ + +DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; +static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); +static DEFINE_PER_CPU(int, rcu_update_flag); + +/** + * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. + * + * If the CPU was idle with dynamic ticks active, this updates the + * dynticks_progress_counter to let the RCU handling know that the + * CPU is active. + */ +void rcu_irq_enter(void) +{ + int cpu = smp_processor_id(); + + if (per_cpu(rcu_update_flag, cpu)) + per_cpu(rcu_update_flag, cpu)++; + + /* + * Only update if we are coming from a stopped ticks mode + * (dynticks_progress_counter is even). + */ + if (!in_interrupt() && + (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { + /* + * The following might seem like we could have a race + * with NMI/SMIs. But this really isn't a problem. + * Here we do a read/modify/write, and the race happens + * when an NMI/SMI comes in after the read and before + * the write. But NMI/SMIs will increment this counter + * twice before returning, so the zero bit will not + * be corrupted by the NMI/SMI which is the most important + * part. + * + * The only thing is that we would bring back the counter + * to a postion that it was in during the NMI/SMI. + * But the zero bit would be set, so the rest of the + * counter would again be ignored. + * + * On return from the IRQ, the counter may have the zero + * bit be 0 and the counter the same as the return from + * the NMI/SMI. If the state machine was so unlucky to + * see that, it still doesn't matter, since all + * RCU read-side critical sections on this CPU would + * have already completed. + */ + per_cpu(dynticks_progress_counter, cpu)++; + /* + * The following memory barrier ensures that any + * rcu_read_lock() primitives in the irq handler + * are seen by other CPUs to follow the above + * increment to dynticks_progress_counter. This is + * required in order for other CPUs to correctly + * determine when it is safe to advance the RCU + * grace-period state machine. + */ + smp_mb(); /* see above block comment. */ + /* + * Since we can't determine the dynamic tick mode from + * the dynticks_progress_counter after this routine, + * we use a second flag to acknowledge that we came + * from an idle state with ticks stopped. + */ + per_cpu(rcu_update_flag, cpu)++; + /* + * If we take an NMI/SMI now, they will also increment + * the rcu_update_flag, and will not update the + * dynticks_progress_counter on exit. That is for + * this IRQ to do. + */ + } +} + +/** + * rcu_irq_exit - Called from exiting Hard irq context. + * + * If the CPU was idle with dynamic ticks active, update the + * dynticks_progress_counter to put let the RCU handling be + * aware that the CPU is going back to idle with no ticks. + */ +void rcu_irq_exit(void) +{ + int cpu = smp_processor_id(); + + /* + * rcu_update_flag is set if we interrupted the CPU + * when it was idle with ticks stopped. + * Once this occurs, we keep track of interrupt nesting + * because a NMI/SMI could also come in, and we still + * only want the IRQ that started the increment of the + * dynticks_progress_counter to be the one that modifies + * it on exit. + */ + if (per_cpu(rcu_update_flag, cpu)) { + if (--per_cpu(rcu_update_flag, cpu)) + return; + + /* This must match the interrupt nesting */ + WARN_ON(in_interrupt()); + + /* + * If an NMI/SMI happens now we are still + * protected by the dynticks_progress_counter being odd. + */ + + /* + * The following memory barrier ensures that any + * rcu_read_unlock() primitives in the irq handler + * are seen by other CPUs to preceed the following + * increment to dynticks_progress_counter. This + * is required in order for other CPUs to determine + * when it is safe to advance the RCU grace-period + * state machine. + */ + smp_mb(); /* see above block comment. */ + per_cpu(dynticks_progress_counter, cpu)++; + WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); + } +} + +static void dyntick_save_progress_counter(int cpu) +{ + per_cpu(rcu_dyntick_snapshot, cpu) = + per_cpu(dynticks_progress_counter, cpu); +} + +static inline int +rcu_try_flip_waitack_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot be in the middle of an rcu_read_lock(), so + * the next rcu_read_lock() it executes must use the new value + * of the counter. So we can safely pretend that this CPU + * already acknowledged the counter. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq handlers, then, as above, we can safely pretend + * that this CPU already acknowledged the counter. + */ + + if ((curr - snap) > 2 || (snap & 0x1) == 0) + return 0; + + /* We need this CPU to explicitly acknowledge the counter flip. */ + + return 1; +} + +static inline int +rcu_try_flip_waitmb_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot have executed an RCU read-side critical section + * during that time, so there is no need for it to execute a + * memory barrier. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU either entered or exited an outermost interrupt, + * SMI, NMI, or whatever handler, then we know that it executed + * a memory barrier when doing so. So we don't need another one. + */ + if (curr != snap) + return 0; + + /* We need the CPU to execute a memory barrier. */ + + return 1; +} + +#else /* !CONFIG_NO_HZ */ + +# define dyntick_save_progress_counter(cpu) do { } while (0) +# define rcu_try_flip_waitack_needed(cpu) (1) +# define rcu_try_flip_waitmb_needed(cpu) (1) + +#endif /* CONFIG_NO_HZ */ + /* * Get here when RCU is idle. Decide whether we need to * move out of idle state, and return non-zero if so. @@ -447,8 +657,10 @@ rcu_try_flip_idle(void) /* Now ask each CPU for acknowledgement of the flip. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask(cpu, rcu_cpu_online_map) { per_cpu(rcu_flip_flag, cpu) = rcu_flipped; + dyntick_save_progress_counter(cpu); + } return 1; } @@ -464,7 +676,8 @@ rcu_try_flip_waitack(void) RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); for_each_cpu_mask(cpu, rcu_cpu_online_map) - if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { + if (rcu_try_flip_waitack_needed(cpu) && + per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); return 0; } @@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void) smp_mb(); /* ^^^^^^^^^^^^ */ /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask(cpu, rcu_cpu_online_map) { per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; + dyntick_save_progress_counter(cpu); + } RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); return 1; @@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void) RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); for_each_cpu_mask(cpu, rcu_cpu_online_map) - if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { + if (rcu_try_flip_waitmb_needed(cpu) && + per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); return 0; } @@ -702,8 +918,9 @@ void rcu_offline_cpu(int cpu) * fix. */ + local_irq_save(flags); rdp = RCU_DATA_ME(); - spin_lock_irqsave(&rdp->lock, flags); + spin_lock(&rdp->lock); *rdp->nexttail = list; if (list) rdp->nexttail = tail; @@ -735,9 +952,11 @@ static void rcu_process_callbacks(struct softirq_action *unused) { unsigned long flags; struct rcu_head *next, *list; - struct rcu_data *rdp = RCU_DATA_ME(); + struct rcu_data *rdp; - spin_lock_irqsave(&rdp->lock, flags); + local_irq_save(flags); + rdp = RCU_DATA_ME(); + spin_lock(&rdp->lock); list = rdp->donelist; if (list == NULL) { spin_unlock_irqrestore(&rdp->lock, flags); @@ -788,10 +1007,10 @@ void __synchronize_sched(void) if (sched_getaffinity(0, &oldmask) < 0) oldmask = cpu_possible_map; for_each_online_cpu(cpu) { - sched_setaffinity(0, cpumask_of_cpu(cpu)); + sched_setaffinity(0, &cpumask_of_cpu(cpu)); schedule(); } - sched_setaffinity(0, oldmask); + sched_setaffinity(0, &oldmask); } EXPORT_SYMBOL_GPL(__synchronize_sched); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index fd599829e72..47894f919d4 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -723,9 +723,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ */ static void rcu_torture_shuffle_tasks(void) { - cpumask_t tmp_mask = CPU_MASK_ALL; + cpumask_t tmp_mask; int i; + cpus_setall(tmp_mask); get_online_cpus(); /* No point in shuffling if there is only one online CPU (ex: UP) */ @@ -737,25 +738,27 @@ static void rcu_torture_shuffle_tasks(void) if (rcu_idle_cpu != -1) cpu_clear(rcu_idle_cpu, tmp_mask); - set_cpus_allowed(current, tmp_mask); + set_cpus_allowed_ptr(current, &tmp_mask); if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) - set_cpus_allowed(reader_tasks[i], tmp_mask); + set_cpus_allowed_ptr(reader_tasks[i], + &tmp_mask); } if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) - set_cpus_allowed(fakewriter_tasks[i], tmp_mask); + set_cpus_allowed_ptr(fakewriter_tasks[i], + &tmp_mask); } if (writer_task) - set_cpus_allowed(writer_task, tmp_mask); + set_cpus_allowed_ptr(writer_task, &tmp_mask); if (stats_task) - set_cpus_allowed(stats_task, tmp_mask); + set_cpus_allowed_ptr(stats_task, &tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; diff --git a/kernel/relay.c b/kernel/relay.c index d080b9d161a..d6204a48581 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) kref_get(&buf->kref); filp->private_data = buf; - return 0; + return nonseekable_open(inode, filp); } /** @@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) +{ +} + /* * subbuf_splice_actor - splice up to one subbuf's worth of data */ @@ -1066,7 +1070,7 @@ static int subbuf_splice_actor(struct file *in, unsigned int flags, int *nonpad_ret) { - unsigned int pidx, poff, total_len, subbuf_pages, ret; + unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; uint64_t pos = (uint64_t) *ppos; @@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, + .spd_release = relay_page_release, }; if (rbuf->subbufs_produced == rbuf->subbufs_consumed) @@ -1097,8 +1102,9 @@ static int subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; + nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); - for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { + for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; unsigned int cur_pos = read_start + total_len; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 16cbec2d5d6..efbfc0fc232 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -113,6 +113,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member, ret = -EINVAL; + strstrip(buf); if (write_strategy) { if (write_strategy(buf, &tmp)) { goto out_free; diff --git a/kernel/resource.c b/kernel/resource.c index 82aea814d40..cee12cc47ca 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -486,6 +486,24 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t EXPORT_SYMBOL(adjust_resource); +/** + * resource_alignment - calculate resource's alignment + * @res: resource pointer + * + * Returns alignment on success, 0 (invalid alignment) on failure. + */ +resource_size_t resource_alignment(struct resource *res) +{ + switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { + case IORESOURCE_SIZEALIGN: + return res->end - res->start + 1; + case IORESOURCE_STARTALIGN: + return res->start; + default: + return 0; + } +} + /* * This is compatibility stuff for IO resources. * diff --git a/kernel/sched.c b/kernel/sched.c index f28f19e65b5..740fb409e5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -66,6 +66,10 @@ #include <linux/unistd.h> #include <linux/pagemap.h> #include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/bootmem.h> +#include <linux/debugfs.h> +#include <linux/ctype.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -114,6 +118,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) */ #define DEF_TIMESLICE (100 * HZ / 1000) +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) @@ -155,6 +164,84 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; +struct rt_bandwidth { + /* nests inside the rq lock: */ + spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + hrtimer_start(&rt_b->rt_period_timer, + rt_b->rt_period_timer.expires, + HRTIMER_MODE_ABS); + } + spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + #ifdef CONFIG_GROUP_SCHED #include <linux/cgroup.h> @@ -174,41 +261,6 @@ struct task_group { struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; - - /* - * shares assigned to a task group governs how much of cpu bandwidth - * is allocated to the group. The more shares a group has, the more is - * the cpu bandwidth allocated to it. - * - * For ex, lets say that there are three task groups, A, B and C which - * have been assigned shares 1000, 2000 and 3000 respectively. Then, - * cpu bandwidth allocated by the scheduler to task groups A, B and C - * should be: - * - * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% - * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% - * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% - * - * The weight assigned to a task group's schedulable entities on every - * cpu (task_group.se[a_cpu]->load.weight) is derived from the task - * group's shares. For ex: lets say that task group A has been - * assigned shares of 1000 and there are two CPUs in a system. Then, - * - * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; - * - * Note: It's not necessary that each of a task's group schedulable - * entity have the same weight on all CPUs. If the group - * has 2 of its tasks on CPU0 and 1 task on CPU1, then a - * better distribution of weight could be: - * - * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 - * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 - * - * rebalance_shares() is responsible for distributing the shares of a - * task groups like this among the group's schedulable entities across - * cpus. - * - */ unsigned long shares; #endif @@ -216,29 +268,39 @@ struct task_group { struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - u64 rt_runtime; + struct rt_bandwidth rt_bandwidth; #endif struct rcu_head rcu; struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; }; +#ifdef CONFIG_USER_SCHED + +/* + * Root task group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. + */ +struct task_group root_task_group; + #ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; - -static struct sched_entity *init_sched_entity_p[NR_CPUS]; -static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; #endif #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; - -static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; -static struct rt_rq *init_rt_rq_p[NR_CPUS]; +#endif +#else +#define root_task_group init_task_group #endif /* task_group_lock serializes add/remove of task groups and also changes to @@ -250,21 +312,13 @@ static DEFINE_SPINLOCK(task_group_lock); static DEFINE_MUTEX(doms_cur_mutex); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP -/* kernel thread that runs rebalance_shares() periodically */ -static struct task_struct *lb_monitor_task; -static int load_balance_monitor(void *unused); -#endif - -static void set_se_shares(struct sched_entity *se, unsigned long shares); - #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif -#define MIN_GROUP_SHARES 2 +#define MIN_SHARES 2 static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -272,17 +326,7 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD; /* Default task group. * Every task in system belong to this group at bootup. */ -struct task_group init_task_group = { -#ifdef CONFIG_FAIR_GROUP_SCHED - .se = init_sched_entity_p, - .cfs_rq = init_cfs_rq_p, -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - .rt_se = init_sched_rt_entity_p, - .rt_rq = init_rt_rq_p, -#endif -}; +struct task_group init_task_group; /* return group to which a task belongs */ static inline struct task_group *task_group(struct task_struct *p) @@ -342,11 +386,15 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct rb_node *rb_load_balance_curr; - /* 'curr' points to currently running entity on this cfs_rq. + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr; + struct sched_entity *curr, *next; unsigned long nr_spread_over; @@ -363,6 +411,43 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + unsigned long task_weight; + unsigned long shares; + /* + * We need space to build a sched_domain wide view of the full task + * group tree, in order to avoid depending on dynamic memory allocation + * during the load balancing we place this in the per cpu task group + * hierarchy. This limits the load balancing to one instance per cpu, + * but more should not be needed anyway. + */ + struct aggregate_struct { + /* + * load = weight(cpus) * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long load; + + /* + * part of the group weight distributed to this span. + */ + unsigned long shares; + + /* + * The sum of all runqueue weights within this span. + */ + unsigned long rq_weight; + + /* + * Weight contributed by tasks; this is the part we can + * influence by moving tasks around. + */ + unsigned long task_weight; + } aggregate; +#endif #endif }; @@ -379,6 +464,9 @@ struct rt_rq { #endif int rt_throttled; u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -441,6 +529,7 @@ struct rq { unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ + unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ @@ -450,8 +539,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -544,6 +631,32 @@ static inline int cpu_of(struct rq *rq) #endif } +#ifdef CONFIG_NO_HZ +static inline bool nohz_on(int cpu) +{ + return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; +} + +static inline u64 max_skipped_ticks(struct rq *rq) +{ + return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; +} + +static inline void update_last_tick_seen(struct rq *rq) +{ + rq->last_tick_seen = jiffies; +} +#else +static inline u64 max_skipped_ticks(struct rq *rq) +{ + return 1; +} + +static inline void update_last_tick_seen(struct rq *rq) +{ +} +#endif + /* * Update the per-runqueue clock, as finegrained as the platform can give * us, but without assuming monotonicity, etc.: @@ -568,9 +681,12 @@ static void __update_rq_clock(struct rq *rq) /* * Catch too large forward jumps too: */ - if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { - if (clock < rq->tick_timestamp + TICK_NSEC) - clock = rq->tick_timestamp + TICK_NSEC; + u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; + u64 max_time = rq->tick_timestamp + max_jump; + + if (unlikely(clock + delta > max_time)) { + if (clock < max_time) + clock = max_time; else clock++; rq->clock_overflows++; @@ -606,23 +722,6 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -unsigned long rt_needs_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq->rt_throttled) - return 0; - - if (rq->clock > rq->rt_period_expire) - return 1; - - delta = rq->rt_period_expire - rq->clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; -} - /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -635,26 +734,137 @@ unsigned long rt_needs_cpu(int cpu) /* * Debugging: various feature bits */ + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + enum { - SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, - SCHED_FEAT_WAKEUP_PREEMPT = 2, - SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_TREE_AVG = 8, - SCHED_FEAT_APPROX_AVG = 16, - SCHED_FEAT_HRTICK = 32, - SCHED_FEAT_DOUBLE_TICK = 64, +#include "sched_features.h" }; +#undef SCHED_FEAT + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + const_debug unsigned int sysctl_sched_features = - SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | - SCHED_FEAT_WAKEUP_PREEMPT * 1 | - SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0 | - SCHED_FEAT_HRTICK * 1 | - SCHED_FEAT_DOUBLE_TICK * 0; +#include "sched_features.h" + 0; -#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +__read_mostly char *sched_feat_names[] = { +#include "sched_features.h" + NULL +}; + +#undef SCHED_FEAT + +int sched_feat_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + return 0; +} + +static ssize_t +sched_feat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + for (i = 0; sched_feat_names[i]; i++) { + len += strlen(sched_feat_names[i]); + len += 4; + } + + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; sched_feat_names[i]; i++) { + if (sysctl_sched_features & (1UL << i)) + r += sprintf(buf + r, "%s ", sched_feat_names[i]); + else + r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; sched_feat_names[i]; i++) { + int len = strlen(sched_feat_names[i]); + + if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (neg) + sysctl_sched_features &= ~(1UL << i); + else + sysctl_sched_features |= (1UL << i); + break; + } + } + + if (!sched_feat_names[i]) + return -EINVAL; + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .read = sched_feat_read, + .write = sched_feat_write, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#endif + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) /* * Number of tasks to iterate in a single balance run. @@ -668,40 +878,98 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; */ unsigned int sysctl_sched_rt_period = 1000000; +static __read_mostly int scheduler_running; + /* * part of the period that we allow rt tasks to run in us. * default: 0.95s */ int sysctl_sched_rt_runtime = 950000; -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_period < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +static const unsigned long long time_sync_thresh = 100000; + +static DEFINE_PER_CPU(unsigned long long, time_offset); +static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); /* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): + * Global lock which we take every now and then to synchronize + * the CPUs time. This method is not warp-safe, but it's good + * enough to synchronize slowly diverging time sources and thus + * it's good enough for tracing: */ -unsigned long long cpu_clock(int cpu) +static DEFINE_SPINLOCK(time_sync_lock); +static unsigned long long prev_global_time; + +static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) +{ + unsigned long flags; + + spin_lock_irqsave(&time_sync_lock, flags); + + if (time < prev_global_time) { + per_cpu(time_offset, cpu) += prev_global_time - time; + time = prev_global_time; + } else { + prev_global_time = time; + } + + spin_unlock_irqrestore(&time_sync_lock, flags); + + return time; +} + +static unsigned long long __cpu_clock(int cpu) { unsigned long long now; unsigned long flags; struct rq *rq; - local_irq_save(flags); - rq = cpu_rq(cpu); /* * Only call sched_clock() if the scheduler has already been * initialized (some code might call cpu_clock() very early): */ - if (rq->idle) - update_rq_clock(rq); + if (unlikely(!scheduler_running)) + return 0; + + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); now = rq->clock; local_irq_restore(flags); return now; } + +/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ +unsigned long long cpu_clock(int cpu) +{ + unsigned long long prev_cpu_time, time, delta_time; + + prev_cpu_time = per_cpu(prev_cpu_time, cpu); + time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); + delta_time = time-prev_cpu_time; + + if (unlikely(delta_time > time_sync_thresh)) + time = __sync_cpu_clock(time, cpu); + + return time; +} EXPORT_SYMBOL_GPL(cpu_clock); #ifndef prepare_arch_switch @@ -1097,6 +1365,49 @@ static void resched_cpu(int cpu) resched_task(cpu_curr(cpu)); spin_unlock_irqrestore(&rq->lock, flags); } + +#ifdef CONFIG_NO_HZ +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} +#endif + #else static void __resched_task(struct task_struct *p, int tif_bit) { @@ -1118,6 +1429,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) */ #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +/* + * delta *= weight / lw + */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) @@ -1125,7 +1439,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, u64 tmp; if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; + lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); tmp = (u64)delta_exec * weight; /* @@ -1140,20 +1454,16 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -static inline unsigned long -calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) -{ - return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); -} - static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; + lw->inv_weight = 0; } static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { lw->weight -= dec; + lw->inv_weight = 0; } /* @@ -1256,6 +1566,332 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* + * Group load balancing. + * + * We calculate a few balance domain wide aggregate numbers; load and weight. + * Given the pictures below, and assuming each item has equal weight: + * + * root 1 - thread + * / | \ A - group + * A 1 B + * /|\ / \ + * C 2 D 3 4 + * | | + * 5 6 + * + * load: + * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, + * which equals 1/9-th of the total load. + * + * shares: + * The weight of this group on the selected cpus. + * + * rq_weight: + * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while + * B would get 2. + * + * task_weight: + * Part of the rq_weight contributed by tasks; all groups except B would + * get 1, B gets 2. + */ + +static inline struct aggregate_struct * +aggregate(struct task_group *tg, struct sched_domain *sd) +{ + return &tg->cfs_rq[sd->first_cpu]->aggregate; +} + +typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static +void aggregate_walk_tree(aggregate_func down, aggregate_func up, + struct sched_domain *sd) +{ + struct task_group *parent, *child; + + rcu_read_lock(); + parent = &root_task_group; +down: + (*down)(parent, sd); + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + (*up)(parent, sd); + + child = parent; + parent = parent->parent; + if (parent) + goto up; + rcu_read_unlock(); +} + +/* + * Calculate the aggregate runqueue weight. + */ +static +void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long rq_weight = 0; + unsigned long task_weight = 0; + int i; + + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + task_weight += tg->cfs_rq[i]->task_weight; + } + + aggregate(tg, sd)->rq_weight = rq_weight; + aggregate(tg, sd)->task_weight = task_weight; +} + +/* + * Compute the weight of this group on the given cpus. + */ +static +void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = 0; + int i; + + for_each_cpu_mask(i, sd->span) + shares += tg->cfs_rq[i]->shares; + + if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) + shares = tg->shares; + + aggregate(tg, sd)->shares = shares; +} + +/* + * Compute the load fraction assigned to this group, relies on the aggregate + * weight and this group's parent's load, i.e. top-down. + */ +static +void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long load; + + if (!tg->parent) { + int i; + + load = 0; + for_each_cpu_mask(i, sd->span) + load += cpu_rq(i)->load.weight; + + } else { + load = aggregate(tg->parent, sd)->load; + + /* + * shares is our weight in the parent's rq so + * shares/parent->rq_weight gives our fraction of the load + */ + load *= aggregate(tg, sd)->shares; + load /= aggregate(tg->parent, sd)->rq_weight + 1; + } + + aggregate(tg, sd)->load = load; +} + +static void __set_se_shares(struct sched_entity *se, unsigned long shares); + +/* + * Calculate and set the cpu's group shares. + */ +static void +__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, + int tcpu) +{ + int boost = 0; + unsigned long shares; + unsigned long rq_weight; + + if (!tg->se[tcpu]) + return; + + rq_weight = tg->cfs_rq[tcpu]->load.weight; + + /* + * If there are currently no tasks on the cpu pretend there is one of + * average load so that when a new task gets to run here it will not + * get delayed by group starvation. + */ + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } + + /* + * \Sum shares * rq_weight + * shares = ----------------------- + * \Sum rq_weight + * + */ + shares = aggregate(tg, sd)->shares * rq_weight; + shares /= aggregate(tg, sd)->rq_weight + 1; + + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + + __set_se_shares(tg->se[tcpu], shares); +} + +/* + * Re-adjust the weights on the cpu the task came from and on the cpu the + * task went to. + */ +static void +__move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + unsigned long shares; + + shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + + __update_group_shares_cpu(tg, sd, scpu); + __update_group_shares_cpu(tg, sd, dcpu); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + if (shares) + tg->cfs_rq[dcpu]->shares += shares; +} + +/* + * Because changing a group's shares changes the weight of the super-group + * we need to walk up the tree and change all shares until we hit the root. + */ +static void +move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + while (tg) { + __move_group_shares(tg, sd, scpu, dcpu); + tg = tg->parent; + } +} + +static +void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = aggregate(tg, sd)->shares; + int i; + + for_each_cpu_mask(i, sd->span) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __update_group_shares_cpu(tg, sd, i); + spin_unlock_irqrestore(&rq->lock, flags); + } + + aggregate_group_shares(tg, sd); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= aggregate(tg, sd)->shares; + if (shares) { + tg->cfs_rq[sd->first_cpu]->shares += shares; + aggregate(tg, sd)->shares += shares; + } +} + +/* + * Calculate the accumulative weight and recursive load of each task group + * while walking down the tree. + */ +static +void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_weight(tg, sd); + aggregate_group_shares(tg, sd); + aggregate_group_load(tg, sd); +} + +/* + * Rebalance the cpu shares while walking back up the tree. + */ +static +void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_set_shares(tg, sd); +} + +static DEFINE_PER_CPU(spinlock_t, aggregate_lock); + +static void __init init_aggregate(void) +{ + int i; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(aggregate_lock, i)); +} + +static int get_aggregate(struct sched_domain *sd) +{ + if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + return 0; + + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + return 1; +} + +static void put_aggregate(struct sched_domain *sd) +{ + spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); +} + +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ + cfs_rq->shares = shares; +} + +#else + +static inline void init_aggregate(void) +{ +} + +static inline int get_aggregate(struct sched_domain *sd) +{ + return 0; +} + +static inline void put_aggregate(struct sched_domain *sd) +{ +} +#endif + +#else /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +} +#endif + #endif /* CONFIG_SMP */ #include "sched_stats.h" @@ -1433,6 +2069,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + /* + * Buddy candidates are cache hot: + */ + if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) + return 1; + if (p->sched_class != &fair_sched_class) return 0; @@ -1720,17 +2362,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, + cpumask_t *tmp) { - cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(tmp, group->cpumask, p->cpus_allowed); + cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, *tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1769,7 +2411,7 @@ static int sched_balance_self(int cpu, int flag) } while (sd) { - cpumask_t span; + cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -1785,7 +2427,7 @@ static int sched_balance_self(int cpu, int flag) continue; } - new_cpu = find_idlest_cpu(group, t, cpu); + new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -1831,6 +2473,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) long old_state; struct rq *rq; + if (!sched_feat(SYNC_WAKEUPS)) + sync = 0; + + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) @@ -1891,10 +2537,11 @@ out_activate: schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); activate_task(rq, p, 1); - check_preempt_curr(rq, p); success = 1; out_running: + check_preempt_curr(rq, p); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -1928,6 +2575,8 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.last_wakeup = 0; + p->se.avg_overlap = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -1943,6 +2592,7 @@ static void __sched_fork(struct task_struct *p) INIT_LIST_HEAD(&p->rt.run_list); p->se.on_rq = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2662,7 +3312,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, cpumask_t *cpus, int *balance) + int *sd_idle, const cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2963,7 +3613,7 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, cpumask_t *cpus) + unsigned long imbalance, const cpumask_t *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -3002,14 +3652,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) + int *balance, cpumask_t *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; - cpumask_t cpus = CPU_MASK_ALL; unsigned long flags; + int unlock_aggregate; + + cpus_setall(*cpus); + + unlock_aggregate = get_aggregate(sd); /* * When power savings policy is enabled for the parent domain, idle @@ -3025,7 +3679,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus, balance); + cpus, balance); if (*balance == 0) goto out_balanced; @@ -3035,7 +3689,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, &cpus); + busiest = find_busiest_queue(group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -3068,8 +3722,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; goto out_balanced; } @@ -3126,8 +3780,9 @@ redo: if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return ld_moved; + ld_moved = -1; + + goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -3142,8 +3797,13 @@ out_one_pinned: if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return 0; + ld_moved = -1; + else + ld_moved = 0; +out: + if (unlock_aggregate) + put_aggregate(sd); + return ld_moved; } /* @@ -3154,7 +3814,8 @@ out_one_pinned: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, + cpumask_t *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3162,7 +3823,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int ld_moved = 0; int sd_idle = 0; int all_pinned = 0; - cpumask_t cpus = CPU_MASK_ALL; + + cpus_setall(*cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3177,14 +3839,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, - &sd_idle, &cpus, NULL); + &sd_idle, cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, - &cpus); + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); goto out_balanced; @@ -3206,8 +3867,8 @@ redo: spin_unlock(&busiest->lock); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; } } @@ -3241,6 +3902,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = -1; unsigned long next_balance = jiffies + HZ; + cpumask_t tmpmask; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3250,8 +3912,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ - pulled_task = load_balance_newidle(this_cpu, - this_rq, sd); + pulled_task = load_balance_newidle(this_cpu, this_rq, + sd, &tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3410,6 +4072,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + cpumask_t tmp; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3433,7 +4096,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3549,7 +4212,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) */ int ilb = first_cpu(nohz.cpu_mask); - if (ilb != NR_CPUS) + if (ilb < nr_cpu_ids) resched_cpu(ilb); } } @@ -3753,9 +4416,9 @@ void scheduler_tick(void) rq->clock_underflows++; } rq->tick_timestamp = rq->clock; + update_last_tick_seen(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -3766,7 +4429,7 @@ void scheduler_tick(void) #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) -void add_preempt_count(int val) +void __kprobes add_preempt_count(int val) { /* * Underflow? @@ -3782,7 +4445,7 @@ void add_preempt_count(int val) } EXPORT_SYMBOL(add_preempt_count); -void sub_preempt_count(int val) +void __kprobes sub_preempt_count(int val) { /* * Underflow? @@ -3884,7 +4547,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev) asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; - long *switch_count; + unsigned long *switch_count; struct rq *rq; int cpu; @@ -3913,7 +4576,7 @@ need_resched_nonpreemptible: if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) { + signal_pending(prev))) { prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, 1); @@ -4306,11 +4969,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -4319,10 +4981,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -4457,7 +5118,7 @@ int task_nice(const struct task_struct *p) { return TASK_NICE(p); } -EXPORT_SYMBOL_GPL(task_nice); +EXPORT_SYMBOL(task_nice); /** * idle_cpu - is a given cpu idle currently? @@ -4589,7 +5250,7 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_runtime == 0) + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -4616,19 +5277,17 @@ recheck: update_rq_clock(rq); on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) deactivate_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - activate_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -4753,9 +5412,10 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, cpumask_t new_mask) +long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) { cpumask_t cpus_allowed; + cpumask_t new_mask = *in_mask; struct task_struct *p; int retval; @@ -4786,13 +5446,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) if (retval) goto out_unlock; - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); cpus_and(new_mask, new_mask, cpus_allowed); again: - retval = set_cpus_allowed(p, new_mask); + retval = set_cpus_allowed_ptr(p, &new_mask); if (!retval) { - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); if (!cpus_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset @@ -4836,7 +5496,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, if (retval) return retval; - return sched_setaffinity(pid, new_mask); + return sched_setaffinity(pid, &new_mask); } /* @@ -5135,7 +5795,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) time_slice = 0; if (p->policy == SCHED_RR) { time_slice = DEF_TIMESLICE; - } else { + } else if (p->policy != SCHED_FIFO) { struct sched_entity *se = &p->se; unsigned long flags; struct rq *rq; @@ -5298,7 +5958,6 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; - sysctl_sched_batch_wakeup_granularity *= factor; } #ifdef CONFIG_SMP @@ -5327,7 +5986,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) { struct migration_req req; unsigned long flags; @@ -5335,23 +5994,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(new_mask, cpu_online_map)) { + if (!cpus_intersects(*new_mask, cpu_online_map)) { ret = -EINVAL; goto out; } if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, &new_mask); + p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = new_mask; - p->rt.nr_cpus_allowed = cpus_weight(new_mask); + p->cpus_allowed = *new_mask; + p->rt.nr_cpus_allowed = cpus_weight(*new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), new_mask)) + if (cpu_isset(task_cpu(p), *new_mask)) goto out; - if (migrate_task(p, any_online_cpu(new_mask), &req)) { + if (migrate_task(p, any_online_cpu(*new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -5364,7 +6023,7 @@ out: return ret; } -EXPORT_SYMBOL_GPL(set_cpus_allowed); +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* * Move (not current) task off this cpu, onto dest cpu. We're doing @@ -5502,12 +6161,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) dest_cpu = any_online_cpu(mask); /* On any allowed CPU? */ - if (dest_cpu == NR_CPUS) + if (dest_cpu >= nr_cpu_ids) dest_cpu = any_online_cpu(p->cpus_allowed); /* No more Mr. Nice Guy. */ - if (dest_cpu == NR_CPUS) { - cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); + if (dest_cpu >= nr_cpu_ids) { + cpumask_t cpus_allowed; + + cpuset_cpus_allowed_locked(p, &cpus_allowed); /* * Try to stay on the same cpuset, where the * current cpuset may be a subset of all cpus. @@ -5543,7 +6204,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); + struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); unsigned long flags; local_irq_save(flags); @@ -5916,7 +6577,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_unlock_irq(&rq->lock); break; - case CPU_DOWN_PREPARE: + case CPU_DYING: + case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); @@ -5954,20 +6616,16 @@ void __init migration_init(void) #ifdef CONFIG_SMP -/* Number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; -EXPORT_SYMBOL(nr_cpu_ids); - #ifdef CONFIG_SCHED_DEBUG -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + cpumask_t *groupmask) { struct sched_group *group = sd->groups; - cpumask_t groupmask; - char str[NR_CPUS]; + char str[256]; - cpumask_scnprintf(str, NR_CPUS, sd->span); - cpus_clear(groupmask); + cpulist_scnprintf(str, sizeof(str), sd->span); + cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6011,25 +6669,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) break; } - if (cpus_intersects(groupmask, group->cpumask)) { + if (cpus_intersects(*groupmask, group->cpumask)) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(groupmask, groupmask, group->cpumask); + cpus_or(*groupmask, *groupmask, group->cpumask); - cpumask_scnprintf(str, NR_CPUS, group->cpumask); + cpulist_scnprintf(str, sizeof(str), group->cpumask); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, groupmask)) + if (!cpus_equal(sd->span, *groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) + if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6037,6 +6695,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) static void sched_domain_debug(struct sched_domain *sd, int cpu) { + cpumask_t *groupmask; int level = 0; if (!sd) { @@ -6046,14 +6705,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!groupmask) { + printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); + return; + } + for (;;) { - if (sched_domain_debug_one(sd, cpu, level)) + if (sched_domain_debug_one(sd, cpu, level, groupmask)) break; level++; sd = sd->parent; if (!sd) break; } + kfree(groupmask); } #else # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6241,30 +6907,33 @@ __setup("isolcpus=", isolated_cpu_setup); * and ->cpu_power to 0. */ static void -init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, +init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, int (*group_fn)(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg)) + struct sched_group **sg, + cpumask_t *tmpmask), + cpumask_t *covered, cpumask_t *tmpmask) { struct sched_group *first = NULL, *last = NULL; - cpumask_t covered = CPU_MASK_NONE; int i; - for_each_cpu_mask(i, span) { + cpus_clear(*covered); + + for_each_cpu_mask(i, *span) { struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg); + int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, covered)) + if (cpu_isset(i, *covered)) continue; - sg->cpumask = CPU_MASK_NONE; + cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask(j, span) { - if (group_fn(j, cpu_map, NULL) != group) + for_each_cpu_mask(j, *span) { + if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, covered); + cpu_set(j, *covered); cpu_set(j, sg->cpumask); } if (!first) @@ -6290,7 +6959,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, * * Should use nodemask_t. */ -static int find_next_best_node(int node, unsigned long *used_nodes) +static int find_next_best_node(int node, nodemask_t *used_nodes) { int i, n, val, min_val, best_node = 0; @@ -6304,7 +6973,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) continue; /* Skip already used nodes */ - if (test_bit(n, used_nodes)) + if (node_isset(n, *used_nodes)) continue; /* Simple min distance search */ @@ -6316,40 +6985,37 @@ static int find_next_best_node(int node, unsigned long *used_nodes) } } - set_bit(best_node, used_nodes); + node_set(best_node, *used_nodes); return best_node; } /** * sched_domain_node_span - get a cpumask for a node's sched_domain * @node: node whose cpumask we're constructing - * @size: number of nodes to include in this span + * @span: resulting cpumask * * Given a node, construct a good cpumask for its sched_domain to span. It * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static cpumask_t sched_domain_node_span(int node) +static void sched_domain_node_span(int node, cpumask_t *span) { - DECLARE_BITMAP(used_nodes, MAX_NUMNODES); - cpumask_t span, nodemask; + nodemask_t used_nodes; + node_to_cpumask_ptr(nodemask, node); int i; - cpus_clear(span); - bitmap_zero(used_nodes, MAX_NUMNODES); + cpus_clear(*span); + nodes_clear(used_nodes); - nodemask = node_to_cpumask(node); - cpus_or(span, span, nodemask); - set_bit(node, used_nodes); + cpus_or(*span, *span, *nodemask); + node_set(node, used_nodes); for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, used_nodes); + int next_node = find_next_best_node(node, &used_nodes); - nodemask = node_to_cpumask(next_node); - cpus_or(span, span, nodemask); + node_to_cpumask_ptr_next(nodemask, next_node); + cpus_or(*span, *span, *nodemask); } - - return span; } #endif @@ -6363,7 +7029,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_cpus, cpu); @@ -6381,19 +7048,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); if (sg) *sg = &per_cpu(sched_group_core, group); return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_core, cpu); @@ -6405,17 +7075,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; #ifdef CONFIG_SCHED_MC - cpumask_t mask = cpu_coregroup_map(cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = cpu_coregroup_map(cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #else group = cpu; #endif @@ -6431,19 +7102,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) * gets dynamically allocated. */ static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; +static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg) + struct sched_group **sg, cpumask_t *nodemask) { - cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); int group; - cpus_and(nodemask, nodemask, *cpu_map); - group = first_cpu(nodemask); + *nodemask = node_to_cpumask(cpu_to_node(cpu)); + cpus_and(*nodemask, *nodemask, *cpu_map); + group = first_cpu(*nodemask); if (sg) *sg = &per_cpu(sched_group_allnodes, group); @@ -6479,7 +7150,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map) +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; @@ -6491,11 +7162,11 @@ static void free_sched_groups(const cpumask_t *cpu_map) continue; for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); struct sched_group *oldsg, *sg = sched_group_nodes[i]; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; if (sg == NULL) @@ -6513,7 +7184,7 @@ next_sg: } } #else -static void free_sched_groups(const cpumask_t *cpu_map) +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { } #endif @@ -6571,13 +7242,106 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) } /* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#define SD_INIT(sd, type) sd_init_##type(sd) +#define SD_INIT_FUNC(type) \ +static noinline void sd_init_##type(struct sched_domain *sd) \ +{ \ + memset(sd, 0, sizeof(*sd)); \ + *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif + +/* + * To minimize stack usage kmalloc room for cpumasks and share the + * space as the usage in build_sched_domains() dictates. Used only + * if the amount of space is significant. + */ +struct allmasks { + cpumask_t tmpmask; /* make this one first */ + union { + cpumask_t nodemask; + cpumask_t this_sibling_map; + cpumask_t this_core_map; + }; + cpumask_t send_covered; + +#ifdef CONFIG_NUMA + cpumask_t domainspan; + cpumask_t covered; + cpumask_t notcovered; +#endif +}; + +#if NR_CPUS > 128 +#define SCHED_CPUMASK_ALLOC 1 +#define SCHED_CPUMASK_FREE(v) kfree(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +#else +#define SCHED_CPUMASK_ALLOC 0 +#define SCHED_CPUMASK_FREE(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +#endif + +#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ + ((unsigned long)(a) + offsetof(struct allmasks, v)) + +static int default_relax_domain_level = -1; + +static int __init setup_relax_domain_level(char *str) +{ + default_relax_domain_level = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + } +} + +/* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int build_sched_domains(const cpumask_t *cpu_map) +static int __build_sched_domains(const cpumask_t *cpu_map, + struct sched_domain_attr *attr) { int i; struct root_domain *rd; + SCHED_CPUMASK_DECLARE(allmasks); + cpumask_t *tmpmask; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6591,39 +7355,65 @@ static int build_sched_domains(const cpumask_t *cpu_map) printk(KERN_WARNING "Can not alloc sched group node list\n"); return -ENOMEM; } - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + return -ENOMEM; + } + +#if SCHED_CPUMASK_ALLOC + /* get space for all scratch cpumask variables */ + allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); + if (!allmasks) { + printk(KERN_WARNING "Cannot alloc cpumask array\n"); + kfree(rd); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif return -ENOMEM; } +#endif + tmpmask = (cpumask_t *)allmasks; + + +#ifdef CONFIG_NUMA + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif /* * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd = NULL, *p; - cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + SCHED_CPUMASK_VAR(nodemask, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); + *nodemask = node_to_cpumask(cpu_to_node(i)); + cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { sd = &per_cpu(allnodes_domains, i); - *sd = SD_ALLNODES_INIT; + SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); sd->span = *cpu_map; - cpu_to_allnodes_group(i, cpu_map, &sd->groups); + sd->first_cpu = first_cpu(sd->span); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; } else p = NULL; sd = &per_cpu(node_domains, i); - *sd = SD_NODE_INIT; - sd->span = sched_domain_node_span(cpu_to_node(i)); + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), &sd->span); + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -6632,94 +7422,120 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(phys_domains, i); - *sd = SD_CPU_INIT; - sd->span = nodemask; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + sd->span = *nodemask; + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups); + cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); #ifdef CONFIG_SCHED_MC p = sd; sd = &per_cpu(core_domains, i); - *sd = SD_MC_INIT; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups); + cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - *sd = SD_SIBLING_INIT; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups); + cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); #endif } #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(this_sibling_map, this_sibling_map, *cpu_map); - if (i != first_cpu(this_sibling_map)) + SCHED_CPUMASK_VAR(this_sibling_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_sibling_map = per_cpu(cpu_sibling_map, i); + cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); + if (i != first_cpu(*this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group); + &cpu_to_cpu_group, + send_covered, tmpmask); } #endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_core_map = cpu_coregroup_map(i); - cpus_and(this_core_map, this_core_map, *cpu_map); - if (i != first_cpu(this_core_map)) + SCHED_CPUMASK_VAR(this_core_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_core_map = cpu_coregroup_map(i); + cpus_and(*this_core_map, *this_core_map, *cpu_map); + if (i != first_cpu(*this_core_map)) continue; + init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group); + &cpu_to_core_group, + send_covered, tmpmask); } #endif /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; - init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, + &cpu_to_phys_group, + send_covered, tmpmask); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sd_allnodes) - init_sched_build_groups(*cpu_map, cpu_map, - &cpu_to_allnodes_group); + if (sd_allnodes) { + SCHED_CPUMASK_VAR(send_covered, allmasks); + + init_sched_build_groups(cpu_map, cpu_map, + &cpu_to_allnodes_group, + send_covered, tmpmask); + } for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - cpumask_t nodemask = node_to_cpumask(i); - cpumask_t domainspan; - cpumask_t covered = CPU_MASK_NONE; + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(domainspan, allmasks); + SCHED_CPUMASK_VAR(covered, allmasks); int j; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) { + *nodemask = node_to_cpumask(i); + cpus_clear(*covered); + + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) { sched_group_nodes[i] = NULL; continue; } - domainspan = sched_domain_node_span(i); - cpus_and(domainspan, domainspan, *cpu_map); + sched_domain_node_span(i, domainspan); + cpus_and(*domainspan, *domainspan, *cpu_map); sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); if (!sg) { @@ -6728,31 +7544,31 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask(j, nodemask) { + for_each_cpu_mask(j, *nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = nodemask; + sg->cpumask = *nodemask; sg->next = sg; - cpus_or(covered, covered, nodemask); + cpus_or(*covered, *covered, *nodemask); prev = sg; for (j = 0; j < MAX_NUMNODES; j++) { - cpumask_t tmp, notcovered; + SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % MAX_NUMNODES; + node_to_cpumask_ptr(pnodemask, n); - cpus_complement(notcovered, covered); - cpus_and(tmp, notcovered, *cpu_map); - cpus_and(tmp, tmp, domainspan); - if (cpus_empty(tmp)) + cpus_complement(*notcovered, *covered); + cpus_and(*tmpmask, *notcovered, *cpu_map); + cpus_and(*tmpmask, *tmpmask, *domainspan); + if (cpus_empty(*tmpmask)) break; - nodemask = node_to_cpumask(n); - cpus_and(tmp, tmp, nodemask); - if (cpus_empty(tmp)) + cpus_and(*tmpmask, *tmpmask, *pnodemask); + if (cpus_empty(*tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group), @@ -6763,9 +7579,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sg->__cpu_power = 0; - sg->cpumask = tmp; + sg->cpumask = *tmpmask; sg->next = prev->next; - cpus_or(covered, covered, tmp); + cpus_or(*covered, *covered, *tmpmask); prev->next = sg; prev = sg; } @@ -6801,7 +7617,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + tmpmask); init_numa_sched_groups_power(sg); } #endif @@ -6819,17 +7636,26 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpu_attach_domain(sd, rd, i); } + SCHED_CPUMASK_FREE((void *)allmasks); return 0; #ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); + SCHED_CPUMASK_FREE((void *)allmasks); return -ENOMEM; #endif } +static int build_sched_domains(const cpumask_t *cpu_map) +{ + return __build_sched_domains(cpu_map, NULL); +} + static cpumask_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; /* attribues of custom domains + in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of @@ -6838,6 +7664,10 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ */ static cpumask_t fallback_doms; +void __attribute__((weak)) arch_update_cpu_topology(void) +{ +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -6847,20 +7677,23 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) { int err; + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) doms_cur = &fallback_doms; cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map) +static void arch_destroy_sched_domains(const cpumask_t *cpu_map, + cpumask_t *tmpmask) { - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); } /* @@ -6869,6 +7702,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map) */ static void detach_destroy_domains(const cpumask_t *cpu_map) { + cpumask_t tmpmask; int i; unregister_sched_domain_sysctl(); @@ -6876,7 +7710,23 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) for_each_cpu_mask(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map); + arch_destroy_sched_domains(cpu_map, &tmpmask); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* fast path */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); } /* @@ -6900,7 +7750,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, + struct sched_domain_attr *dattr_new) { int i, j; @@ -6913,12 +7764,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + dattr_new = NULL; } /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < ndoms_new; j++) { - if (cpus_equal(doms_cur[i], doms_new[j])) + if (cpus_equal(doms_cur[i], doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ @@ -6930,11 +7783,13 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { - if (cpus_equal(doms_new[i], doms_cur[j])) + if (cpus_equal(doms_new[i], doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - build_sched_domains(doms_new + i); + __build_sched_domains(doms_new + i, + dattr_new ? dattr_new + i : NULL); match2: ; } @@ -6942,7 +7797,9 @@ match2: /* Remember the new sched domains */ if (doms_cur != &fallback_doms) kfree(doms_cur); + kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; + dattr_cur = dattr_new; ndoms_cur = ndoms_new; register_sched_domain_sysctl(); @@ -6951,7 +7808,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static int arch_reinit_sched_domains(void) +int arch_reinit_sched_domains(void) { int err; @@ -7069,6 +7926,11 @@ void __init sched_init_smp(void) { cpumask_t non_isolated_cpus; +#if defined(CONFIG_NUMA) + sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), + GFP_KERNEL); + BUG_ON(sched_group_nodes_bycpu == NULL); +#endif get_online_cpus(); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); @@ -7079,24 +7941,9 @@ void __init sched_init_smp(void) hotcpu_notifier(update_sched_domains, 0); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed(current, non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) BUG(); sched_init_granularity(); - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (nr_cpu_ids == 1) - return; - - lb_monitor_task = kthread_create(load_balance_monitor, NULL, - "group_balance"); - if (!IS_ERR(lb_monitor_task)) { - lb_monitor_task->flags |= PF_NOFREEZE; - wake_up_process(lb_monitor_task); - } else { - printk(KERN_ERR "Could not create load balance monitor thread" - "(error = %ld) \n", PTR_ERR(lb_monitor_task)); - } -#endif } #else void __init sched_init_smp(void) @@ -7115,6 +7962,7 @@ int in_sched_functions(unsigned long addr) static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) { cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; #endif @@ -7144,6 +7992,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + spin_lock_init(&rt_rq->rt_runtime_lock); #ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; @@ -7152,10 +8002,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, - struct cfs_rq *cfs_rq, struct sched_entity *se, - int cpu, int add) +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, int add, + struct sched_entity *parent) { + struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; @@ -7163,45 +8014,132 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; - se->cfs_rq = &rq->cfs; + /* se could be NULL for init_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + se->my_q = cfs_rq; se->load.weight = tg->shares; se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); - se->parent = NULL; + se->parent = parent; } #endif #ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, - struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, - int cpu, int add) +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *parent) { + struct rq *rq = cpu_rq(cpu); + tg->rt_rq[cpu] = rt_rq; init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_se = rt_se; + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; if (add) list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + rt_se->rt_rq = &rq->rt; rt_se->my_q = rt_rq; - rt_se->parent = NULL; + rt_se->parent = parent; INIT_LIST_HEAD(&rt_se->run_list); } #endif void __init sched_init(void) { - int highest_cpu = 0; int i, j; + unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_USER_SCHED + alloc_size *= 2; +#endif + /* + * As sched_init() is called before page_alloc is setup, + * we use alloc_bootmem(). + */ + if (alloc_size) { + ptr = (unsigned long)alloc_bootmem(alloc_size); + +#ifdef CONFIG_FAIR_GROUP_SCHED + init_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif +#endif +#ifdef CONFIG_RT_GROUP_SCHED + init_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif +#endif + } #ifdef CONFIG_SMP + init_aggregate(); init_defrootdomain(); #endif + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&init_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#ifdef CONFIG_USER_SCHED + init_rt_bandwidth(&root_task_group.rt_bandwidth, + global_rt_period(), RUNTIME_INF); +#endif +#endif + #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); + INIT_LIST_HEAD(&init_task_group.children); + +#ifdef CONFIG_USER_SCHED + INIT_LIST_HEAD(&root_task_group.children); + init_task_group.parent = &root_task_group; + list_add(&init_task_group.siblings, &root_task_group.children); +#endif #endif for_each_possible_cpu(i) { @@ -7212,26 +8150,68 @@ void __init sched_init(void) lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; rq->clock = 1; + update_last_tick_seen(rq); init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - init_tg_cfs_entry(rq, &init_task_group, +#ifdef CONFIG_CGROUP_SCHED + /* + * How much cpu bandwidth does init_task_group get? + * + * In case of task-groups formed thr' the cgroup filesystem, it + * gets 100% of the cpu resources in the system. This overall + * system cpu resource is divided among the tasks of + * init_task_group and its child task-groups in a fair manner, + * based on each entity's (task or task-group's) weight + * (se->load.weight). + * + * In other words, if init_task_group has 10 tasks of weight + * 1024) and two child groups A0 and A1 (of weight 1024 each), + * then A0's share of the cpu resource is: + * + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * + * We achieve this by letting init_task_group's tasks sit + * directly in rq->cfs (i.e init_task_group->se[] = NULL). + */ + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + root_task_group.shares = NICE_0_LOAD; + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); + /* + * In case of task-groups formed thr' the user id of tasks, + * init_task_group represents tasks belonging to root user. + * Hence it forms a sibling of all subsequent groups formed. + * In this case, init_task_group gets only a fraction of overall + * system cpu resource, based on the weight assigned to root + * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished + * by letting tasks of init_task_group sit in a separate cfs_rq + * (init_cfs_rq) and having one entity represent this group of + * tasks in rq->cfs (i.e init_task_group->se[] != NULL). + */ + init_tg_cfs_entry(&init_task_group, &per_cpu(init_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1); + &per_cpu(init_sched_entity, i), i, 1, + root_task_group.se[i]); #endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_runtime = - sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); - init_tg_rt_entry(rq, &init_task_group, +#ifdef CONFIG_CGROUP_SCHED + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); + init_tg_rt_entry(&init_task_group, &per_cpu(init_rt_rq, i), - &per_cpu(init_sched_rt_entity, i), i, 1); + &per_cpu(init_sched_rt_entity, i), i, 1, + root_task_group.rt_se[i]); +#endif #endif - rq->rt_period_expire = 0; - rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -7248,7 +8228,6 @@ void __init sched_init(void) #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); - highest_cpu = i; } set_load_weight(&init_task); @@ -7258,7 +8237,6 @@ void __init sched_init(void) #endif #ifdef CONFIG_SMP - nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); #endif @@ -7283,6 +8261,8 @@ void __init sched_init(void) * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; + + scheduler_running = 1; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -7415,159 +8395,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_GROUP_SCHED - -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP -/* - * distribute shares of all task groups among their schedulable entities, - * to reflect load distribution across cpus. - */ -static int rebalance_shares(struct sched_domain *sd, int this_cpu) -{ - struct cfs_rq *cfs_rq; - struct rq *rq = cpu_rq(this_cpu); - cpumask_t sdspan = sd->span; - int balanced = 1; - - /* Walk thr' all the task groups that we have */ - for_each_leaf_cfs_rq(rq, cfs_rq) { - int i; - unsigned long total_load = 0, total_shares; - struct task_group *tg = cfs_rq->tg; - - /* Gather total task load of this group across cpus */ - for_each_cpu_mask(i, sdspan) - total_load += tg->cfs_rq[i]->load.weight; - - /* Nothing to do if this group has no load */ - if (!total_load) - continue; - - /* - * tg->shares represents the number of cpu shares the task group - * is eligible to hold on a single cpu. On N cpus, it is - * eligible to hold (N * tg->shares) number of cpu shares. - */ - total_shares = tg->shares * cpus_weight(sdspan); - - /* - * redistribute total_shares across cpus as per the task load - * distribution. - */ - for_each_cpu_mask(i, sdspan) { - unsigned long local_load, local_shares; - - local_load = tg->cfs_rq[i]->load.weight; - local_shares = (local_load * total_shares) / total_load; - if (!local_shares) - local_shares = MIN_GROUP_SHARES; - if (local_shares == tg->se[i]->load.weight) - continue; - - spin_lock_irq(&cpu_rq(i)->lock); - set_se_shares(tg->se[i], local_shares); - spin_unlock_irq(&cpu_rq(i)->lock); - balanced = 0; - } - } - - return balanced; -} - -/* - * How frequently should we rebalance_shares() across cpus? - * - * The more frequently we rebalance shares, the more accurate is the fairness - * of cpu bandwidth distribution between task groups. However higher frequency - * also implies increased scheduling overhead. - * - * sysctl_sched_min_bal_int_shares represents the minimum interval between - * consecutive calls to rebalance_shares() in the same sched domain. - * - * sysctl_sched_max_bal_int_shares represents the maximum interval between - * consecutive calls to rebalance_shares() in the same sched domain. - * - * These settings allows for the appropriate trade-off between accuracy of - * fairness and the associated overhead. - * - */ - -/* default: 8ms, units: milliseconds */ -const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; - -/* default: 128ms, units: milliseconds */ -const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; - -/* kernel thread that runs rebalance_shares() periodically */ -static int load_balance_monitor(void *unused) -{ - unsigned int timeout = sysctl_sched_min_bal_int_shares; - struct sched_param schedparm; - int ret; - - /* - * We don't want this thread's execution to be limited by the shares - * assigned to default group (init_task_group). Hence make it run - * as a SCHED_RR RT task at the lowest priority. - */ - schedparm.sched_priority = 1; - ret = sched_setscheduler(current, SCHED_RR, &schedparm); - if (ret) - printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" - " monitor thread (error = %d) \n", ret); - - while (!kthread_should_stop()) { - int i, cpu, balanced = 1; - - /* Prevent cpus going down or coming up */ - get_online_cpus(); - /* lockout changes to doms_cur[] array */ - lock_doms_cur(); - /* - * Enter a rcu read-side critical section to safely walk rq->sd - * chain on various cpus and to walk task group list - * (rq->leaf_cfs_rq_list) in rebalance_shares(). - */ - rcu_read_lock(); - - for (i = 0; i < ndoms_cur; i++) { - cpumask_t cpumap = doms_cur[i]; - struct sched_domain *sd = NULL, *sd_prev = NULL; - - cpu = first_cpu(cpumap); - - /* Find the highest domain at which to balance shares */ - for_each_domain(cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - sd_prev = sd; - } - - sd = sd_prev; - /* sd == NULL? No load balance reqd in this domain */ - if (!sd) - continue; - - balanced &= rebalance_shares(sd, cpu); - } - - rcu_read_unlock(); - - unlock_doms_cur(); - put_online_cpus(); - - if (!balanced) - timeout = sysctl_sched_min_bal_int_shares; - else if (timeout < sysctl_sched_max_bal_int_shares) - timeout *= 2; - - msleep_interruptible(timeout); - } - - return 0; -} -#endif /* CONFIG_SMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static void free_fair_sched_group(struct task_group *tg) { @@ -7584,17 +8411,18 @@ static void free_fair_sched_group(struct task_group *tg) kfree(tg->se); } -static int alloc_fair_sched_group(struct task_group *tg) +static +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se; + struct sched_entity *se, *parent_se; struct rq *rq; int i; - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); if (!tg->cfs_rq) goto err; - tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); if (!tg->se) goto err; @@ -7613,7 +8441,8 @@ static int alloc_fair_sched_group(struct task_group *tg) if (!se) goto err; - init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + parent_se = parent ? parent->se[i] : NULL; + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); } return 1; @@ -7637,7 +8466,8 @@ static inline void free_fair_sched_group(struct task_group *tg) { } -static inline int alloc_fair_sched_group(struct task_group *tg) +static inline +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7656,6 +8486,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; + destroy_rt_bandwidth(&tg->rt_bandwidth); + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -7667,21 +8499,23 @@ static void free_rt_sched_group(struct task_group *tg) kfree(tg->rt_se); } -static int alloc_rt_sched_group(struct task_group *tg) +static +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; + struct sched_rt_entity *rt_se, *parent_se; struct rq *rq; int i; - tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); if (!tg->rt_rq) goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); if (!tg->rt_se) goto err; - tg->rt_runtime = 0; + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); for_each_possible_cpu(i) { rq = cpu_rq(i); @@ -7696,7 +8530,8 @@ static int alloc_rt_sched_group(struct task_group *tg) if (!rt_se) goto err; - init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); + parent_se = parent ? parent->rt_se[i] : NULL; + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); } return 1; @@ -7720,7 +8555,8 @@ static inline void free_rt_sched_group(struct task_group *tg) { } -static inline int alloc_rt_sched_group(struct task_group *tg) +static inline +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7734,6 +8570,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) } #endif +#ifdef CONFIG_GROUP_SCHED static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -7742,7 +8579,7 @@ static void free_sched_group(struct task_group *tg) } /* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(void) +struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; @@ -7752,10 +8589,10 @@ struct task_group *sched_create_group(void) if (!tg) return ERR_PTR(-ENOMEM); - if (!alloc_fair_sched_group(tg)) + if (!alloc_fair_sched_group(tg, parent)) goto err; - if (!alloc_rt_sched_group(tg)) + if (!alloc_rt_sched_group(tg, parent)) goto err; spin_lock_irqsave(&task_group_lock, flags); @@ -7764,6 +8601,12 @@ struct task_group *sched_create_group(void) register_rt_sched_group(tg, i); } list_add_rcu(&tg->list, &task_groups); + + WARN_ON(!parent); /* root should already exist */ + + tg->parent = parent; + list_add_rcu(&tg->siblings, &parent->children); + INIT_LIST_HEAD(&tg->children); spin_unlock_irqrestore(&task_group_lock, flags); return tg; @@ -7792,6 +8635,7 @@ void sched_destroy_group(struct task_group *tg) unregister_rt_sched_group(tg, i); } list_del_rcu(&tg->list); + list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); /* wait for possible concurrent references to cfs_rqs complete */ @@ -7816,47 +8660,53 @@ void sched_move_task(struct task_struct *tsk) running = task_current(rq, tsk); on_rq = tsk->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, tsk, 0); - if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); - } + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); set_task_rq(tsk, task_cpu(tsk)); - if (on_rq) { - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->moved_group) + tsk->sched_class->moved_group(tsk); +#endif + + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) enqueue_task(rq, tsk, 0); - } task_rq_unlock(rq, &flags); } +#endif #ifdef CONFIG_FAIR_GROUP_SCHED -/* rq->lock to be locked by caller */ -static void set_se_shares(struct sched_entity *se, unsigned long shares) +static void __set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; int on_rq; - if (!shares) - shares = MIN_GROUP_SHARES; - on_rq = se->on_rq; - if (on_rq) { + if (on_rq) dequeue_entity(cfs_rq, se, 0); - dec_cpu_load(rq, se->load.weight); - } se->load.weight = shares; se->load.inv_weight = div64_64((1ULL<<32), shares); - if (on_rq) { + if (on_rq) enqueue_entity(cfs_rq, se, 0); - inc_cpu_load(rq, se->load.weight); - } +} + +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __set_se_shares(se, shares); + spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -7866,21 +8716,28 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) int i; unsigned long flags; + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + + /* + * A weight of 0 or 1 can cause arithmetics problems. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ + if (shares < MIN_SHARES) + shares = MIN_SHARES; + mutex_lock(&shares_mutex); if (tg->shares == shares) goto done; - if (shares < MIN_GROUP_SHARES) - shares = MIN_GROUP_SHARES; - - /* - * Prevent any load balance activity (rebalance_shares, - * load_balance_fair) from referring to this group first, - * by taking it off the rq->leaf_cfs_rq_list on each cpu. - */ spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); + list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); /* wait for any ongoing reference to this group to finish */ @@ -7892,9 +8749,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) */ tg->shares = shares; for_each_possible_cpu(i) { - spin_lock_irq(&cpu_rq(i)->lock); - set_se_shares(tg->se[i], shares); - spin_unlock_irq(&cpu_rq(i)->lock); + /* + * force a rebalance + */ + cfs_rq_set_shares(tg->cfs_rq[i], 0); + set_se_shares(tg->se[i], shares/nr_cpu_ids); } /* @@ -7904,6 +8763,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) register_fair_sched_group(tg, i); + list_add_rcu(&tg->siblings, &tg->parent->children); spin_unlock_irqrestore(&task_group_lock, flags); done: mutex_unlock(&shares_mutex); @@ -7927,69 +8787,211 @@ static unsigned long to_ratio(u64 period, u64 runtime) if (runtime == RUNTIME_INF) return 1ULL << 16; - runtime *= (1ULL << 16); - div64_64(runtime, period); - return runtime; + return div64_64(runtime << 16, period); } +#ifdef CONFIG_CGROUP_SCHED +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + struct task_group *tgi, *parent = tg->parent; + unsigned long total = 0; + + if (!parent) { + if (global_rt_period() < period) + return 0; + + return to_ratio(period, runtime) < + to_ratio(global_rt_period(), global_rt_runtime()); + } + + if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(tgi, &parent->children, siblings) { + if (tgi == tg) + continue; + + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); + } + rcu_read_unlock(); + + return total + to_ratio(period, runtime) < + to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), + parent->rt_bandwidth.rt_runtime); +} +#elif defined CONFIG_USER_SCHED static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { struct task_group *tgi; unsigned long total = 0; unsigned long global_ratio = - to_ratio(sysctl_sched_rt_period, - sysctl_sched_rt_runtime < 0 ? - RUNTIME_INF : sysctl_sched_rt_runtime); + to_ratio(global_rt_period(), global_rt_runtime()); rcu_read_lock(); list_for_each_entry_rcu(tgi, &task_groups, list) { if (tgi == tg) continue; - total += to_ratio(period, tgi->rt_runtime); + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); } rcu_read_unlock(); return total + to_ratio(period, runtime) < global_ratio; } +#endif -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - u64 rt_runtime, rt_period; - int err = 0; + struct task_struct *g, *p; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); + return 0; +} - rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us == -1) - rt_runtime = rt_period; +static int tg_set_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) +{ + int i, err = 0; mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { + err = -EBUSY; + goto unlock; + } if (!__rt_schedulable(tg, rt_period, rt_runtime)) { err = -EINVAL; goto unlock; } - if (rt_runtime_us == -1) - rt_runtime = RUNTIME_INF; - tg->rt_runtime = rt_runtime; + + spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return err; } +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; - if (tg->rt_runtime == RUNTIME_INF) + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) return -1; - rt_runtime_us = tg->rt_runtime; + rt_runtime_us = tg->rt_bandwidth.rt_runtime; do_div(rt_runtime_us, NSEC_PER_USEC); return rt_runtime_us; } + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + int ret = 0; + + mutex_lock(&rt_constraints_mutex); + if (!__rt_schedulable(NULL, 1, 0)) + ret = -EINVAL; + mutex_unlock(&rt_constraints_mutex); + + return ret; +} +#else +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} #endif -#endif /* CONFIG_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} #ifdef CONFIG_CGROUP_SCHED @@ -8003,7 +9005,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) static struct cgroup_subsys_state * cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct task_group *tg; + struct task_group *tg, *parent; if (!cgrp->parent) { /* This is early initialization for the top cgroup */ @@ -8011,11 +9013,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &init_task_group.css; } - /* we support only 1-level deep hierarchical scheduler atm */ - if (cgrp->parent->parent) - return ERR_PTR(-EINVAL); - - tg = sched_create_group(); + parent = cgroup_tg(cgrp->parent); + tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -8039,7 +9038,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, { #ifdef CONFIG_RT_GROUP_SCHED /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -8073,7 +9072,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) #endif #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, +static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) @@ -8117,6 +9116,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) +{ + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); +} #endif static struct cftype cpu_files[] = { @@ -8133,6 +9143,11 @@ static struct cftype cpu_files[] = { .read = cpu_rt_runtime_read, .write = cpu_rt_runtime_write, }, + { + .name = "rt_period_us", + .read_uint = cpu_rt_period_read_uint, + .write_uint = cpu_rt_period_write_uint, + }, #endif }; @@ -8173,9 +9188,9 @@ struct cpuacct { struct cgroup_subsys cpuacct_subsys; /* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cont) +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), struct cpuacct, css); } @@ -8188,7 +9203,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cont) + struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -8206,18 +9221,18 @@ static struct cgroup_subsys_state *cpuacct_create( /* destroy an existing cpu accounting group */ static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cpuacct *ca = cgroup_ca(cont); + struct cpuacct *ca = cgroup_ca(cgrp); free_percpu(ca->cpuusage); kfree(ca); } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cont); + struct cpuacct *ca = cgroup_ca(cgrp); u64 totalcpuusage = 0; int i; @@ -8236,16 +9251,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) return totalcpuusage; } +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_possible_cpu(i) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + + spin_lock_irq(&cpu_rq(i)->lock); + *cpuusage = 0; + spin_unlock_irq(&cpu_rq(i)->lock); + } +out: + return err; +} + static struct cftype files[] = { { .name = "usage", .read_uint = cpuusage_read, + .write_uint = cpuusage_write, }, }; -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) { - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); + return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); } /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4b5e24cf2f4..f3f4af4b8b0 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", SPLIT_NS(p->se.vruntime), SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(p->se.sum_sleep_runtime)); #else - SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", + SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif + +#ifdef CONFIG_CGROUP_SCHED + { + char path[64]; + + cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); + SEQ_printf(m, " %s", path); + } +#endif + SEQ_printf(m, "\n"); } static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) @@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; - SEQ_printf(m, "\ncfs_rq\n"); +#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) + SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#else + char path[128] = ""; + struct cgroup *cgroup = NULL; + struct task_group *tg = cfs_rq->tg; + + if (tg) + cgroup = tg->css.cgroup; + + if (cgroup) + cgroup_path(cgroup, path, sizeof(path)); + + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); +#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -143,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); +#endif +#endif } static void print_cpu(struct seq_file *m, int cpu) @@ -214,7 +243,6 @@ static int sched_debug_show(struct seq_file *m, void *v) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); - PN(sysctl_sched_batch_wakeup_granularity); PN(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN @@ -288,6 +316,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_start); PN(se.vruntime); PN(se.sum_exec_runtime); + PN(se.avg_overlap); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6c091d6e159..89fa32b4edf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -62,16 +62,6 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1; unsigned int __read_mostly sysctl_sched_compat_yield; /* - * SCHED_BATCH wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - */ -unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; - -/* * SCHED_OTHER wake-up granularity. * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) * @@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; * CFS operations on generic schedulable entities: */ +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* cpu runqueue to which this cfs_rq is attached */ @@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ + for (; se; se = se->parent) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on + * another cpu ('this_cpu') + */ +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return cfs_rq->tg->cfs_rq[this_cpu]; +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + if (se->cfs_rq == pse->cfs_rq) + return 1; + + return 0; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return se->parent; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) @@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) #define entity_is_task(se) 1 -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#define for_each_sched_entity(se) \ + for (; se; se = NULL) -static inline struct task_struct *task_of(struct sched_entity *se) +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) { - return container_of(se, struct task_struct, se); + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} + +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return &cpu_rq(this_cpu)->cfs; +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + return 1; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return NULL; } +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -175,8 +254,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * Maintain a cache of leftmost tree entries (it is frequently * used): */ - if (leftmost) + if (leftmost) { cfs_rq->rb_leftmost = &se->run_node; + /* + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. + */ + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, se->vruntime); + } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -184,8 +270,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); + if (cfs_rq->rb_leftmost == &se->run_node) { + struct rb_node *next_node; + struct sched_entity *next; + + next_node = rb_next(&se->run_node); + cfs_rq->rb_leftmost = next_node; + + if (next_node) { + next = rb_entry(next_node, + struct sched_entity, run_node); + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, + next->vruntime); + } + } + + if (cfs_rq->next == se) + cfs_rq->next = NULL; rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -202,17 +304,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct sched_entity *se = NULL; - struct rb_node *parent; + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); - while (*link) { - parent = *link; - se = rb_entry(parent, struct sched_entity, run_node); - link = &parent->rb_right; - } + if (!last) + return NULL; - return se; + return rb_entry(last, struct sched_entity, run_node); } /************************************************************** @@ -237,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, #endif /* + * delta *= w / rw + */ +static inline unsigned long +calc_delta_weight(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + se->load.weight, &cfs_rq_of(se)->load); + } + + return delta; +} + +/* + * delta *= rw / w + */ +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, &se->load); + } + + return delta; +} + +/* * The idea is to set a period in which each task runs once. * * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch @@ -265,38 +390,54 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - - return slice; + return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); } /* - * We calculate the vruntime slice. + * We calculate the vruntime slice of a to be inserted task * - * vs = s/w = p/rw + * vs = s*rw/w = p */ -static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) +static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 vslice = __sched_period(nr_running); + unsigned long nr_running = cfs_rq->nr_running; - vslice *= NICE_0_LOAD; - do_div(vslice, rq_weight); + if (!se->on_rq) + nr_running++; - return vslice; + return __sched_period(nr_running); } -static u64 sched_vslice(struct cfs_rq *cfs_rq) +/* + * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in + * that it favours >=0 over <0. + * + * -20 | + * | + * 0 --------+------- + * .' + * 19 .' + * + */ +static unsigned long +calc_delta_asym(unsigned long delta, struct sched_entity *se) { - return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); -} + struct load_weight lw = { + .weight = NICE_0_LOAD, + .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) + }; -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return __sched_vslice(cfs_rq->load.weight + se->load.weight, - cfs_rq->nr_running + 1); + for_each_sched_entity(se) { + struct load_weight *se_lw = &se->load; + + if (se->load.weight < NICE_0_LOAD) + se_lw = &lw; + + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, se_lw); + } + + return delta; } /* @@ -308,31 +449,13 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; - u64 vruntime; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = delta_exec; - if (unlikely(curr->load.weight != NICE_0_LOAD)) { - delta_exec_weighted = calc_delta_fair(delta_exec_weighted, - &curr->load); - } + delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; - - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(curr->vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = curr->vruntime; - - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, vruntime); } static void update_curr(struct cfs_rq *cfs_rq) @@ -418,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + inc_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; + list_add(&se->group_node, &cfs_rq->tasks); } static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + dec_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; + list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -498,16 +644,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime; - vruntime = cfs_rq->min_vruntime; - - if (sched_feat(TREE_AVG)) { - struct sched_entity *last = __pick_last_entity(cfs_rq); - if (last) { - vruntime += last->vruntime; - vruntime >>= 1; - } - } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) - vruntime += sched_vslice(cfs_rq)/2; + if (first_fair(cfs_rq)) { + vruntime = min_vruntime(cfs_rq->min_vruntime, + __pick_next_entity(cfs_rq)->vruntime); + } else + vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, @@ -520,8 +661,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) - vruntime -= sysctl_sched_latency; + if (sched_feat(NEW_FAIR_SLEEPERS)) { + if (sched_feat(NORMALIZED_SLEEPER)) + vruntime -= calc_delta_weight(sysctl_sched_latency, se); + else + vruntime -= sysctl_sched_latency; + } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); @@ -550,6 +695,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) account_entity_enqueue(cfs_rq, se); } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (!se->last_wakeup) + return; + + update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); + se->last_wakeup = 0; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -560,6 +720,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { + update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -621,12 +782,28 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + +static struct sched_entity * +pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (!cfs_rq->next) + return se; + + if (wakeup_preempt_entity(cfs_rq->next, se) != 0) + return se; + + return cfs_rq->next; +} + static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = NULL; if (first_fair(cfs_rq)) { se = __pick_next_entity(cfs_rq); + se = pick_next(cfs_rq, se); set_next_entity(cfs_rq, se); } @@ -682,103 +859,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * CFS operations on tasks: */ -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* Walk up scheduling entities hierarchy */ -#define for_each_sched_entity(se) \ - for (; se; se = se->parent) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on - * another cpu ('this_cpu') - */ -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return cfs_rq->tg->cfs_rq[this_cpu]; -} - -/* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) - -/* Do the two (enqueued) entities belong to the same group ? */ -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - if (se->cfs_rq == pse->cfs_rq) - return 1; - - return 0; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return se->parent; -} - -#define GROUP_IMBALANCE_PCT 20 - -#else /* CONFIG_FAIR_GROUP_SCHED */ - -#define for_each_sched_entity(se) \ - for (; se; se = NULL) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return &cpu_rq(this_cpu)->cfs; -} - -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) - -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return NULL; -} - -#endif /* CONFIG_FAIR_GROUP_SCHED */ - #ifdef CONFIG_SCHED_HRTICK static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -824,26 +904,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, - *topse = NULL; /* Highest schedulable entity */ - int incload = 1; + struct sched_entity *se = &p->se; for_each_sched_entity(se) { - topse = se; - if (se->on_rq) { - incload = 0; + if (se->on_rq) break; - } cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, wakeup); wakeup = 1; } - /* Increment cpu load if we just enqueued the first task of a group on - * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs - * at the highest grouping level. - */ - if (incload) - inc_cpu_load(rq, topse->load.weight); hrtick_start_fair(rq, rq->curr); } @@ -856,28 +925,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, - *topse = NULL; /* Highest schedulable entity */ - int decload = 1; + struct sched_entity *se = &p->se; for_each_sched_entity(se) { - topse = se; cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, sleep); /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) { - if (parent_entity(se)) - decload = 0; + if (cfs_rq->load.weight) break; - } sleep = 1; } - /* Decrement cpu load if we just dequeued the last task of a group on - * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs - * at the highest grouping level. - */ - if (decload) - dec_cpu_load(rq, topse->load.weight); hrtick_start_fair(rq, rq->curr); } @@ -915,7 +972,7 @@ static void yield_task_fair(struct rq *rq) /* * Already in the rightmost position? */ - if (unlikely(rightmost->vruntime < se->vruntime)) + if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) return; /* @@ -954,7 +1011,9 @@ static int wake_idle(int cpu, struct task_struct *p) return cpu; for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_IDLE) { + if ((sd->flags & SD_WAKE_IDLE) + || ((sd->flags & SD_WAKE_IDLE_FAR) + && !task_hot(p, task_rq(p)->clock, sd))) { cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) { @@ -979,100 +1038,177 @@ static inline int wake_idle(int cpu, struct task_struct *p) #endif #ifdef CONFIG_SMP -static int select_task_rq_fair(struct task_struct *p, int sync) + +static const struct sched_class fair_sched_class; + +static int +wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, + struct task_struct *p, int prev_cpu, int this_cpu, int sync, + int idx, unsigned long load, unsigned long this_load, + unsigned int imbalance) { - int cpu, this_cpu; - struct rq *rq; - struct sched_domain *sd, *this_sd = NULL; - int new_cpu; + struct task_struct *curr = this_rq->curr; + unsigned long tl = this_load; + unsigned long tl_per_task; + + if (!(this_sd->flags & SD_WAKE_AFFINE)) + return 0; - cpu = task_cpu(p); - rq = task_rq(p); - this_cpu = smp_processor_id(); - new_cpu = cpu; + /* + * If the currently running task will sleep within + * a reasonable amount of time then attract this newly + * woken task: + */ + if (sync && curr->sched_class == &fair_sched_class) { + if (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + return 1; + } + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); + + return 1; + } + return 0; +} - if (cpu == this_cpu) - goto out_set_cpu; +static int select_task_rq_fair(struct task_struct *p, int sync) +{ + struct sched_domain *sd, *this_sd = NULL; + int prev_cpu, this_cpu, new_cpu; + unsigned long load, this_load; + struct rq *rq, *this_rq; + unsigned int imbalance; + int idx; + + prev_cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + this_rq = cpu_rq(this_cpu); + new_cpu = prev_cpu; + /* + * 'this_sd' is the first domain that both + * this_cpu and prev_cpu are present in: + */ for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpu_isset(prev_cpu, sd->span)) { this_sd = sd; break; } } if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) - goto out_set_cpu; + goto out; /* * Check for affine wakeup and passive balancing possibilities. */ - if (this_sd) { - int idx = this_sd->wake_idx; - unsigned int imbalance; - unsigned long load, this_load; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(cpu, idx); - this_load = target_load(this_cpu, idx); - - new_cpu = this_cpu; /* Wake to this CPU if we can */ - - if (this_sd->flags & SD_WAKE_AFFINE) { - unsigned long tl = this_load; - unsigned long tl_per_task; - - /* - * Attract cache-cold tasks on sync wakeups: - */ - if (sync && !task_hot(p, rq->clock, this_sd)) - goto out_set_cpu; - - schedstat_inc(p, se.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - - if ((tl <= load && - tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); - goto out_set_cpu; - } - } + if (!this_sd) + goto out; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - goto out_set_cpu; - } + idx = this_sd->wake_idx; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); + + if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + load, this_load, imbalance)) + return this_cpu; + + if (prev_cpu == this_cpu) + goto out; + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + return this_cpu; } } - new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ -out_set_cpu: +out: return wake_idle(new_cpu, p); } #endif /* CONFIG_SMP */ +static unsigned long wakeup_gran(struct sched_entity *se) +{ + unsigned long gran = sysctl_sched_wakeup_granularity; + + /* + * More easily preempt - nice tasks, while not making it harder for + * + nice tasks. + */ + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + + return gran; +} + +/* + * Should 'se' preempt 'curr'. + * + * |s1 + * |s2 + * |s3 + * g + * |<--->|c + * + * w(c, s1) = -1 + * w(c, s2) = 0 + * w(c, s3) = 1 + * + */ +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +{ + s64 gran, vdiff = curr->vruntime - se->vruntime; + + if (vdiff < 0) + return -1; + + gran = wakeup_gran(curr); + if (vdiff > gran) + return 1; + + return 0; +} + +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ + int depth = 0; + + for_each_sched_entity(se) + depth++; + + return depth; +} /* * Preempt the current task with a newly woken task if needed: @@ -1082,7 +1218,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - unsigned long gran; + int se_depth, pse_depth; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1090,6 +1226,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) resched_task(curr); return; } + + se->last_wakeup = se->sum_exec_runtime; + if (unlikely(se == pse)) + return; + + cfs_rq_of(pse)->next = pse; + /* * Batch tasks do not preempt (their preemption is driven by * the tick): @@ -1100,20 +1243,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; - while (!is_same_group(se, pse)) { + /* + * preemption test can be made between sibling entities who are in the + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of + * both tasks until we find their ancestors who are siblings of common + * parent. + */ + + /* First walk up until both entities are at same depth */ + se_depth = depth_se(se); + pse_depth = depth_se(pse); + + while (se_depth > pse_depth) { + se_depth--; se = parent_entity(se); + } + + while (pse_depth > se_depth) { + pse_depth--; pse = parent_entity(pse); } - gran = sysctl_sched_wakeup_granularity; - /* - * More easily preempt - nice tasks, while not making - * it harder for + nice tasks. - */ - if (unlikely(se->load.weight > NICE_0_LOAD)) - gran = calc_delta_fair(gran, &se->load); + while (!is_same_group(se, pse)) { + se = parent_entity(se); + pse = parent_entity(pse); + } - if (pse->vruntime + gran < se->vruntime) + if (wakeup_preempt_entity(se, pse) == 1) resched_task(curr); } @@ -1164,15 +1320,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) * the current task: */ static struct task_struct * -__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) +__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) { - struct task_struct *p; + struct task_struct *p = NULL; + struct sched_entity *se; - if (!curr) + if (next == &cfs_rq->tasks) return NULL; - p = rb_entry(curr, struct task_struct, se.run_node); - cfs_rq->rb_load_balance_curr = rb_next(curr); + /* Skip over entities that are not tasks */ + do { + se = list_entry(next, struct sched_entity, group_node); + next = next->next; + } while (next != &cfs_rq->tasks && !entity_is_task(se)); + + if (next == &cfs_rq->tasks) + return NULL; + + cfs_rq->balance_iterator = next; + + if (entity_is_task(se)) + p = task_of(se); return p; } @@ -1181,105 +1349,100 @@ static struct task_struct *load_balance_start_fair(void *arg) { struct cfs_rq *cfs_rq = arg; - return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); + return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); } static struct task_struct *load_balance_next_fair(void *arg) { struct cfs_rq *cfs_rq = arg; - return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); + return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, int *this_best_prio) +__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, + struct cfs_rq *cfs_rq) { - struct cfs_rq *busy_cfs_rq; - long rem_load_move = max_load_move; struct rq_iterator cfs_rq_iterator; - unsigned long load_moved; cfs_rq_iterator.start = load_balance_start_fair; cfs_rq_iterator.next = load_balance_next_fair; + cfs_rq_iterator.arg = cfs_rq; + + return balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &cfs_rq_iterator); +} - for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { #ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; - unsigned long maxload, task_load, group_weight; - unsigned long thisload, per_task_load; - struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + long rem_load_move = max_load_move; + int busiest_cpu = cpu_of(busiest); + struct task_group *tg; - task_load = busy_cfs_rq->load.weight; - group_weight = se->load.weight; + rcu_read_lock(); + list_for_each_entry(tg, &task_groups, list) { + long imbalance; + unsigned long this_weight, busiest_weight; + long rem_load, max_load, moved_load; /* - * 'group_weight' is contributed by tasks of total weight - * 'task_load'. To move 'rem_load_move' worth of weight only, - * we need to move a maximum task load of: - * - * maxload = (remload / group_weight) * task_load; + * empty group */ - maxload = (rem_load_move * task_load) / group_weight; - - if (!maxload || !task_load) + if (!aggregate(tg, sd)->task_weight) continue; - per_task_load = task_load / busy_cfs_rq->nr_running; - /* - * balance_tasks will try to forcibly move atleast one task if - * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if - * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. - */ - if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) - continue; + rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; + rem_load /= aggregate(tg, sd)->load + 1; - /* Disable priority-based load balance */ - *this_best_prio = 0; - thisload = this_cfs_rq->load.weight; -#else -# define maxload rem_load_move -#endif - /* - * pass busy_cfs_rq argument into - * load_balance_[start|next]_fair iterators - */ - cfs_rq_iterator.arg = busy_cfs_rq; - load_moved = balance_tasks(this_rq, this_cpu, busiest, - maxload, sd, idle, all_pinned, - this_best_prio, - &cfs_rq_iterator); + this_weight = tg->cfs_rq[this_cpu]->task_weight; + busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * load_moved holds the task load that was moved. The - * effective (group) weight moved would be: - * load_moved_eff = load_moved/task_load * group_weight; - */ - load_moved = (group_weight * load_moved) / task_load; - - /* Adjust shares on both cpus to reflect load_moved */ - group_weight -= load_moved; - set_se_shares(se, group_weight); - - se = busy_cfs_rq->tg->se[this_cpu]; - if (!thisload) - group_weight = load_moved; - else - group_weight = se->load.weight + load_moved; - set_se_shares(se, group_weight); -#endif + imbalance = (busiest_weight - this_weight) / 2; + + if (imbalance < 0) + imbalance = busiest_weight; + + max_load = max(rem_load, imbalance); + moved_load = __load_balance_fair(this_rq, this_cpu, busiest, + max_load, sd, idle, all_pinned, this_best_prio, + tg->cfs_rq[busiest_cpu]); + + if (!moved_load) + continue; + + move_group_shares(tg, sd, busiest_cpu, this_cpu); - rem_load_move -= load_moved; + moved_load *= aggregate(tg, sd)->load; + moved_load /= aggregate(tg, sd)->rq_weight + 1; - if (rem_load_move <= 0) + rem_load_move -= moved_load; + if (rem_load_move < 0) break; } + rcu_read_unlock(); return max_load_move - rem_load_move; } +#else +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return __load_balance_fair(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &busiest->cfs); +} +#endif static int move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, @@ -1403,6 +1566,16 @@ static void set_curr_task_fair(struct rq *rq) set_next_entity(cfs_rq_of(se), se); } +#ifdef CONFIG_FAIR_GROUP_SCHED +static void moved_group_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + update_curr(cfs_rq); + place_entity(cfs_rq, &p->se, 1); +} +#endif + /* * All the scheduling class methods: */ @@ -1431,19 +1604,47 @@ static const struct sched_class fair_sched_class = { .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, + +#ifdef CONFIG_FAIR_GROUP_SCHED + .moved_group = moved_group_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG +static void +print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth) +{ + struct sched_entity *se; + + if (!cfs_rq) + return; + + list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) { + int i; + + for (i = depth; i; i--) + seq_puts(m, " "); + + seq_printf(m, "%lu %s %lu\n", + se->load.weight, + entity_is_task(se) ? "T" : "G", + calc_delta_weight(SCHED_LOAD_SCALE, se) + ); + if (!entity_is_task(se)) + print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1); + } +} + static void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; -#ifdef CONFIG_FAIR_GROUP_SCHED - print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); -#endif rcu_read_lock(); for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); + + seq_printf(m, "\nWeight tree:\n"); + print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1); rcu_read_unlock(); } #endif diff --git a/kernel/sched_features.h b/kernel/sched_features.h new file mode 100644 index 00000000000..1c7283cb958 --- /dev/null +++ b/kernel/sched_features.h @@ -0,0 +1,10 @@ +SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) +SCHED_FEAT(WAKEUP_PREEMPT, 1) +SCHED_FEAT(START_DEBIT, 1) +SCHED_FEAT(AFFINE_WAKEUPS, 1) +SCHED_FEAT(CACHE_HOT_BUDDY, 1) +SCHED_FEAT(SYNC_WAKEUPS, 1) +SCHED_FEAT(HRTICK, 1) +SCHED_FEAT(DOUBLE_TICK, 0) +SCHED_FEAT(NORMALIZED_SLEEPER, 1) +SCHED_FEAT(DEADLINE, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f54792b175b..c2730a5a4f0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) if (!rt_rq->tg) return RUNTIME_INF; - return rt_rq->tg->rt_runtime; + return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -127,14 +132,39 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } +#ifdef CONFIG_SMP +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_rq(smp_processor_id())->rd->span; +} +#else +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_online_map; +} +#endif + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &rt_rq->tg->rt_bandwidth; +} + #else static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - if (sysctl_sched_rt_runtime == -1) - return RUNTIME_INF; + return rt_rq->rt_runtime; +} - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(def_rt_bandwidth.rt_period); } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -173,6 +203,102 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) { return rt_rq->rt_throttled; } + +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_online_map; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &def_rt_bandwidth; +} + +#endif + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ + int i, idle = 1; + cpumask_t span; + + if (rt_b->rt_runtime == RUNTIME_INF) + return 1; + + span = sched_rt_period_mask(); + for_each_cpu_mask(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); + struct rq *rq = rq_of_rt_rq(rt_rq); + + spin_lock(&rq->lock); + if (rt_rq->rt_time) { + u64 runtime; + + spin_lock(&rt_rq->rt_runtime_lock); + runtime = rt_rq->rt_runtime; + rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { + rt_rq->rt_throttled = 0; + enqueue = 1; + } + if (rt_rq->rt_time || rt_rq->rt_nr_running) + idle = 0; + spin_unlock(&rt_rq->rt_runtime_lock); + } + + if (enqueue) + sched_rt_rq_enqueue(rt_rq); + spin_unlock(&rq->lock); + } + + return idle; +} + +#ifdef CONFIG_SMP +static int balance_runtime(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int i, weight, more = 0; + u64 rt_period; + + weight = cpus_weight(rd->span); + + spin_lock(&rt_b->rt_runtime_lock); + rt_period = ktime_to_ns(rt_b->rt_period); + for_each_cpu_mask(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + if (iter == rt_rq) + continue; + + spin_lock(&iter->rt_runtime_lock); + diff = iter->rt_runtime - iter->rt_time; + if (diff > 0) { + do_div(diff, weight); + if (rt_rq->rt_runtime + diff > rt_period) + diff = rt_period - rt_rq->rt_runtime; + iter->rt_runtime -= diff; + rt_rq->rt_runtime += diff; + more = 1; + if (rt_rq->rt_runtime == rt_period) { + spin_unlock(&iter->rt_runtime_lock); + break; + } + } + spin_unlock(&iter->rt_runtime_lock); + } + spin_unlock(&rt_b->rt_runtime_lock); + + return more; +} #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) @@ -197,12 +323,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); + if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) + return 0; + +#ifdef CONFIG_SMP if (rt_rq->rt_time > runtime) { - struct rq *rq = rq_of_rt_rq(rt_rq); + int more; - rq->rt_throttled = 1; - rt_rq->rt_throttled = 1; + spin_unlock(&rt_rq->rt_runtime_lock); + more = balance_runtime(rt_rq); + spin_lock(&rt_rq->rt_runtime_lock); + + if (more) + runtime = sched_rt_runtime(rt_rq); + } +#endif + if (rt_rq->rt_time > runtime) { + rt_rq->rt_throttled = 1; if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; @@ -212,29 +350,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -static void update_sched_rt_period(struct rq *rq) -{ - struct rt_rq *rt_rq; - u64 period; - - while (rq->clock > rq->rt_period_expire) { - period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; - rq->rt_period_expire += period; - - for_each_leaf_rt_rq(rt_rq, rq) { - u64 runtime = sched_rt_runtime(rt_rq); - - rt_rq->rt_time -= min(rt_rq->rt_time, runtime); - if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { - rt_rq->rt_throttled = 0; - sched_rt_rq_enqueue(rt_rq); - } - } - - rq->rt_throttled = 0; - } -} - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -259,9 +374,15 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + for_each_sched_rt_entity(rt_se) { + rt_rq = rt_rq_of_se(rt_se); + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + spin_unlock(&rt_rq->rt_runtime_lock); + } } static inline @@ -284,6 +405,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; + + if (rt_rq->tg) + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +#else + start_rt_bandwidth(&def_rt_bandwidth); #endif } @@ -353,27 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) /* * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. - * - * XXX: O(1/2 h^2) because we can only walk up, not down the chain. - * doesn't matter much for now, as h=2 for GROUP_SCHED. */ static void dequeue_rt_stack(struct task_struct *p) { - struct sched_rt_entity *rt_se, *top_se; + struct sched_rt_entity *rt_se, *back = NULL; - /* - * dequeue all, top - down. - */ - do { - rt_se = &p->rt; - top_se = NULL; - for_each_sched_rt_entity(rt_se) { - if (on_rt_rq(rt_se)) - top_se = rt_se; - } - if (top_se) - dequeue_rt_entity(top_se); - } while (top_se); + rt_se = &p->rt; + for_each_sched_rt_entity(rt_se) { + rt_se->back = back; + back = rt_se; + } + + for (rt_se = back; rt_se; rt_se = rt_se->back) { + if (on_rt_rq(rt_se)) + dequeue_rt_entity(rt_se); + } } /* @@ -1005,7 +1125,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) +static void set_cpus_allowed_rt(struct task_struct *p, + const cpumask_t *new_mask) { int weight = cpus_weight(*new_mask); @@ -1111,9 +1232,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p, pull_rt_task(rq); /* * If there's a higher priority task waiting to run - * then reschedule. + * then reschedule. Note, the above pull_rt_task + * can release the rq lock and p could migrate. + * Only reschedule if p is still on the same runqueue. */ - if (p->prio > rq->rt.highest_prio) + if (p->prio > rq->rt.highest_prio && rq->curr == p) resched_task(p); #else /* For UP simply resched on drop of prio */ diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 5b32433e7ee..5bae2e0c3ff 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -9,6 +9,11 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; + int mask_len = NR_CPUS/32 * 9; + char *mask_str = kmalloc(mask_len, GFP_KERNEL); + + if (mask_str == NULL) + return -ENOMEM; seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); seq_printf(seq, "timestamp %lu\n", jiffies); @@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v) preempt_disable(); for_each_domain(cpu, sd) { enum cpu_idle_type itype; - char mask_str[NR_CPUS]; - cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + cpumask_scnprintf(mask_str, mask_len, sd->span); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { diff --git a/kernel/semaphore.c b/kernel/semaphore.c new file mode 100644 index 00000000000..5c2942e768c --- /dev/null +++ b/kernel/semaphore.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2008 Intel Corporation + * Author: Matthew Wilcox <willy@linux.intel.com> + * + * Distributed under the terms of the GNU GPL, version 2 + * + * This file implements counting semaphores. + * A counting semaphore may be acquired 'n' times before sleeping. + * See mutex.c for single-acquisition sleeping locks which enforce + * rules which allow code to be debugged more easily. + */ + +/* + * Some notes on the implementation: + * + * The spinlock controls access to the other members of the semaphore. + * down_trylock() and up() can be called from interrupt context, so we + * have to disable interrupts when taking the lock. It turns out various + * parts of the kernel expect to be able to use down() on a semaphore in + * interrupt context when they know it will succeed, so we have to use + * irqsave variants for down(), down_interruptible() and down_killable() + * too. + * + * The ->count variable represents how many more tasks can acquire this + * semaphore. If it's zero, there may be tasks waiting on the wait_list. + */ + +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/spinlock.h> + +static noinline void __down(struct semaphore *sem); +static noinline int __down_interruptible(struct semaphore *sem); +static noinline int __down_killable(struct semaphore *sem); +static noinline int __down_timeout(struct semaphore *sem, long jiffies); +static noinline void __up(struct semaphore *sem); + +/** + * down - acquire the semaphore + * @sem: the semaphore to be acquired + * + * Acquires the semaphore. If no more tasks are allowed to acquire the + * semaphore, calling this function will put the task to sleep until the + * semaphore is released. + * + * Use of this function is deprecated, please use down_interruptible() or + * down_killable() instead. + */ +void down(struct semaphore *sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + __down(sem); + spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(down); + +/** + * down_interruptible - acquire the semaphore unless interrupted + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a signal, this function will return -EINTR. + * If the semaphore is successfully acquired, this function returns 0. + */ +int down_interruptible(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_interruptible(sem); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_interruptible); + +/** + * down_killable - acquire the semaphore unless killed + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a fatal signal, this function will return + * -EINTR. If the semaphore is successfully acquired, this function returns + * 0. + */ +int down_killable(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_killable(sem); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_killable); + +/** + * down_trylock - try to acquire the semaphore, without waiting + * @sem: the semaphore to be acquired + * + * Try to acquire the semaphore atomically. Returns 0 if the mutex has + * been acquired successfully or 1 if it it cannot be acquired. + * + * NOTE: This return value is inverted from both spin_trylock and + * mutex_trylock! Be careful about this when converting code. + * + * Unlike mutex_trylock, this function can be used from interrupt context, + * and the semaphore can be released by any task or interrupt. + */ +int down_trylock(struct semaphore *sem) +{ + unsigned long flags; + int count; + + spin_lock_irqsave(&sem->lock, flags); + count = sem->count - 1; + if (likely(count >= 0)) + sem->count = count; + spin_unlock_irqrestore(&sem->lock, flags); + + return (count < 0); +} +EXPORT_SYMBOL(down_trylock); + +/** + * down_timeout - acquire the semaphore within a specified time + * @sem: the semaphore to be acquired + * @jiffies: how long to wait before failing + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the semaphore is not released within the specified number of jiffies, + * this function returns -ETIME. It returns 0 if the semaphore was acquired. + */ +int down_timeout(struct semaphore *sem, long jiffies) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_timeout(sem, jiffies); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_timeout); + +/** + * up - release the semaphore + * @sem: the semaphore to release + * + * Release the semaphore. Unlike mutexes, up() may be called from any + * context and even by tasks which have never called down(). + */ +void up(struct semaphore *sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(list_empty(&sem->wait_list))) + sem->count++; + else + __up(sem); + spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(up); + +/* Functions for the contended case */ + +struct semaphore_waiter { + struct list_head list; + struct task_struct *task; + int up; +}; + +/* + * Because this function is inlined, the 'state' parameter will be + * constant, and thus optimised away by the compiler. Likewise the + * 'timeout' parameter for the cases without timeouts. + */ +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + struct task_struct *task = current; + struct semaphore_waiter waiter; + + list_add_tail(&waiter.list, &sem->wait_list); + waiter.task = task; + waiter.up = 0; + + for (;;) { + if (state == TASK_INTERRUPTIBLE && signal_pending(task)) + goto interrupted; + if (state == TASK_KILLABLE && fatal_signal_pending(task)) + goto interrupted; + if (timeout <= 0) + goto timed_out; + __set_task_state(task, state); + spin_unlock_irq(&sem->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&sem->lock); + if (waiter.up) + return 0; + } + + timed_out: + list_del(&waiter.list); + return -ETIME; + + interrupted: + list_del(&waiter.list); + return -EINTR; +} + +static noinline void __sched __down(struct semaphore *sem) +{ + __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_interruptible(struct semaphore *sem) +{ + return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_killable(struct semaphore *sem) +{ + return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +{ + return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); +} + +static noinline void __sched __up(struct semaphore *sem) +{ + struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, + struct semaphore_waiter, list); + list_del(&waiter->list); + waiter->up = 1; + wake_up_process(waiter->task); +} diff --git a/kernel/signal.c b/kernel/signal.c index 84917fe507f..64ad0ed1599 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -220,7 +220,7 @@ void flush_signals(struct task_struct *t) unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); - clear_tsk_thread_flag(t,TIF_SIGPENDING); + clear_tsk_thread_flag(t, TIF_SIGPENDING); flush_sigqueue(&t->pending); flush_sigqueue(&t->signal->shared_pending); spin_unlock_irqrestore(&t->sighand->siglock, flags); @@ -424,7 +424,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) } if (signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && - info->si_sys_private){ + info->si_sys_private) { /* * Release the siglock to ensure proper locking order * of timer locks outside of siglocks. Note, we leave @@ -1623,7 +1623,6 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) /* Let the debugger run. */ __set_current_state(TASK_TRACED); spin_unlock_irq(¤t->sighand->siglock); - try_to_freeze(); read_lock(&tasklist_lock); if (!unlikely(killed) && may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); @@ -1641,6 +1640,13 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) } /* + * While in TASK_TRACED, we were considered "frozen enough". + * Now that we woke up, it's crucial if we're supposed to be + * frozen that we freeze now before running anything substantial. + */ + try_to_freeze(); + + /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with * any signal-sending on another CPU that wants to examine it. @@ -1751,15 +1757,60 @@ static int do_signal_stop(int signr) return 1; } +static int ptrace_signal(int signr, siginfo_t *info, + struct pt_regs *regs, void *cookie) +{ + if (!(current->ptrace & PT_PTRACED)) + return signr; + + ptrace_signal_deliver(regs, cookie); + + /* Let the debugger run. */ + ptrace_stop(signr, 0, info); + + /* We're back. Did the debugger cancel the sig? */ + signr = current->exit_code; + if (signr == 0) + return signr; + + current->exit_code = 0; + + /* Update the siginfo structure if the signal has + changed. If the debugger wanted something + specific in the siginfo structure then it should + have updated *info via PTRACE_SETSIGINFO. */ + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_pid_vnr(current->parent); + info->si_uid = current->parent->uid; + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + specific_send_sig_info(signr, info, current); + signr = 0; + } + + return signr; +} + int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie) { sigset_t *mask = ¤t->blocked; int signr = 0; +relock: + /* + * We'll jump back here after any time we were stopped in TASK_STOPPED. + * While in TASK_STOPPED, we were considered "frozen enough". + * Now that we woke up, it's crucial if we're supposed to be + * frozen that we freeze now before running anything substantial. + */ try_to_freeze(); -relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { struct k_sigaction *ka; @@ -1773,36 +1824,10 @@ relock: if (!signr) break; /* will return 0 */ - if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { - ptrace_signal_deliver(regs, cookie); - - /* Let the debugger run. */ - ptrace_stop(signr, 0, info); - - /* We're back. Did the debugger cancel the sig? */ - signr = current->exit_code; - if (signr == 0) - continue; - - current->exit_code = 0; - - /* Update the siginfo structure if the signal has - changed. If the debugger wanted something - specific in the siginfo structure then it should - have updated *info via PTRACE_SETSIGINFO. */ - if (signr != info->si_signo) { - info->si_signo = signr; - info->si_errno = 0; - info->si_code = SI_USER; - info->si_pid = task_pid_vnr(current->parent); - info->si_uid = current->parent->uid; - } - - /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { - specific_send_sig_info(signr, info, current); + if (signr != SIGKILL) { + signr = ptrace_signal(signr, info, regs, cookie); + if (!signr) continue; - } } ka = ¤t->sighand->action[signr-1]; diff --git a/kernel/softirq.c b/kernel/softirq.c index 5b3aea5f471..3c44956ee7e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -313,6 +313,7 @@ void irq_exit(void) /* Make sure that timer wheel updates are propagated */ if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) tick_nohz_stop_sched_tick(); + rcu_irq_exit(); #endif preempt_enable_no_resched(); } @@ -355,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) /* Tasklets */ struct tasklet_head { - struct tasklet_struct *list; + struct tasklet_struct *head; + struct tasklet_struct **tail; }; /* Some compilers disobey section attribute on statics when not @@ -368,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t) unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; + t->next = NULL; + *__get_cpu_var(tasklet_vec).tail = t; + __get_cpu_var(tasklet_vec).tail = &(t->next); raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -381,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; + t->next = NULL; + *__get_cpu_var(tasklet_hi_vec).tail = t; + __get_cpu_var(tasklet_hi_vec).tail = &(t->next); raise_softirq_irqoff(HI_SOFTIRQ); local_irq_restore(flags); } @@ -394,8 +398,9 @@ static void tasklet_action(struct softirq_action *a) struct tasklet_struct *list; local_irq_disable(); - list = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = NULL; + list = __get_cpu_var(tasklet_vec).head; + __get_cpu_var(tasklet_vec).head = NULL; + __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; local_irq_enable(); while (list) { @@ -415,8 +420,9 @@ static void tasklet_action(struct softirq_action *a) } local_irq_disable(); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; + t->next = NULL; + *__get_cpu_var(tasklet_vec).tail = t; + __get_cpu_var(tasklet_vec).tail = &(t->next); __raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_enable(); } @@ -427,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a) struct tasklet_struct *list; local_irq_disable(); - list = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = NULL; + list = __get_cpu_var(tasklet_hi_vec).head; + __get_cpu_var(tasklet_hi_vec).head = NULL; + __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; local_irq_enable(); while (list) { @@ -448,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a) } local_irq_disable(); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; + t->next = NULL; + *__get_cpu_var(tasklet_hi_vec).tail = t; + __get_cpu_var(tasklet_hi_vec).tail = &(t->next); __raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); } @@ -486,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill); void __init softirq_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + per_cpu(tasklet_vec, cpu).tail = + &per_cpu(tasklet_vec, cpu).head; + per_cpu(tasklet_hi_vec, cpu).tail = + &per_cpu(tasklet_hi_vec, cpu).head; + } + open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); } @@ -554,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) return; /* CPU is dead, so no lock needed. */ - for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { + for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { if (*i == t) { *i = t->next; + /* If this was the tail element, move the tail ptr */ + if (*i == NULL) + per_cpu(tasklet_vec, cpu).tail = i; return; } } @@ -565,20 +585,20 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) static void takeover_tasklets(unsigned int cpu) { - struct tasklet_struct **i; - /* CPU is dead, so no lock needed. */ local_irq_disable(); /* Find end, append list for that CPU. */ - for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); - *i = per_cpu(tasklet_vec, cpu).list; - per_cpu(tasklet_vec, cpu).list = NULL; + *__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).head; + __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; + per_cpu(tasklet_vec, cpu).head = NULL; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; raise_softirq_irqoff(TASKLET_SOFTIRQ); - for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); - *i = per_cpu(tasklet_hi_vec, cpu).list; - per_cpu(tasklet_hi_vec, cpu).list = NULL; + *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; + __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; + per_cpu(tasklet_hi_vec, cpu).head = NULL; + per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 7c2da88db4e..01b6522fd92 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu) /* initialize timestamp */ touch_softlockup_watchdog(); + set_current_state(TASK_INTERRUPTIBLE); /* * Run briefly once per second to reset the softlockup timestamp. * If this gets delayed for more than 60 seconds then the * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); touch_softlockup_watchdog(); schedule(); if (kthread_should_stop()) break; - if (this_cpu != check_cpu) - continue; - - if (sysctl_hung_task_timeout_secs) - check_hung_uninterruptible_tasks(this_cpu); + if (this_cpu == check_cpu) { + if (sysctl_hung_task_timeout_secs) + check_hung_uninterruptible_tasks(this_cpu); + } + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); return 0; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6f4e0e13f70..0101aeef7ed 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -11,7 +11,6 @@ #include <linux/interrupt.h> #include <asm/atomic.h> -#include <asm/semaphore.h> #include <asm/uaccess.h> /* Since we effect priority and affinity (both of which are visible @@ -35,7 +34,7 @@ static int stopmachine(void *cpu) int irqs_disabled = 0; int prepared = 0; - set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); + set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ @@ -135,8 +134,7 @@ static void restart_machine(void) preempt_enable_no_resched(); } -struct stop_machine_data -{ +struct stop_machine_data { int (*fn)(void *); void *data; struct completion done; diff --git a/kernel/sys.c b/kernel/sys.c index a626116af5d..f2a45136695 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -67,6 +67,12 @@ #ifndef SET_ENDIAN # define SET_ENDIAN(a,b) (-EINVAL) #endif +#ifndef GET_TSC_CTL +# define GET_TSC_CTL(a) (-EINVAL) +#endif +#ifndef SET_TSC_CTL +# define SET_TSC_CTL(a) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -1626,10 +1632,9 @@ asmlinkage long sys_umask(int mask) asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { - long error; + long uninitialized_var(error); - error = security_task_prctl(option, arg2, arg3, arg4, arg5); - if (error) + if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) return error; switch (option) { @@ -1682,17 +1687,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = -EINVAL; break; - case PR_GET_KEEPCAPS: - if (current->keep_capabilities) - error = 1; - break; - case PR_SET_KEEPCAPS: - if (arg2 != 0 && arg2 != 1) { - error = -EINVAL; - break; - } - current->keep_capabilities = arg2; - break; case PR_SET_NAME: { struct task_struct *me = current; unsigned char ncomm[sizeof(me->comm)]; @@ -1726,18 +1720,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_SECCOMP: error = prctl_set_seccomp(arg2); break; - - case PR_CAPBSET_READ: - if (!cap_valid(arg2)) - return -EINVAL; - return !!cap_raised(current->cap_bset, arg2); - case PR_CAPBSET_DROP: -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES - return cap_prctl_drop(arg2); -#else - return -EINVAL; -#endif - + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); + break; + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8b7e9541179..fd3364827cc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -270,17 +270,6 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_batch_wakeup_granularity_ns", - .data = &sysctl_sched_batch_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int), @@ -311,24 +300,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_min_bal_int_shares", - .data = &sysctl_sched_min_bal_int_shares, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_max_bal_int_shares", - .data = &sysctl_sched_max_bal_int_shares, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif #endif { .ctl_name = CTL_UNNUMBERED, @@ -336,7 +307,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_rt_period, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_rt_handler, }, { .ctl_name = CTL_UNNUMBERED, @@ -344,7 +315,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_rt_runtime, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_rt_handler, }, { .ctl_name = CTL_UNNUMBERED, diff --git a/kernel/time.c b/kernel/time.c index a5ec013b6c8..35d373a9878 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -379,6 +379,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) ts->tv_sec = sec; ts->tv_nsec = nsec; } +EXPORT_SYMBOL(set_normalized_timespec); /** * ns_to_timespec - Convert nanoseconds to timespec diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 548c436a776..73961f35fdc 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,9 +141,12 @@ static void clocksource_watchdog(unsigned long data) } if (!list_empty(&watchdog_list)) { - /* Cycle through CPUs to check if the CPUs stay synchronized to - * each other. */ + /* + * Cycle through CPUs to check if the CPUs stay + * synchronized to each other. + */ int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); + if (next_cpu >= NR_CPUS) next_cpu = first_cpu(cpu_online_map); watchdog_timer.expires += WATCHDOG_INTERVAL; @@ -169,7 +172,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (!started && watchdog) { watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); + add_timer_on(&watchdog_timer, + first_cpu(cpu_online_map)); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -179,7 +183,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (watchdog) del_timer(&watchdog_timer); watchdog = cs; - init_timer_deferrable(&watchdog_timer); + init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; /* Reset watchdog cycles */ @@ -191,7 +195,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + first_cpu(cpu_online_map)); } } } @@ -228,6 +232,18 @@ void clocksource_resume(void) } /** + * clocksource_touch_watchdog - Update watchdog + * + * Update the watchdog after exception contexts such as kgdb so as not + * to incorrectly trip the watchdog. + * + */ +void clocksource_touch_watchdog(void) +{ + clocksource_resume_watchdog(); +} + +/** * clocksource_get_next - Returns the selected clocksource * */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c88b5910e7a..5fd9b946977 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ long time_freq; /* frequency offset (scaled ppm)*/ static long time_reftime; /* time at last adjustment (s) */ long time_adjust; +static long ntp_tick_adj; static void ntp_update_frequency(void) { u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; - second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); tick_length_base = second_length; @@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc) freq_adj = shift_right(freq_adj, time_constant * 2 + (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { + u64 utemp64; temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); if (time_offset < 0) { - temp64 = -temp64; - do_div(temp64, mtemp); - freq_adj -= temp64; + utemp64 = -temp64; + do_div(utemp64, mtemp); + freq_adj -= utemp64; } else { - do_div(temp64, mtemp); - freq_adj += temp64; + utemp64 = temp64; + do_div(utemp64, mtemp); + freq_adj += utemp64; } } freq_adj += time_freq; @@ -400,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) notify_cmos_timer(); return(result); } + +static int __init ntp_tick_adj_setup(char *str) +{ + ntp_tick_adj = simple_strtol(str, NULL, 0); + return 1; +} + +__setup("ntp_tick_adj=", ntp_tick_adj_setup); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e1bd50cbbf5..57a1f02e5ec 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -14,7 +14,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> @@ -262,7 +262,7 @@ out: void tick_broadcast_on_off(unsigned long reason, int *oncpu) { if (!cpu_isset(*oncpu, cpu_online_map)) - printk(KERN_ERR "tick-braodcast: ignoring broadcast for " + printk(KERN_ERR "tick-broadcast: ignoring broadcast for " "offline CPU #%d\n", *oncpu); else smp_call_function_single(*oncpu, tick_do_broadcast_on_off, diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 1bea399a9ef..4f3886562b8 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -14,12 +14,14 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include <linux/tick.h> +#include <asm/irq_regs.h> + #include "tick-internal.h" /* diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0258d3115d5..450c04935b6 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -14,7 +14,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa9bb73dbdb..b854a895591 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -158,9 +158,8 @@ void tick_nohz_stop_idle(int cpu) } } -static ktime_t tick_nohz_start_idle(int cpu) +static ktime_t tick_nohz_start_idle(struct tick_sched *ts) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, delta; now = ktime_get(); @@ -192,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) void tick_nohz_stop_sched_tick(void) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - unsigned long rt_jiffies; struct tick_sched *ts; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; @@ -201,8 +199,8 @@ void tick_nohz_stop_sched_tick(void) local_irq_save(flags); cpu = smp_processor_id(); - now = tick_nohz_start_idle(cpu); ts = &per_cpu(tick_cpu_sched, cpu); + now = tick_nohz_start_idle(ts); /* * If this cpu is offline and it is the one which updates @@ -222,7 +220,6 @@ void tick_nohz_stop_sched_tick(void) if (need_resched()) goto end; - cpu = smp_processor_id(); if (unlikely(local_softirq_pending())) { static int ratelimit; @@ -245,10 +242,6 @@ void tick_nohz_stop_sched_tick(void) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - rt_jiffies = rt_needs_cpu(cpu); - if (rt_jiffies && rt_jiffies < delta_jiffies) - delta_jiffies = rt_jiffies; - if (rcu_needs_cpu(cpu)) delta_jiffies = 1; /* @@ -282,6 +275,7 @@ void tick_nohz_stop_sched_tick(void) ts->idle_tick = ts->sched_timer.expires; ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; + rcu_enter_nohz(); } /* @@ -375,6 +369,8 @@ void tick_nohz_restart_sched_tick(void) return; } + rcu_exit_nohz(); + /* Update jiffies first */ select_nohz_load_balancer(0); now = ktime_get(); @@ -397,6 +393,7 @@ void tick_nohz_restart_sched_tick(void) sub_preempt_count(HARDIRQ_OFFSET); } + touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick */ @@ -637,7 +634,7 @@ void tick_cancel_sched_timer(int cpu) if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); - ts->tick_stopped = 0; + ts->nohz_mode = NOHZ_MODE_INACTIVE; } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1af9fb050fe..2d6087c7cf9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -178,6 +178,7 @@ static void change_clocksource(void) if (clock == new) return; + new->cycle_last = 0; now = clocksource_read(new); nsec = __get_nsec_offset(); timespec_add_ns(&xtime, nsec); @@ -187,13 +188,16 @@ static void change_clocksource(void) clock->error = 0; clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, - (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); tick_clock_notify(); + /* + * We're holding xtime lock and waking up klogd would deadlock + * us on enqueue. So no printing! printk(KERN_INFO "Time: %s clocksource has been installed.\n", clock->name); + */ } #else static inline void change_clocksource(void) { } @@ -245,8 +249,7 @@ void __init timekeeping_init(void) ntp_clear(); clock = clocksource_get_next(); - clocksource_calculate_interval(clock, - (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); clock->cycle_last = clocksource_read(clock); xtime.tv_sec = sec; @@ -293,6 +296,7 @@ static int timekeeping_resume(struct sys_device *dev) timespec_add_ns(&xtime, timekeeping_suspend_nsecs); update_xtime_cache(0); /* re-base the last cycle value */ + clock->cycle_last = 0; clock->cycle_last = clocksource_read(clock); clock->error = 0; timekeeping_suspended = 0; diff --git a/kernel/timer.c b/kernel/timer.c index 99b00a25f88..f3d35d4ea42 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } - /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified @@ -1220,13 +1228,6 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) return 0; } -/* - * lockdep: we want to track each per-CPU base as a separate lock-class, - * but timer-bases are kmalloc()-ed, so we need to attach separate - * keys to them: - */ -static struct lock_class_key base_lock_keys[NR_CPUS]; - static int __cpuinit init_timers_cpu(int cpu) { int j; @@ -1269,7 +1270,6 @@ static int __cpuinit init_timers_cpu(int cpu) } spin_lock_init(&base->lock); - lockdep_set_class(&base->lock, base_lock_keys + cpu); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); @@ -1308,8 +1308,8 @@ static void __cpuinit migrate_timers(int cpu) new_base = get_cpu_var(tvec_bases); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_lock(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1322,8 +1322,8 @@ static void __cpuinit migrate_timers(int cpu) migrate_timer_list(new_base, old_base->tv5.vec + i); } - double_spin_unlock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(tvec_bases); } diff --git a/kernel/uid16.c b/kernel/uid16.c index dd308ba4e03..3e41c1673e2 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -21,7 +21,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi { long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, filename, user, group); return ret; } @@ -29,7 +29,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g { long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, filename, user, group); return ret; } @@ -37,7 +37,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) { long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, fd, user, group); return ret; } @@ -45,7 +45,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) { long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, rgid, egid); return ret; } @@ -53,7 +53,7 @@ asmlinkage long sys_setgid16(old_gid_t gid) { long ret = sys_setgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, gid); return ret; } @@ -61,7 +61,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) { long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, ruid, euid); return ret; } @@ -69,7 +69,7 @@ asmlinkage long sys_setuid16(old_uid_t uid) { long ret = sys_setuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, uid); return ret; } @@ -78,7 +78,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, ruid, euid, suid); return ret; } @@ -98,7 +98,7 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, rgid, egid, sgid); return ret; } @@ -117,7 +117,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid) { long ret = sys_setfsuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, uid); return ret; } @@ -125,7 +125,7 @@ asmlinkage long sys_setfsgid16(old_gid_t gid) { long ret = sys_setfsgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, gid); return ret; } diff --git a/kernel/user.c b/kernel/user.c index 7132022a040..debce602bfd 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up) { int rc = 0; - up->tg = sched_create_group(); + up->tg = sched_create_group(&root_task_group); if (IS_ERR(up->tg)) rc = -ENOMEM; @@ -193,6 +193,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, static struct kobj_attribute cpu_rt_runtime_attr = __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); + +static ssize_t cpu_rt_period_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + + return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); +} + +static ssize_t cpu_rt_period_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t size) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + unsigned long rt_period; + int rc; + + sscanf(buf, "%lu", &rt_period); + + rc = sched_group_set_rt_period(up->tg, rt_period); + + return (rc ? rc : size); +} + +static struct kobj_attribute cpu_rt_period_attr = + __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); #endif /* default attributes per uid directory */ @@ -202,6 +229,7 @@ static struct attribute *uids_attributes[] = { #endif #ifdef CONFIG_RT_GROUP_SCHED &cpu_rt_runtime_attr.attr, + &cpu_rt_period_attr.attr, #endif NULL }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ff06611655a..00ff4d08e37 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -219,6 +219,7 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + timer_stats_timer_set_start_info(&dwork->timer); if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); @@ -580,6 +581,7 @@ EXPORT_SYMBOL(schedule_delayed_work); int schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { + timer_stats_timer_set_start_info(&dwork->timer); return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); |