aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c17
-rw-r--r--kernel/audit_tree.c5
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/capability.c132
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpuset.c70
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c131
-rw-r--r--kernel/futex.c269
-rw-r--r--kernel/hrtimer.c17
-rw-r--r--kernel/irq/manage.c49
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/kgdb.c35
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c17
-rw-r--r--kernel/module.c83
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/rcuclassic.c16
-rw-r--r--kernel/rcupreempt.c22
-rw-r--r--kernel/sched.c870
-rw-r--r--kernel/sched_clock.c246
-rw-r--r--kernel/sched_debug.c12
-rw-r--r--kernel/sched_fair.c288
-rw-r--r--kernel/sched_idletask.c2
-rw-r--r--kernel/sched_rt.c79
-rw-r--r--kernel/sched_stats.h7
-rw-r--r--kernel/signal.c51
-rw-r--r--kernel/softlockup.c16
-rw-r--r--kernel/stop_machine.c7
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sysctl.c5
-rw-r--r--kernel/time.c8
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/timeconst.pl120
-rw-r--r--kernel/workqueue.c2
36 files changed, 1143 insertions, 1465 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 188c43223f5..1c9938addb9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
- notifier.o ksysfs.o pm_qos_params.o
+ notifier.o ksysfs.o pm_qos_params.o sched_clock.o
obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
diff --git a/kernel/audit.c b/kernel/audit.c
index b7d3709cc45..e092f1c0ce3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -572,16 +572,17 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
if (!skb)
- return;
+ goto out;
reply->pid = pid;
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
- if (IS_ERR(tsk)) {
- kfree(reply);
- kfree_skb(skb);
- }
+ if (!IS_ERR(tsk))
+ return;
+ kfree_skb(skb);
+out:
+ kfree(reply);
}
/*
@@ -737,7 +738,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
- err = audit_filter_user(&NETLINK_CB(skb), msg_type);
+ err = audit_filter_user(&NETLINK_CB(skb));
if (err == 1) {
err = 0;
if (msg_type == AUDIT_USER_TTY) {
@@ -778,7 +779,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
/* fallthrough */
case AUDIT_LIST:
- err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+ err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
uid, seq, data, nlmsg_len(nlh),
loginuid, sessionid, sid);
break;
@@ -797,7 +798,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
/* fallthrough */
case AUDIT_LIST_RULES:
- err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
+ err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
uid, seq, data, nlmsg_len(nlh),
loginuid, sessionid, sid);
break;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 9ef5e0aacc3..f7921a2ecf1 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -172,10 +172,9 @@ static void insert_hash(struct audit_chunk *chunk)
struct audit_chunk *audit_tree_lookup(const struct inode *inode)
{
struct list_head *list = chunk_hash(inode);
- struct list_head *pos;
+ struct audit_chunk *p;
- list_for_each_rcu(pos, list) {
- struct audit_chunk *p = container_of(pos, struct audit_chunk, hash);
+ list_for_each_entry_rcu(p, list, hash) {
if (p->watch.inode == inode) {
get_inotify_watch(&p->watch);
return p;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0e0bd27e651..98c50cc671b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1544,6 +1544,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
* @data: payload data
* @datasz: size of payload data
* @loginuid: loginuid of sender
+ * @sessionid: sessionid for netlink audit message
* @sid: SE Linux Security ID of sender
*/
int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
@@ -1720,7 +1721,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
return 1;
}
-int audit_filter_user(struct netlink_skb_parms *cb, int type)
+int audit_filter_user(struct netlink_skb_parms *cb)
{
enum audit_state state = AUDIT_DISABLED;
struct audit_entry *e;
diff --git a/kernel/capability.c b/kernel/capability.c
index 39e8193b41e..901e0fdc3ff 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -53,11 +53,95 @@ static void warn_legacy_capability_use(void)
}
/*
+ * Version 2 capabilities worked fine, but the linux/capability.h file
+ * that accompanied their introduction encouraged their use without
+ * the necessary user-space source code changes. As such, we have
+ * created a version 3 with equivalent functionality to version 2, but
+ * with a header change to protect legacy source code from using
+ * version 2 when it wanted to use version 1. If your system has code
+ * that trips the following warning, it is using version 2 specific
+ * capabilities and may be doing so insecurely.
+ *
+ * The remedy is to either upgrade your version of libcap (to 2.10+,
+ * if the application is linked against it), or recompile your
+ * application with modern kernel headers and this warning will go
+ * away.
+ */
+
+static void warn_deprecated_v2(void)
+{
+ static int warned;
+
+ if (!warned) {
+ char name[sizeof(current->comm)];
+
+ printk(KERN_INFO "warning: `%s' uses deprecated v2"
+ " capabilities in a way that may be insecure.\n",
+ get_task_comm(name, current));
+ warned = 1;
+ }
+}
+
+/*
+ * Version check. Return the number of u32s in each capability flag
+ * array, or a negative value on error.
+ */
+static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
+{
+ __u32 version;
+
+ if (get_user(version, &header->version))
+ return -EFAULT;
+
+ switch (version) {
+ case _LINUX_CAPABILITY_VERSION_1:
+ warn_legacy_capability_use();
+ *tocopy = _LINUX_CAPABILITY_U32S_1;
+ break;
+ case _LINUX_CAPABILITY_VERSION_2:
+ warn_deprecated_v2();
+ /*
+ * fall through - v3 is otherwise equivalent to v2.
+ */
+ case _LINUX_CAPABILITY_VERSION_3:
+ *tocopy = _LINUX_CAPABILITY_U32S_3;
+ break;
+ default:
+ if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
+ return -EFAULT;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
* For sys_getproccap() and sys_setproccap(), any of the three
* capability set pointers may be NULL -- indicating that that set is
* uninteresting and/or not to be changed.
*/
+/*
+ * Atomically modify the effective capabilities returning the original
+ * value. No permission check is performed here - it is assumed that the
+ * caller is permitted to set the desired effective capabilities.
+ */
+kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
+{
+ kernel_cap_t pE_old;
+
+ spin_lock(&task_capability_lock);
+
+ pE_old = current->cap_effective;
+ current->cap_effective = pE_new;
+
+ spin_unlock(&task_capability_lock);
+
+ return pE_old;
+}
+
+EXPORT_SYMBOL(cap_set_effective);
+
/**
* sys_capget - get the capabilities of a given process.
* @header: pointer to struct that contains capability version and
@@ -71,27 +155,13 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
{
int ret = 0;
pid_t pid;
- __u32 version;
struct task_struct *target;
unsigned tocopy;
kernel_cap_t pE, pI, pP;
- if (get_user(version, &header->version))
- return -EFAULT;
-
- switch (version) {
- case _LINUX_CAPABILITY_VERSION_1:
- warn_legacy_capability_use();
- tocopy = _LINUX_CAPABILITY_U32S_1;
- break;
- case _LINUX_CAPABILITY_VERSION_2:
- tocopy = _LINUX_CAPABILITY_U32S_2;
- break;
- default:
- if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
- return -EFAULT;
- return -EINVAL;
- }
+ ret = cap_validate_magic(header, &tocopy);
+ if (ret != 0)
+ return ret;
if (get_user(pid, &header->pid))
return -EFAULT;
@@ -118,7 +188,7 @@ out:
spin_unlock(&task_capability_lock);
if (!ret) {
- struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+ struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i;
for (i = 0; i < tocopy; i++) {
@@ -128,7 +198,7 @@ out:
}
/*
- * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S,
+ * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
* we silently drop the upper capabilities here. This
* has the effect of making older libcap
* implementations implicitly drop upper capability
@@ -240,30 +310,16 @@ static inline int cap_set_all(kernel_cap_t *effective,
*/
asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
{
- struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S];
+ struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
unsigned i, tocopy;
kernel_cap_t inheritable, permitted, effective;
- __u32 version;
struct task_struct *target;
int ret;
pid_t pid;
- if (get_user(version, &header->version))
- return -EFAULT;
-
- switch (version) {
- case _LINUX_CAPABILITY_VERSION_1:
- warn_legacy_capability_use();
- tocopy = _LINUX_CAPABILITY_U32S_1;
- break;
- case _LINUX_CAPABILITY_VERSION_2:
- tocopy = _LINUX_CAPABILITY_U32S_2;
- break;
- default:
- if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
- return -EFAULT;
- return -EINVAL;
- }
+ ret = cap_validate_magic(header, &tocopy);
+ if (ret != 0)
+ return ret;
if (get_user(pid, &header->pid))
return -EFAULT;
@@ -281,7 +337,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
permitted.cap[i] = kdata[i].permitted;
inheritable.cap[i] = kdata[i].inheritable;
}
- while (i < _LINUX_CAPABILITY_U32S) {
+ while (i < _KERNEL_CAPABILITY_U32S) {
effective.cap[i] = 0;
permitted.cap[i] = 0;
inheritable.cap[i] = 0;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fbc6fc8949b..15ac0e1e4f4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2903,7 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
cg = tsk->cgroups;
parent = task_cgroup(tsk, subsys->subsys_id);
- snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
+ snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
/* Pin the hierarchy */
atomic_inc(&parent->root->sb->s_active);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8da627d3380..9fceb97e989 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -797,8 +797,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
retval = cpulist_parse(buf, trialcs.cpus_allowed);
if (retval < 0)
return retval;
+
+ if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))
+ return -EINVAL;
}
- cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
retval = validate_change(cs, &trialcs);
if (retval < 0)
return retval;
@@ -932,9 +934,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
retval = nodelist_parse(buf, trialcs.mems_allowed);
if (retval < 0)
goto done;
+
+ if (!nodes_subset(trialcs.mems_allowed,
+ node_states[N_HIGH_MEMORY]))
+ return -EINVAL;
}
- nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
- node_states[N_HIGH_MEMORY]);
oldmem = cs->mems_allowed;
if (nodes_equal(oldmem, trialcs.mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
@@ -1031,12 +1035,10 @@ int current_cpuset_is_being_rebound(void)
return task_cs(current) == cpuset_being_rebound;
}
-static int update_relax_domain_level(struct cpuset *cs, char *buf)
+static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
- int val = simple_strtol(buf, NULL, 10);
-
- if (val < 0)
- val = -1;
+ if (val < -1 || val >= SD_LV_MAX)
+ return -EINVAL;
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
@@ -1280,9 +1282,6 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
case FILE_MEMLIST:
retval = update_nodemask(cs, buffer);
break;
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- retval = update_relax_domain_level(cs, buffer);
- break;
default:
retval = -EINVAL;
goto out2;
@@ -1348,6 +1347,30 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
return retval;
}
+static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
+{
+ int retval = 0;
+ struct cpuset *cs = cgroup_cs(cgrp);
+ cpuset_filetype_t type = cft->private;
+
+ cgroup_lock();
+
+ if (cgroup_is_removed(cgrp)) {
+ cgroup_unlock();
+ return -ENODEV;
+ }
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ retval = update_relax_domain_level(cs, val);
+ break;
+ default:
+ retval = -EINVAL;
+ break;
+ }
+ cgroup_unlock();
+ return retval;
+}
+
/*
* These ascii lists should be read in a single call, by using a user
* buffer large enough to hold the entire map. If read in smaller
@@ -1406,9 +1429,6 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
case FILE_MEMLIST:
s += cpuset_sprintf_memlist(s, cs);
break;
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- s += sprintf(s, "%d", cs->relax_domain_level);
- break;
default:
retval = -EINVAL;
goto out;
@@ -1449,6 +1469,18 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
}
}
+static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+ cpuset_filetype_t type = cft->private;
+ switch (type) {
+ case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ return cs->relax_domain_level;
+ default:
+ BUG();
+ }
+}
+
/*
* for the common functions, 'private' gives the type of file
@@ -1499,8 +1531,8 @@ static struct cftype files[] = {
{
.name = "sched_relax_domain_level",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
+ .read_s64 = cpuset_read_s64,
+ .write_s64 = cpuset_write_s64,
.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
},
@@ -1858,6 +1890,12 @@ static void common_cpu_mem_hotplug_unplug(void)
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
scan_for_empty_cpusets(&top_cpuset);
+ /*
+ * Scheduler destroys domains on hotplug events.
+ * Rebuild them based on the current settings.
+ */
+ rebuild_sched_domains();
+
cgroup_unlock();
}
diff --git a/kernel/exit.c b/kernel/exit.c
index d3ad54677f9..8f6185e69b6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -19,6 +19,7 @@
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
+#include <linux/fdtable.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
@@ -125,6 +126,12 @@ static void __exit_signal(struct task_struct *tsk)
__unhash_process(tsk);
+ /*
+ * Do this under ->siglock, we can race with another thread
+ * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
+ */
+ flush_sigqueue(&tsk->pending);
+
tsk->signal = NULL;
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
@@ -132,7 +139,6 @@ static void __exit_signal(struct task_struct *tsk)
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
- flush_sigqueue(&tsk->pending);
if (sig) {
flush_sigqueue(&sig->shared_pending);
taskstats_tgid_free(sig);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2bb675af4de..19908b26cf8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -22,6 +22,7 @@
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
+#include <linux/fdtable.h>
#include <linux/key.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
@@ -659,136 +660,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int count_open_files(struct fdtable *fdt)
-{
- int size = fdt->max_fds;
- int i;
-
- /* Find the last open fd */
- for (i = size/(8*sizeof(long)); i > 0; ) {
- if (fdt->open_fds->fds_bits[--i])
- break;
- }
- i = (i+1) * 8 * sizeof(long);
- return i;
-}
-
-static struct files_struct *alloc_files(void)
-{
- struct files_struct *newf;
- struct fdtable *fdt;
-
- newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
- if (!newf)
- goto out;
-
- atomic_set(&newf->count, 1);
-
- spin_lock_init(&newf->file_lock);
- newf->next_fd = 0;
- fdt = &newf->fdtab;
- fdt->max_fds = NR_OPEN_DEFAULT;
- fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
- fdt->open_fds = (fd_set *)&newf->open_fds_init;
- fdt->fd = &newf->fd_array[0];
- INIT_RCU_HEAD(&fdt->rcu);
- fdt->next = NULL;
- rcu_assign_pointer(newf->fdt, fdt);
-out:
- return newf;
-}
-
-/*
- * Allocate a new files structure and copy contents from the
- * passed in files structure.
- * errorp will be valid only when the returned files_struct is NULL.
- */
-static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
-{
- struct files_struct *newf;
- struct file **old_fds, **new_fds;
- int open_files, size, i;
- struct fdtable *old_fdt, *new_fdt;
-
- *errorp = -ENOMEM;
- newf = alloc_files();
- if (!newf)
- goto out;
-
- spin_lock(&oldf->file_lock);
- old_fdt = files_fdtable(oldf);
- new_fdt = files_fdtable(newf);
- open_files = count_open_files(old_fdt);
-
- /*
- * Check whether we need to allocate a larger fd array and fd set.
- * Note: we're not a clone task, so the open count won't change.
- */
- if (open_files > new_fdt->max_fds) {
- new_fdt->max_fds = 0;
- spin_unlock(&oldf->file_lock);
- spin_lock(&newf->file_lock);
- *errorp = expand_files(newf, open_files-1);
- spin_unlock(&newf->file_lock);
- if (*errorp < 0)
- goto out_release;
- new_fdt = files_fdtable(newf);
- /*
- * Reacquire the oldf lock and a pointer to its fd table
- * who knows it may have a new bigger fd table. We need
- * the latest pointer.
- */
- spin_lock(&oldf->file_lock);
- old_fdt = files_fdtable(oldf);
- }
-
- old_fds = old_fdt->fd;
- new_fds = new_fdt->fd;
-
- memcpy(new_fdt->open_fds->fds_bits,
- old_fdt->open_fds->fds_bits, open_files/8);
- memcpy(new_fdt->close_on_exec->fds_bits,
- old_fdt->close_on_exec->fds_bits, open_files/8);
-
- for (i = open_files; i != 0; i--) {
- struct file *f = *old_fds++;
- if (f) {
- get_file(f);
- } else {
- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open(). So make sure that this
- * fd is available to the new process.
- */
- FD_CLR(open_files - i, new_fdt->open_fds);
- }
- rcu_assign_pointer(*new_fds++, f);
- }
- spin_unlock(&oldf->file_lock);
-
- /* compute the remainder to be cleared */
- size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
-
- /* This is long word aligned thus could use a optimized version */
- memset(new_fds, 0, size);
-
- if (new_fdt->max_fds > open_files) {
- int left = (new_fdt->max_fds-open_files)/8;
- int start = open_files / (8 * sizeof(unsigned long));
-
- memset(&new_fdt->open_fds->fds_bits[start], 0, left);
- memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
- }
-
- return newf;
-
-out_release:
- kmem_cache_free(files_cachep, newf);
-out:
- return NULL;
-}
-
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
struct files_struct *oldf, *newf;
diff --git a/kernel/futex.c b/kernel/futex.c
index 98092c9817f..7d1136e97c1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -104,10 +104,6 @@ struct futex_q {
/* Key which the futex is hashed on: */
union futex_key key;
- /* For fd, sigio sent using these: */
- int fd;
- struct file *filp;
-
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
struct task_struct *task;
@@ -126,9 +122,6 @@ struct futex_hash_bucket {
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
-/* Futex-fs vfsmount entry: */
-static struct vfsmount *futex_mnt;
-
/*
* Take mm->mmap_sem, when futex is shared
*/
@@ -610,8 +603,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
static void wake_futex(struct futex_q *q)
{
plist_del(&q->list, &q->list.plist);
- if (q->filp)
- send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
/*
* The lock in wake_up_all() is a crucial memory barrier after the
* plist_del() and also before assigning to q->lock_ptr.
@@ -988,14 +979,10 @@ out:
}
/* The key must be already stored in q->key. */
-static inline struct futex_hash_bucket *
-queue_lock(struct futex_q *q, int fd, struct file *filp)
+static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
{
struct futex_hash_bucket *hb;
- q->fd = fd;
- q->filp = filp;
-
init_waitqueue_head(&q->waiters);
get_futex_key_refs(&q->key);
@@ -1006,7 +993,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
return hb;
}
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
@@ -1041,15 +1028,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
* exactly once. They are called with the hashed spinlock held.
*/
-/* The key must be already stored in q->key. */
-static void queue_me(struct futex_q *q, int fd, struct file *filp)
-{
- struct futex_hash_bucket *hb;
-
- hb = queue_lock(q, fd, filp);
- __queue_me(q, hb);
-}
-
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static int unqueue_me(struct futex_q *q)
{
@@ -1118,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q)
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
+ struct task_struct *newowner,
+ struct rw_semaphore *fshared)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
+ struct task_struct *oldowner = pi_state->owner;
u32 uval, curval, newval;
- int ret;
+ int ret, attempt = 0;
/* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
+ /*
+ * We are here either because we stole the rtmutex from the
+ * pending owner or we are the pending owner which failed to
+ * get the rtmutex. We have to replace the pending owner TID
+ * in the user space variable. This must be atomic as we have
+ * to preserve the owner died bit here.
+ *
+ * Note: We write the user space value _before_ changing the
+ * pi_state because we can fault here. Imagine swapped out
+ * pages or a fork, which was running right before we acquired
+ * mmap_sem, that marked all the anonymous memory readonly for
+ * cow.
+ *
+ * Modifying pi_state _before_ the user space value would
+ * leave the pi_state in an inconsistent state when we fault
+ * here, because we need to drop the hash bucket lock to
+ * handle the fault. This might be observed in the PID check
+ * in lookup_pi_state.
+ */
+retry:
+ if (get_futex_value_locked(&uval, uaddr))
+ goto handle_fault;
+
+ while (1) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+ if (curval == -EFAULT)
+ goto handle_fault;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+
+ /*
+ * We fixed up user space. Now we need to fix the pi_state
+ * itself.
+ */
if (pi_state->owner != NULL) {
spin_lock_irq(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
spin_unlock_irq(&pi_state->owner->pi_lock);
- } else
- newtid |= FUTEX_OWNER_DIED;
+ }
pi_state->owner = newowner;
@@ -1140,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list);
spin_unlock_irq(&newowner->pi_lock);
+ return 0;
/*
- * We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
- * owner died bit here.
+ * To handle the page fault we need to drop the hash bucket
+ * lock here. That gives the other task (either the pending
+ * owner itself or the task which stole the rtmutex) the
+ * chance to try the fixup of the pi_state. So once we are
+ * back from handling the fault we need to check the pi_state
+ * after reacquiring the hash bucket lock and before trying to
+ * do another fixup. When the fixup has been done already we
+ * simply return.
*/
- ret = get_futex_value_locked(&uval, uaddr);
+handle_fault:
+ spin_unlock(q->lock_ptr);
- while (!ret) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+ spin_lock(q->lock_ptr);
- if (curval == -EFAULT)
- ret = -EFAULT;
- if (curval == uval)
- break;
- uval = curval;
- }
- return ret;
+ /*
+ * Check if someone else fixed it for us:
+ */
+ if (pi_state->owner != oldowner)
+ return 0;
+
+ if (ret)
+ return ret;
+
+ goto retry;
}
/*
@@ -1194,7 +1224,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
if (unlikely(ret != 0))
goto out_release_sem;
- hb = queue_lock(&q, -1, NULL);
+ hb = queue_lock(&q);
/*
* Access the page AFTER the futex is queued.
@@ -1238,7 +1268,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
goto out_unlock_release_sem;
/* Only actually queue if *uaddr contained val. */
- __queue_me(&q, hb);
+ queue_me(&q, hb);
/*
* Now the futex is queued and we have checked the data, we
@@ -1386,7 +1416,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
goto out_release_sem;
retry_unlocked:
- hb = queue_lock(&q, -1, NULL);
+ hb = queue_lock(&q);
retry_locked:
ret = lock_taken = 0;
@@ -1499,7 +1529,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
/*
* Only actually queue now that the atomic ops are done:
*/
- __queue_me(&q, hb);
+ queue_me(&q, hb);
/*
* Now the futex is queued and we have checked the data, we
@@ -1529,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
* that case:
*/
if (q.pi_state->owner != curr)
- ret = fixup_pi_state_owner(uaddr, &q, curr);
+ ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
} else {
/*
* Catch the rare case, where the lock was released
@@ -1561,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
int res;
owner = rt_mutex_owner(&q.pi_state->pi_mutex);
- res = fixup_pi_state_owner(uaddr, &q, owner);
+ res = fixup_pi_state_owner(uaddr, &q, owner,
+ fshared);
/* propagate -EFAULT, if the fixup failed */
if (res)
@@ -1746,121 +1777,6 @@ pi_faulted:
return ret;
}
-static int futex_close(struct inode *inode, struct file *filp)
-{
- struct futex_q *q = filp->private_data;
-
- unqueue_me(q);
- kfree(q);
-
- return 0;
-}
-
-/* This is one-shot: once it's gone off you need a new fd */
-static unsigned int futex_poll(struct file *filp,
- struct poll_table_struct *wait)
-{
- struct futex_q *q = filp->private_data;
- int ret = 0;
-
- poll_wait(filp, &q->waiters, wait);
-
- /*
- * plist_node_empty() is safe here without any lock.
- * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
- */
- if (plist_node_empty(&q->list))
- ret = POLLIN | POLLRDNORM;
-
- return ret;
-}
-
-static const struct file_operations futex_fops = {
- .release = futex_close,
- .poll = futex_poll,
-};
-
-/*
- * Signal allows caller to avoid the race which would occur if they
- * set the sigio stuff up afterwards.
- */
-static int futex_fd(u32 __user *uaddr, int signal)
-{
- struct futex_q *q;
- struct file *filp;
- int ret, err;
- struct rw_semaphore *fshared;
- static unsigned long printk_interval;
-
- if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
- printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
- "will be removed from the kernel in June 2007\n",
- current->comm);
- }
-
- ret = -EINVAL;
- if (!valid_signal(signal))
- goto out;
-
- ret = get_unused_fd();
- if (ret < 0)
- goto out;
- filp = get_empty_filp();
- if (!filp) {
- put_unused_fd(ret);
- ret = -ENFILE;
- goto out;
- }
- filp->f_op = &futex_fops;
- filp->f_path.mnt = mntget(futex_mnt);
- filp->f_path.dentry = dget(futex_mnt->mnt_root);
- filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
-
- if (signal) {
- err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
- if (err < 0) {
- goto error;
- }
- filp->f_owner.signum = signal;
- }
-
- q = kmalloc(sizeof(*q), GFP_KERNEL);
- if (!q) {
- err = -ENOMEM;
- goto error;
- }
- q->pi_state = NULL;
-
- fshared = &current->mm->mmap_sem;
- down_read(fshared);
- err = get_futex_key(uaddr, fshared, &q->key);
-
- if (unlikely(err != 0)) {
- up_read(fshared);
- kfree(q);
- goto error;
- }
-
- /*
- * queue_me() must be called before releasing mmap_sem, because
- * key->shared.inode needs to be referenced while holding it.
- */
- filp->private_data = q;
-
- queue_me(q, ret, filp);
- up_read(fshared);
-
- /* Now we map fd to filp, so userspace can access it */
- fd_install(ret, filp);
-out:
- return ret;
-error:
- put_unused_fd(ret);
- put_filp(filp);
- ret = err;
- goto out;
-}
-
/*
* Support for robust futexes: the kernel cleans up held futexes at
* thread exit time.
@@ -2092,10 +2008,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
case FUTEX_WAKE_BITSET:
ret = futex_wake(uaddr, fshared, val, val3);
break;
- case FUTEX_FD:
- /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
- ret = futex_fd(uaddr, val);
- break;
case FUTEX_REQUEUE:
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
break;
@@ -2156,19 +2068,6 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
-static int futexfs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data,
- struct vfsmount *mnt)
-{
- return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type futex_fs_type = {
- .name = "futexfs",
- .get_sb = futexfs_get_sb,
- .kill_sb = kill_anon_super,
-};
-
static int __init futex_init(void)
{
u32 curval;
@@ -2193,16 +2092,6 @@ static int __init futex_init(void)
spin_lock_init(&futex_queues[i].lock);
}
- i = register_filesystem(&futex_fs_type);
- if (i)
- return i;
-
- futex_mnt = kern_mount(&futex_fs_type);
- if (IS_ERR(futex_mnt)) {
- unregister_filesystem(&futex_fs_type);
- return PTR_ERR(futex_mnt);
- }
-
return 0;
}
__initcall(futex_init);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9af1d6a8095..ab80515008f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -154,15 +154,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
}
/*
- * Helper function to check, whether the timer is running the callback
- * function
- */
-static inline int hrtimer_callback_running(struct hrtimer *timer)
-{
- return timer->state & HRTIMER_STATE_CALLBACK;
-}
-
-/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
*/
@@ -1012,10 +1003,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
*/
raise = timer->state == HRTIMER_STATE_PENDING;
+ /*
+ * We use preempt_disable to prevent this task from migrating after
+ * setting up the softirq and raising it. Otherwise, if me migrate
+ * we will raise the softirq on the wrong CPU.
+ */
+ preempt_disable();
+
unlock_hrtimer_base(timer, &flags);
if (raise)
hrtimer_raise_softirq();
+ preempt_enable();
return ret;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46e4ad1723f..46d6611a33b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -150,6 +150,26 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
+static void __enable_irq(struct irq_desc *desc, unsigned int irq)
+{
+ switch (desc->depth) {
+ case 0:
+ printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+ WARN_ON(1);
+ break;
+ case 1: {
+ unsigned int status = desc->status & ~IRQ_DISABLED;
+
+ /* Prevent probing on this irq: */
+ desc->status = status | IRQ_NOPROBE;
+ check_irq_resend(desc, irq);
+ /* fall-through */
+ }
+ default:
+ desc->depth--;
+ }
+}
+
/**
* enable_irq - enable handling of an irq
* @irq: Interrupt to enable
@@ -169,22 +189,7 @@ void enable_irq(unsigned int irq)
return;
spin_lock_irqsave(&desc->lock, flags);
- switch (desc->depth) {
- case 0:
- printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
- WARN_ON(1);
- break;
- case 1: {
- unsigned int status = desc->status & ~IRQ_DISABLED;
-
- /* Prevent probing on this irq: */
- desc->status = status | IRQ_NOPROBE;
- check_irq_resend(desc, irq);
- /* fall-through */
- }
- default:
- desc->depth--;
- }
+ __enable_irq(desc, irq);
spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL(enable_irq);
@@ -365,7 +370,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
compat_irq_chip_set_default_handler(desc);
desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
- IRQ_INPROGRESS);
+ IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
if (!(desc->status & IRQ_NOAUTOEN)) {
desc->depth = 0;
@@ -381,6 +386,16 @@ int setup_irq(unsigned int irq, struct irqaction *new)
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
desc->irqs_unhandled = 0;
+
+ /*
+ * Check whether we disabled the irq via the spurious handler
+ * before. Reenable it and give it another chance.
+ */
+ if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
+ desc->status &= ~IRQ_SPURIOUS_DISABLED;
+ __enable_irq(desc, irq);
+ }
+
spin_unlock_irqrestore(&desc->lock, flags);
new->irq = irq;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 088dabbf2d6..c66d3f10e85 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -209,8 +209,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
* Now kill the IRQ
*/
printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
- desc->status |= IRQ_DISABLED;
- desc->depth = 1;
+ desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
+ desc->depth++;
desc->chip->disable(irq);
}
desc->irqs_unhandled = 0;
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 1bd0ec1c80b..3ec23c3ec97 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -52,6 +52,7 @@
#include <asm/byteorder.h>
#include <asm/atomic.h>
#include <asm/system.h>
+#include <asm/unaligned.h>
static int kgdb_break_asap;
@@ -61,7 +62,7 @@ struct kgdb_state {
int err_code;
int cpu;
int pass_exception;
- long threadid;
+ unsigned long threadid;
long kgdb_usethreadid;
struct pt_regs *linux_regs;
};
@@ -146,7 +147,7 @@ atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
* the other CPUs might interfere with your debugging context, so
* use this with care:
*/
-int kgdb_do_roundup = 1;
+static int kgdb_do_roundup = 1;
static int __init opt_nokgdbroundup(char *str)
{
@@ -227,8 +228,6 @@ void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
* GDB remote protocol parser:
*/
-static const char hexchars[] = "0123456789abcdef";
-
static int hex(char ch)
{
if ((ch >= 'a') && (ch <= 'f'))
@@ -316,8 +315,8 @@ static void put_packet(char *buffer)
}
kgdb_io_ops->write_char('#');
- kgdb_io_ops->write_char(hexchars[checksum >> 4]);
- kgdb_io_ops->write_char(hexchars[checksum & 0xf]);
+ kgdb_io_ops->write_char(hex_asc_hi(checksum));
+ kgdb_io_ops->write_char(hex_asc_lo(checksum));
if (kgdb_io_ops->flush)
kgdb_io_ops->flush();
@@ -346,14 +345,6 @@ static void put_packet(char *buffer)
}
}
-static char *pack_hex_byte(char *pkt, u8 byte)
-{
- *pkt++ = hexchars[byte >> 4];
- *pkt++ = hexchars[byte & 0xf];
-
- return pkt;
-}
-
/*
* Convert the memory pointed to by mem into hex, placing result in buf.
* Return a pointer to the last char put in buf (null). May return an error.
@@ -438,7 +429,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
* While we find nice hex chars, build a long_val.
* Return number of chars processed.
*/
-int kgdb_hex2long(char **ptr, long *long_val)
+int kgdb_hex2long(char **ptr, unsigned long *long_val)
{
int hex_val;
int num = 0;
@@ -486,8 +477,8 @@ static void error_packet(char *pkt, int error)
{
error = -error;
pkt[0] = 'E';
- pkt[1] = hexchars[(error / 10)];
- pkt[2] = hexchars[(error % 10)];
+ pkt[1] = hex_asc[(error / 10)];
+ pkt[2] = hex_asc[(error % 10)];
pkt[3] = '\0';
}
@@ -518,10 +509,7 @@ static void int_to_threadref(unsigned char *id, int value)
scan = (unsigned char *)id;
while (i--)
*scan++ = 0;
- *scan++ = (value >> 24) & 0xff;
- *scan++ = (value >> 16) & 0xff;
- *scan++ = (value >> 8) & 0xff;
- *scan++ = (value & 0xff);
+ put_unaligned_be32(value, scan);
}
static struct task_struct *getthread(struct pt_regs *regs, int tid)
@@ -709,7 +697,7 @@ int kgdb_isremovedbreak(unsigned long addr)
return 0;
}
-int remove_all_break(void)
+static int remove_all_break(void)
{
unsigned long addr;
int error;
@@ -1511,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs)
return 1;
}
-void kgdb_console_write(struct console *co, const char *s, unsigned count)
+static void kgdb_console_write(struct console *co, const char *s,
+ unsigned count)
{
unsigned long flags;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index e2764047ec0..8df97d3dfda 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -27,6 +27,7 @@
#include <linux/mnt_namespace.h>
#include <linux/completion.h>
#include <linux/file.h>
+#include <linux/fdtable.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1e0250cb948..1485ca8d0e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -79,7 +79,7 @@ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
*
* For such cases, we now have a blacklist
*/
-struct kprobe_blackpoint kprobe_blacklist[] = {
+static struct kprobe_blackpoint kprobe_blacklist[] = {
{"preempt_schedule",},
{NULL} /* Terminator */
};
@@ -699,8 +699,9 @@ static int __register_kprobes(struct kprobe **kps, int num,
return -EINVAL;
for (i = 0; i < num; i++) {
ret = __register_kprobe(kps[i], called_from);
- if (ret < 0 && i > 0) {
- unregister_kprobes(kps, i);
+ if (ret < 0) {
+ if (i > 0)
+ unregister_kprobes(kps, i);
break;
}
}
@@ -776,8 +777,9 @@ static int __register_jprobes(struct jprobe **jps, int num,
jp->kp.break_handler = longjmp_break_handler;
ret = __register_kprobe(&jp->kp, called_from);
}
- if (ret < 0 && i > 0) {
- unregister_jprobes(jps, i);
+ if (ret < 0) {
+ if (i > 0)
+ unregister_jprobes(jps, i);
break;
}
}
@@ -920,8 +922,9 @@ static int __register_kretprobes(struct kretprobe **rps, int num,
return -EINVAL;
for (i = 0; i < num; i++) {
ret = __register_kretprobe(rps[i], called_from);
- if (ret < 0 && i > 0) {
- unregister_kretprobes(rps, i);
+ if (ret < 0) {
+ if (i > 0)
+ unregister_kretprobes(rps, i);
break;
}
}
diff --git a/kernel/module.c b/kernel/module.c
index 8674a390a2e..5f80478b746 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -890,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = {
static const char vermagic[] = VERMAGIC_STRING;
+static int try_to_force_load(struct module *mod, const char *symname)
+{
+#ifdef CONFIG_MODULE_FORCE_LOAD
+ if (!(tainted & TAINT_FORCED_MODULE))
+ printk("%s: no version for \"%s\" found: kernel tainted.\n",
+ mod->name, symname);
+ add_taint_module(mod, TAINT_FORCED_MODULE);
+ return 0;
+#else
+ return -ENOEXEC;
+#endif
+}
+
#ifdef CONFIG_MODVERSIONS
static int check_version(Elf_Shdr *sechdrs,
unsigned int versindex,
@@ -904,6 +917,10 @@ static int check_version(Elf_Shdr *sechdrs,
if (!crc)
return 1;
+ /* No versions at all? modprobe --force does this. */
+ if (versindex == 0)
+ return try_to_force_load(mod, symname) == 0;
+
versions = (void *) sechdrs[versindex].sh_addr;
num_versions = sechdrs[versindex].sh_size
/ sizeof(struct modversion_info);
@@ -914,18 +931,19 @@ static int check_version(Elf_Shdr *sechdrs,
if (versions[i].crc == *crc)
return 1;
- printk("%s: disagrees about version of symbol %s\n",
- mod->name, symname);
DEBUGP("Found checksum %lX vs module %lX\n",
*crc, versions[i].crc);
- return 0;
+ goto bad_version;
}
- /* Not in module's version table. OK, but that taints the kernel. */
- if (!(tainted & TAINT_FORCED_MODULE))
- printk("%s: no version for \"%s\" found: kernel tainted.\n",
- mod->name, symname);
- add_taint_module(mod, TAINT_FORCED_MODULE);
- return 1;
+
+ printk(KERN_WARNING "%s: no symbol version for %s\n",
+ mod->name, symname);
+ return 0;
+
+bad_version:
+ printk("%s: disagrees about version of symbol %s\n",
+ mod->name, symname);
+ return 0;
}
static inline int check_modstruct_version(Elf_Shdr *sechdrs,
@@ -939,11 +957,14 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
return check_version(sechdrs, versindex, "struct_module", mod, crc);
}
-/* First part is kernel version, which we ignore. */
-static inline int same_magic(const char *amagic, const char *bmagic)
+/* First part is kernel version, which we ignore if module has crcs. */
+static inline int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
{
- amagic += strcspn(amagic, " ");
- bmagic += strcspn(bmagic, " ");
+ if (has_crcs) {
+ amagic += strcspn(amagic, " ");
+ bmagic += strcspn(bmagic, " ");
+ }
return strcmp(amagic, bmagic) == 0;
}
#else
@@ -963,7 +984,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
return 1;
}
-static inline int same_magic(const char *amagic, const char *bmagic)
+static inline int same_magic(const char *amagic, const char *bmagic,
+ bool has_crcs)
{
return strcmp(amagic, bmagic) == 0;
}
@@ -1315,7 +1337,19 @@ out_unreg:
kobject_put(&mod->mkobj.kobj);
return err;
}
-#endif
+
+static void mod_sysfs_fini(struct module *mod)
+{
+ kobject_put(&mod->mkobj.kobj);
+}
+
+#else /* CONFIG_SYSFS */
+
+static void mod_sysfs_fini(struct module *mod)
+{
+}
+
+#endif /* CONFIG_SYSFS */
static void mod_kobject_remove(struct module *mod)
{
@@ -1323,7 +1357,7 @@ static void mod_kobject_remove(struct module *mod)
module_param_sysfs_remove(mod);
kobject_put(mod->mkobj.drivers_dir);
kobject_put(mod->holders_dir);
- kobject_put(&mod->mkobj.kobj);
+ mod_sysfs_fini(mod);
}
/*
@@ -1758,7 +1792,7 @@ static struct module *load_module(void __user *umod,
/* Sanity checks against insmoding binaries or wrong arch,
weird elf version */
- if (memcmp(hdr->e_ident, ELFMAG, 4) != 0
+ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
|| hdr->e_type != ET_REL
|| !elf_check_arch(hdr)
|| hdr->e_shentsize != sizeof(*sechdrs)) {
@@ -1853,10 +1887,10 @@ static struct module *load_module(void __user *umod,
modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
- add_taint_module(mod, TAINT_FORCED_MODULE);
- printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
- mod->name);
- } else if (!same_magic(modmagic, vermagic)) {
+ err = try_to_force_load(mod, "magic");
+ if (err)
+ goto free_hdr;
+ } else if (!same_magic(modmagic, vermagic, versindex)) {
printk(KERN_ERR "%s: version magic '%s' should be '%s'\n",
mod->name, modmagic, vermagic);
err = -ENOEXEC;
@@ -2006,9 +2040,10 @@ static struct module *load_module(void __user *umod,
(mod->num_gpl_future_syms && !gplfuturecrcindex) ||
(mod->num_unused_syms && !unusedcrcindex) ||
(mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
- printk(KERN_WARNING "%s: No versions for exported symbols."
- " Tainting kernel.\n", mod->name);
- add_taint_module(mod, TAINT_FORCED_MODULE);
+ printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
+ err = try_to_force_load(mod, "nocrc");
+ if (err)
+ goto cleanup;
}
#endif
markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3..e2129e83fd7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -666,7 +666,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
return retval;
}
-const char printk_recursion_bug_msg [] =
+static const char printk_recursion_bug_msg [] =
KERN_CRIT "BUG: recent printk recursion!\n";
static int printk_recursion_bug;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306..a38895a5b8e 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -89,8 +89,22 @@ static void force_quiescent_state(struct rcu_data *rdp,
/*
* Don't send IPI to itself. With irqs disabled,
* rdp->cpu is the current cpu.
+ *
+ * cpu_online_map is updated by the _cpu_down()
+ * using stop_machine_run(). Since we're in irqs disabled
+ * section, stop_machine_run() is not exectuting, hence
+ * the cpu_online_map is stable.
+ *
+ * However, a cpu might have been offlined _just_ before
+ * we disabled irqs while entering here.
+ * And rcu subsystem might not yet have handled the CPU_DEAD
+ * notification, leading to the offlined cpu's bit
+ * being set in the rcp->cpumask.
+ *
+ * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
+ * sending smp_reschedule() to an offlined CPU.
*/
- cpumask = rcp->cpumask;
+ cpus_and(cpumask, rcp->cpumask, cpu_online_map);
cpu_clear(rdp->cpu, cpumask);
for_each_cpu_mask(cpu, cpumask)
smp_send_reschedule(cpu);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a51..41d275a81df 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -217,8 +217,6 @@ long rcu_batches_completed(void)
}
EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-
void __rcu_read_lock(void)
{
int idx;
@@ -927,26 +925,22 @@ void rcu_offline_cpu(int cpu)
spin_unlock_irqrestore(&rdp->lock, flags);
}
-void __devinit rcu_online_cpu(int cpu)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
- cpu_set(cpu, rcu_cpu_online_map);
- spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
-}
-
#else /* #ifdef CONFIG_HOTPLUG_CPU */
void rcu_offline_cpu(int cpu)
{
}
-void __devinit rcu_online_cpu(int cpu)
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+void __cpuinit rcu_online_cpu(int cpu)
{
-}
+ unsigned long flags;
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+ spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+ cpu_set(cpu, rcu_cpu_online_map);
+ spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+}
static void rcu_process_callbacks(struct softirq_action *unused)
{
diff --git a/kernel/sched.c b/kernel/sched.c
index 34bcc5bc120..4e2f6033565 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,16 +75,6 @@
#include <asm/irq_regs.h>
/*
- * Scheduler clock - returns current time in nanosec units.
- * This is default implementation.
- * Architectures and sub-architectures can override this.
- */
-unsigned long long __attribute__((weak)) sched_clock(void)
-{
- return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
-}
-
-/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
* and back.
@@ -146,7 +136,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
static inline int rt_policy(int policy)
{
- if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+ if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
return 1;
return 0;
}
@@ -242,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
}
#endif
+/*
+ * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+static DEFINE_MUTEX(sched_domains_mutex);
+
#ifdef CONFIG_GROUP_SCHED
#include <linux/cgroup.h>
@@ -308,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
*/
static DEFINE_SPINLOCK(task_group_lock);
-/* doms_cur_mutex serializes access to doms_cur[] array */
-static DEFINE_MUTEX(doms_cur_mutex);
-
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -318,7 +311,16 @@ static DEFINE_MUTEX(doms_cur_mutex);
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif
+/*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
+ * (The default weight is 1024 - so there's no practical
+ * limitation from this.)
+ */
#define MIN_SHARES 2
+#define MAX_SHARES (1UL << 18)
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
#endif
@@ -358,21 +360,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
}
-static inline void lock_doms_cur(void)
-{
- mutex_lock(&doms_cur_mutex);
-}
-
-static inline void unlock_doms_cur(void)
-{
- mutex_unlock(&doms_cur_mutex);
-}
-
#else
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_doms_cur(void) { }
-static inline void unlock_doms_cur(void) { }
#endif /* CONFIG_GROUP_SCHED */
@@ -411,43 +401,6 @@ struct cfs_rq {
*/
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
-
-#ifdef CONFIG_SMP
- unsigned long task_weight;
- unsigned long shares;
- /*
- * We need space to build a sched_domain wide view of the full task
- * group tree, in order to avoid depending on dynamic memory allocation
- * during the load balancing we place this in the per cpu task group
- * hierarchy. This limits the load balancing to one instance per cpu,
- * but more should not be needed anyway.
- */
- struct aggregate_struct {
- /*
- * load = weight(cpus) * f(tg)
- *
- * Where f(tg) is the recursive weight fraction assigned to
- * this group.
- */
- unsigned long load;
-
- /*
- * part of the group weight distributed to this span.
- */
- unsigned long shares;
-
- /*
- * The sum of all runqueue weights within this span.
- */
- unsigned long rq_weight;
-
- /*
- * Weight contributed by tasks; this is the part we can
- * influence by moving tasks around.
- */
- unsigned long task_weight;
- } aggregate;
-#endif
#endif
};
@@ -560,13 +513,7 @@ struct rq {
unsigned long next_balance;
struct mm_struct *prev_mm;
- u64 clock, prev_clock_raw;
- s64 clock_max_delta;
-
- unsigned int clock_warps, clock_overflows, clock_underflows;
- u64 idle_clock;
- unsigned int clock_deep_idle_events;
- u64 tick_timestamp;
+ u64 clock;
atomic_t nr_iowait;
@@ -631,82 +578,6 @@ static inline int cpu_of(struct rq *rq)
#endif
}
-#ifdef CONFIG_NO_HZ
-static inline bool nohz_on(int cpu)
-{
- return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE;
-}
-
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
- return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1;
-}
-
-static inline void update_last_tick_seen(struct rq *rq)
-{
- rq->last_tick_seen = jiffies;
-}
-#else
-static inline u64 max_skipped_ticks(struct rq *rq)
-{
- return 1;
-}
-
-static inline void update_last_tick_seen(struct rq *rq)
-{
-}
-#endif
-
-/*
- * Update the per-runqueue clock, as finegrained as the platform can give
- * us, but without assuming monotonicity, etc.:
- */
-static void __update_rq_clock(struct rq *rq)
-{
- u64 prev_raw = rq->prev_clock_raw;
- u64 now = sched_clock();
- s64 delta = now - prev_raw;
- u64 clock = rq->clock;
-
-#ifdef CONFIG_SCHED_DEBUG
- WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
-#endif
- /*
- * Protect against sched_clock() occasionally going backwards:
- */
- if (unlikely(delta < 0)) {
- clock++;
- rq->clock_warps++;
- } else {
- /*
- * Catch too large forward jumps too:
- */
- u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC;
- u64 max_time = rq->tick_timestamp + max_jump;
-
- if (unlikely(clock + delta > max_time)) {
- if (clock < max_time)
- clock = max_time;
- else
- clock++;
- rq->clock_overflows++;
- } else {
- if (unlikely(delta > rq->clock_max_delta))
- rq->clock_max_delta = delta;
- clock += delta;
- }
- }
-
- rq->prev_clock_raw = now;
- rq->clock = clock;
-}
-
-static void update_rq_clock(struct rq *rq)
-{
- if (likely(smp_processor_id() == cpu_of(rq)))
- __update_rq_clock(rq);
-}
-
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
@@ -722,6 +593,11 @@ static void update_rq_clock(struct rq *rq)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+static inline void update_rq_clock(struct rq *rq)
+{
+ rq->clock = sched_clock_cpu(cpu_of(rq));
+}
+
/*
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
@@ -757,14 +633,14 @@ const_debug unsigned int sysctl_sched_features =
#define SCHED_FEAT(name, enabled) \
#name ,
-__read_mostly char *sched_feat_names[] = {
+static __read_mostly char *sched_feat_names[] = {
#include "sched_features.h"
NULL
};
#undef SCHED_FEAT
-int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_open(struct inode *inode, struct file *filp)
{
filp->private_data = inode->i_private;
return 0;
@@ -899,7 +775,7 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
-static const unsigned long long time_sync_thresh = 100000;
+unsigned long long time_sync_thresh = 100000;
static DEFINE_PER_CPU(unsigned long long, time_offset);
static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
@@ -913,11 +789,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
static DEFINE_SPINLOCK(time_sync_lock);
static unsigned long long prev_global_time;
-static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
+static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
{
- unsigned long flags;
-
- spin_lock_irqsave(&time_sync_lock, flags);
+ /*
+ * We want this inlined, to not get tracer function calls
+ * in this critical section:
+ */
+ spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
+ __raw_spin_lock(&time_sync_lock.raw_lock);
if (time < prev_global_time) {
per_cpu(time_offset, cpu) += prev_global_time - time;
@@ -926,7 +805,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
prev_global_time = time;
}
- spin_unlock_irqrestore(&time_sync_lock, flags);
+ __raw_spin_unlock(&time_sync_lock.raw_lock);
+ spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
return time;
}
@@ -934,8 +814,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu)
static unsigned long long __cpu_clock(int cpu)
{
unsigned long long now;
- unsigned long flags;
- struct rq *rq;
/*
* Only call sched_clock() if the scheduler has already been
@@ -944,11 +822,7 @@ static unsigned long long __cpu_clock(int cpu)
if (unlikely(!scheduler_running))
return 0;
- local_irq_save(flags);
- rq = cpu_rq(cpu);
- update_rq_clock(rq);
- now = rq->clock;
- local_irq_restore(flags);
+ now = sched_clock_cpu(cpu);
return now;
}
@@ -960,13 +834,18 @@ static unsigned long long __cpu_clock(int cpu)
unsigned long long cpu_clock(int cpu)
{
unsigned long long prev_cpu_time, time, delta_time;
+ unsigned long flags;
+ local_irq_save(flags);
prev_cpu_time = per_cpu(prev_cpu_time, cpu);
time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
delta_time = time-prev_cpu_time;
- if (unlikely(delta_time > time_sync_thresh))
+ if (unlikely(delta_time > time_sync_thresh)) {
time = __sync_cpu_clock(time, cpu);
+ per_cpu(prev_cpu_time, cpu) = time;
+ }
+ local_irq_restore(flags);
return time;
}
@@ -1117,43 +996,6 @@ static struct rq *this_rq_lock(void)
return rq;
}
-/*
- * We are going deep-idle (irqs are disabled):
- */
-void sched_clock_idle_sleep_event(void)
-{
- struct rq *rq = cpu_rq(smp_processor_id());
-
- spin_lock(&rq->lock);
- __update_rq_clock(rq);
- spin_unlock(&rq->lock);
- rq->clock_deep_idle_events++;
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
-
-/*
- * We just idled delta nanoseconds (called with irqs disabled):
- */
-void sched_clock_idle_wakeup_event(u64 delta_ns)
-{
- struct rq *rq = cpu_rq(smp_processor_id());
- u64 now = sched_clock();
-
- rq->idle_clock += delta_ns;
- /*
- * Override the previous timestamp and ignore all
- * sched_clock() deltas that occured while we idled,
- * and use the PM-provided delta_ns to advance the
- * rq clock:
- */
- spin_lock(&rq->lock);
- rq->prev_clock_raw = now;
- rq->clock += delta_ns;
- spin_unlock(&rq->lock);
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-
static void __resched_task(struct task_struct *p, int tif_bit);
static inline void resched_task(struct task_struct *p)
@@ -1189,6 +1031,7 @@ static inline void resched_rq(struct rq *rq)
enum {
HRTICK_SET, /* re-programm hrtick_timer */
HRTICK_RESET, /* not a new slice */
+ HRTICK_BLOCK, /* stop hrtick operations */
};
/*
@@ -1200,6 +1043,8 @@ static inline int hrtick_enabled(struct rq *rq)
{
if (!sched_feat(HRTICK))
return 0;
+ if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags)))
+ return 0;
return hrtimer_is_hres_active(&rq->hrtick_timer);
}
@@ -1275,14 +1120,72 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
spin_lock(&rq->lock);
- __update_rq_clock(rq);
+ update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
spin_unlock(&rq->lock);
return HRTIMER_NORESTART;
}
-static inline void init_rq_hrtick(struct rq *rq)
+#ifdef CONFIG_SMP
+static void hotplug_hrtick_disable(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ rq->hrtick_flags = 0;
+ __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ hrtick_clear(rq);
+}
+
+static void hotplug_hrtick_enable(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags);
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static int
+hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int cpu = (int)(long)hcpu;
+
+ switch (action) {
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ hotplug_hrtick_disable(cpu);
+ return NOTIFY_OK;
+
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ hotplug_hrtick_enable(cpu);
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static void init_hrtick(void)
+{
+ hotcpu_notifier(hotplug_hrtick, 0);
+}
+#endif /* CONFIG_SMP */
+
+static void init_rq_hrtick(struct rq *rq)
{
rq->hrtick_flags = 0;
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -1319,6 +1222,10 @@ static inline void init_rq_hrtick(struct rq *rq)
void hrtick_resched(void)
{
}
+
+static inline void init_hrtick(void)
+{
+}
#endif
/*
@@ -1429,17 +1336,19 @@ static void __resched_task(struct task_struct *p, int tif_bit)
*/
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-/*
- * delta *= weight / lw
- */
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
{
u64 tmp;
- if (unlikely(!lw->inv_weight))
- lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
+ if (!lw->inv_weight) {
+ if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else
+ lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
+ / (lw->weight+1);
+ }
tmp = (u64)delta_exec * weight;
/*
@@ -1454,6 +1363,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+{
+ return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+}
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -1566,324 +1481,6 @@ static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long cpu_avg_load_per_task(int cpu);
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- * root 1 - thread
- * / | \ A - group
- * A 1 B
- * /|\ / \
- * C 2 D 3 4
- * | |
- * 5 6
- *
- * load:
- * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- * which equals 1/9-th of the total load.
- *
- * shares:
- * The weight of this group on the selected cpus.
- *
- * rq_weight:
- * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- * B would get 2.
- *
- * task_weight:
- * Part of the rq_weight contributed by tasks; all groups except B would
- * get 1, B gets 2.
- */
-
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
-{
- return &tg->cfs_rq[sd->first_cpu]->aggregate;
-}
-
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
-
-/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
- */
-static
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
- struct sched_domain *sd)
-{
- struct task_group *parent, *child;
-
- rcu_read_lock();
- parent = &root_task_group;
-down:
- (*down)(parent, sd);
- list_for_each_entry_rcu(child, &parent->children, siblings) {
- parent = child;
- goto down;
-
-up:
- continue;
- }
- (*up)(parent, sd);
-
- child = parent;
- parent = parent->parent;
- if (parent)
- goto up;
- rcu_read_unlock();
-}
-
-/*
- * Calculate the aggregate runqueue weight.
- */
-static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long rq_weight = 0;
- unsigned long task_weight = 0;
- int i;
-
- for_each_cpu_mask(i, sd->span) {
- rq_weight += tg->cfs_rq[i]->load.weight;
- task_weight += tg->cfs_rq[i]->task_weight;
- }
-
- aggregate(tg, sd)->rq_weight = rq_weight;
- aggregate(tg, sd)->task_weight = task_weight;
-}
-
-/*
- * Compute the weight of this group on the given cpus.
- */
-static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long shares = 0;
- int i;
-
- for_each_cpu_mask(i, sd->span)
- shares += tg->cfs_rq[i]->shares;
-
- if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
- shares = tg->shares;
-
- aggregate(tg, sd)->shares = shares;
-}
-
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long load;
-
- if (!tg->parent) {
- int i;
-
- load = 0;
- for_each_cpu_mask(i, sd->span)
- load += cpu_rq(i)->load.weight;
-
- } else {
- load = aggregate(tg->parent, sd)->load;
-
- /*
- * shares is our weight in the parent's rq so
- * shares/parent->rq_weight gives our fraction of the load
- */
- load *= aggregate(tg, sd)->shares;
- load /= aggregate(tg->parent, sd)->rq_weight + 1;
- }
-
- aggregate(tg, sd)->load = load;
-}
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void
-__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
- int tcpu)
-{
- int boost = 0;
- unsigned long shares;
- unsigned long rq_weight;
-
- if (!tg->se[tcpu])
- return;
-
- rq_weight = tg->cfs_rq[tcpu]->load.weight;
-
- /*
- * If there are currently no tasks on the cpu pretend there is one of
- * average load so that when a new task gets to run here it will not
- * get delayed by group starvation.
- */
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- /*
- * \Sum shares * rq_weight
- * shares = -----------------------
- * \Sum rq_weight
- *
- */
- shares = aggregate(tg, sd)->shares * rq_weight;
- shares /= aggregate(tg, sd)->rq_weight + 1;
-
- /*
- * record the actual number of shares, not the boosted amount.
- */
- tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
-
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
-
- __set_se_shares(tg->se[tcpu], shares);
-}
-
-/*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
- * task went to.
- */
-static void
-__move_group_shares(struct task_group *tg, struct sched_domain *sd,
- int scpu, int dcpu)
-{
- unsigned long shares;
-
- shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-
- __update_group_shares_cpu(tg, sd, scpu);
- __update_group_shares_cpu(tg, sd, dcpu);
-
- /*
- * ensure we never loose shares due to rounding errors in the
- * above redistribution.
- */
- shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
- if (shares)
- tg->cfs_rq[dcpu]->shares += shares;
-}
-
-/*
- * Because changing a group's shares changes the weight of the super-group
- * we need to walk up the tree and change all shares until we hit the root.
- */
-static void
-move_group_shares(struct task_group *tg, struct sched_domain *sd,
- int scpu, int dcpu)
-{
- while (tg) {
- __move_group_shares(tg, sd, scpu, dcpu);
- tg = tg->parent;
- }
-}
-
-static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long shares = aggregate(tg, sd)->shares;
- int i;
-
- for_each_cpu_mask(i, sd->span) {
- struct rq *rq = cpu_rq(i);
- unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, sd, i);
- spin_unlock_irqrestore(&rq->lock, flags);
- }
-
- aggregate_group_shares(tg, sd);
-
- /*
- * ensure we never loose shares due to rounding errors in the
- * above redistribution.
- */
- shares -= aggregate(tg, sd)->shares;
- if (shares) {
- tg->cfs_rq[sd->first_cpu]->shares += shares;
- aggregate(tg, sd)->shares += shares;
- }
-}
-
-/*
- * Calculate the accumulative weight and recursive load of each task group
- * while walking down the tree.
- */
-static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
-{
- aggregate_group_weight(tg, sd);
- aggregate_group_shares(tg, sd);
- aggregate_group_load(tg, sd);
-}
-
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
-{
- aggregate_group_set_shares(tg, sd);
-}
-
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
-
-static void __init init_aggregate(void)
-{
- int i;
-
- for_each_possible_cpu(i)
- spin_lock_init(&per_cpu(aggregate_lock, i));
-}
-
-static int get_aggregate(struct sched_domain *sd)
-{
- if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
- return 0;
-
- aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
- return 1;
-}
-
-static void put_aggregate(struct sched_domain *sd)
-{
- spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
-}
-
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
- cfs_rq->shares = shares;
-}
-
-#else
-
-static inline void init_aggregate(void)
-{
-}
-
-static inline int get_aggregate(struct sched_domain *sd)
-{
- return 0;
-}
-
-static inline void put_aggregate(struct sched_domain *sd)
-{
-}
-#endif
-
#else /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1904,14 +1501,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
#define sched_class_highest (&rt_sched_class)
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_add(&rq->load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_sub(&rq->load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running++;
+ inc_load(rq, p);
}
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running--;
+ dec_load(rq, p);
}
static void set_load_weight(struct task_struct *p)
@@ -2003,7 +1612,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
rq->nr_uninterruptible--;
enqueue_task(rq, p, wakeup);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
/*
@@ -2015,7 +1624,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
rq->nr_uninterruptible++;
dequeue_task(rq, p, sleep);
- dec_nr_running(rq);
+ dec_nr_running(p, rq);
}
/**
@@ -2668,7 +2277,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* management (if any):
*/
p->sched_class->task_new(rq, p);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
check_preempt_curr(rq, p);
#ifdef CONFIG_SMP
@@ -3659,12 +3268,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
- int unlock_aggregate;
cpus_setall(*cpus);
- unlock_aggregate = get_aggregate(sd);
-
/*
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
@@ -3780,9 +3386,8 @@ redo:
if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
-
- goto out;
+ return -1;
+ return ld_moved;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
@@ -3797,13 +3402,8 @@ out_one_pinned:
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
- else
- ld_moved = 0;
-out:
- if (unlock_aggregate)
- put_aggregate(sd);
- return ld_moved;
+ return -1;
+ return 0;
}
/*
@@ -4339,8 +3939,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
struct rq *rq = this_rq();
cputime64_t tmp;
- if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
- return account_guest_time(p, cputime);
+ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+ account_guest_time(p, cputime);
+ return;
+ }
p->stime = cputime_add(p->stime, cputime);
@@ -4404,19 +4006,11 @@ void scheduler_tick(void)
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
- u64 next_tick = rq->tick_timestamp + TICK_NSEC;
+
+ sched_clock_tick();
spin_lock(&rq->lock);
- __update_rq_clock(rq);
- /*
- * Let rq->clock advance by at least TICK_NSEC:
- */
- if (unlikely(rq->clock < next_tick)) {
- rq->clock = next_tick;
- rq->clock_underflows++;
- }
- rq->tick_timestamp = rq->clock;
- update_last_tick_seen(rq);
+ update_rq_clock(rq);
update_cpu_load(rq);
curr->sched_class->task_tick(rq, curr, 0);
spin_unlock(&rq->lock);
@@ -4495,7 +4089,7 @@ static inline void schedule_debug(struct task_struct *prev)
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
- if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
+ if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
__schedule_bug(prev);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4570,17 +4164,15 @@ need_resched_nonpreemptible:
* Do the rq-clock update outside the rq lock:
*/
local_irq_disable();
- __update_rq_clock(rq);
+ update_rq_clock(rq);
spin_lock(&rq->lock);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
- if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
- signal_pending(prev))) {
+ if (unlikely(signal_pending_state(prev->state, prev)))
prev->state = TASK_RUNNING;
- } else {
+ else
deactivate_task(rq, prev, 1);
- }
switch_count = &prev->nvcsw;
}
@@ -4595,9 +4187,9 @@ need_resched_nonpreemptible:
prev->sched_class->put_prev_task(rq, prev);
next = pick_next_task(rq, prev);
- sched_info_switch(prev, next);
-
if (likely(prev != next)) {
+ sched_info_switch(prev, next);
+
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -4632,8 +4224,6 @@ EXPORT_SYMBOL(schedule);
asmlinkage void __sched preempt_schedule(void)
{
struct thread_info *ti = current_thread_info();
- struct task_struct *task = current;
- int saved_lock_depth;
/*
* If there is a non-zero preempt_count or interrupts are disabled,
@@ -4644,16 +4234,7 @@ asmlinkage void __sched preempt_schedule(void)
do {
add_preempt_count(PREEMPT_ACTIVE);
-
- /*
- * We keep the big kernel semaphore locked, but we
- * clear ->lock_depth so that schedule() doesnt
- * auto-release the semaphore:
- */
- saved_lock_depth = task->lock_depth;
- task->lock_depth = -1;
schedule();
- task->lock_depth = saved_lock_depth;
sub_preempt_count(PREEMPT_ACTIVE);
/*
@@ -4674,26 +4255,15 @@ EXPORT_SYMBOL(preempt_schedule);
asmlinkage void __sched preempt_schedule_irq(void)
{
struct thread_info *ti = current_thread_info();
- struct task_struct *task = current;
- int saved_lock_depth;
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
do {
add_preempt_count(PREEMPT_ACTIVE);
-
- /*
- * We keep the big kernel semaphore locked, but we
- * clear ->lock_depth so that schedule() doesnt
- * auto-release the semaphore:
- */
- saved_lock_depth = task->lock_depth;
- task->lock_depth = -1;
local_irq_enable();
schedule();
local_irq_disable();
- task->lock_depth = saved_lock_depth;
sub_preempt_count(PREEMPT_ACTIVE);
/*
@@ -4828,22 +4398,20 @@ do_wait_for_common(struct completion *x, long timeout, int state)
signal_pending(current)) ||
(state == TASK_KILLABLE &&
fatal_signal_pending(current))) {
- __remove_wait_queue(&x->wait, &wait);
- return -ERESTARTSYS;
+ timeout = -ERESTARTSYS;
+ break;
}
__set_current_state(state);
spin_unlock_irq(&x->wait.lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&x->wait.lock);
- if (!timeout) {
- __remove_wait_queue(&x->wait, &wait);
- return timeout;
- }
- } while (!x->done);
+ } while (!x->done && timeout);
__remove_wait_queue(&x->wait, &wait);
+ if (!x->done)
+ return timeout;
}
x->done--;
- return timeout;
+ return timeout ?: 1;
}
static long __sched
@@ -5018,8 +4586,10 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock;
}
on_rq = p->se.on_rq;
- if (on_rq)
+ if (on_rq) {
dequeue_task(rq, p, 0);
+ dec_load(rq, p);
+ }
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -5029,6 +4599,7 @@ void set_user_nice(struct task_struct *p, long nice)
if (on_rq) {
enqueue_task(rq, p, 0);
+ inc_load(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -5612,7 +5183,6 @@ static void __cond_resched(void)
} while (need_resched());
}
-#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
int __sched _cond_resched(void)
{
if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
@@ -5623,7 +5193,6 @@ int __sched _cond_resched(void)
return 0;
}
EXPORT_SYMBOL(_cond_resched);
-#endif
/*
* cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -5918,8 +5487,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
+#if defined(CONFIG_PREEMPT)
+ task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
+#else
task_thread_info(idle)->preempt_count = 0;
-
+#endif
/*
* The idle tasks have their own, simple scheduling class:
*/
@@ -6050,10 +5622,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
double_rq_lock(rq_src, rq_dest);
/* Already moved. */
if (task_cpu(p) != src_cpu)
- goto out;
+ goto done;
/* Affinity changed (again). */
if (!cpu_isset(dest_cpu, p->cpus_allowed))
- goto out;
+ goto fail;
on_rq = p->se.on_rq;
if (on_rq)
@@ -6064,8 +5636,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
activate_task(rq_dest, p, 0);
check_preempt_curr(rq_dest, p);
}
+done:
ret = 1;
-out:
+fail:
double_rq_unlock(rq_src, rq_dest);
return ret;
}
@@ -6315,6 +5888,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
next = pick_next_task(rq, rq->curr);
if (!next)
break;
+ next->sched_class->put_prev_task(rq, next);
migrate_dead(dead_cpu, next);
}
@@ -7305,7 +6879,12 @@ static int default_relax_domain_level = -1;
static int __init setup_relax_domain_level(char *str)
{
- default_relax_domain_level = simple_strtoul(str, NULL, 0);
+ unsigned long val;
+
+ val = simple_strtoul(str, NULL, 0);
+ if (val < SD_LV_MAX)
+ default_relax_domain_level = val;
+
return 1;
}
__setup("relax_domain_level=", setup_relax_domain_level);
@@ -7402,7 +6981,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
sd->span = *cpu_map;
- sd->first_cpu = first_cpu(sd->span);
cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
p = sd;
sd_allnodes = 1;
@@ -7413,7 +6991,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, NODE);
set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), &sd->span);
- sd->first_cpu = first_cpu(sd->span);
sd->parent = p;
if (p)
p->child = sd;
@@ -7425,7 +7002,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, CPU);
set_domain_attribute(sd, attr);
sd->span = *nodemask;
- sd->first_cpu = first_cpu(sd->span);
sd->parent = p;
if (p)
p->child = sd;
@@ -7437,7 +7013,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, MC);
set_domain_attribute(sd, attr);
sd->span = cpu_coregroup_map(i);
- sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
@@ -7450,7 +7025,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, SIBLING);
set_domain_attribute(sd, attr);
sd->span = per_cpu(cpu_sibling_map, i);
- sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
@@ -7654,8 +7228,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
static cpumask_t *doms_cur; /* current sched domains */
static int ndoms_cur; /* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur; /* attribues of custom domains
- in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+ /* attribues of custom domains in 'doms_cur' */
/*
* Special case: If a kmalloc of a doms_cur partition (array of
@@ -7669,6 +7243,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
}
/*
+ * Free current domain masks.
+ * Called after all cpus are attached to NULL domain.
+ */
+static void free_sched_domains(void)
+{
+ ndoms_cur = 0;
+ if (doms_cur != &fallback_doms)
+ kfree(doms_cur);
+ doms_cur = &fallback_doms;
+}
+
+/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
@@ -7755,7 +7341,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
{
int i, j;
- lock_doms_cur();
+ mutex_lock(&sched_domains_mutex);
/* always unregister in case we don't destroy any domains */
unregister_sched_domain_sysctl();
@@ -7804,7 +7390,7 @@ match2:
register_sched_domain_sysctl();
- unlock_doms_cur();
+ mutex_unlock(&sched_domains_mutex);
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -7813,8 +7399,11 @@ int arch_reinit_sched_domains(void)
int err;
get_online_cpus();
+ mutex_lock(&sched_domains_mutex);
detach_destroy_domains(&cpu_online_map);
+ free_sched_domains();
err = arch_init_sched_domains(&cpu_online_map);
+ mutex_unlock(&sched_domains_mutex);
put_online_cpus();
return err;
@@ -7898,6 +7487,7 @@ static int update_sched_domains(struct notifier_block *nfb,
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
detach_destroy_domains(&cpu_online_map);
+ free_sched_domains();
return NOTIFY_OK;
case CPU_UP_CANCELED:
@@ -7916,8 +7506,16 @@ static int update_sched_domains(struct notifier_block *nfb,
return NOTIFY_DONE;
}
+#ifndef CONFIG_CPUSETS
+ /*
+ * Create default domain partitioning if cpusets are disabled.
+ * Otherwise we let cpusets rebuild the domains based on the
+ * current setup.
+ */
+
/* The hotplug lock is already held by cpu_up/cpu_down */
arch_init_sched_domains(&cpu_online_map);
+#endif
return NOTIFY_OK;
}
@@ -7932,13 +7530,16 @@ void __init sched_init_smp(void)
BUG_ON(sched_group_nodes_bycpu == NULL);
#endif
get_online_cpus();
+ mutex_lock(&sched_domains_mutex);
arch_init_sched_domains(&cpu_online_map);
cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
if (cpus_empty(non_isolated_cpus))
cpu_set(smp_processor_id(), non_isolated_cpus);
+ mutex_unlock(&sched_domains_mutex);
put_online_cpus();
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
+ init_hrtick();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
@@ -8025,7 +7626,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
se->my_q = cfs_rq;
se->load.weight = tg->shares;
- se->load.inv_weight = div64_u64(1ULL<<32, se->load.weight);
+ se->load.inv_weight = 0;
se->parent = parent;
}
#endif
@@ -8054,7 +7655,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
else
rt_se->rt_rq = parent->my_q;
- rt_se->rt_rq = &rq->rt;
rt_se->my_q = rt_rq;
rt_se->parent = parent;
INIT_LIST_HEAD(&rt_se->run_list);
@@ -8115,7 +7715,6 @@ void __init sched_init(void)
}
#ifdef CONFIG_SMP
- init_aggregate();
init_defrootdomain();
#endif
@@ -8149,8 +7748,6 @@ void __init sched_init(void)
spin_lock_init(&rq->lock);
lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
- rq->clock = 1;
- update_last_tick_seen(rq);
init_cfs_rq(&rq->cfs, rq);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8294,6 +7891,7 @@ EXPORT_SYMBOL(__might_sleep);
static void normalize_task(struct rq *rq, struct task_struct *p)
{
int on_rq;
+
update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
@@ -8325,7 +7923,6 @@ void normalize_rt_tasks(void)
p->se.sleep_start = 0;
p->se.block_start = 0;
#endif
- task_rq(p)->clock = 0;
if (!rt_task(p)) {
/*
@@ -8682,31 +8279,25 @@ void sched_move_task(struct task_struct *tsk)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
+ struct rq *rq = cfs_rq->rq;
int on_rq;
+ spin_lock_irq(&rq->lock);
+
on_rq = se->on_rq;
if (on_rq)
dequeue_entity(cfs_rq, se, 0);
se->load.weight = shares;
- se->load.inv_weight = div64_u64((1ULL<<32), shares);
+ se->load.inv_weight = 0;
if (on_rq)
enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- struct rq *rq = cfs_rq->rq;
- unsigned long flags;
- spin_lock_irqsave(&rq->lock, flags);
- __set_se_shares(se, shares);
- spin_unlock_irqrestore(&rq->lock, flags);
+ spin_unlock_irq(&rq->lock);
}
static DEFINE_MUTEX(shares_mutex);
@@ -8722,13 +8313,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
if (!tg->se[0])
return -EINVAL;
- /*
- * A weight of 0 or 1 can cause arithmetics problems.
- * (The default weight is 1024 - so there's no practical
- * limitation from this.)
- */
if (shares < MIN_SHARES)
shares = MIN_SHARES;
+ else if (shares > MAX_SHARES)
+ shares = MAX_SHARES;
mutex_lock(&shares_mutex);
if (tg->shares == shares)
@@ -8748,13 +8336,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* w/o tripping rebalance_share or load_balance_fair.
*/
tg->shares = shares;
- for_each_possible_cpu(i) {
- /*
- * force a rebalance
- */
- cfs_rq_set_shares(tg->cfs_rq[i], 0);
- set_se_shares(tg->se[i], shares/nr_cpu_ids);
- }
+ for_each_possible_cpu(i)
+ set_se_shares(tg->se[i], shares);
/*
* Enable load balance activity on this group, by inserting it back on
@@ -8793,7 +8376,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
#ifdef CONFIG_CGROUP_SCHED
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
- struct task_group *tgi, *parent = tg->parent;
+ struct task_group *tgi, *parent = tg ? tg->parent : NULL;
unsigned long total = 0;
if (!parent) {
@@ -8920,6 +8503,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
rt_period = (u64)rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
+ if (rt_period == 0)
+ return -EINVAL;
+
return tg_set_bandwidth(tg, rt_period, rt_runtime);
}
@@ -9072,7 +8658,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
#endif
#ifdef CONFIG_RT_GROUP_SCHED
-static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
s64 val)
{
return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
new file mode 100644
index 00000000000..ce05271219a
--- /dev/null
+++ b/kernel/sched_clock.c
@@ -0,0 +1,246 @@
+/*
+ * sched_clock for unstable cpu clocks
+ *
+ * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * Based on code by:
+ * Ingo Molnar <mingo@redhat.com>
+ * Guillaume Chazarain <guichaz@gmail.com>
+ *
+ * Create a semi stable clock from a mixture of other events, including:
+ * - gtod
+ * - jiffies
+ * - sched_clock()
+ * - explicit idle events
+ *
+ * We use gtod as base and the unstable clock deltas. The deltas are filtered,
+ * making it monotonic and keeping it within an expected window. This window
+ * is set up using jiffies.
+ *
+ * Furthermore, explicit sleep and wakeup hooks allow us to account for time
+ * that is otherwise invisible (TSC gets stopped).
+ *
+ * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
+ * consistent between cpus (never more than 1 jiffies difference).
+ */
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/ktime.h>
+#include <linux/module.h>
+
+
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+
+struct sched_clock_data {
+ /*
+ * Raw spinlock - this is a special case: this might be called
+ * from within instrumentation code so we dont want to do any
+ * instrumentation ourselves.
+ */
+ raw_spinlock_t lock;
+
+ unsigned long prev_jiffies;
+ u64 prev_raw;
+ u64 tick_raw;
+ u64 tick_gtod;
+ u64 clock;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
+
+static inline struct sched_clock_data *this_scd(void)
+{
+ return &__get_cpu_var(sched_clock_data);
+}
+
+static inline struct sched_clock_data *cpu_sdc(int cpu)
+{
+ return &per_cpu(sched_clock_data, cpu);
+}
+
+static __read_mostly int sched_clock_running;
+
+void sched_clock_init(void)
+{
+ u64 ktime_now = ktime_to_ns(ktime_get());
+ unsigned long now_jiffies = jiffies;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct sched_clock_data *scd = cpu_sdc(cpu);
+
+ scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ scd->prev_jiffies = now_jiffies;
+ scd->prev_raw = 0;
+ scd->tick_raw = 0;
+ scd->tick_gtod = ktime_now;
+ scd->clock = ktime_now;
+ }
+
+ sched_clock_running = 1;
+}
+
+/*
+ * update the percpu scd from the raw @now value
+ *
+ * - filter out backward motion
+ * - use jiffies to generate a min,max window to clip the raw values
+ */
+static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
+{
+ unsigned long now_jiffies = jiffies;
+ long delta_jiffies = now_jiffies - scd->prev_jiffies;
+ u64 clock = scd->clock;
+ u64 min_clock, max_clock;
+ s64 delta = now - scd->prev_raw;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
+
+ if (unlikely(delta < 0)) {
+ clock++;
+ goto out;
+ }
+
+ max_clock = min_clock + TICK_NSEC;
+
+ if (unlikely(clock + delta > max_clock)) {
+ if (clock < max_clock)
+ clock = max_clock;
+ else
+ clock++;
+ } else {
+ clock += delta;
+ }
+
+ out:
+ if (unlikely(clock < min_clock))
+ clock = min_clock;
+
+ scd->prev_raw = now;
+ scd->prev_jiffies = now_jiffies;
+ scd->clock = clock;
+}
+
+static void lock_double_clock(struct sched_clock_data *data1,
+ struct sched_clock_data *data2)
+{
+ if (data1 < data2) {
+ __raw_spin_lock(&data1->lock);
+ __raw_spin_lock(&data2->lock);
+ } else {
+ __raw_spin_lock(&data2->lock);
+ __raw_spin_lock(&data1->lock);
+ }
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+ struct sched_clock_data *scd = cpu_sdc(cpu);
+ u64 now, clock;
+
+ if (unlikely(!sched_clock_running))
+ return 0ull;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ now = sched_clock();
+
+ if (cpu != raw_smp_processor_id()) {
+ /*
+ * in order to update a remote cpu's clock based on our
+ * unstable raw time rebase it against:
+ * tick_raw (offset between raw counters)
+ * tick_gotd (tick offset between cpus)
+ */
+ struct sched_clock_data *my_scd = this_scd();
+
+ lock_double_clock(scd, my_scd);
+
+ now -= my_scd->tick_raw;
+ now += scd->tick_raw;
+
+ now -= my_scd->tick_gtod;
+ now += scd->tick_gtod;
+
+ __raw_spin_unlock(&my_scd->lock);
+ } else {
+ __raw_spin_lock(&scd->lock);
+ }
+
+ __update_sched_clock(scd, now);
+ clock = scd->clock;
+
+ __raw_spin_unlock(&scd->lock);
+
+ return clock;
+}
+
+void sched_clock_tick(void)
+{
+ struct sched_clock_data *scd = this_scd();
+ u64 now, now_gtod;
+
+ if (unlikely(!sched_clock_running))
+ return;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ now = sched_clock();
+ now_gtod = ktime_to_ns(ktime_get());
+
+ __raw_spin_lock(&scd->lock);
+ __update_sched_clock(scd, now);
+ /*
+ * update tick_gtod after __update_sched_clock() because that will
+ * already observe 1 new jiffy; adding a new tick_gtod to that would
+ * increase the clock 2 jiffies.
+ */
+ scd->tick_raw = now;
+ scd->tick_gtod = now_gtod;
+ __raw_spin_unlock(&scd->lock);
+}
+
+/*
+ * We are going deep-idle (irqs are disabled):
+ */
+void sched_clock_idle_sleep_event(void)
+{
+ sched_clock_cpu(smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+ struct sched_clock_data *scd = this_scd();
+ u64 now = sched_clock();
+
+ /*
+ * Override the previous timestamp and ignore all
+ * sched_clock() deltas that occured while we idled,
+ * and use the PM-provided delta_ns to advance the
+ * rq clock:
+ */
+ __raw_spin_lock(&scd->lock);
+ scd->prev_raw = now;
+ scd->clock += delta_ns;
+ __raw_spin_unlock(&scd->lock);
+
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
+
+#endif
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ * This is default implementation.
+ * Architectures and sub-architectures can override this.
+ */
+unsigned long long __attribute__((weak)) sched_clock(void)
+{
+ return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6b4a12558e8..8bb713040ac 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,11 +167,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#endif
SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
cfs_rq->nr_spread_over);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
-#endif
-#endif
}
static void print_cpu(struct seq_file *m, int cpu)
@@ -204,13 +199,6 @@ static void print_cpu(struct seq_file *m, int cpu)
PN(next_balance);
P(curr->pid);
PN(clock);
- PN(idle_clock);
- PN(prev_clock_raw);
- P(clock_warps);
- P(clock_overflows);
- P(clock_underflows);
- P(clock_deep_idle_events);
- PN(clock_max_delta);
P(cpu_load[0]);
P(cpu_load[1]);
P(cpu_load[2]);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 89fa32b4edf..08ae848b71d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
#endif
/*
- * delta *= w / rw
- */
-static inline unsigned long
-calc_delta_weight(unsigned long delta, struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- delta = calc_delta_mine(delta,
- se->load.weight, &cfs_rq_of(se)->load);
- }
-
- return delta;
-}
-
-/*
- * delta *= rw / w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- delta = calc_delta_mine(delta,
- cfs_rq_of(se)->load.weight, &se->load);
- }
-
- return delta;
-}
-
-/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+ u64 slice = __sched_period(cfs_rq->nr_running);
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ slice *= se->load.weight;
+ do_div(slice, cfs_rq->load.weight);
+ }
+
+
+ return slice;
}
/*
* We calculate the vruntime slice of a to be inserted task
*
- * vs = s*rw/w = p
+ * vs = s/w = p/rw
*/
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long nr_running = cfs_rq->nr_running;
+ unsigned long weight;
+ u64 vslice;
if (!se->on_rq)
nr_running++;
- return __sched_period(nr_running);
-}
-
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- * -20 |
- * |
- * 0 --------+-------
- * .'
- * 19 .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
- struct load_weight lw = {
- .weight = NICE_0_LOAD,
- .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
- };
+ vslice = __sched_period(nr_running);
for_each_sched_entity(se) {
- struct load_weight *se_lw = &se->load;
+ cfs_rq = cfs_rq_of(se);
- if (se->load.weight < NICE_0_LOAD)
- se_lw = &lw;
+ weight = cfs_rq->load.weight;
+ if (!se->on_rq)
+ weight += se->load.weight;
- delta = calc_delta_mine(delta,
- cfs_rq_of(se)->load.weight, se_lw);
+ vslice *= NICE_0_LOAD;
+ do_div(vslice, weight);
}
- return delta;
+ return vslice;
}
/*
@@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
- delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+ delta_exec_weighted = delta_exec;
+ if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+ delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+ &curr->load);
+ }
curr->vruntime += delta_exec_weighted;
}
@@ -541,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
- cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
-
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- inc_cpu_load(rq_of(cfs_rq), se->load.weight);
- if (entity_is_task(se))
- add_cfs_task_weight(cfs_rq, se->load.weight);
cfs_rq->nr_running++;
se->on_rq = 1;
list_add(&se->group_node, &cfs_rq->tasks);
@@ -571,10 +523,6 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- dec_cpu_load(rq_of(cfs_rq), se->load.weight);
- if (entity_is_task(se))
- add_cfs_task_weight(cfs_rq, -se->load.weight);
cfs_rq->nr_running--;
se->on_rq = 0;
list_del_init(&se->group_node);
@@ -661,12 +609,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
if (!initial) {
/* sleeps upto a single latency don't count. */
- if (sched_feat(NEW_FAIR_SLEEPERS)) {
- if (sched_feat(NORMALIZED_SLEEPER))
- vruntime -= calc_delta_weight(sysctl_sched_latency, se);
- else
- vruntime -= sysctl_sched_latency;
- }
+ if (sched_feat(NEW_FAIR_SLEEPERS))
+ vruntime -= sysctl_sched_latency;
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
@@ -682,6 +626,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ account_entity_enqueue(cfs_rq, se);
if (wakeup) {
place_entity(cfs_rq, se, 0);
@@ -692,7 +637,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
- account_entity_enqueue(cfs_rq, se);
}
static void update_avg(u64 *avg, u64 sample)
@@ -841,8 +785,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
- if (queued)
- return resched_task(rq_of(cfs_rq)->curr);
+ if (queued) {
+ resched_task(rq_of(cfs_rq)->curr);
+ return;
+ }
/*
* don't let the period tick interfere with the hrtick preemption
*/
@@ -957,7 +903,7 @@ static void yield_task_fair(struct rq *rq)
return;
if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
- __update_rq_clock(rq);
+ update_rq_clock(rq);
/*
* Update run-time statistics of the 'current'.
*/
@@ -1007,7 +953,7 @@ static int wake_idle(int cpu, struct task_struct *p)
* sibling runqueue info. This will avoid the checks and cache miss
* penalities associated with that.
*/
- if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+ if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
@@ -1050,16 +996,27 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
struct task_struct *curr = this_rq->curr;
unsigned long tl = this_load;
unsigned long tl_per_task;
+ int balanced;
- if (!(this_sd->flags & SD_WAKE_AFFINE))
+ if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
/*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync)
+ tl -= current->se.load.weight;
+
+ balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+
+ /*
* If the currently running task will sleep within
* a reasonable amount of time then attract this newly
* woken task:
*/
- if (sync && curr->sched_class == &fair_sched_class) {
+ if (sync && balanced && curr->sched_class == &fair_sched_class) {
if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
p->se.avg_overlap < sysctl_sched_migration_cost)
return 1;
@@ -1068,16 +1025,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync)
- tl -= current->se.load.weight;
-
if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
- 100*(tl + p->se.load.weight) <= imbalance*load) {
+ balanced) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
@@ -1162,10 +1111,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
unsigned long gran = sysctl_sched_wakeup_granularity;
/*
- * More easily preempt - nice tasks, while not making it harder for
- * + nice tasks.
+ * More easily preempt - nice tasks, while not making
+ * it harder for + nice tasks.
*/
- gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+ if (unlikely(se->load.weight > NICE_0_LOAD))
+ gran = calc_delta_fair(gran, &se->load);
return gran;
}
@@ -1359,90 +1309,75 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
}
-static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
- struct cfs_rq *cfs_rq)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
{
- struct rq_iterator cfs_rq_iterator;
+ struct sched_entity *curr;
+ struct task_struct *p;
- cfs_rq_iterator.start = load_balance_start_fair;
- cfs_rq_iterator.next = load_balance_next_fair;
- cfs_rq_iterator.arg = cfs_rq;
+ if (!cfs_rq->nr_running || !first_fair(cfs_rq))
+ return MAX_PRIO;
- return balance_tasks(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
- this_best_prio, &cfs_rq_iterator);
+ curr = cfs_rq->curr;
+ if (!curr)
+ curr = __pick_next_entity(cfs_rq);
+
+ p = task_of(curr);
+
+ return p->prio;
}
+#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
{
+ struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
- int busiest_cpu = cpu_of(busiest);
- struct task_group *tg;
-
- rcu_read_lock();
- list_for_each_entry(tg, &task_groups, list) {
- long imbalance;
- unsigned long this_weight, busiest_weight;
- long rem_load, max_load, moved_load;
-
- /*
- * empty group
- */
- if (!aggregate(tg, sd)->task_weight)
- continue;
-
- rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
- rem_load /= aggregate(tg, sd)->load + 1;
-
- this_weight = tg->cfs_rq[this_cpu]->task_weight;
- busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
+ struct rq_iterator cfs_rq_iterator;
- imbalance = (busiest_weight - this_weight) / 2;
+ cfs_rq_iterator.start = load_balance_start_fair;
+ cfs_rq_iterator.next = load_balance_next_fair;
- if (imbalance < 0)
- imbalance = busiest_weight;
+ for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct cfs_rq *this_cfs_rq;
+ long imbalance;
+ unsigned long maxload;
- max_load = max(rem_load, imbalance);
- moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
- max_load, sd, idle, all_pinned, this_best_prio,
- tg->cfs_rq[busiest_cpu]);
+ this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
- if (!moved_load)
+ imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+ /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+ if (imbalance <= 0)
continue;
- move_group_shares(tg, sd, busiest_cpu, this_cpu);
+ /* Don't pull more than imbalance/2 */
+ imbalance /= 2;
+ maxload = min(rem_load_move, imbalance);
- moved_load *= aggregate(tg, sd)->load;
- moved_load /= aggregate(tg, sd)->rq_weight + 1;
+ *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+#else
+# define maxload rem_load_move
+#endif
+ /*
+ * pass busy_cfs_rq argument into
+ * load_balance_[start|next]_fair iterators
+ */
+ cfs_rq_iterator.arg = busy_cfs_rq;
+ rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+ maxload, sd, idle, all_pinned,
+ this_best_prio,
+ &cfs_rq_iterator);
- rem_load_move -= moved_load;
- if (rem_load_move < 0)
+ if (rem_load_move <= 0)
break;
}
- rcu_read_unlock();
return max_load_move - rem_load_move;
}
-#else
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio)
-{
- return __load_balance_fair(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
- this_best_prio, &busiest->cfs);
-}
-#endif
static int
move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1611,30 +1546,6 @@ static const struct sched_class fair_sched_class = {
};
#ifdef CONFIG_SCHED_DEBUG
-static void
-print_cfs_rq_tasks(struct seq_file *m, struct cfs_rq *cfs_rq, int depth)
-{
- struct sched_entity *se;
-
- if (!cfs_rq)
- return;
-
- list_for_each_entry_rcu(se, &cfs_rq->tasks, group_node) {
- int i;
-
- for (i = depth; i; i--)
- seq_puts(m, " ");
-
- seq_printf(m, "%lu %s %lu\n",
- se->load.weight,
- entity_is_task(se) ? "T" : "G",
- calc_delta_weight(SCHED_LOAD_SCALE, se)
- );
- if (!entity_is_task(se))
- print_cfs_rq_tasks(m, group_cfs_rq(se), depth + 1);
- }
-}
-
static void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq;
@@ -1642,9 +1553,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
rcu_read_lock();
for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
print_cfs_rq(m, cpu, cfs_rq);
-
- seq_printf(m, "\nWeight tree:\n");
- print_cfs_rq_tasks(m, &cpu_rq(cpu)->cfs, 1);
rcu_read_unlock();
}
#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 2bcafa37563..3a4f92dbbe6 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
-const struct sched_class idle_sched_class = {
+static const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c2730a5a4f0..0f3c19197fa 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -250,7 +250,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
spin_unlock(&rt_rq->rt_runtime_lock);
- }
+ } else if (rt_rq->rt_nr_running)
+ idle = 0;
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
@@ -449,13 +450,19 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
#endif
}
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
- if (group_rq && rt_rq_throttled(group_rq))
+ /*
+ * Don't enqueue the group if its throttled, or when empty.
+ * The latter is a consequence of the former when a child group
+ * get throttled and the current group doesn't have any other
+ * active members.
+ */
+ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;
list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -464,7 +471,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
inc_rt_tasks(rt_se, rt_rq);
}
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
@@ -480,11 +487,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
* Because the prio of an upper entry depends on the lower
* entries, we must remove entries top - down.
*/
-static void dequeue_rt_stack(struct task_struct *p)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
{
- struct sched_rt_entity *rt_se, *back = NULL;
+ struct sched_rt_entity *back = NULL;
- rt_se = &p->rt;
for_each_sched_rt_entity(rt_se) {
rt_se->back = back;
back = rt_se;
@@ -492,7 +498,26 @@ static void dequeue_rt_stack(struct task_struct *p)
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
- dequeue_rt_entity(rt_se);
+ __dequeue_rt_entity(rt_se);
+ }
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+{
+ dequeue_rt_stack(rt_se);
+ for_each_sched_rt_entity(rt_se)
+ __enqueue_rt_entity(rt_se);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+{
+ dequeue_rt_stack(rt_se);
+
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq && rt_rq->rt_nr_running)
+ __enqueue_rt_entity(rt_se);
}
}
@@ -506,36 +531,15 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
if (wakeup)
rt_se->timeout = 0;
- dequeue_rt_stack(p);
-
- /*
- * enqueue everybody, bottom - up.
- */
- for_each_sched_rt_entity(rt_se)
- enqueue_rt_entity(rt_se);
-
- inc_cpu_load(rq, p->se.load.weight);
+ enqueue_rt_entity(rt_se);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
{
struct sched_rt_entity *rt_se = &p->rt;
- struct rt_rq *rt_rq;
update_curr_rt(rq);
-
- dequeue_rt_stack(p);
-
- /*
- * re-enqueue all non-empty rt_rq entities.
- */
- for_each_sched_rt_entity(rt_se) {
- rt_rq = group_rt_rq(rt_se);
- if (rt_rq && rt_rq->rt_nr_running)
- enqueue_rt_entity(rt_se);
- }
-
- dec_cpu_load(rq, p->se.load.weight);
+ dequeue_rt_entity(rt_se);
}
/*
@@ -546,8 +550,10 @@ static
void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
{
struct rt_prio_array *array = &rt_rq->active;
+ struct list_head *queue = array->queue + rt_se_prio(rt_se);
- list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+ if (on_rt_rq(rt_se))
+ list_move_tail(&rt_se->run_list, queue);
}
static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -1098,11 +1104,14 @@ static void post_schedule_rt(struct rq *rq)
}
}
-
+/*
+ * If we are not running and we are not going to reschedule soon, we should
+ * try to push tasks away now
+ */
static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
- (p->prio >= rq->rt.highest_prio) &&
+ !test_tsk_need_resched(rq->curr) &&
rq->rt.overloaded)
push_rt_tasks(rq);
}
@@ -1309,7 +1318,7 @@ static void set_curr_task_rt(struct rq *rq)
p->se.exec_start = rq->clock;
}
-const struct sched_class rt_sched_class = {
+static const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 5bae2e0c3ff..80179ef7450 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -67,6 +67,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
preempt_enable();
#endif
}
+ kfree(mask_str);
return 0;
}
@@ -197,6 +198,9 @@ static inline void sched_info_queued(struct task_struct *t)
/*
* Called when a process ceases being the active-running process, either
* voluntarily or involuntarily. Now we can calculate how long we ran.
+ * Also, if the process is still in the TASK_RUNNING state, call
+ * sched_info_queued() to mark that it has now again started waiting on
+ * the runqueue.
*/
static inline void sched_info_depart(struct task_struct *t)
{
@@ -205,6 +209,9 @@ static inline void sched_info_depart(struct task_struct *t)
t->sched_info.cpu_time += delta;
rq_sched_info_depart(task_rq(t), delta);
+
+ if (t->state == TASK_RUNNING)
+ sched_info_queued(t);
}
/*
diff --git a/kernel/signal.c b/kernel/signal.c
index 72bb4f51f96..6c0958e52ea 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -231,6 +231,40 @@ void flush_signals(struct task_struct *t)
spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
+static void __flush_itimer_signals(struct sigpending *pending)
+{
+ sigset_t signal, retain;
+ struct sigqueue *q, *n;
+
+ signal = pending->signal;
+ sigemptyset(&retain);
+
+ list_for_each_entry_safe(q, n, &pending->list, list) {
+ int sig = q->info.si_signo;
+
+ if (likely(q->info.si_code != SI_TIMER)) {
+ sigaddset(&retain, sig);
+ } else {
+ sigdelset(&signal, sig);
+ list_del_init(&q->list);
+ __sigqueue_free(q);
+ }
+ }
+
+ sigorsets(&pending->signal, &signal, &retain);
+}
+
+void flush_itimer_signals(void)
+{
+ struct task_struct *tsk = current;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->sighand->siglock, flags);
+ __flush_itimer_signals(&tsk->pending);
+ __flush_itimer_signals(&tsk->signal->shared_pending);
+ spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+}
+
void ignore_signals(struct task_struct *t)
{
int i;
@@ -1240,17 +1274,22 @@ void sigqueue_free(struct sigqueue *q)
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
/*
- * If the signal is still pending remove it from the
- * pending queue. We must hold ->siglock while testing
- * q->list to serialize with collect_signal().
+ * We must hold ->siglock while testing q->list
+ * to serialize with collect_signal() or with
+ * __exit_signal()->flush_sigqueue().
*/
spin_lock_irqsave(lock, flags);
+ q->flags &= ~SIGQUEUE_PREALLOC;
+ /*
+ * If it is queued it will be freed when dequeued,
+ * like the "regular" sigqueue.
+ */
if (!list_empty(&q->list))
- list_del_init(&q->list);
+ q = NULL;
spin_unlock_irqrestore(lock, flags);
- q->flags &= ~SIGQUEUE_PREALLOC;
- __sigqueue_free(q);
+ if (q)
+ __sigqueue_free(q);
}
int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 01b6522fd92..a272d78185e 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -49,12 +49,17 @@ static unsigned long get_timestamp(int this_cpu)
return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
}
-void touch_softlockup_watchdog(void)
+static void __touch_softlockup_watchdog(void)
{
int this_cpu = raw_smp_processor_id();
__raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
}
+
+void touch_softlockup_watchdog(void)
+{
+ __raw_get_cpu_var(touch_timestamp) = 0;
+}
EXPORT_SYMBOL(touch_softlockup_watchdog);
void touch_all_softlockup_watchdogs(void)
@@ -80,7 +85,7 @@ void softlockup_tick(void)
unsigned long now;
if (touch_timestamp == 0) {
- touch_softlockup_watchdog();
+ __touch_softlockup_watchdog();
return;
}
@@ -95,7 +100,7 @@ void softlockup_tick(void)
/* do not print during early bootup: */
if (unlikely(system_state != SYSTEM_RUNNING)) {
- touch_softlockup_watchdog();
+ __touch_softlockup_watchdog();
return;
}
@@ -115,6 +120,7 @@ void softlockup_tick(void)
printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
this_cpu, now - touch_timestamp,
current->comm, task_pid_nr(current));
+ print_modules();
if (regs)
show_regs(regs);
else
@@ -214,7 +220,7 @@ static int watchdog(void *__bind_cpu)
sched_setscheduler(current, SCHED_FIFO, &param);
/* initialize timestamp */
- touch_softlockup_watchdog();
+ __touch_softlockup_watchdog();
set_current_state(TASK_INTERRUPTIBLE);
/*
@@ -223,7 +229,7 @@ static int watchdog(void *__bind_cpu)
* debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
- touch_softlockup_watchdog();
+ __touch_softlockup_watchdog();
schedule();
if (kthread_should_stop())
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0101aeef7ed..b7350bbfb07 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -62,8 +62,7 @@ static int stopmachine(void *cpu)
* help our sisters onto their CPUs. */
if (!prepared && !irqs_disabled)
yield();
- else
- cpu_relax();
+ cpu_relax();
}
/* Ack: we are exiting. */
@@ -106,8 +105,10 @@ static int stop_machine(void)
}
/* Wait for them all to come to life. */
- while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
+ while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
yield();
+ cpu_relax();
+ }
/* If some failed, kill them all. */
if (ret < 0) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 895d2d4c949..14e97282eb6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1652,7 +1652,7 @@ asmlinkage long sys_umask(int mask)
asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
- long uninitialized_var(error);
+ long error = 0;
if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
return error;
@@ -1701,9 +1701,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
error = PR_TIMING_STATISTICAL;
break;
case PR_SET_TIMING:
- if (arg2 == PR_TIMING_STATISTICAL)
- error = 0;
- else
+ if (arg2 != PR_TIMING_STATISTICAL)
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d7ffdc59816..29116652dca 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -81,6 +81,7 @@ extern int compat_log;
extern int maps_protect;
extern int sysctl_stat_interval;
extern int latencytop_enabled;
+extern int sysctl_nr_open_min, sysctl_nr_open_max;
/* Constants used for minimum and maximum */
#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
@@ -1190,7 +1191,9 @@ static struct ctl_table fs_table[] = {
.data = &sysctl_nr_open,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &sysctl_nr_open_min,
+ .extra2 = &sysctl_nr_open_max,
},
{
.ctl_name = FS_DENTRY,
diff --git a/kernel/time.c b/kernel/time.c
index cbe0d5a222f..6a08660b4fa 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -246,7 +246,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
- return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
+ return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32;
# else
return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN;
# endif
@@ -262,7 +262,7 @@ unsigned int inline jiffies_to_usecs(const unsigned long j)
return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
- return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
+ return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
# else
return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
# endif
@@ -476,7 +476,7 @@ unsigned long msecs_to_jiffies(const unsigned int m)
if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
return MAX_JIFFY_OFFSET;
- return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+ return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
>> MSEC_TO_HZ_SHR32;
#endif
}
@@ -491,7 +491,7 @@ unsigned long usecs_to_jiffies(const unsigned int u)
#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
return u * (HZ / USEC_PER_SEC);
#else
- return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+ return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
>> USEC_TO_HZ_SHR32;
#endif
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 73961f35fdc..dadde5361f3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -471,10 +471,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
/*
* Sysfs setup bits:
*/
-static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
sysfs_override_clocksource);
-static SYSDEV_ATTR(available_clocksource, 0600,
+static SYSDEV_ATTR(available_clocksource, 0444,
sysfs_show_available_clocksources, NULL);
static struct sysdev_class clocksource_sysclass = {
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index 41468035473..eb51d76e058 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -1,7 +1,7 @@
#!/usr/bin/perl
# -----------------------------------------------------------------------
#
-# Copyright 2007 rPath, Inc. - All Rights Reserved
+# Copyright 2007-2008 rPath, Inc. - All Rights Reserved
#
# This file is part of the Linux kernel, and is made available under
# the terms of the GNU General Public License version 2 or (at your
@@ -20,198 +20,138 @@
%canned_values = (
24 => [
'0xa6aaaaab','0x2aaaaaa',26,
- '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58,
125,3,
'0xc49ba5e4','0x1fbe76c8b4',37,
- '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69,
3,125,
'0xa2c2aaab','0xaaaa',16,
- '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48,
125000,3,
'0xc9539b89','0x7fffbce4217d',47,
- '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79,
3,125000,
], 32 => [
'0xfa000000','0x6000000',27,
- '0xfa00000000000000','0x600000000000000',59,
125,4,
'0x83126e98','0xfdf3b645a',36,
- '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68,
4,125,
'0xf4240000','0x0',17,
- '0xf424000000000000','0x0',49,
31250,1,
'0x8637bd06','0x3fff79c842fa',46,
- '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78,
1,31250,
], 48 => [
'0xa6aaaaab','0x6aaaaaa',27,
- '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59,
125,6,
'0xc49ba5e4','0xfdf3b645a',36,
- '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68,
6,125,
'0xa2c2aaab','0x15555',17,
- '0xa2c2aaaaaaaaaaab','0x1555555555555',49,
62500,3,
'0xc9539b89','0x3fffbce4217d',46,
- '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78,
3,62500,
], 64 => [
'0xfa000000','0xe000000',28,
- '0xfa00000000000000','0xe00000000000000',60,
125,8,
'0x83126e98','0x7ef9db22d',35,
- '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67,
8,125,
'0xf4240000','0x0',18,
- '0xf424000000000000','0x0',50,
15625,1,
'0x8637bd06','0x1fff79c842fa',45,
- '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77,
1,15625,
], 100 => [
'0xa0000000','0x0',28,
- '0xa000000000000000','0x0',60,
10,1,
'0xcccccccd','0x733333333',35,
- '0xcccccccccccccccd','0x73333333333333333',67,
1,10,
'0x9c400000','0x0',18,
- '0x9c40000000000000','0x0',50,
10000,1,
'0xd1b71759','0x1fff2e48e8a7',45,
- '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77,
1,10000,
], 122 => [
'0x8325c53f','0xfbcda3a',28,
- '0x8325c53ef368eb05','0xfbcda3ac10c9714',60,
500,61,
'0xf9db22d1','0x7fbe76c8b',35,
- '0xf9db22d0e560418a','0x7fbe76c8b43958106',67,
61,500,
'0x8012e2a0','0x3ef36',18,
- '0x8012e29f79b47583','0x3ef368eb04325',50,
500000,61,
'0xffda4053','0x1ffffbce4217',45,
- '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77,
61,500000,
], 128 => [
'0xfa000000','0x1e000000',29,
- '0xfa00000000000000','0x1e00000000000000',61,
125,16,
'0x83126e98','0x3f7ced916',34,
- '0x83126e978d4fdf3c','0x3f7ced916872b020c',66,
16,125,
'0xf4240000','0x40000',19,
- '0xf424000000000000','0x4000000000000',51,
15625,2,
'0x8637bd06','0xfffbce4217d',44,
- '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76,
2,15625,
], 200 => [
'0xa0000000','0x0',29,
- '0xa000000000000000','0x0',61,
5,1,
'0xcccccccd','0x333333333',34,
- '0xcccccccccccccccd','0x33333333333333333',66,
1,5,
'0x9c400000','0x0',19,
- '0x9c40000000000000','0x0',51,
5000,1,
'0xd1b71759','0xfff2e48e8a7',44,
- '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76,
1,5000,
], 250 => [
'0x80000000','0x0',29,
- '0x8000000000000000','0x0',61,
4,1,
'0x80000000','0x180000000',33,
- '0x8000000000000000','0x18000000000000000',65,
1,4,
'0xfa000000','0x0',20,
- '0xfa00000000000000','0x0',52,
4000,1,
'0x83126e98','0x7ff7ced9168',43,
- '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75,
1,4000,
], 256 => [
'0xfa000000','0x3e000000',30,
- '0xfa00000000000000','0x3e00000000000000',62,
125,32,
'0x83126e98','0x1fbe76c8b',33,
- '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65,
32,125,
'0xf4240000','0xc0000',20,
- '0xf424000000000000','0xc000000000000',52,
15625,4,
'0x8637bd06','0x7ffde7210be',43,
- '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75,
4,15625,
], 300 => [
'0xd5555556','0x2aaaaaaa',30,
- '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62,
10,3,
'0x9999999a','0x1cccccccc',33,
- '0x999999999999999a','0x1cccccccccccccccc',65,
3,10,
'0xd0555556','0xaaaaa',20,
- '0xd055555555555556','0xaaaaaaaaaaaaa',52,
10000,3,
'0x9d495183','0x7ffcb923a29',43,
- '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75,
3,10000,
], 512 => [
'0xfa000000','0x7e000000',31,
- '0xfa00000000000000','0x7e00000000000000',63,
125,64,
'0x83126e98','0xfdf3b645',32,
- '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64,
64,125,
'0xf4240000','0x1c0000',21,
- '0xf424000000000000','0x1c000000000000',53,
15625,8,
'0x8637bd06','0x3ffef39085f',42,
- '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74,
8,15625,
], 1000 => [
'0x80000000','0x0',31,
- '0x8000000000000000','0x0',63,
1,1,
'0x80000000','0x0',31,
- '0x8000000000000000','0x0',63,
1,1,
'0xfa000000','0x0',22,
- '0xfa00000000000000','0x0',54,
1000,1,
'0x83126e98','0x1ff7ced9168',41,
- '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73,
1,1000,
], 1024 => [
'0xfa000000','0xfe000000',32,
- '0xfa00000000000000','0xfe00000000000000',64,
125,128,
'0x83126e98','0x7ef9db22',31,
- '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63,
128,125,
'0xf4240000','0x3c0000',22,
- '0xf424000000000000','0x3c000000000000',54,
15625,16,
'0x8637bd06','0x1fff79c842f',41,
- '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73,
16,15625,
], 1200 => [
'0xd5555556','0xd5555555',32,
- '0xd555555555555556','0xd555555555555555',64,
5,6,
'0x9999999a','0x66666666',31,
- '0x999999999999999a','0x6666666666666666',63,
6,5,
'0xd0555556','0x2aaaaa',22,
- '0xd055555555555556','0x2aaaaaaaaaaaaa',54,
2500,3,
'0x9d495183','0x1ffcb923a29',41,
- '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73,
3,2500,
]
);
@@ -264,6 +204,15 @@ sub fmuls($$$) {
return 0;
}
+# Generate a hex value if the result fits in 64 bits;
+# otherwise skip.
+sub bignum_hex($) {
+ my($x) = @_;
+ my $s = $x->as_hex();
+
+ return (length($s) > 18) ? undef : $s;
+}
+
# Provides mul, adj, and shr factors for a specific
# (bit, time, hz) combination
sub muladj($$$) {
@@ -271,7 +220,7 @@ sub muladj($$$) {
my $s = fmuls($b, $t, $hz);
my $m = fmul($s, $t, $hz);
my $a = fadj($s, $t, $hz);
- return ($m->as_hex(), $a->as_hex(), $s);
+ return (bignum_hex($m), bignum_hex($a), $s);
}
# Provides numerator, denominator values
@@ -288,12 +237,10 @@ sub conversions($$) {
# HZ_TO_xx
push(@val, muladj(32, $t, $hz));
- push(@val, muladj(64, $t, $hz));
push(@val, numden($t, $hz));
# xx_TO_HZ
push(@val, muladj(32, $hz, $t));
- push(@val, muladj(64, $hz, $t));
push(@val, numden($hz, $t));
return @val;
@@ -318,6 +265,19 @@ sub compute_values($) {
return @val;
}
+sub outputval($$)
+{
+ my($name, $val) = @_;
+ my $csuf;
+
+ if (defined($val)) {
+ if ($name !~ /SHR/) {
+ $val = "U64_C($val)";
+ }
+ printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
+ }
+}
+
sub output($@)
{
my($hz, @val) = @_;
@@ -331,6 +291,7 @@ sub output($@)
print "\n";
print "#include <linux/param.h>\n";
+ print "#include <linux/types.h>\n";
print "\n";
print "#if HZ != $hz\n";
@@ -340,15 +301,13 @@ sub output($@)
foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
'HZ_TO_USEC','USEC_TO_HZ') {
- foreach $bit (32, 64) {
+ foreach $bit (32) {
foreach $suf ('MUL', 'ADJ', 'SHR') {
- printf "#define %-23s %s\n",
- "${pfx}_$suf$bit", shift(@val);
+ outputval("${pfx}_$suf$bit", shift(@val));
}
}
foreach $suf ('NUM', 'DEN') {
- printf "#define %-23s %s\n",
- "${pfx}_$suf", shift(@val);
+ outputval("${pfx}_$suf", shift(@val));
}
}
@@ -356,6 +315,23 @@ sub output($@)
print "#endif /* KERNEL_TIMECONST_H */\n";
}
+# Pretty-print Perl values
+sub perlvals(@) {
+ my $v;
+ my @l = ();
+
+ foreach $v (@_) {
+ if (!defined($v)) {
+ push(@l, 'undef');
+ } elsif ($v =~ /^0x/) {
+ push(@l, "\'".$v."\'");
+ } else {
+ push(@l, $v.'');
+ }
+ }
+ return join(',', @l);
+}
+
($hz) = @ARGV;
# Use this to generate the %canned_values structure
@@ -373,15 +349,15 @@ if ($hz eq '--can') {
print "$pf$hz => [\n";
while (scalar(@values)) {
my $bit;
- foreach $bit (32, 64) {
+ foreach $bit (32) {
my $m = shift(@values);
my $a = shift(@values);
my $s = shift(@values);
- print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n";
+ print "\t\t", perlvals($m,$a,$s), ",\n";
}
my $n = shift(@values);
my $d = shift(@values);
- print "\t\t",$n,',',$d,",\n";
+ print "\t\t", perlvals($n,$d), ",\n";
}
print "\t]";
$pf = ', ';
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29fc39f1029..ce7799540c9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -13,7 +13,7 @@
* Kai Petzke <wpp@marie.physik.tu-berlin.de>
* Theodore Ts'o <tytso@mit.edu>
*
- * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
+ * Made to use alloc_percpu by Christoph Lameter.
*/
#include <linux/module.h>