From d58e6576b0deec6f0b9ff8450fe282da18c50883 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Oct 2009 20:40:43 +0200 Subject: futex: Handle spurious wake up The futex code does not handle spurious wake up in futex_wait and futex_wait_requeue_pi. The code assumes that any wake up which was not caused by futex_wake / requeue or by a timeout was caused by a signal wake up and returns one of the syscall restart error codes. In case of a spurious wake up the signal delivery code which deals with the restart error codes is not invoked and we return that error code to user space. That causes applications which actually check the return codes to fail. Blaise reported that on preempt-rt a python test program run into a exception trap. -rt exposed that due to a built in spurious wake up accelerator :) Solve this by checking signal_pending(current) in the wake up path and handle the spurious wake up case w/o returning to user space. Reported-by: Blaise Gassend Debugged-by: Darren Hart Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable@kernel.org LKML-Reference: --- kernel/futex.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4949d336d88..5c88839bd99 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1791,6 +1791,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } +retry: /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -1808,9 +1809,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out_put_key; /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) { + put_futex_key(fshared, &q.key); + goto retry; + } + ret = -ERESTARTSYS; if (!abs_time) goto out_put_key; @@ -2118,9 +2124,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, */ plist_del(&q->list, &q->list.plist); + /* Handle spurious wakeups gracefully */ + ret = -EAGAIN; if (timeout && !timeout->task) ret = -ETIMEDOUT; - else + else if (signal_pending(current)) ret = -ERESTARTNOINTR; } return ret; @@ -2198,6 +2206,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; +retry: key2 = FUTEX_KEY_INIT; ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2292,6 +2301,9 @@ out_put_keys: out_key2: put_futex_key(fshared, &key2); + /* Spurious wakeup ? */ + if (ret == -EAGAIN) + goto retry; out: if (to) { hrtimer_cancel(&to->timer); -- cgit v1.2.3 From 2bc872036e1c5948b5b02942810bbdd8dbdb9812 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 14 Oct 2009 10:12:39 -0700 Subject: futex: Check for NULL keys in match_futex If userspace tries to perform a requeue_pi on a non-requeue_pi waiter, it will find the futex_q->requeue_pi_key to be NULL and OOPS. Check for NULL in match_futex() instead of doing explicit NULL pointer checks on all call sites. While match_futex(NULL, NULL) returning false is a little odd, it's still correct as we expect valid key references. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Ingo Molnar CC: Eric Dumazet CC: Dinakar Guniguntala CC: John Stultz Cc: stable@kernel.org LKML-Reference: <4AD60687.10306@us.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5c88839bd99..06938e560ac 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) */ static inline int match_futex(union futex_key *key1, union futex_key *key2) { - return (key1->both.word == key2->both.word + return (key1 && key2 + && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } -- cgit v1.2.3 From 37c72e56f6b234ea7387ba530434a80abf2658d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Oct 2009 10:15:55 -0700 Subject: rcu: Prevent RCU IPI storms in presence of high call_rcu() load As the number of callbacks on a given CPU rises, invoke force_quiescent_state() only every blimit number of callbacks (defaults to 10,000), and even then only if no other CPU has invoked force_quiescent_state() in the meantime. This should fix the performance regression reported by Nick. Reported-by: Nick Piggin Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: jens.axboe@oracle.com LKML-Reference: <12555405592133-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 29 ++++++++++++++++++++++++----- kernel/rcutree.h | 4 ++++ 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 705f02ac743..ddbf111e9e1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -958,7 +958,7 @@ static void rcu_offline_cpu(int cpu) * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; @@ -1011,6 +1011,13 @@ static void rcu_do_batch(struct rcu_data *rdp) if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + local_irq_restore(flags); /* Re-raise the RCU softirq if there are callbacks remaining. */ @@ -1224,7 +1231,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rdp); + rcu_do_batch(rsp, rdp); } /* @@ -1288,10 +1295,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } - /* Force the grace period if too many callbacks or too long waiting. */ - if (unlikely(++rdp->qlen > qhimark)) { + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { rdp->blimit = LONG_MAX; - force_quiescent_state(rsp, 0); + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1523,6 +1540,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->beenonline = 1; /* We have now been online. */ rdp->preemptable = preemptable; rdp->passed_quiesc_completed = lastcomp - 1; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; spin_unlock(&rnp->lock); /* irqs remain disabled. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b40ac570604..599161f309f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -167,6 +167,10 @@ struct rcu_data { struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; long qlen; /* # of queued callbacks */ + long qlen_last_fqs_check; + /* qlen at last check for QS forcing */ + unsigned long n_force_qs_snap; + /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ -- cgit v1.2.3 From 019129d595caaa5bd0b41d128308da1be6a91869 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Oct 2009 10:15:56 -0700 Subject: rcu: Stopgap fix for synchronize_rcu_expedited() for TREE_PREEMPT_RCU For the short term, map synchronize_rcu_expedited() to synchronize_rcu() for TREE_PREEMPT_RCU and to synchronize_sched_expedited() for TREE_RCU. Longer term, there needs to be a real expedited grace period for TREE_PREEMPT_RCU, but candidate patches to date are considerably more complex and intrusive. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: npiggin@suse.de Cc: jens.axboe@oracle.com LKML-Reference: <12555405592331-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree_plugin.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c0cb783aa16..ebd20ee7707 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -392,6 +392,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); +/* + * Wait for an rcu-preempt grace period. We are supposed to expedite the + * grace period, but this is the crude slow compatability hack, so just + * invoke synchronize_rcu(). + */ +void synchronize_rcu_expedited(void) +{ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + /* * Check to see if there is any immediate preemptable-RCU-related work * to be done. @@ -564,6 +575,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptable RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + /* * Because preemptable RCU does not exist, it never has any work to do. */ -- cgit v1.2.3 From 237c80c5c8fb7ec128cf2a756b550dc41ad7eac7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Oct 2009 09:26:14 -0700 Subject: rcu: Fix TREE_PREEMPT_RCU CPU_HOTPLUG bad-luck hang If the following sequence of events occurs, then TREE_PREEMPT_RCU will hang waiting for a grace period to complete, eventually OOMing the system: o A TREE_PREEMPT_RCU build of the kernel is booted on a system with more than 64 physical CPUs present (32 on a 32-bit system). Alternatively, a TREE_PREEMPT_RCU build of the kernel is booted with RCU_FANOUT set to a sufficiently small value that the physical CPUs populate two or more leaf rcu_node structures. o A task is preempted in an RCU read-side critical section while running on a CPU corresponding to a given leaf rcu_node structure. o All CPUs corresponding to this same leaf rcu_node structure record quiescent states for the current grace period. o All of these same CPUs go offline (hence the need for enough physical CPUs to populate more than one leaf rcu_node structure). This causes the preempted task to be moved to the root rcu_node structure. At this point, there is nothing left to cause the quiescent state to be propagated up the rcu_node tree, so the current grace period never completes. The simplest fix, especially after considering the deadlock possibilities, is to detect this situation when the last CPU is offlined, and to set that CPU's ->qsmask bit in its leaf rcu_node structure. This will cause the next invocation of force_quiescent_state() to end the grace period. Without this fix, this hang can be triggered in an hour or so on some machines with rcutorture and random CPU onlining/offlining. With this fix, these same machines pass a full 10 hours of this sort of abuse. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20091015162614.GA19131@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 15 ++++++++++++++- kernel/rcutree.h | 6 +++--- kernel/rcutree_plugin.h | 25 +++++++++++++++++-------- 3 files changed, 34 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ddbf111e9e1..0536125b049 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -913,7 +913,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp, rdp); + + /* + * If there was a task blocking the current grace period, + * and if all CPUs have checked in, we need to propagate + * the quiescent state up the rcu_node hierarchy. But that + * is inconvenient at the moment due to deadlock issues if + * this should end the current grace period. So set the + * offlined CPU's bit in ->qsmask in order to force the + * next force_quiescent_state() invocation to clean up this + * mess in a deadlock-free manner. + */ + if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask) + rnp->qsmask |= mask; + mask = rnp->grpmask; spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 599161f309f..1823c6e2060 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -306,9 +306,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index ebd20ee7707..ef2a58c2b9d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) * parent is to remove the need for rcu_read_unlock_special() to * make more than two attempts to acquire the target rcu_node's lock. * + * Returns 1 if there was previously a task blocking the current grace + * period on the specified rcu_node structure. + * * The caller must hold rnp->lock with irqs disabled. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; struct list_head *lp_root; + int retval = rcu_preempted_readers(rnp); struct rcu_node *rnp_root = rcu_get_root(rsp); struct task_struct *tp; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); - return; /* Shouldn't happen: at least one CPU online. */ + return 0; /* Shouldn't happen: at least one CPU online. */ } WARN_ON_ONCE(rnp != rdp->mynode && (!list_empty(&rnp->blocked_tasks[0]) || @@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, spin_unlock(&rnp_root->lock); /* irqs remain disabled */ } } + + return retval; } /* @@ -532,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) /* * Because preemptable RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections. + * tasks that were blocked within RCU read-side critical sections, and + * such non-existent tasks cannot possibly have been blocking the current + * grace period. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { + return 0; } /* -- cgit v1.2.3 From 89061d3d58e1f0742139605dc6a7950aa1ecc019 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 15 Oct 2009 15:30:48 -0700 Subject: futex: Move drop_futex_key_refs out of spinlock'ed region When requeuing tasks from one futex to another, the reference held by the requeued task to the original futex location needs to be dropped eventually. Dropping the reference may ultimately lead to a call to "iput_final" and subsequently call into filesystem- specific code - which may be non-atomic. It is therefore safer to defer this drop operation until after the futex_hash_bucket spinlock has been dropped. Originally-From: Helge Bahmann Signed-off-by: Darren Hart Cc: Cc: Peter Zijlstra Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz Cc: Sven-Thorsten Dietrich Cc: John Kacur LKML-Reference: <4AD7A298.5040802@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 06938e560ac..642f3bbaacc 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1029,7 +1029,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; @@ -1227,6 +1226,7 @@ retry_private: */ if (ret == 1) { WARN_ON(pi_state); + drop_count++; task_count++; ret = get_futex_value_locked(&curval2, uaddr2); if (!ret) @@ -1305,6 +1305,7 @@ retry_private: if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; continue; } else if (ret) { /* -EDEADLK */ -- cgit v1.2.3