From 8b09dee67f484e9b42114b1a1f068e080fd7aa56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcupreempt: remove duplicate prototypes rcu_batches_completed and rcu_patches_completed_bh are both declared in rcuclassic.h and rcupreempt.h. This patch removes the extra prototypes for them from rcupdate.h. rcu_batches_completed_bh is defined as a static inline in the rcupreempt.h header file. Trying to export this as EXPORT_SYMBOL_GPL causes linking problems with the powerpc linker. There's no need to export a static inlined function. Modules must be compiled with the same type of RCU implementation as the kernel they are for. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e1cdf196a51..5e02b774070 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -217,8 +217,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - void __rcu_read_lock(void) { int idx; -- cgit v1.2.3 From 4446a36ff8c74ac3b32feb009b651048e129c6af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: add call_rcu_sched() Fourth cut of patch to provide the call_rcu_sched(). This is again to synchronize_sched() as call_rcu() is to synchronize_rcu(). Should be fine for experimental and -rt use, but not ready for inclusion. With some luck, I will be able to tell Andrew to come out of hiding on the next round. Passes multi-day rcutorture sessions with concurrent CPU hotplugging. Fixes since the first version include a bug that could result in indefinite blocking (spotted by Gautham Shenoy), better resiliency against CPU-hotplug operations, and other minor fixes. Fixes since the second version include reworking grace-period detection to avoid deadlocks that could happen when running concurrently with CPU hotplug, adding Mathieu's fix to avoid the softlockup messages, as well as Mathieu's fix to allow use earlier in boot. Fixes since the third version include a wrong-CPU bug spotted by Andrew, getting rid of the obsolete synchronize_kernel API that somehow snuck back in, merging spin_unlock() and local_irq_restore() in a few places, commenting the code that checks for quiescent states based on interrupting from user-mode execution or the idle loop, removing some inline attributes, and some code-style changes. Known/suspected shortcomings: o I still do not entirely trust the sleep/wakeup logic. Next step will be to use a private snapshot of the CPU online mask in rcu_sched_grace_period() -- if the CPU wasn't there at the start of the grace period, we don't need to hear from it. And the bit about accounting for changes in online CPUs inside of rcu_sched_grace_period() is ugly anyway. o It might be good for rcu_sched_grace_period() to invoke resched_cpu() when a given CPU wasn't responding quickly, but resched_cpu() is declared static... This patch also fixes a long-standing bug in the earlier preemptable-RCU implementation of synchronize_rcu() that could result in loss of concurrent external changes to a task's CPU affinity mask. I still cannot remember who reported this... Signed-off-by: Paul E. McKenney Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/rcupdate.c | 20 +-- kernel/rcupreempt.c | 414 ++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 372 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c09605f8d16..a4e329d9288 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -39,18 +39,12 @@ #include #include #include -#include #include #include #include #include #include -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -60,7 +54,7 @@ static struct completion rcu_barrier_completion; * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. */ -static void wakeme_after_rcu(struct rcu_head *head) +void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; @@ -77,17 +71,7 @@ static void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ -void synchronize_rcu(void) -{ - struct rcu_synchronize rcu; - - init_completion(&rcu.completion); - /* Will wake me after RCU finished */ - call_rcu(&rcu.head, wakeme_after_rcu); - - /* Wait for it */ - wait_for_completion(&rcu.completion); -} +synchronize_rcu_xxx(synchronize_rcu, call_rcu) EXPORT_SYMBOL_GPL(synchronize_rcu); static void rcu_barrier_callback(struct rcu_head *notused) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 5e02b774070..aaa7976bd85 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -87,9 +88,14 @@ struct rcu_data { struct rcu_head **nexttail; struct rcu_head *waitlist[GP_STAGES]; struct rcu_head **waittail[GP_STAGES]; - struct rcu_head *donelist; + struct rcu_head *donelist; /* from waitlist & waitschedlist */ struct rcu_head **donetail; long rcu_flipctr[2]; + struct rcu_head *nextschedlist; + struct rcu_head **nextschedtail; + struct rcu_head *waitschedlist; + struct rcu_head **waitschedtail; + int rcu_sched_sleeping; #ifdef CONFIG_RCU_TRACE struct rcupreempt_trace trace; #endif /* #ifdef CONFIG_RCU_TRACE */ @@ -131,11 +137,24 @@ enum rcu_try_flip_states { rcu_try_flip_waitmb_state, }; +/* + * States for rcu_ctrlblk.rcu_sched_sleep. + */ + +enum rcu_sched_sleep_states { + rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */ + rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */ + rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */ +}; + struct rcu_ctrlblk { spinlock_t fliplock; /* Protect state-machine transitions. */ long completed; /* Number of last completed batch. */ enum rcu_try_flip_states rcu_try_flip_state; /* The current state of the rcu state machine */ + spinlock_t schedlock; /* Protect rcu_sched sleep state. */ + enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */ + wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ }; static DEFINE_PER_CPU(struct rcu_data, rcu_data); @@ -143,8 +162,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), .completed = 0, .rcu_try_flip_state = rcu_try_flip_idle_state, + .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), + .sched_sleep = rcu_sched_not_sleeping, + .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq), }; +static struct task_struct *rcu_sched_grace_period_task; #ifdef CONFIG_RCU_TRACE static char *rcu_try_flip_state_names[] = @@ -207,6 +230,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) */ #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); +#define RCU_SCHED_BATCH_TIME (HZ / 50) + /* * Return the number of RCU batches processed thus far. Useful * for debug and statistics. @@ -411,32 +436,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) } } -#ifdef CONFIG_NO_HZ +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { + .dynticks = 1, +}; -DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; -static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); +#ifdef CONFIG_NO_HZ static DEFINE_PER_CPU(int, rcu_update_flag); /** * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. * * If the CPU was idle with dynamic ticks active, this updates the - * dynticks_progress_counter to let the RCU handling know that the + * rcu_dyntick_sched.dynticks to let the RCU handling know that the * CPU is active. */ void rcu_irq_enter(void) { int cpu = smp_processor_id(); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); if (per_cpu(rcu_update_flag, cpu)) per_cpu(rcu_update_flag, cpu)++; /* * Only update if we are coming from a stopped ticks mode - * (dynticks_progress_counter is even). + * (rcu_dyntick_sched.dynticks is even). */ if (!in_interrupt() && - (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { + (rdssp->dynticks & 0x1) == 0) { /* * The following might seem like we could have a race * with NMI/SMIs. But this really isn't a problem. @@ -459,12 +486,12 @@ void rcu_irq_enter(void) * RCU read-side critical sections on this CPU would * have already completed. */ - per_cpu(dynticks_progress_counter, cpu)++; + rdssp->dynticks++; /* * The following memory barrier ensures that any * rcu_read_lock() primitives in the irq handler * are seen by other CPUs to follow the above - * increment to dynticks_progress_counter. This is + * increment to rcu_dyntick_sched.dynticks. This is * required in order for other CPUs to correctly * determine when it is safe to advance the RCU * grace-period state machine. @@ -472,7 +499,7 @@ void rcu_irq_enter(void) smp_mb(); /* see above block comment. */ /* * Since we can't determine the dynamic tick mode from - * the dynticks_progress_counter after this routine, + * the rcu_dyntick_sched.dynticks after this routine, * we use a second flag to acknowledge that we came * from an idle state with ticks stopped. */ @@ -480,7 +507,7 @@ void rcu_irq_enter(void) /* * If we take an NMI/SMI now, they will also increment * the rcu_update_flag, and will not update the - * dynticks_progress_counter on exit. That is for + * rcu_dyntick_sched.dynticks on exit. That is for * this IRQ to do. */ } @@ -490,12 +517,13 @@ void rcu_irq_enter(void) * rcu_irq_exit - Called from exiting Hard irq context. * * If the CPU was idle with dynamic ticks active, update the - * dynticks_progress_counter to put let the RCU handling be + * rcu_dyntick_sched.dynticks to put let the RCU handling be * aware that the CPU is going back to idle with no ticks. */ void rcu_irq_exit(void) { int cpu = smp_processor_id(); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); /* * rcu_update_flag is set if we interrupted the CPU @@ -503,7 +531,7 @@ void rcu_irq_exit(void) * Once this occurs, we keep track of interrupt nesting * because a NMI/SMI could also come in, and we still * only want the IRQ that started the increment of the - * dynticks_progress_counter to be the one that modifies + * rcu_dyntick_sched.dynticks to be the one that modifies * it on exit. */ if (per_cpu(rcu_update_flag, cpu)) { @@ -515,28 +543,29 @@ void rcu_irq_exit(void) /* * If an NMI/SMI happens now we are still - * protected by the dynticks_progress_counter being odd. + * protected by the rcu_dyntick_sched.dynticks being odd. */ /* * The following memory barrier ensures that any * rcu_read_unlock() primitives in the irq handler * are seen by other CPUs to preceed the following - * increment to dynticks_progress_counter. This + * increment to rcu_dyntick_sched.dynticks. This * is required in order for other CPUs to determine * when it is safe to advance the RCU grace-period * state machine. */ smp_mb(); /* see above block comment. */ - per_cpu(dynticks_progress_counter, cpu)++; - WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); + rdssp->dynticks++; + WARN_ON(rdssp->dynticks & 0x1); } } static void dyntick_save_progress_counter(int cpu) { - per_cpu(rcu_dyntick_snapshot, cpu) = - per_cpu(dynticks_progress_counter, cpu); + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->dynticks_snap = rdssp->dynticks; } static inline int @@ -544,9 +573,10 @@ rcu_try_flip_waitack_needed(int cpu) { long curr; long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - curr = per_cpu(dynticks_progress_counter, cpu); - snap = per_cpu(rcu_dyntick_snapshot, cpu); + curr = rdssp->dynticks; + snap = rdssp->dynticks_snap; smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ /* @@ -580,9 +610,10 @@ rcu_try_flip_waitmb_needed(int cpu) { long curr; long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); - curr = per_cpu(dynticks_progress_counter, cpu); - snap = per_cpu(rcu_dyntick_snapshot, cpu); + curr = rdssp->dynticks; + snap = rdssp->dynticks_snap; smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ /* @@ -609,14 +640,86 @@ rcu_try_flip_waitmb_needed(int cpu) return 1; } +static void dyntick_save_progress_counter_sched(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->sched_dynticks_snap = rdssp->dynticks; +} + +static int rcu_qsctr_inc_needed_dyntick(int cpu) +{ + long curr; + long snap; + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + curr = rdssp->dynticks; + snap = rdssp->sched_dynticks_snap; + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot be in the middle of an rcu_read_lock(), so + * the next rcu_read_lock() it executes must use the new value + * of the counter. Therefore, this CPU has been in a quiescent + * state the entire time, and we don't need to wait for it. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq handlers, then, as above, this CPU has already + * passed through a quiescent state. + */ + + if ((curr - snap) > 2 || (snap & 0x1) == 0) + return 0; + + /* We need this CPU to go through a quiescent state. */ + + return 1; +} + #else /* !CONFIG_NO_HZ */ -# define dyntick_save_progress_counter(cpu) do { } while (0) -# define rcu_try_flip_waitack_needed(cpu) (1) -# define rcu_try_flip_waitmb_needed(cpu) (1) +# define dyntick_save_progress_counter(cpu) do { } while (0) +# define rcu_try_flip_waitack_needed(cpu) (1) +# define rcu_try_flip_waitmb_needed(cpu) (1) + +# define dyntick_save_progress_counter_sched(cpu) do { } while (0) +# define rcu_qsctr_inc_needed_dyntick(cpu) (1) #endif /* CONFIG_NO_HZ */ +static void save_qsctr_sched(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->sched_qs_snap = rdssp->sched_qs; +} + +static inline int rcu_qsctr_inc_needed(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + /* + * If there has been a quiescent state, no more need to wait + * on this CPU. + */ + + if (rdssp->sched_qs != rdssp->sched_qs_snap) { + smp_mb(); /* force ordering with cpu entering schedule(). */ + return 0; + } + + /* We need this CPU to go through a quiescent state. */ + + return 1; +} + /* * Get here when RCU is idle. Decide whether we need to * move out of idle state, and return non-zero if so. @@ -819,6 +922,26 @@ void rcu_check_callbacks(int cpu, int user) unsigned long flags; struct rcu_data *rdp = RCU_DATA_CPU(cpu); + /* + * If this CPU took its interrupt from user mode or from the + * idle loop, and this is not a nested interrupt, then + * this CPU has to have exited all prior preept-disable + * sections of code. So increment the counter to note this. + * + * The memory barrier is needed to handle the case where + * writes from a preempt-disable section of code get reordered + * into schedule() by this CPU's write buffer. So the memory + * barrier makes sure that the rcu_qsctr_inc() is seen by other + * CPUs to happen after any such write. + */ + + if (user || + (idle_cpu(cpu) && !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + smp_mb(); /* Guard against aggressive schedule(). */ + rcu_qsctr_inc(cpu); + } + rcu_check_mb(cpu); if (rcu_ctrlblk.completed == rdp->completed) rcu_try_flip(); @@ -869,6 +992,8 @@ void rcu_offline_cpu(int cpu) struct rcu_head *list = NULL; unsigned long flags; struct rcu_data *rdp = RCU_DATA_CPU(cpu); + struct rcu_head *schedlist = NULL; + struct rcu_head **schedtail = &schedlist; struct rcu_head **tail = &list; /* @@ -882,6 +1007,11 @@ void rcu_offline_cpu(int cpu) rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], list, tail); rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); + rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail, + schedlist, schedtail); + rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail, + schedlist, schedtail); + rdp->rcu_sched_sleeping = 0; spin_unlock_irqrestore(&rdp->lock, flags); rdp->waitlistcount = 0; @@ -916,22 +1046,40 @@ void rcu_offline_cpu(int cpu) * fix. */ - local_irq_save(flags); + local_irq_save(flags); /* disable preempt till we know what lock. */ rdp = RCU_DATA_ME(); spin_lock(&rdp->lock); *rdp->nexttail = list; if (list) rdp->nexttail = tail; + *rdp->nextschedtail = schedlist; + if (schedlist) + rdp->nextschedtail = schedtail; spin_unlock_irqrestore(&rdp->lock, flags); } void __devinit rcu_online_cpu(int cpu) { unsigned long flags; + struct rcu_data *rdp; spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); cpu_set(cpu, rcu_cpu_online_map); spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); + + /* + * The rcu_sched grace-period processing might have bypassed + * this CPU, given that it was not in the rcu_cpu_online_map + * when the grace-period scan started. This means that the + * grace-period task might sleep. So make sure that if this + * should happen, the first callback posted to this CPU will + * wake up the grace-period task if need be. + */ + + rdp = RCU_DATA_CPU(cpu); + spin_lock_irqsave(&rdp->lock, flags); + rdp->rcu_sched_sleeping = 1; + spin_unlock_irqrestore(&rdp->lock, flags); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -986,31 +1134,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) *rdp->nexttail = head; rdp->nexttail = &head->next; RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); - spin_unlock(&rdp->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&rdp->lock, flags); } EXPORT_SYMBOL_GPL(call_rcu); +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + struct rcu_data *rdp; + int wake_gp = 0; + + head->func = func; + head->next = NULL; + local_irq_save(flags); + rdp = RCU_DATA_ME(); + spin_lock(&rdp->lock); + *rdp->nextschedtail = head; + rdp->nextschedtail = &head->next; + if (rdp->rcu_sched_sleeping) { + + /* Grace-period processing might be sleeping... */ + + rdp->rcu_sched_sleeping = 0; + wake_gp = 1; + } + spin_unlock_irqrestore(&rdp->lock, flags); + if (wake_gp) { + + /* Wake up grace-period processing, unless someone beat us. */ + + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping) + wake_gp = 0; + rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + if (wake_gp) + wake_up_interruptible(&rcu_ctrlblk.sched_wq); + } +} +EXPORT_SYMBOL_GPL(call_rcu_sched); + /* * Wait until all currently running preempt_disable() code segments * (including hardware-irq-disable segments) complete. Note that * in -rt this does -not- necessarily result in all currently executing * interrupt -handlers- having completed. */ -void __synchronize_sched(void) +synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) +EXPORT_SYMBOL_GPL(__synchronize_sched); + +/* + * kthread function that manages call_rcu_sched grace periods. + */ +static int rcu_sched_grace_period(void *arg) { - cpumask_t oldmask; + int couldsleep; /* might sleep after current pass. */ + int couldsleepnext = 0; /* might sleep after next pass. */ int cpu; + unsigned long flags; + struct rcu_data *rdp; + int ret; - if (sched_getaffinity(0, &oldmask) < 0) - oldmask = cpu_possible_map; - for_each_online_cpu(cpu) { - sched_setaffinity(0, &cpumask_of_cpu(cpu)); - schedule(); - } - sched_setaffinity(0, &oldmask); + /* + * Each pass through the following loop handles one + * rcu_sched grace period cycle. + */ + do { + /* Save each CPU's current state. */ + + for_each_online_cpu(cpu) { + dyntick_save_progress_counter_sched(cpu); + save_qsctr_sched(cpu); + } + + /* + * Sleep for about an RCU grace-period's worth to + * allow better batching and to consume less CPU. + */ + schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME); + + /* + * If there was nothing to do last time, prepare to + * sleep at the end of the current grace period cycle. + */ + couldsleep = couldsleepnext; + couldsleepnext = 1; + if (couldsleep) { + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + } + + /* + * Wait on each CPU in turn to have either visited + * a quiescent state or been in dynticks-idle mode. + */ + for_each_online_cpu(cpu) { + while (rcu_qsctr_inc_needed(cpu) && + rcu_qsctr_inc_needed_dyntick(cpu)) { + /* resched_cpu(cpu); @@@ */ + schedule_timeout_interruptible(1); + } + } + + /* Advance callbacks for each CPU. */ + + for_each_online_cpu(cpu) { + + rdp = RCU_DATA_CPU(cpu); + spin_lock_irqsave(&rdp->lock, flags); + + /* + * We are running on this CPU irq-disabled, so no + * CPU can go offline until we re-enable irqs. + * The current CPU might have already gone + * offline (between the for_each_offline_cpu and + * the spin_lock_irqsave), but in that case all its + * callback lists will be empty, so no harm done. + * + * Advance the callbacks! We share normal RCU's + * donelist, since callbacks are invoked the + * same way in either case. + */ + if (rdp->waitschedlist != NULL) { + *rdp->donetail = rdp->waitschedlist; + rdp->donetail = rdp->waitschedtail; + + /* + * Next rcu_check_callbacks() will + * do the required raise_softirq(). + */ + } + if (rdp->nextschedlist != NULL) { + rdp->waitschedlist = rdp->nextschedlist; + rdp->waitschedtail = rdp->nextschedtail; + couldsleep = 0; + couldsleepnext = 0; + } else { + rdp->waitschedlist = NULL; + rdp->waitschedtail = &rdp->waitschedlist; + } + rdp->nextschedlist = NULL; + rdp->nextschedtail = &rdp->nextschedlist; + + /* Mark sleep intention. */ + + rdp->rcu_sched_sleeping = couldsleep; + + spin_unlock_irqrestore(&rdp->lock, flags); + } + + /* If we saw callbacks on the last scan, go deal with them. */ + + if (!couldsleep) + continue; + + /* Attempt to block... */ + + spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); + if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) { + + /* + * Someone posted a callback after we scanned. + * Go take care of it. + */ + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + couldsleepnext = 0; + continue; + } + + /* Block until the next person posts a callback. */ + + rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; + spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); + ret = 0; + __wait_event_interruptible(rcu_ctrlblk.sched_wq, + rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, + ret); + + /* + * Signals would prevent us from sleeping, and we cannot + * do much with them in any case. So flush them. + */ + if (ret) + flush_signals(current); + couldsleepnext = 0; + + } while (!kthread_should_stop()); + + return (0); } -EXPORT_SYMBOL_GPL(__synchronize_sched); /* * Check to see if any future RCU-related work will need to be done @@ -1027,7 +1340,9 @@ int rcu_needs_cpu(int cpu) return (rdp->donelist != NULL || !!rdp->waitlistcount || - rdp->nextlist != NULL); + rdp->nextlist != NULL || + rdp->nextschedlist != NULL || + rdp->waitschedlist != NULL); } int rcu_pending(int cpu) @@ -1038,7 +1353,9 @@ int rcu_pending(int cpu) if (rdp->donelist != NULL || !!rdp->waitlistcount || - rdp->nextlist != NULL) + rdp->nextlist != NULL || + rdp->nextschedlist != NULL || + rdp->waitschedlist != NULL) return 1; /* The RCU core needs an acknowledgement from this CPU. */ @@ -1105,6 +1422,11 @@ void __init __rcu_init(void) rdp->donetail = &rdp->donelist; rdp->rcu_flipctr[0] = 0; rdp->rcu_flipctr[1] = 0; + rdp->nextschedlist = NULL; + rdp->nextschedtail = &rdp->nextschedlist; + rdp->waitschedlist = NULL; + rdp->waitschedtail = &rdp->waitschedlist; + rdp->rcu_sched_sleeping = 0; } register_cpu_notifier(&rcu_nb); @@ -1127,11 +1449,15 @@ void __init __rcu_init(void) } /* - * Deprecated, use synchronize_rcu() or synchronize_sched() instead. + * Late-boot-time RCU initialization that must wait until after scheduler + * has been initialized. */ -void synchronize_kernel(void) +void __init rcu_init_sched(void) { - synchronize_rcu(); + rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period, + NULL, + "rcu_sched_grace_period"); + WARN_ON(IS_ERR(rcu_sched_grace_period_task)); } #ifdef CONFIG_RCU_TRACE -- cgit v1.2.3 From 8db559b83009bed92e1b5dd13a651ff273d9ff62 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: add memory barriers and comments to rcu_check_callbacks() Add comments to the logic that infers quiescent states when interrupting from either user mode or the idle loop. Also add a memory barrier: it appears that James Huang was in fact onto something, as the scheduler is much less synchronization happy than it once was, so we can no longer rely on its memory barriers in all cases. Signed-off-by: Paul E. McKenney Reported-by: James Huang Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/rcuclassic.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index f4ffbd0f306..d8348792f9f 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -502,10 +502,38 @@ void rcu_check_callbacks(int cpu, int user) if (user || (idle_cpu(cpu) && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + + /* + * Get here if this CPU took its interrupt from user + * mode or from the idle loop, and if this is not a + * nested interrupt. In this case, the CPU is in + * a quiescent state, so count it. + * + * Also do a memory barrier. This is needed to handle + * the case where writes from a preempt-disable section + * of code get reordered into schedule() by this CPU's + * write buffer. The memory barrier makes sure that + * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see + * by other CPUs to happen after any such write. + */ + + smp_mb(); /* See above block comment. */ rcu_qsctr_inc(cpu); rcu_bh_qsctr_inc(cpu); - } else if (!in_softirq()) + + } else if (!in_softirq()) { + + /* + * Get here if this CPU did not take its interrupt from + * softirq, in other words, if it is not interrupting + * a rcu_bh read-side critical section. This is an _bh + * critical section, so count it. The memory barrier + * is needed for the same reason as is the above one. + */ + + smp_mb(); /* See above block comment. */ rcu_bh_qsctr_inc(cpu); + } raise_rcu_softirq(); } -- cgit v1.2.3 From 70f12f848d3e981479b4f6f751e73c14f7c13e5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: add rcu_barrier_sched() and rcu_barrier_bh() Add rcu_barrier_sched() and rcu_barrier_bh(). With these in place, rcutorture no longer gives the occasional oops when repeatedly starting and stopping torturing rcu_bh. Also adds the API needed to flush out pre-existing call_rcu_sched() callbacks. Signed-off-by: Paul E. McKenney Signed-off-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/rcupdate.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a4e329d9288..4a74b8d48d9 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,12 @@ #include #include +enum rcu_barrier { + RCU_BARRIER_STD, + RCU_BARRIER_BH, + RCU_BARRIER_SCHED, +}; + static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -83,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused) /* * Called with preemption disabled, and from cross-cpu IRQ context. */ -static void rcu_barrier_func(void *notused) +static void rcu_barrier_func(void *type) { int cpu = smp_processor_id(); struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); atomic_inc(&rcu_barrier_cpu_count); - call_rcu(head, rcu_barrier_callback); + switch ((enum rcu_barrier)type) { + case RCU_BARRIER_STD: + call_rcu(head, rcu_barrier_callback); + break; + case RCU_BARRIER_BH: + call_rcu_bh(head, rcu_barrier_callback); + break; + case RCU_BARRIER_SCHED: + call_rcu_sched(head, rcu_barrier_callback); + break; + } } -/** - * rcu_barrier - Wait until all the in-flight RCUs are complete. +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. */ -void rcu_barrier(void) +static void _rcu_barrier(enum rcu_barrier type) { BUG_ON(in_interrupt()); /* Take cpucontrol mutex to protect against CPU hotplug */ @@ -111,13 +128,39 @@ void rcu_barrier(void) * until all the callbacks are queued. */ rcu_read_lock(); - on_each_cpu(rcu_barrier_func, NULL, 0, 1); + on_each_cpu(rcu_barrier_func, (void *)type, 0, 1); rcu_read_unlock(); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); } + +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(RCU_BARRIER_STD); +} EXPORT_SYMBOL_GPL(rcu_barrier); +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(RCU_BARRIER_BH); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(RCU_BARRIER_SCHED); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + void __init rcu_init(void) { __rcu_init(); -- cgit v1.2.3 From 2326974df29988181b6b69ed6fbf42b17adf916f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: add call_rcu_sched() and friends to rcutorture Add entry to rcu_torture_ops allowing the correct barrier function to be used upon exit from rcutorture. Also add torture options for the new call_rcu_sched() API. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/rcutorture.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 33acc424667..0334b6a8bac 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -192,6 +192,7 @@ struct rcu_torture_ops { int (*completed)(void); void (*deferredfree)(struct rcu_torture *p); void (*sync)(void); + void (*cb_barrier)(void); int (*stats)(char *page); char *name; }; @@ -265,6 +266,7 @@ static struct rcu_torture_ops rcu_ops = { .completed = rcu_torture_completed, .deferredfree = rcu_torture_deferred_free, .sync = synchronize_rcu, + .cb_barrier = rcu_barrier, .stats = NULL, .name = "rcu" }; @@ -304,6 +306,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .completed = rcu_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, + .cb_barrier = NULL, .stats = NULL, .name = "rcu_sync" }; @@ -364,6 +367,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .completed = rcu_bh_torture_completed, .deferredfree = rcu_bh_torture_deferred_free, .sync = rcu_bh_torture_synchronize, + .cb_barrier = rcu_barrier_bh, .stats = NULL, .name = "rcu_bh" }; @@ -377,6 +381,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .completed = rcu_bh_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = rcu_bh_torture_synchronize, + .cb_barrier = NULL, .stats = NULL, .name = "rcu_bh_sync" }; @@ -458,6 +463,7 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferredfree = rcu_sync_torture_deferred_free, .sync = srcu_torture_synchronize, + .cb_barrier = NULL, .stats = srcu_torture_stats, .name = "srcu" }; @@ -482,6 +488,11 @@ static int sched_torture_completed(void) return 0; } +static void rcu_sched_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); +} + static void sched_torture_synchronize(void) { synchronize_sched(); @@ -494,12 +505,27 @@ static struct rcu_torture_ops sched_ops = { .readdelay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = sched_torture_read_unlock, .completed = sched_torture_completed, - .deferredfree = rcu_sync_torture_deferred_free, + .deferredfree = rcu_sched_torture_deferred_free, .sync = sched_torture_synchronize, + .cb_barrier = rcu_barrier_sched, .stats = NULL, .name = "sched" }; +static struct rcu_torture_ops sched_ops_sync = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .readdelay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = sched_torture_completed, + .deferredfree = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = NULL, + .stats = NULL, + .name = "sched_sync" +}; + /* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure @@ -848,7 +874,9 @@ rcu_torture_cleanup(void) stats_task = NULL; /* Wait for all RCU callbacks to fire. */ - rcu_barrier(); + + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ @@ -868,7 +896,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, - &srcu_ops, &sched_ops, }; + &srcu_ops, &sched_ops, &sched_ops_sync, }; /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { -- cgit v1.2.3 From 82524746c27fa418c250a56dd7606b9d3fc79826 Mon Sep 17 00:00:00 2001 From: Franck Bui-Huu Date: Mon, 12 May 2008 21:21:05 +0200 Subject: rcu: split list.h and move rcu-protected lists into rculist.h Move rcu-protected lists from list.h into a new header file rculist.h. This is done because list are a very used primitive structure all over the kernel and it's currently impossible to include other header files in this list.h without creating some circular dependencies. For example, list.h implements rcu-protected list and uses rcu_dereference() without including rcupdate.h. It actually compiles because users of rcu_dereference() are macros. Others RCU functions could be used too but aren't probably because of this. Therefore this patch creates rculist.h which includes rcupdates without to many changes/troubles. Signed-off-by: Franck Bui-Huu Acked-by: Paul E. McKenney Acked-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/pid.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 20d59fa2d49..30bd5d4b2ac 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From d7c0651390b6a03ad53f99faec0ba88109d7191d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 12 May 2008 21:21:06 +0200 Subject: rcu: fix rcu_try_flip_waitack_needed() to prevent grace-period stall The comment was correct -- need to make the code match the comment. Without this patch, if a CPU goes dynticks idle (and stays there forever) in just the right phase of preemptible-RCU grace-period processing, grace periods stall. The offending sequence of events (courtesy of Promela/spin, at least after I got the liveness criterion coded correctly...) is as follows: o CPU 0 is in dynticks-idle mode. Its dynticks_progress_counter is (say) 10. o CPU 0 takes an interrupt, so rcu_irq_enter() increments CPU 0's dynticks_progress_counter to 11. o CPU 1 is doing RCU grace-period processing in rcu_try_flip_idle(), sees rcu_pending(), so invokes dyntick_save_progress_counter(), which in turn takes a snapshot of CPU 0's dynticks_progress_counter into CPU 0's rcu_dyntick_snapshot -- now set to 11. CPU 1 then updates the RCU grace-period state to rcu_try_flip_waitack(). o CPU 0 returns from its interrupt, so rcu_irq_exit() increments CPU 0's dynticks_progress_counter to 12. o CPU 1 later invokes rcu_try_flip_waitack(), which notices that CPU 0 has not yet responded, and hence in turn invokes rcu_try_flip_waitack_needed(). This function examines the state of CPU 0's dynticks_progress_counter and rcu_dyntick_snapshot variables, which it copies to curr (== 12) and snap (== 11), respectively. Because curr!=snap, the first condition fails. Because curr-snap is only 1 and snap is odd, the second condition fails. rcu_try_flip_waitack_needed() therefore incorrectly concludes that it must wait for CPU 0 to explicitly acknowledge the counter flip. o CPU 0 remains forever in dynticks-idle mode, never taking any more hardware interrupts or any NMIs, and never running any more tasks. (Of course, -something- will usually eventually happen, which might be why we haven't seen this one in the wild. Still should be fixed!) Therefore the grace period never ends. Fix is to make the code match the comment, as shown below. With this fix, the above scenario would be satisfied with curr being even, and allow the grace period to proceed. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Josh Triplett Cc: Dipankar Sarma Cc: Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index aaa7976bd85..27d0748d8d3 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -597,7 +597,7 @@ rcu_try_flip_waitack_needed(int cpu) * that this CPU already acknowledged the counter. */ - if ((curr - snap) > 2 || (snap & 0x1) == 0) + if ((curr - snap) > 2 || (curr & 0x1) == 0) return 0; /* We need this CPU to explicitly acknowledge the counter flip. */ -- cgit v1.2.3 From e19a98967f49cb63352b3db4818983ea2cec24ba Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Wed, 14 May 2008 16:23:00 -0700 Subject: rcu: remove duplicated include in kernel/rcupreempt_trace.c Removed duplicated include file in kernel/rcupreempt_trace.c Signed-off-by: Huang Weiyi Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupreempt_trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 49ac4947af2..5edf82c34bb 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 247ab1a80595100bd7ea61d174f8b7b7e5f8d4bb Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Wed, 14 May 2008 16:23:00 -0700 Subject: rcu: remove duplicated include in kernel/rcupreempt.c Removed duplicated include file in kernel/rcupreempt.c. Signed-off-by: Huang Weiyi Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 27d0748d8d3..b8e4cdae4e8 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 906d882cacecd37ad2fdd03ed2a9b232bcb9507e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 9 Jun 2008 16:35:25 -0700 Subject: rcu: remove unused field struct rcu_data::rcu_tasklet Since softirq works for rcu reclaimer, rcu_tasklet is unused now. Signed-off-by: Lai Jiangshan Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index b8e4cdae4e8..396b121edfe 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -82,7 +82,6 @@ struct rcu_data { spinlock_t lock; /* Protect rcu_data fields. */ long completed; /* Number of last completed batch. */ int waitlistcount; - struct tasklet_struct rcu_tasklet; struct rcu_head *nextlist; struct rcu_head **nexttail; struct rcu_head *waitlist[GP_STAGES]; -- cgit v1.2.3 From 5af970a48f3ba0dd96a036b196c79dc923f28231 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Jun 2008 10:09:48 +0200 Subject: rcutorture: WARN_ON_ONCE(1) when detecting an error this makes it easier for automated tests to pick up such failures. --- kernel/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 0334b6a8bac..0ca7e9b290b 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -687,6 +687,7 @@ rcu_torture_printk(char *page) if (i > 1) { cnt += sprintf(&page[cnt], "!!! "); atomic_inc(&n_rcu_torture_error); + WARN_ON_ONCE(1); } cnt += sprintf(&page[cnt], "Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) -- cgit v1.2.3 From d120f65f3aaf306c957bc4c82e510f5b0f1e9b27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Jun 2008 05:21:44 -0700 Subject: rcu: make rcutorture more vicious: add stutter feature This patch takes a step towards making rcutorture more brutal by allowing the test to be automatically periodically paused, with the default being to run the test for five seconds then pause for five seconds and repeat. This behavior can be controlled using a new "stutter" module parameter, so that "stutter=0" gives the old default behavior of running continuously. Starting and stopping rcutorture more heavily stresses RCU's interaction with the scheduler, as well as exercising more paths through the grace-period detection code. Note that the default to "shuffle_interval" has also been adjusted from 5 seconds to 3 seconds to provide varying overlap with the "stutter" interval. I am still unable to provoke the failures that Alexey has been seeing, even with this patch, but will be doing a few additional things to beef up rcutorture. Suggested-by: Ingo Molnar Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 0ca7e9b290b..98ae7d16822 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -57,7 +57,8 @@ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ -static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ +static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ +static int stutter = 5; /* Start/stop testing interval (in sec) */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -72,6 +73,8 @@ module_param(test_no_idle_hz, bool, 0444); MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0444); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -91,6 +94,7 @@ static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *shuffler_task; +static struct task_struct *stutter_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -119,6 +123,8 @@ static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; static struct list_head rcu_torture_removed; +static int stutter_pause_test = 0; + /* * Allocate an element from the rcu_tortures pool. */ @@ -179,6 +185,13 @@ rcu_random(struct rcu_random_state *rrsp) return swahw32(rrsp->rrs_state); } +static void +rcu_stutter_wait(void) +{ + while (stutter_pause_test) + schedule_timeout_interruptible(1); +} + /* * Operations vector for selecting different types of tests. */ @@ -563,6 +576,7 @@ rcu_torture_writer(void *arg) } rcu_torture_current_version++; oldbatch = cur_ops->completed(); + rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); while (!kthread_should_stop()) @@ -586,6 +600,7 @@ rcu_torture_fakewriter(void *arg) schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); cur_ops->sync(); + rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); @@ -641,6 +656,7 @@ rcu_torture_reader(void *arg) preempt_enable(); cur_ops->readunlock(idx); schedule(); + rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); while (!kthread_should_stop()) @@ -812,15 +828,34 @@ rcu_torture_shuffle(void *arg) return 0; } +/* Cause the rcutorture test to "stutter", starting and stopping all + * threads periodically. + */ +static int +rcu_torture_stutter(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); + do { + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 1; + if (!kthread_should_stop()) + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 0; + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + return 0; +} + static inline void rcu_torture_print_module_parms(char *tag) { printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval = %d\n", + "shuffle_interval=%d stutter=%d\n", torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval); + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter); } static void @@ -829,6 +864,11 @@ rcu_torture_cleanup(void) int i; fullstop = 1; + if (stutter_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); + kthread_stop(stutter_task); + } + stutter_task = NULL; if (shuffler_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); kthread_stop(shuffler_task); @@ -1017,6 +1057,19 @@ rcu_torture_init(void) goto unwind; } } + if (stutter < 0) + stutter = 0; + if (stutter) { + /* Create the stutter thread */ + stutter_task = kthread_run(rcu_torture_stutter, NULL, + "rcu_torture_stutter"); + if (IS_ERR(stutter_task)) { + firsterr = PTR_ERR(stutter_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); + stutter_task = NULL; + goto unwind; + } + } return 0; unwind: -- cgit v1.2.3 From 31a72bce0bd6f3e0114009288bccbc96376eeeca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Jun 2008 09:26:49 -0700 Subject: rcu: make rcutorture more vicious: reinstate boot-time testing This patch re-institutes the ability to build rcutorture directly into the Linux kernel. The reason that this capability was removed was that this could result in your kernel being pretty much useless, as rcutorture would be running starting from early boot. This problem has been avoided by (1) making rcutorture run only three seconds of every six by default, (2) adding a CONFIG_RCU_TORTURE_TEST_RUNNABLE that permits rcutorture to be quiesced at boot time, and (3) adding a sysctl in /proc named /proc/sys/kernel/rcutorture_runnable that permits rcutorture to be quiesced and unquiesced when built into the kernel. Please note that this /proc file is -not- available when rcutorture is built as a module. Please also note that to get the earlier take-no-prisoners behavior, you must use the boot command line to set rcutorture's "stutter" parameter to zero. The rcutorture quiescing mechanism is currently quite crude: loops in each rcutorture process that poll a global variable once per tick. Suggestions for improvement are welcome. The default action will be to reduce the polling rate to a few times per second. Signed-off-by: Paul E. McKenney Suggested-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 9 ++++++++- kernel/sysctl.c | 13 +++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 98ae7d16822..27003e2421c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -125,6 +125,13 @@ static struct list_head rcu_torture_removed; static int stutter_pause_test = 0; +#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) +#define RCUTORTURE_RUNNABLE_INIT 1 +#else +#define RCUTORTURE_RUNNABLE_INIT 0 +#endif +int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; + /* * Allocate an element from the rcu_tortures pool. */ @@ -188,7 +195,7 @@ rcu_random(struct rcu_random_state *rrsp) static void rcu_stutter_wait(void) { - while (stutter_pause_test) + while (stutter_pause_test || !rcutorture_runnable) schedule_timeout_interruptible(1); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29116652dca..c6887cf135c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,6 +82,9 @@ extern int maps_protect; extern int sysctl_stat_interval; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifdef CONFIG_RCU_TORTURE_TEST +extern int rcutorture_runnable; +#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) @@ -813,6 +816,16 @@ static struct ctl_table kern_table[] = { .child = key_sysctls, }, #endif +#ifdef CONFIG_RCU_TORTURE_TEST + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rcutorture_runnable", + .data = &rcutorture_runnable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- cgit v1.2.3 From e3d7be270c5b1be07ffadcc8b56599ad8e975c1d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Jun 2008 13:06:38 -0700 Subject: rcu, rcutorture: make quiescent rcutorture less power-hungry Signed-off-by: Paul E. McKenney Cc: josh@freedesktop.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: dino@in.ibm.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: vegard.nossum@gmail.com Cc: adobriyan@gmail.com Cc: oleg@tv-sign.ru Cc: bunk@kernel.org Cc: rjw@sisk.pl Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 27003e2421c..0c40c13c540 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -196,7 +196,10 @@ static void rcu_stutter_wait(void) { while (stutter_pause_test || !rcutorture_runnable) - schedule_timeout_interruptible(1); + if (rcutorture_runnable) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(HZ); } /* -- cgit v1.2.3 From 3ccf79f4570acacfefc51772e8f9207895b35ad7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 22 Jun 2008 14:02:55 -0700 Subject: rcu: make quiescent rcutorture less power-hungry This patch aligns the rcutorture wakeup times to align with all other multiple-of-a-second wakeups to further decrease power consumption. Suggested-by: Arjan van de Ven Signed-off-by: Paul E. McKenney Cc: josh@freedesktop.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: dino@in.ibm.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: vegard.nossum@gmail.com Cc: adobriyan@gmail.com Cc: oleg@tv-sign.ru Cc: bunk@kernel.org Cc: rjw@sisk.pl Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 0c40c13c540..5e954edf0ed 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -199,7 +199,7 @@ rcu_stutter_wait(void) if (rcutorture_runnable) schedule_timeout_interruptible(1); else - schedule_timeout_interruptible(HZ); + schedule_timeout_interruptible(round_jiffies_relative(HZ)); } /* -- cgit v1.2.3 From 0729fbf3bc70870370b4f43d652f05a468dc68b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Jun 2008 12:24:52 -0700 Subject: rcu: make rcutorture even more vicious: invoke RCU readers from irq handlers (timers) This patch allows torturing RCU from irq handlers (timers, in this case). A new module parameter irqreader enables such additional torturing, and is enabled by default. Variants of RCU that do not tolerate readers being called from irq handlers (e.g., SRCU) ignore irqreader. Signed-off-by: Paul E. McKenney Cc: josh@freedesktop.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: dino@in.ibm.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Cc: vegard.nossum@gmail.com Cc: adobriyan@gmail.com Cc: oleg@tv-sign.ru Cc: bunk@kernel.org Cc: rjw@sisk.pl Signed-off-by: Ingo Molnar --- kernel/rcutorture.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 5e954edf0ed..90b5b123f7a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -59,6 +59,7 @@ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ +static int irqreader = 1; /* RCU readers from irq (timers). */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -75,6 +76,8 @@ module_param(shuffle_interval, int, 0444); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); module_param(stutter, int, 0444); MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -121,6 +124,7 @@ static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; static atomic_t n_rcu_torture_error; +static long n_rcu_torture_timers = 0; static struct list_head rcu_torture_removed; static int stutter_pause_test = 0; @@ -217,6 +221,7 @@ struct rcu_torture_ops { void (*sync)(void); void (*cb_barrier)(void); int (*stats)(char *page); + int irqcapable; char *name; }; static struct rcu_torture_ops *cur_ops = NULL; @@ -291,6 +296,7 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .cb_barrier = rcu_barrier, .stats = NULL, + .irqcapable = 1, .name = "rcu" }; @@ -331,6 +337,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .sync = synchronize_rcu, .cb_barrier = NULL, .stats = NULL, + .irqcapable = 1, .name = "rcu_sync" }; @@ -392,6 +399,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .sync = rcu_bh_torture_synchronize, .cb_barrier = rcu_barrier_bh, .stats = NULL, + .irqcapable = 1, .name = "rcu_bh" }; @@ -406,6 +414,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .sync = rcu_bh_torture_synchronize, .cb_barrier = NULL, .stats = NULL, + .irqcapable = 1, .name = "rcu_bh_sync" }; @@ -532,6 +541,7 @@ static struct rcu_torture_ops sched_ops = { .sync = sched_torture_synchronize, .cb_barrier = rcu_barrier_sched, .stats = NULL, + .irqcapable = 1, .name = "sched" }; @@ -619,6 +629,52 @@ rcu_torture_fakewriter(void *arg) return 0; } +/* + * RCU torture reader from timer handler. Dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static void rcu_torture_timer(unsigned long unused) +{ + int idx; + int completed; + static DEFINE_RCU_RANDOM(rand); + static DEFINE_SPINLOCK(rand_lock); + struct rcu_torture *p; + int pipe_count; + + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + p = rcu_dereference(rcu_torture_current); + if (p == NULL) { + /* Leave because rcu_torture_writer is not yet underway */ + cur_ops->readunlock(idx); + return; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + spin_lock(&rand_lock); + cur_ops->readdelay(&rand); + n_rcu_torture_timers++; + spin_unlock(&rand_lock); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_count)[pipe_count]; + completed = cur_ops->completed() - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + ++__get_cpu_var(rcu_torture_batch)[completed]; + preempt_enable(); + cur_ops->readunlock(idx); +} + /* * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -633,11 +689,18 @@ rcu_torture_reader(void *arg) DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; + struct timer_list t; VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); set_user_nice(current, 19); + if (irqreader && cur_ops->irqcapable) + setup_timer_on_stack(&t, rcu_torture_timer, 0); do { + if (irqreader && cur_ops->irqcapable) { + if (!timer_pending(&t)) + mod_timer(&t, 1); + } idx = cur_ops->readlock(); completed = cur_ops->completed(); p = rcu_dereference(rcu_torture_current); @@ -669,6 +732,8 @@ rcu_torture_reader(void *arg) rcu_stutter_wait(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + if (irqreader && cur_ops->irqcapable) + del_timer_sync(&t); while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); return 0; @@ -699,14 +764,15 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " - "rtmbe: %d", + "rtmbe: %d nt: %ld", rcu_torture_current, rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free), - atomic_read(&n_rcu_torture_mberror)); + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_timers); if (atomic_read(&n_rcu_torture_mberror) != 0) cnt += sprintf(&page[cnt], " !!!"); cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); @@ -862,10 +928,10 @@ rcu_torture_print_module_parms(char *tag) printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d\n", + "shuffle_interval=%d stutter=%d irqreader=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter); + stutter, irqreader); } static void -- cgit v1.2.3 From 199a952876adbfc2b6c13b8b07adabebf4ff54b2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 26 Jun 2008 10:06:43 +0800 Subject: rcu classic: update qlen when cpu offline When callbacks are moved from offline cpu to this cpu, the qlen field of this rdp should be updated. [ Paul E. McKenney: ] The effect of this bug would be for force_quiescent_state() to be invoked when it should not and vice versa -- wasting cycles in the first case and letting RCU callbacks remain piled up in the second case. The bug is thus "benign" in that it does not result in premature grace-period termination, but should of course be fixed nonetheless. Preemption is disabled by the caller's get_cpu_var(), so we are guaranteed to remain on the same CPU, as required. The local_irq_disable() is indeed needed, otherwise, an interrupt might invoke call_rcu() or call_rcu_bh(), which could cause that interrupt's increment of ->qlen to be lost. Signed-off-by: Lai Jiangshan Cc: Andrew Morton Reviewed-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 214e1cde981..529190c485f 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -387,6 +387,10 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp, rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + + local_irq_disable(); + this_rdp->qlen += rdp->qlen; + local_irq_enable(); } static void rcu_offline_cpu(int cpu) -- cgit v1.2.3