From 36be57ffe39e03aab9fbe857f70c7a6a15bd9e08 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sat, 20 May 2006 15:00:10 -0700
Subject: [PATCH] cpuset: update cpuset_zones_allowed comment

Update the kernel/cpuset.c:cpuset_zone_allowed() comment.

The rule for when mm/page_alloc.c should call cpuset_zone_allowed()
was intended to be:

  Don't call cpuset_zone_allowed() if you can't sleep, unless you
  pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
  the code that might scan up ancestor cpusets and sleep.

The explanation of this rule in the comment above cpuset_zone_allowed() was
stale, as a result of a restructuring of some __alloc_pages() code in
November 2005.

Rewrite that comment ...

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 72248d1b9e3..57fd88237af 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2231,19 +2231,25 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * So only GFP_KERNEL allocations, if all nodes in the cpuset are
  * short of memory, might require taking the callback_mutex mutex.
  *
- * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
- * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
- * hardwall cpusets - no allocation on a node outside the cpuset is
- * allowed (unless in interrupt, of course).
- *
- * The second loop doesn't even call here for GFP_ATOMIC requests
- * (if the __alloc_pages() local variable 'wait' is set).  That check
- * and the checks below have the combined affect in the second loop of
- * the __alloc_pages() routine that:
+ * The first call here from mm/page_alloc:get_page_from_freelist()
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
+ * no allocation on a node outside the cpuset is allowed (unless in
+ * interrupt, of course).
+ *
+ * The second pass through get_page_from_freelist() doesn't even call
+ * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
+ * in alloc_flags.  That logic and the checks below have the combined
+ * affect that:
  *	in_interrupt - any node ok (current task context irrelevant)
  *	GFP_ATOMIC   - any node ok
  *	GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
+ *
+ * Rule:
+ *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
+ *    the code that might scan up ancestor cpusets and sleep.
  **/
 
 int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
-- 
cgit v1.2.3


From 92d1dbd27417c54c23aac6a84c285e256f6118b6 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sat, 20 May 2006 15:00:11 -0700
Subject: [PATCH] cpuset: might_sleep_if check in cpuset_zones_allowed

It's too easy to incorrectly call cpuset_zone_allowed() in an atomic
context without __GFP_HARDWALL set, and when done, it is not noticed until
a tight memory situation forces allocations to be tried outside the current
cpuset.

Add a 'might_sleep_if()' check, to catch this earlier on, instead of
waiting for a similar check in the mutex_lock() code, which is only rarely
invoked.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 57fd88237af..ab81fdd4572 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2261,6 +2261,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 	if (in_interrupt())
 		return 1;
 	node = z->zone_pgdat->node_id;
+	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
-- 
cgit v1.2.3


From 0662b71322e211dba9a4bc0e6fbca7861a2b5a7d Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 20 May 2006 15:00:24 -0700
Subject: [PATCH] Fix a NO_IDLE_HZ timer bug

Under certain timing conditions, a race during boot occurs where timer
ticks are being processed on remote CPUs.  The remote timer ticks can
increment jiffies, and if this happens during a window when a timeout is
very close to expiring but a local tick has not yet been delivered, you can
end up with

1) No softirq pending
2) A local timer wheel which is not synced to jiffies
3) No high resolution timer active
4) A local timer which is supposed to fire before the current jiffies value.

In this circumstance, the comparison in next_timer_interrupt overflows,
because the base of the comparison for high resolution timers is jiffies,
but for the softirq timer wheel, it is relative the the current base of the
wheel (jiffies_base).

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 67eaf0f5409..9e49deed468 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -541,6 +541,22 @@ found:
 	}
 	spin_unlock(&base->lock);
 
+	/*
+	 * It can happen that other CPUs service timer IRQs and increment
+	 * jiffies, but we have not yet got a local timer tick to process
+	 * the timer wheels.  In that case, the expiry time can be before
+	 * jiffies, but since the high-resolution timer here is relative to
+	 * jiffies, the default expression when high-resolution timers are
+	 * not active,
+	 *
+	 *   time_before(MAX_JIFFY_OFFSET + jiffies, expires)
+	 *
+	 * would falsely evaluate to true.  If that is the case, just
+	 * return jiffies so that we can immediately fire the local timer
+	 */
+	if (time_before(expires, jiffies))
+		return jiffies;
+
 	if (time_before(hr_expires, expires))
 		return hr_expires;
 
-- 
cgit v1.2.3


From f1adad78dd2fc8edaa513e0bde92b4c64340245c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sun, 21 May 2006 18:54:09 -0700
Subject: Revert "[PATCH] sched: fix interactive task starvation"

This reverts commit 5ce74abe788a26698876e66b9c9ce7e7acc25413 (and its
dependent commit 8a5bc075b8d8cf7a87b3f08fad2fba0f5d13295e), because of
audio underruns.

Reported by Rene Herman <rene.herman@keyaccess.nl>, who also pinpointed
the exact cause of the underruns:

  "Audio underruns galore, with only ogg123 and firefox (browsing the
   GIT tree online is also a nice trigger by the way).

   If I back it out, everything is fine for me again."

Cc: Rene Herman <rene.herman@keyaccess.nl>
Cc: Mike Galbraith <efault@gmx.de>
Acked-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 62 +++++++++++++++++-----------------------------------------
 1 file changed, 18 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 4c64f85698a..c13f1bd2df7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -664,48 +664,6 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
-/*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired, and switch periodically
- * regardless, to ensure that highly interactive tasks do not starve
- * the less fortunate for unreasonably long periods.
- */
-static inline int expired_starving(runqueue_t *rq)
-{
-	int limit;
-
-	/*
-	 * Arrays were recently switched, all is well
-	 */
-	if (!rq->expired_timestamp)
-		return 0;
-
-	limit = STARVATION_LIMIT * rq->nr_running;
-
-	/*
-	 * It's time to switch arrays
-	 */
-	if (jiffies - rq->expired_timestamp >= limit)
-		return 1;
-
-	/*
-	 * There's a better selection in the expired array
-	 */
-	if (rq->curr->static_prio > rq->best_expired_prio)
-		return 1;
-
-	/*
-	 * All is well
-	 */
-	return 0;
-}
-
 /*
  * __activate_task - move a task to the runqueue.
  */
@@ -713,7 +671,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
 {
 	prio_array_t *target = rq->active;
 
-	if (unlikely(batch_task(p) || (expired_starving(rq) && !rt_task(p))))
+	if (batch_task(p))
 		target = rq->expired;
 	enqueue_task(p, target);
 	rq->nr_running++;
@@ -2531,6 +2489,22 @@ unsigned long long current_sched_time(const task_t *tsk)
 	return ns;
 }
 
+/*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+#define EXPIRED_STARVING(rq) \
+	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+		(jiffies - (rq)->expired_timestamp >= \
+			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+			((rq)->curr->static_prio > (rq)->best_expired_prio))
+
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
@@ -2666,7 +2640,7 @@ void scheduler_tick(void)
 
 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
 				rq->best_expired_prio = p->static_prio;
-- 
cgit v1.2.3