From 6f85a124d819e1cf33b16d064a6a656fd448a735 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 15 Aug 2008 14:55:02 -0700
Subject: net: Preserve netfilter attributes in skb_gso_segment using
 __copy_skb_header

skb_gso_segment didn't preserve some attributes in the original skb
such as the netfilter fields.  This was harmless until they were used
which is the case for packets going through lo.

This patch makes it call __copy_skb_header which also picks up some
other missing attributes.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 84640172d65..ca1ccdf1ef7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2256,14 +2256,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
 			segs = nskb;
 		tail = nskb;
 
-		nskb->dev = skb->dev;
-		skb_copy_queue_mapping(nskb, skb);
-		nskb->priority = skb->priority;
-		nskb->protocol = skb->protocol;
-		nskb->vlan_tci = skb->vlan_tci;
-		nskb->dst = dst_clone(skb->dst);
-		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
-		nskb->pkt_type = skb->pkt_type;
+		__copy_skb_header(nskb, skb);
 		nskb->mac_len = skb->mac_len;
 
 		skb_reserve(nskb, headroom);
@@ -2274,6 +2267,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
 		skb_copy_from_linear_data(skb, skb_put(nskb, doffset),
 					  doffset);
 		if (!sg) {
+			nskb->ip_summed = CHECKSUM_NONE;
 			nskb->csum = skb_copy_and_csum_bits(skb, offset,
 							    skb_put(nskb, len),
 							    len, 0);
@@ -2283,8 +2277,6 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
 		frag = skb_shinfo(nskb)->frags;
 		k = 0;
 
-		nskb->ip_summed = CHECKSUM_PARTIAL;
-		nskb->csum = skb->csum;
 		skb_copy_from_linear_data_offset(skb, offset,
 						 skb_put(nskb, hsize), hsize);
 
-- 
cgit v1.2.3


From db543c1f973cd1d557cc32ceee76737c1e4d2898 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 15 Aug 2008 15:13:53 -0700
Subject: net: skb_copy_datagram_from_iovec()

There's an skb_copy_datagram_iovec() to copy out of a paged skb, but
nothing the other way around (because we don't do that).

We want to allocate big skbs in tun.c, so let's add the function.
It's a carbon copy of skb_copy_datagram_iovec() with enough changes to
be annoying.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/datagram.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

(limited to 'net/core')

diff --git a/net/core/datagram.c b/net/core/datagram.c
index dd61dcad601..52f577a0f54 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -339,6 +339,93 @@ fault:
 	return -EFAULT;
 }
 
+/**
+ *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying to
+ *	@from: io vector to copy to
+ *	@len: amount of data to copy to buffer from iovec
+ *
+ *	Returns 0 or -EFAULT.
+ *	Note: the iovec is modified during the copy.
+ */
+int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
+				 struct iovec *from, int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (memcpy_fromiovec(skb->data + offset, from, copy))
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_shinfo(skb)->frags[i].size;
+		if ((copy = end - offset) > 0) {
+			int err;
+			u8  *vaddr;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct page *page = frag->page;
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			err = memcpy_fromiovec(vaddr + frag->page_offset +
+					       offset - start, from, copy);
+			kunmap(page);
+			if (err)
+				goto fault;
+
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *list = skb_shinfo(skb)->frag_list;
+
+		for (; list; list = list->next) {
+			int end;
+
+			WARN_ON(start > offset + len);
+
+			end = start + list->len;
+			if ((copy = end - offset) > 0) {
+				if (copy > len)
+					copy = len;
+				if (skb_copy_datagram_from_iovec(list,
+								 offset - start,
+								 from, copy))
+					goto fault;
+				if ((len -= copy) == 0)
+					return 0;
+				offset += copy;
+			}
+			start = end;
+		}
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
+
 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 				      u8 __user *to, int len,
 				      __wsum *csump)
-- 
cgit v1.2.3


From a9312ae89324438b0edc554eb36c3ec6bf927d04 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 17 Aug 2008 21:51:03 -0700
Subject: pkt_sched: Add 'deactivated' state.

This new state lets dev_deactivate() mark a qdisc as having been
deactivated.

dev_queue_xmit() and ing_filter() check for this bit and do not
try to process the qdisc if the bit is set.

dev_deactivate() polls the qdisc after setting the bit, waiting
for both __QDISC_STATE_RUNNING and __QDISC_STATE_SCHED to clear.

This isn't perfect yet, but subsequent changesets will make it so.
This part is just one piece of the puzzle.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 600bb23c4c2..d9e31f63ade 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1800,6 +1800,12 @@ gso:
 
 		spin_lock(root_lock);
 
+		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+			spin_unlock(root_lock);
+			rc = NET_XMIT_DROP;
+			goto out_kfree_skb;
+		}
+
 		rc = qdisc_enqueue_root(skb, q);
 		qdisc_run(q);
 
@@ -2084,7 +2090,8 @@ static int ing_filter(struct sk_buff *skb)
 	q = rxq->qdisc;
 	if (q != &noop_qdisc) {
 		spin_lock(qdisc_lock(q));
-		result = qdisc_enqueue_root(skb, q);
+		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
+			result = qdisc_enqueue_root(skb, q);
 		spin_unlock(qdisc_lock(q));
 	}
 
-- 
cgit v1.2.3


From def82a1db1fdc4f861c77009e2ee86870c3743b0 Mon Sep 17 00:00:00 2001
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sun, 17 Aug 2008 21:54:43 -0700
Subject: net: Change handling of the __QDISC_STATE_SCHED flag in
 net_tx_action().

Change handling of the __QDISC_STATE_SCHED flag in net_tx_action() to
enable proper control in dev_deactivate(). Now, if this flag is seen
as unset under root_lock means a qdisc can't be netif_scheduled.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index d9e31f63ade..819f0175bdc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1339,19 +1339,23 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 }
 
 
-void __netif_schedule(struct Qdisc *q)
+static inline void __netif_reschedule(struct Qdisc *q)
 {
-	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) {
-		struct softnet_data *sd;
-		unsigned long flags;
+	struct softnet_data *sd;
+	unsigned long flags;
 
-		local_irq_save(flags);
-		sd = &__get_cpu_var(softnet_data);
-		q->next_sched = sd->output_queue;
-		sd->output_queue = q;
-		raise_softirq_irqoff(NET_TX_SOFTIRQ);
-		local_irq_restore(flags);
-	}
+	local_irq_save(flags);
+	sd = &__get_cpu_var(softnet_data);
+	q->next_sched = sd->output_queue;
+	sd->output_queue = q;
+	raise_softirq_irqoff(NET_TX_SOFTIRQ);
+	local_irq_restore(flags);
+}
+
+void __netif_schedule(struct Qdisc *q)
+{
+	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
+		__netif_reschedule(q);
 }
 EXPORT_SYMBOL(__netif_schedule);
 
@@ -1980,15 +1984,15 @@ static void net_tx_action(struct softirq_action *h)
 
 			head = head->next_sched;
 
-			smp_mb__before_clear_bit();
-			clear_bit(__QDISC_STATE_SCHED, &q->state);
-
 			root_lock = qdisc_lock(q);
 			if (spin_trylock(root_lock)) {
+				smp_mb__before_clear_bit();
+				clear_bit(__QDISC_STATE_SCHED,
+					  &q->state);
 				qdisc_run(q);
 				spin_unlock(root_lock);
 			} else {
-				__netif_schedule(q);
+				__netif_reschedule(q);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 96d203169d1d851ac1468f7d4459a09581be364c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 17 Aug 2008 23:37:16 -0700
Subject: pkt_sched: Fix missed RCU unlock in dev_queue_xmit()

Noticed by Jarek Poplawski.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 819f0175bdc..8d133802372 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1805,14 +1805,12 @@ gso:
 		spin_lock(root_lock);
 
 		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
-			spin_unlock(root_lock);
+			kfree_skb(skb);
 			rc = NET_XMIT_DROP;
-			goto out_kfree_skb;
+		} else {
+			rc = qdisc_enqueue_root(skb, q);
+			qdisc_run(q);
 		}
-
-		rc = qdisc_enqueue_root(skb, q);
-		qdisc_run(q);
-
 		spin_unlock(root_lock);
 
 		goto out;
-- 
cgit v1.2.3


From deb3abf15fb92a608fba630da2e8719862731714 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 18 Aug 2008 22:32:10 -0700
Subject: Revert "pkt_sched: Protect gen estimators under est_lock."

This reverts commit d4766692e72422f3b0f0e9ac6773d92baad07d51.

qdisc_destroy() now runs in RTNL fully again, so this
change is no longer needed.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gen_estimator.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index a89f32fa94f..57abe8266be 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -99,7 +99,7 @@ struct gen_estimator_head
 
 static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
 
-/* Protects against NULL dereference and RCU write-side */
+/* Protects against NULL dereference */
 static DEFINE_RWLOCK(est_lock);
 
 static void est_timer(unsigned long arg)
@@ -185,7 +185,6 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
 	est->last_packets = bstats->packets;
 	est->avpps = rate_est->pps<<10;
 
-	write_lock_bh(&est_lock);
 	if (!elist[idx].timer.function) {
 		INIT_LIST_HEAD(&elist[idx].list);
 		setup_timer(&elist[idx].timer, est_timer, idx);
@@ -195,7 +194,6 @@ int gen_new_estimator(struct gnet_stats_basic *bstats,
 		mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
 
 	list_add_rcu(&est->list, &elist[idx].list);
-	write_unlock_bh(&est_lock);
 	return 0;
 }
 
@@ -214,6 +212,7 @@ static void __gen_kill_estimator(struct rcu_head *head)
  * Removes the rate estimator specified by &bstats and &rate_est
  * and deletes the timer.
  *
+ * NOTE: Called under rtnl_mutex
  */
 void gen_kill_estimator(struct gnet_stats_basic *bstats,
 	struct gnet_stats_rate_est *rate_est)
@@ -227,17 +226,17 @@ void gen_kill_estimator(struct gnet_stats_basic *bstats,
 		if (!elist[idx].timer.function)
 			continue;
 
-		write_lock_bh(&est_lock);
 		list_for_each_entry_safe(e, n, &elist[idx].list, list) {
 			if (e->rate_est != rate_est || e->bstats != bstats)
 				continue;
 
+			write_lock_bh(&est_lock);
 			e->bstats = NULL;
+			write_unlock_bh(&est_lock);
 
 			list_del_rcu(&e->list);
 			call_rcu(&e->e_rcu, __gen_kill_estimator);
 		}
-		write_unlock_bh(&est_lock);
 	}
 }
 
-- 
cgit v1.2.3


From 195648bbc5ae0848e82f771ecf4cd7497054c212 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 19 Aug 2008 04:00:36 -0700
Subject: pkt_sched: Prevent livelock in TX queue running.

If dev_deactivate() is trying to quiesce the queue, it
is theoretically possible for another cpu to livelock
trying to process that queue.  This happens because
dev_deactivate() grabs the queue spinlock as it checks
the queue state, whereas net_tx_action() does a trylock
and reschedules the qdisc if it hits the lock.

This breaks the livelock by adding a check on
__QDISC_STATE_DEACTIVATED to net_tx_action() when
the trylock fails.

Based upon feedback from Herbert Xu and Jarek Poplawski.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8d133802372..60c51f76588 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1990,7 +1990,9 @@ static void net_tx_action(struct softirq_action *h)
 				qdisc_run(q);
 				spin_unlock(root_lock);
 			} else {
-				__netif_reschedule(q);
+				if (!test_bit(__QDISC_STATE_DEACTIVATED,
+					      &q->state))
+					__netif_reschedule(q);
 			}
 		}
 	}
-- 
cgit v1.2.3