From 88f16db7a2fa63b9242e8a0fbc40d51722f2e2f9 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Wed, 13 May 2009 12:04:30 +0200
Subject: wext: verify buffer size for SIOCSIWENCODEEXT

Another design flaw in wireless extensions (is anybody
surprised?) in the way it handles the iw_encode_ext
structure: The structure is part of the 'extra' memory
but contains the key length explicitly, instead of it
just being the length of the extra buffer - size of
the struct and using the explicit key length only for
the get operation (which only writes it).

Therefore, we have this layout:

extra: +-------------------------+
       | struct iw_encode_ext  { |
       |     ...                 |
       |     u16 key_len;        |
       |     u8 key[0];          |
       | };                      |
       +-------------------------+
       | key material            |
       +-------------------------+

Now, all drivers I checked use ext->key_len without
checking that both key_len and the struct fit into the
extra buffer that has been copied from userspace. This
leads to a buffer overrun while reading that buffer,
depending on the driver it may be possible to specify
arbitrary key_len or it may need to be a proper length
for the key algorithm specified.

Thankfully, this is only exploitable by root, but root
can actually cause a segfault or use kernel memory as
a key (which you can even get back with siocgiwencode
or siocgiwencodeext from the key buffer).

Fix this by verifying that key_len fits into the buffer
along with struct iw_encode_ext.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/wext.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/wireless/wext.c b/net/wireless/wext.c
index cb6a5bb85d8..0e59f9ae9b8 100644
--- a/net/wireless/wext.c
+++ b/net/wireless/wext.c
@@ -786,6 +786,13 @@ static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd,
 			err = -EFAULT;
 			goto out;
 		}
+
+		if (cmd == SIOCSIWENCODEEXT) {
+			struct iw_encode_ext *ee = (void *) extra;
+
+			if (iwp->length < sizeof(*ee) + ee->key_len)
+				return -EFAULT;
+		}
 	}
 
 	err = handler(dev, info, (union iwreq_data *) iwp, extra);
-- 
cgit v1.2.3


From 5078b2e32ad4b1f753b1c837c15892202f753c97 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Wed, 13 May 2009 17:04:42 -0400
Subject: cfg80211: fix race between core hint and driver's custom apply

Its possible for cfg80211 to have scheduled the work and for
the global workqueue to not have kicked in prior to a cfg80211
driver's regulatory hint or wiphy_apply_custom_regulatory().

Although this is very unlikely its possible and should fix
this race. When this race would happen you are expected to have
hit a null pointer dereference panic.

Cc: stable@kernel.org
Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Tested-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 net/wireless/reg.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 08265ca1578..487cb627ddb 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1551,6 +1551,13 @@ static int regulatory_hint_core(const char *alpha2)
 
 	queue_regulatory_request(request);
 
+	/*
+	 * This ensures last_request is populated once modules
+	 * come swinging in and calling regulatory hints and
+	 * wiphy_apply_custom_regulatory().
+	 */
+	flush_scheduled_work();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From cf8da764fc6959b7efb482f375dfef9830e98205 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 19 May 2009 18:54:22 +0000
Subject: net: fix length computation in rt_check_expire()

rt_check_expire() computes average and standard deviation of chain lengths,
but not correclty reset length to 0 at beginning of each chain.
This probably gives overflows for sum2 (and sum) on loaded machines instead
of meaningful results.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c4c60e9f068..869cf1c44b7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -785,7 +785,7 @@ static void rt_check_expire(void)
 	static unsigned int rover;
 	unsigned int i = rover, goal;
 	struct rtable *rth, **rthp;
-	unsigned long length = 0, samples = 0;
+	unsigned long samples = 0;
 	unsigned long sum = 0, sum2 = 0;
 	u64 mult;
 
@@ -795,9 +795,9 @@ static void rt_check_expire(void)
 	goal = (unsigned int)mult;
 	if (goal > rt_hash_mask)
 		goal = rt_hash_mask + 1;
-	length = 0;
 	for (; goal > 0; goal--) {
 		unsigned long tmo = ip_rt_gc_timeout;
+		unsigned long length;
 
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
@@ -809,6 +809,7 @@ static void rt_check_expire(void)
 
 		if (*rthp == NULL)
 			continue;
+		length = 0;
 		spin_lock_bh(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
 			if (rt_is_expired(rth)) {
-- 
cgit v1.2.3


From 1ddbcb005c395518c2cd0df504cff3d4b5c85853 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 19 May 2009 20:14:28 +0000
Subject: net: fix rtable leak in net/ipv4/route.c

Alexander V. Lukyanov found a regression in 2.6.29 and made a complete
analysis found in http://bugzilla.kernel.org/show_bug.cgi?id=13339
Quoted here because its a perfect one :

begin_of_quotation
 2.6.29 patch has introduced flexible route cache rebuilding. Unfortunately the
 patch has at least one critical flaw, and another problem.

 rt_intern_hash calculates rthi pointer, which is later used for new entry
 insertion. The same loop calculates cand pointer which is used to clean the
 list. If the pointers are the same, rtable leak occurs, as first the cand is
 removed then the new entry is appended to it.

 This leak leads to unregister_netdevice problem (usage count > 0).

 Another problem of the patch is that it tries to insert the entries in certain
 order, to facilitate counting of entries distinct by all but QoS parameters.
 Unfortunately, referencing an existing rtable entry moves it to list beginning,
 to speed up further lookups, so the carefully built order is destroyed.

 For the first problem the simplest patch it to set rthi=0 when rthi==cand, but
 it will also destroy the ordering.
end_of_quotation

Problematic commit is 1080d709fb9d8cd4392f93476ee46a9d6ea05a5b
(net: implement emergency route cache rebulds when gc_elasticity is exceeded)

Trying to keep dst_entries ordered is too complex and breaks the fact that
order should depend on the frequency of use for garbage collection.

A possible fix is to make rt_intern_hash() simpler, and only makes
rt_check_expire() a litle bit smarter, being able to cope with an arbitrary
entries order. The added loop is running on cache hot data, while cpu
is prefetching next object, so should be unnoticied.

Reported-and-analyzed-by: Alexander V. Lukyanov <lav@yar.ru>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 55 +++++++++++++++++--------------------------------------
 1 file changed, 17 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 869cf1c44b7..28205e5bfa9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -784,7 +784,7 @@ static void rt_check_expire(void)
 {
 	static unsigned int rover;
 	unsigned int i = rover, goal;
-	struct rtable *rth, **rthp;
+	struct rtable *rth, *aux, **rthp;
 	unsigned long samples = 0;
 	unsigned long sum = 0, sum2 = 0;
 	u64 mult;
@@ -812,6 +812,7 @@ static void rt_check_expire(void)
 		length = 0;
 		spin_lock_bh(rt_hash_lock_addr(i));
 		while ((rth = *rthp) != NULL) {
+			prefetch(rth->u.dst.rt_next);
 			if (rt_is_expired(rth)) {
 				*rthp = rth->u.dst.rt_next;
 				rt_free(rth);
@@ -820,33 +821,30 @@ static void rt_check_expire(void)
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
 				if (time_before_eq(jiffies, rth->u.dst.expires)) {
+nofree:
 					tmo >>= 1;
 					rthp = &rth->u.dst.rt_next;
 					/*
-					 * Only bump our length if the hash
-					 * inputs on entries n and n+1 are not
-					 * the same, we only count entries on
+					 * We only count entries on
 					 * a chain with equal hash inputs once
 					 * so that entries for different QOS
 					 * levels, and other non-hash input
 					 * attributes don't unfairly skew
 					 * the length computation
 					 */
-					if ((*rthp == NULL) ||
-					    !compare_hash_inputs(&(*rthp)->fl,
-								 &rth->fl))
-						length += ONE;
+					for (aux = rt_hash_table[i].chain;;) {
+						if (aux == rth) {
+							length += ONE;
+							break;
+						}
+						if (compare_hash_inputs(&aux->fl, &rth->fl))
+							break;
+						aux = aux->u.dst.rt_next;
+					}
 					continue;
 				}
-			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-				tmo >>= 1;
-				rthp = &rth->u.dst.rt_next;
-				if ((*rthp == NULL) ||
-				    !compare_hash_inputs(&(*rthp)->fl,
-							 &rth->fl))
-					length += ONE;
-				continue;
-			}
+			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+				goto nofree;
 
 			/* Cleanup aged off entries. */
 			*rthp = rth->u.dst.rt_next;
@@ -1069,7 +1067,6 @@ out:	return 0;
 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 {
 	struct rtable	*rth, **rthp;
-	struct rtable	*rthi;
 	unsigned long	now;
 	struct rtable *cand, **candp;
 	u32 		min_score;
@@ -1089,7 +1086,6 @@ restart:
 	}
 
 	rthp = &rt_hash_table[hash].chain;
-	rthi = NULL;
 
 	spin_lock_bh(rt_hash_lock_addr(hash));
 	while ((rth = *rthp) != NULL) {
@@ -1135,17 +1131,6 @@ restart:
 		chain_length++;
 
 		rthp = &rth->u.dst.rt_next;
-
-		/*
-		 * check to see if the next entry in the chain
-		 * contains the same hash input values as rt.  If it does
-		 * This is where we will insert into the list, instead of
-		 * at the head.  This groups entries that differ by aspects not
-		 * relvant to the hash function together, which we use to adjust
-		 * our chain length
-		 */
-		if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
-			rthi = rth;
 	}
 
 	if (cand) {
@@ -1206,10 +1191,7 @@ restart:
 		}
 	}
 
-	if (rthi)
-		rt->u.dst.rt_next = rthi->u.dst.rt_next;
-	else
-		rt->u.dst.rt_next = rt_hash_table[hash].chain;
+	rt->u.dst.rt_next = rt_hash_table[hash].chain;
 
 #if RT_CACHE_DEBUG >= 2
 	if (rt->u.dst.rt_next) {
@@ -1225,10 +1207,7 @@ restart:
 	 * previous writes to rt are comitted to memory
 	 * before making rt visible to other CPUS.
 	 */
-	if (rthi)
-		rcu_assign_pointer(rthi->u.dst.rt_next, rt);
-	else
-		rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 
 	spin_unlock_bh(rt_hash_lock_addr(hash));
 	*rp = rt;
-- 
cgit v1.2.3


From 4f72427998b105392e60bae7a6798a0c96fe4f0a Mon Sep 17 00:00:00 2001
From: Jean-Mickael Guerin <jean-mickael.guerin@6wind.com>
Date: Wed, 20 May 2009 17:38:59 -0700
Subject: IPv6: set RTPROT_KERNEL to initial route

The use of unspecified protocol in IPv6 initial route prevents quagga to
install IPv6 default route:
# show ipv6 route
S   ::/0 [1/0] via fe80::1, eth1_0
K>* ::/0 is directly connected, lo, rej
C>* ::1/128 is directly connected, lo
C>* fe80::/64 is directly connected, eth1_0

# ip -6 route
fe80::/64 dev eth1_0  proto kernel  metric 256  mtu 1500 advmss 1440
hoplimit -1
ff00::/8 dev eth1_0  metric 256  mtu 1500 advmss 1440 hoplimit -1
unreachable default dev lo  proto none  metric -1  error -101 hoplimit 255

The attached patch ensures RTPROT_KERNEL to the default initial route
and fixes the problem for quagga.
This is similar to "ipv6: protocol for address routes"
f410a1fba7afa79d2992620e874a343fdba28332.

# show ipv6 route
S>* ::/0 [1/0] via fe80::1, eth1_0
C>* ::1/128 is directly connected, lo
C>* fe80::/64 is directly connected, eth1_0

# ip -6 route
fe80::/64 dev eth1_0  proto kernel  metric 256  mtu 1500 advmss 1440
hoplimit -1
fe80::/64 dev eth1_0  proto kernel  metric 256  mtu 1500 advmss 1440
hoplimit -1
ff00::/8 dev eth1_0  metric 256  mtu 1500 advmss 1440 hoplimit -1
default via fe80::1 dev eth1_0  proto zebra  metric 1024  mtu 1500
advmss 1440 hoplimit -1
unreachable default dev lo  proto kernel  metric -1  error -101 hoplimit 255

Signed-off-by: Jean-Mickael Guerin <jean-mickael.guerin@6wind.com>
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1394ddb6e35..032a5ec391c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -137,6 +137,7 @@ static struct rt6_info ip6_null_entry_template = {
 		}
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
 };
@@ -159,6 +160,7 @@ static struct rt6_info ip6_prohibit_entry_template = {
 		}
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
 };
@@ -176,6 +178,7 @@ static struct rt6_info ip6_blk_hole_entry_template = {
 		}
 	},
 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
+	.rt6i_protocol  = RTPROT_KERNEL,
 	.rt6i_metric	= ~(u32) 0,
 	.rt6i_ref	= ATOMIC_INIT(1),
 };
-- 
cgit v1.2.3


From 5b5f792a6a9a2f9ae812d151ed621f72e99b1725 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 21 May 2009 15:07:12 -0700
Subject: pktgen: do not access flows[] beyond its length

typo -- pkt_dev->nflows is for stats only, the number of concurrent
flows is stored in cflows.

Reported-By: Vladimir Ivashchenko <hazard@francoudi.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 3779c1438c1..0666a827bc6 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2447,7 +2447,7 @@ static inline void free_SAs(struct pktgen_dev *pkt_dev)
 	if (pkt_dev->cflows) {
 		/* let go of the SAs if we have them */
 		int i = 0;
-		for (;  i < pkt_dev->nflows; i++){
+		for (;  i < pkt_dev->cflows; i++) {
 			struct xfrm_state *x = pkt_dev->flows[i].x;
 			if (x) {
 				xfrm_state_put(x);
-- 
cgit v1.2.3


From 3ed18d76d959e5cbfa5d70c8f7ba95476582a556 Mon Sep 17 00:00:00 2001
From: Robert Olsson <robert.olsson@its.uu.se>
Date: Thu, 21 May 2009 15:20:59 -0700
Subject: ipv4: Fix oops with FIB_TRIE

It seems we can fix this by disabling preemption while we re-balance the
trie. This is with the CONFIG_CLASSIC_RCU. It's been stress-tested at high
loads continuesly taking a full BGP table up/down via iproute -batch.

Note. fib_trie is not updated for CONFIG_PREEMPT_RCU

Reported-by: Andrei Popa
Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ec0ae490f0b..33c7c85dfe4 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -986,9 +986,12 @@ fib_find_node(struct trie *t, u32 key)
 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 {
 	int wasfull;
-	t_key cindex, key = tn->key;
+	t_key cindex, key;
 	struct tnode *tp;
 
+	preempt_disable();
+	key = tn->key;
+
 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
@@ -1007,6 +1010,7 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
 	if (IS_TNODE(tn))
 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
 
+	preempt_enable();
 	return (struct node *)tn;
 }
 
-- 
cgit v1.2.3


From 0975ecba3b670df7c488a5e0e6fe9f1f370a8ad8 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 21 May 2009 15:22:02 -0700
Subject: RxRPC: Error handling for rxrpc_alloc_connection()

rxrpc_alloc_connection() doesn't return an error code on failure, it just
returns NULL.  IS_ERR(NULL) is false.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/ar-connection.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
index 0f1218b8d28..67e38a05624 100644
--- a/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@ -343,9 +343,9 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx,
 		/* not yet present - create a candidate for a new connection
 		 * and then redo the check */
 		conn = rxrpc_alloc_connection(gfp);
-		if (IS_ERR(conn)) {
-			_leave(" = %ld", PTR_ERR(conn));
-			return PTR_ERR(conn);
+		if (!conn) {
+			_leave(" = -ENOMEM");
+			return -ENOMEM;
 		}
 
 		conn->trans = trans;
@@ -508,9 +508,9 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
 		/* not yet present - create a candidate for a new connection and then
 		 * redo the check */
 		candidate = rxrpc_alloc_connection(gfp);
-		if (IS_ERR(candidate)) {
-			_leave(" = %ld", PTR_ERR(candidate));
-			return PTR_ERR(candidate);
+		if (!candidate) {
+			_leave(" = -ENOMEM");
+			return -ENOMEM;
 		}
 
 		candidate->trans = trans;
-- 
cgit v1.2.3


From bfcaa50270e18f35220a11d46e98fc6232c24606 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 25 May 2009 17:23:15 +0200
Subject: netfilter: nf_ct_tcp: fix accepting invalid RST segments

Robert L Mathews discovered that some clients send evil TCP RST segments,
which are accepted by netfilter conntrack but discarded by the
destination. Thus the conntrack entry is destroyed but the destination
retransmits data until timeout.

The same technique, i.e. sending properly crafted RST segments, can easily
be used to bypass connlimit/connbytes based restrictions (the sample
script written by Robert can be found in the netfilter mailing list
archives).

The patch below adds a new flag and new field to struct ip_ct_tcp_state so
that checking RST segments can be made more strict and thus TCP conntrack
can catch the invalid ones: the RST segment is accepted only if its
sequence number higher than or equal to the highest ack we seen from the
other direction. (The last_ack field cannot be reused because it is used
to catch resent packets.)

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_proto_tcp.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b5ccf2b4b2e..97a6e93d742 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -634,6 +634,14 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			sender->td_end = end;
 			sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
 		}
+		if (tcph->ack) {
+			if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+				sender->td_maxack = ack;
+				sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+			} else if (after(ack, sender->td_maxack))
+				sender->td_maxack = ack;
+		}
+
 		/*
 		 * Update receiver data.
 		 */
@@ -918,6 +926,16 @@ static int tcp_packet(struct nf_conn *ct,
 				  "nf_ct_tcp: invalid state ");
 		return -NF_ACCEPT;
 	case TCP_CONNTRACK_CLOSE:
+		if (index == TCP_RST_SET
+		    && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
+		    && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
+			/* Invalid RST  */
+			write_unlock_bh(&tcp_lock);
+			if (LOG_INVALID(net, IPPROTO_TCP))
+				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+					  "nf_ct_tcp: invalid RST ");
+			return -NF_ACCEPT;
+		}
 		if (index == TCP_RST_SET
 		    && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
 			 && ct->proto.tcp.last_index == TCP_SYN_SET)
-- 
cgit v1.2.3


From b38b1f616867c832301f24eaf259889494d495b3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <Pablo Neira Ayuso>
Date: Mon, 25 May 2009 17:29:43 +0200
Subject: netfilter: nf_ct_dccp: add missing DCCP protocol changes in event
 cache

This patch adds the missing protocol state-change event reporting
for DCCP.

$ sudo conntrack -E
    [NEW] dccp     33 240 src=192.168.0.2 dst=192.168.1.2 sport=57040 dport=5001 [UNREPLIED] src=192.168.1.2 dst=192.168.1.100 sport=5001 dport=57040

With this patch:

$ sudo conntrack -E
    [NEW] dccp     33 240 REQUEST src=192.168.0.2 dst=192.168.1.2 sport=57040 dport=5001 [UNREPLIED] src=192.168.1.2 dst=192.168.1.100 sport=5001 dport=57040

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_proto_dccp.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 8e757dd5339..aee0d6bea30 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -22,6 +22,7 @@
 #include <linux/netfilter/nfnetlink_conntrack.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_log.h>
 
 static DEFINE_RWLOCK(dccp_lock);
@@ -553,6 +554,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 	ct->proto.dccp.state = new_state;
 	write_unlock_bh(&dccp_lock);
 
+	if (new_state != old_state)
+		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
 	dn = dccp_pernet(net);
 	nf_ct_refresh_acct(ct, ctinfo, skb, dn->dccp_timeout[new_state]);
 
-- 
cgit v1.2.3


From c80a5cdfc5ca6533cb893154f546370da1fdb8f0 Mon Sep 17 00:00:00 2001
From: Doug Leith <doug.leith@nuim.ie>
Date: Mon, 25 May 2009 22:44:59 -0700
Subject: tcp: tcp_vegas ssthresh bugfix

This patch fixes ssthresh accounting issues in tcp_vegas when cwnd decreases

Signed-off-by: Doug Leith <doug.leith@nuim.ie>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_vegas.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a453aac91bd..c6743eec9b7 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -158,6 +158,11 @@ void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 }
 EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
 
+static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
+{
+	return  min(tp->snd_ssthresh, tp->snd_cwnd-1);
+}
+
 static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -221,11 +226,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 			 */
 			diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
 
-			if (diff > gamma && tp->snd_ssthresh > 2 ) {
+			if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
 				/* Going too fast. Time to slow down
 				 * and switch to congestion avoidance.
 				 */
-				tp->snd_ssthresh = 2;
 
 				/* Set cwnd to match the actual rate
 				 * exactly:
@@ -235,6 +239,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 				 * utilization.
 				 */
 				tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
+				tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
 
 			} else if (tp->snd_cwnd <= tp->snd_ssthresh) {
 				/* Slow start.  */
@@ -250,6 +255,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
 					 * we slow down.
 					 */
 					tp->snd_cwnd--;
+					tp->snd_ssthresh
+						= tcp_vegas_ssthresh(tp);
 				} else if (diff < alpha) {
 					/* We don't have enough extra packets
 					 * in the network, so speed up.
-- 
cgit v1.2.3


From 68743082b560067e3e93eab8b2568f238e486865 Mon Sep 17 00:00:00 2001
From: Vu Pham <vu@mellanox.com>
Date: Tue, 26 May 2009 14:51:00 -0400
Subject: XPRTRDMA: fix client rpcrdma FRMR registration on mlx4 devices

mlx4/connectX FRMR requires local write enable together with remote
rdma write enable. This fixes NFS/RDMA operation over the ConnectX
Infiniband HCA in the default memreg mode.

Signed-off-by: Vu Pham <vu@mellanox.com>
Signed-off-by: Tom Talpey <tmtalpey@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprtrdma/verbs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 3b21e0cc5e6..465aafc2007 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1495,7 +1495,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
 	frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
 	frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
 	frmr_wr.wr.fast_reg.access_flags = (writing ?
-				IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
+				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+				IB_ACCESS_REMOTE_READ);
 	frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
-- 
cgit v1.2.3


From e65fcfd63a9a62baa5708484ff8edbe56eb3e7ec Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Tue, 26 May 2009 20:47:02 -0700
Subject: cls_cgroup: read classid atomically in classifier

Avoid reading the unsynchronized value cs->classid multiple times,
since it could change concurrently from non-zero to zero; this would
result in the classifier returning a positive result with a bogus
(zero) classid.

Signed-off-by: Paul Menage <menage@google.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_cgroup.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 91a3db4a76f..cc29b44b150 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -104,8 +104,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
 			       struct tcf_result *res)
 {
 	struct cls_cgroup_head *head = tp->root;
-	struct cgroup_cls_state *cs;
-	int ret = 0;
+	u32 classid;
 
 	/*
 	 * Due to the nature of the classifier it is required to ignore all
@@ -121,17 +120,18 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
 		return -1;
 
 	rcu_read_lock();
-	cs = task_cls_state(current);
-	if (cs->classid && tcf_em_tree_match(skb, &head->ematches, NULL)) {
-		res->classid = cs->classid;
-		res->class = 0;
-		ret = tcf_exts_exec(skb, &head->exts, res);
-	} else
-		ret = -1;
-
+	classid = task_cls_state(current)->classid;
 	rcu_read_unlock();
 
-	return ret;
+	if (!classid)
+		return -1;
+
+	if (!tcf_em_tree_match(skb, &head->ematches, NULL))
+		return -1;
+
+	res->classid = classid;
+	res->class = 0;
+	return tcf_exts_exec(skb, &head->exts, res);
 }
 
 static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
-- 
cgit v1.2.3


From 4c713189485dbea875aecd1990daed74908e181d Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Wed, 27 May 2009 09:10:28 +0200
Subject: Bluetooth: Remove useless flush_work() causing lockdep warnings

The calls to flush_work() are pointless in a single thread workqueue
and they are actually causing a lockdep warning.

=============================================
[ INFO: possible recursive locking detected ]
2.6.30-rc6-02911-gbb803cf #16
---------------------------------------------
bluetooth/2518 is trying to acquire lock:
 (bluetooth){+.+.+.}, at: [<c0130c14>] flush_work+0x28/0xb0

but task is already holding lock:
 (bluetooth){+.+.+.}, at: [<c0130424>] worker_thread+0x149/0x25e

other info that might help us debug this:
2 locks held by bluetooth/2518:
 #0:  (bluetooth){+.+.+.}, at: [<c0130424>] worker_thread+0x149/0x25e
 #1:  (&conn->work_del){+.+...}, at: [<c0130424>] worker_thread+0x149/0x25e

stack backtrace:
Pid: 2518, comm: bluetooth Not tainted 2.6.30-rc6-02911-gbb803cf #16
Call Trace:
 [<c03d64d9>] ? printk+0xf/0x11
 [<c0140d96>] __lock_acquire+0x7ce/0xb1b
 [<c0141173>] lock_acquire+0x90/0xad
 [<c0130c14>] ? flush_work+0x28/0xb0
 [<c0130c2e>] flush_work+0x42/0xb0
 [<c0130c14>] ? flush_work+0x28/0xb0
 [<f8b84966>] del_conn+0x1c/0x84 [bluetooth]
 [<c0130469>] worker_thread+0x18e/0x25e
 [<c0130424>] ? worker_thread+0x149/0x25e
 [<f8b8494a>] ? del_conn+0x0/0x84 [bluetooth]
 [<c0133843>] ? autoremove_wake_function+0x0/0x33
 [<c01302db>] ? worker_thread+0x0/0x25e
 [<c013355a>] kthread+0x45/0x6b
 [<c0133515>] ? kthread+0x0/0x6b
 [<c01034a7>] kernel_thread_helper+0x7/0x10

Based on a report by Oliver Hartkopp <oliver@hartkopp.net>

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Tested-by: Oliver Hartkopp <oliver@hartkopp.net>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_sysfs.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 4cc3624bd22..95f7a7a544b 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -90,9 +90,6 @@ static void add_conn(struct work_struct *work)
 	struct hci_conn *conn = container_of(work, struct hci_conn, work_add);
 	struct hci_dev *hdev = conn->hdev;
 
-	/* ensure previous del is complete */
-	flush_work(&conn->work_del);
-
 	dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle);
 
 	if (device_add(&conn->dev) < 0) {
@@ -118,9 +115,6 @@ static void del_conn(struct work_struct *work)
 	struct hci_conn *conn = container_of(work, struct hci_conn, work_del);
 	struct hci_dev *hdev = conn->hdev;
 
-	/* ensure previous add is complete */
-	flush_work(&conn->work_add);
-
 	if (!device_is_registered(&conn->dev))
 		return;
 
-- 
cgit v1.2.3


From 683a04cebc63819a36b1db19843bd17771f05b55 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@comx.dk>
Date: Wed, 27 May 2009 15:45:34 +0200
Subject: netfilter: xt_hashlimit does a wrong SEQ_SKIP

The function dl_seq_show() returns 1 (equal to SEQ_SKIP) in case
a seq_printf() call return -1.  It should return -1.

This SEQ_SKIP behavior brakes processing the proc file e.g. via a
pipe or just through less.

Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_hashlimit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index a5b5369c30f..219dcdbe388 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -926,7 +926,7 @@ static int dl_seq_show(struct seq_file *s, void *v)
 	if (!hlist_empty(&htable->hash[*bucket])) {
 		hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node)
 			if (dl_seq_real_show(ent, htable->family, s))
-				return 1;
+				return -1;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From eeff9beec3d2563c42cca41e66d4169592bb5475 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 27 May 2009 15:49:11 +0200
Subject: netfilter: nfnetlink_log: fix wrong skbuff size	calculation

This problem was introduced in 72961ecf84d67d6359a1b30f9b2a8427f13e1e71
since no space was reserved for the new attributes NFULA_HWTYPE,
NFULA_HWLEN and NFULA_HWHEADER.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nfnetlink_log.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index fd326ac27ec..66a6dd5c519 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -581,6 +581,12 @@ nfulnl_log_packet(u_int8_t pf,
 		+ nla_total_size(sizeof(struct nfulnl_msg_packet_hw))
 		+ nla_total_size(sizeof(struct nfulnl_msg_packet_timestamp));
 
+	if (in && skb_mac_header_was_set(skb)) {
+		size +=   nla_total_size(skb->dev->hard_header_len)
+			+ nla_total_size(sizeof(u_int16_t))	/* hwtype */
+			+ nla_total_size(sizeof(u_int16_t));	/* hwlen */
+	}
+
 	spin_lock_bh(&inst->lock);
 
 	if (inst->flags & NFULNL_CFG_F_SEQ)
-- 
cgit v1.2.3


From 7f4218354fe312b327af06c3d8c95ed5f214c8ca Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 27 May 2009 18:51:06 -0400
Subject: nfsd: Revert "svcrpc: take advantage of tcp autotuning"

This reverts commit 47a14ef1af48c696b214ac168f056ddc79793d0e "svcrpc:
take advantage of tcp autotuning", which uncovered some further problems
in the server rpc code, causing significant performance regressions in
common cases.

We will likely reinstate this patch after releasing 2.6.30 and applying
some work on the underlying fixes to the problem (developed by Trond).

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Olga Kornievskaia <aglo@citi.umich.edu>
Cc: Jim Rees <rees@umich.edu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/svcsock.c | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index af3198814c1..9d504234af4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -345,6 +345,7 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
 	lock_sock(sock->sk);
 	sock->sk->sk_sndbuf = snd * 2;
 	sock->sk->sk_rcvbuf = rcv * 2;
+	sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
 	release_sock(sock->sk);
 #endif
 }
@@ -796,6 +797,23 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
 		test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
 
+	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
+		/* sndbuf needs to have room for one request
+		 * per thread, otherwise we can stall even when the
+		 * network isn't a bottleneck.
+		 *
+		 * We count all threads rather than threads in a
+		 * particular pool, which provides an upper bound
+		 * on the number of threads which will access the socket.
+		 *
+		 * rcvbuf just needs to be able to hold a few requests.
+		 * Normally they will be removed from the queue
+		 * as soon a a complete request arrives.
+		 */
+		svc_sock_setbufsize(svsk->sk_sock,
+				    (serv->sv_nrthreads+3) * serv->sv_max_mesg,
+				    3 * serv->sv_max_mesg);
+
 	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 
 	/* Receive data. If we haven't got the record length yet, get
@@ -1043,6 +1061,15 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 
 		tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF;
 
+		/* initialise setting must have enough space to
+		 * receive and respond to one request.
+		 * svc_tcp_recvfrom will re-adjust if necessary
+		 */
+		svc_sock_setbufsize(svsk->sk_sock,
+				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+				    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
+
+		set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
 		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
 		if (sk->sk_state != TCP_ESTABLISHED)
 			set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
@@ -1112,14 +1139,8 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
 	/* Initialize the socket */
 	if (sock->type == SOCK_DGRAM)
 		svc_udp_init(svsk, serv);
-	else {
-		/* initialise setting must have enough space to
-		 * receive and respond to one request.
-		 */
-		svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg,
-					4 * serv->sv_max_mesg);
+	else
 		svc_tcp_init(svsk, serv);
-	}
 
 	dprintk("svc: svc_setup_socket created %p (inet %p)\n",
 				svsk, svsk->sk_sk);
-- 
cgit v1.2.3


From 98779be861a05c4cb75bed916df72ec0cba8b53d Mon Sep 17 00:00:00 2001
From: Steve Wise <swise@opengridcomputing.com>
Date: Thu, 14 May 2009 16:34:28 -0500
Subject: svcrdma: dma unmap the correct length for the RPCRDMA header page.

The svcrdma module was incorrectly unmapping the RPCRDMA header page.
On IBM pserver systems this causes a resource leak that results in
running out of bus address space (10 cthon iterations will reproduce it).
The code was mapping the full page but only unmapping the actual header
length.  The fix is to only map the header length.

I also cleaned up the use of ib_dma_map_page() calls since the unmap
logic always uses ib_dma_unmap_single().  I made these symmetrical.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 12 ++++++------
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 8b510c5e877..f11be72a1a8 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -128,7 +128,8 @@ static int fast_reg_xdr(struct svcxprt_rdma *xprt,
 		page_bytes -= sge_bytes;
 
 		frmr->page_list->page_list[page_no] =
-			ib_dma_map_page(xprt->sc_cm_id->device, page, 0,
+			ib_dma_map_single(xprt->sc_cm_id->device,
+					  page_address(page),
 					  PAGE_SIZE, DMA_TO_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 					 frmr->page_list->page_list[page_no]))
@@ -532,18 +533,17 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
 
 	/* Prepare the SGE for the RPCRDMA Header */
+	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
 	ctxt->sge[0].addr =
-		ib_dma_map_page(rdma->sc_cm_id->device,
-				page, 0, PAGE_SIZE, DMA_TO_DEVICE);
+		ib_dma_map_single(rdma->sc_cm_id->device, page_address(page),
+				  ctxt->sge[0].length, DMA_TO_DEVICE);
 	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
 		goto err;
 	atomic_inc(&rdma->sc_dma_used);
 
 	ctxt->direction = DMA_TO_DEVICE;
 
-	ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
-	ctxt->sge[0].lkey = rdma->sc_dma_lkey;
-
 	/* Determine how many of our SGE are to be transmitted */
 	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
 		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 4b0c2fa15e0..5151f9f6c57 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -500,8 +500,8 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
 		BUG_ON(sge_no >= xprt->sc_max_sge);
 		page = svc_rdma_get_page();
 		ctxt->pages[sge_no] = page;
-		pa = ib_dma_map_page(xprt->sc_cm_id->device,
-				     page, 0, PAGE_SIZE,
+		pa = ib_dma_map_single(xprt->sc_cm_id->device,
+				     page_address(page), PAGE_SIZE,
 				     DMA_FROM_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
 			goto err_put_ctxt;
@@ -1315,8 +1315,8 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
 
 	/* Prepare SGE for local address */
-	sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
-				   p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+	sge.addr = ib_dma_map_single(xprt->sc_cm_id->device,
+				   page_address(p), PAGE_SIZE, DMA_FROM_DEVICE);
 	if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) {
 		put_page(p);
 		return;
@@ -1343,7 +1343,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	if (ret) {
 		dprintk("svcrdma: Error %d posting send for protocol error\n",
 			ret);
-		ib_dma_unmap_page(xprt->sc_cm_id->device,
+		ib_dma_unmap_single(xprt->sc_cm_id->device,
 				  sge.addr, PAGE_SIZE,
 				  DMA_FROM_DEVICE);
 		svc_rdma_put_context(ctxt, 1);
-- 
cgit v1.2.3


From 12186be7d2e1106cede1cc728526e3d7998cbe94 Mon Sep 17 00:00:00 2001
From: Minoru Usui <usui@mxm.nes.nec.co.jp>
Date: Tue, 2 Jun 2009 02:17:34 -0700
Subject: net_cls: fix unconfigured struct tcf_proto keeps chaining and avoid
 kernel panic when we use cls_cgroup

This patch fixes a bug which unconfigured struct tcf_proto keeps
chaining in tc_ctl_tfilter(), and avoids kernel panic in
cls_cgroup_classify() when we use cls_cgroup.

When we execute 'tc filter add', tcf_proto is allocated, initialized
by classifier's init(), and chained.  After it's chained,
tc_ctl_tfilter() calls classifier's change().  When classifier's
change() fails, tc_ctl_tfilter() does not free and keeps tcf_proto.

In addition, cls_cgroup is initialized in change() not in init().  It
accesses unconfigured struct tcf_proto which is chained before
change(), then hits Oops.

Signed-off-by: Minoru Usui <usui@mxm.nes.nec.co.jp>
Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Tested-by: Minoru Usui <usui@mxm.nes.nec.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 0759f32e9dc..09cdcdfe7e9 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -135,6 +135,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
 	unsigned long cl;
 	unsigned long fh;
 	int err;
+	int tp_created = 0;
 
 	if (net != &init_net)
 		return -EINVAL;
@@ -266,10 +267,7 @@ replay:
 			goto errout;
 		}
 
-		spin_lock_bh(root_lock);
-		tp->next = *back;
-		*back = tp;
-		spin_unlock_bh(root_lock);
+		tp_created = 1;
 
 	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
 		goto errout;
@@ -296,8 +294,11 @@ replay:
 		switch (n->nlmsg_type) {
 		case RTM_NEWTFILTER:
 			err = -EEXIST;
-			if (n->nlmsg_flags & NLM_F_EXCL)
+			if (n->nlmsg_flags & NLM_F_EXCL) {
+				if (tp_created)
+					tcf_destroy(tp);
 				goto errout;
+			}
 			break;
 		case RTM_DELTFILTER:
 			err = tp->ops->delete(tp, fh);
@@ -314,8 +315,18 @@ replay:
 	}
 
 	err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
-	if (err == 0)
+	if (err == 0) {
+		if (tp_created) {
+			spin_lock_bh(root_lock);
+			tp->next = *back;
+			*back = tp;
+			spin_unlock_bh(root_lock);
+		}
 		tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
+	} else {
+		if (tp_created)
+			tcf_destroy(tp);
+	}
 
 errout:
 	if (cl)
-- 
cgit v1.2.3