From fff9cfd99c0f88645c3f50d7476d6c8cef99f140 Mon Sep 17 00:00:00 2001
From:  <jt@hpl.hp.com>
Date: Thu, 12 May 2005 20:24:19 -0400
Subject:   [PATCH] Wireless Extensions 18 (aka WPA)

        This is version 18 of the Wireless Extensions. The main change
  is that it adds all the necessary APIs for WPA and WPA2 support. This
  work was entirely done by Jouni Malinen, so let's thank him for both
  his hard work and deep expertise on the subject ;-)
        This APIs obviously doesn't do much by itself and works in
  concert with driver support (Jouni already sent you the HostAP
  changes) and userspace (Jouni is updating wpa_supplicant). This is
  also orthogonal with the ongoing work on in-kernel IEEE support (but
  potentially useful).
        The patch is attached, tested with 2.6.11. Normally, I would
  ask you to push that directly in the kernel (99% of the patch has been
  on my web page for ages and it does not affect non-WPA stuff), but
  Jouni convinced me that it should bake a few weeks in wireless-2.6
  first, so that other driver maintainers can get up to speed with it.

  Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
---
 net/core/wireless.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/wireless.c b/net/core/wireless.c
index 750cc5daeb0..b2fe378dfbf 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -2,7 +2,7 @@
  * This file implement the Wireless Extensions APIs.
  *
  * Authors :	Jean Tourrilhes - HPL - <jt@hpl.hp.com>
- * Copyright (c) 1997-2004 Jean Tourrilhes, All Rights Reserved.
+ * Copyright (c) 1997-2005 Jean Tourrilhes, All Rights Reserved.
  *
  * (As all part of the Linux kernel, this file is GPL)
  */
@@ -187,6 +187,12 @@ static const struct iw_ioctl_description standard_ioctl[] = {
 		.header_type	= IW_HEADER_TYPE_ADDR,
 		.flags		= IW_DESCR_FLAG_DUMP,
 	},
+	[SIOCSIWMLME	- SIOCIWFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= sizeof(struct iw_mlme),
+		.max_tokens	= sizeof(struct iw_mlme),
+	},
 	[SIOCGIWAPLIST	- SIOCIWFIRST] = {
 		.header_type	= IW_HEADER_TYPE_POINT,
 		.token_size	= sizeof(struct sockaddr) +
@@ -195,7 +201,10 @@ static const struct iw_ioctl_description standard_ioctl[] = {
 		.flags		= IW_DESCR_FLAG_NOMAX,
 	},
 	[SIOCSIWSCAN	- SIOCIWFIRST] = {
-		.header_type	= IW_HEADER_TYPE_PARAM,
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= 0,
+		.max_tokens	= sizeof(struct iw_scan_req),
 	},
 	[SIOCGIWSCAN	- SIOCIWFIRST] = {
 		.header_type	= IW_HEADER_TYPE_POINT,
@@ -273,6 +282,42 @@ static const struct iw_ioctl_description standard_ioctl[] = {
 	[SIOCGIWPOWER	- SIOCIWFIRST] = {
 		.header_type	= IW_HEADER_TYPE_PARAM,
 	},
+	[SIOCSIWGENIE	- SIOCIWFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[SIOCGIWGENIE	- SIOCIWFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[SIOCSIWAUTH	- SIOCIWFIRST] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[SIOCGIWAUTH	- SIOCIWFIRST] = {
+		.header_type	= IW_HEADER_TYPE_PARAM,
+	},
+	[SIOCSIWENCODEEXT - SIOCIWFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= sizeof(struct iw_encode_ext),
+		.max_tokens	= sizeof(struct iw_encode_ext) +
+				  IW_ENCODING_TOKEN_MAX,
+	},
+	[SIOCGIWENCODEEXT - SIOCIWFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= sizeof(struct iw_encode_ext),
+		.max_tokens	= sizeof(struct iw_encode_ext) +
+				  IW_ENCODING_TOKEN_MAX,
+	},
+	[SIOCSIWPMKSA - SIOCIWFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.min_tokens	= sizeof(struct iw_pmksa),
+		.max_tokens	= sizeof(struct iw_pmksa),
+	},
 };
 static const int standard_ioctl_num = (sizeof(standard_ioctl) /
 				       sizeof(struct iw_ioctl_description));
@@ -299,6 +344,31 @@ static const struct iw_ioctl_description standard_event[] = {
 	[IWEVEXPIRED	- IWEVFIRST] = {
 		.header_type	= IW_HEADER_TYPE_ADDR, 
 	},
+	[IWEVGENIE	- IWEVFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[IWEVMICHAELMICFAILURE	- IWEVFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT, 
+		.token_size	= 1,
+		.max_tokens	= sizeof(struct iw_michaelmicfailure),
+	},
+	[IWEVASSOCREQIE	- IWEVFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[IWEVASSOCRESPIE	- IWEVFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= IW_GENERIC_IE_MAX,
+	},
+	[IWEVPMKIDCAND	- IWEVFIRST] = {
+		.header_type	= IW_HEADER_TYPE_POINT,
+		.token_size	= 1,
+		.max_tokens	= sizeof(struct iw_pmkid_cand),
+	},
 };
 static const int standard_event_num = (sizeof(standard_event) /
 				       sizeof(struct iw_ioctl_description));
-- 
cgit v1.2.3


From 3ec3b2fba526ead2fa3f3d7c91924f39a0733749 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Tue, 17 May 2005 12:08:48 +0100
Subject: AUDIT: Capture sys_socketcall arguments and sockaddrs

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 net/socket.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index cec0cb38b9c..6b7c3b51a7c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -81,6 +81,7 @@
 #include <linux/syscalls.h>
 #include <linux/compat.h>
 #include <linux/kmod.h>
+#include <linux/audit.h>
 
 #ifdef CONFIG_NET_RADIO
 #include <linux/wireless.h>		/* Note : will define WIRELESS_EXT */
@@ -226,7 +227,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr)
 		return 0;
 	if(copy_from_user(kaddr,uaddr,ulen))
 		return -EFAULT;
-	return 0;
+	return audit_sockaddr(ulen, kaddr);
 }
 
 /**
@@ -1906,7 +1907,11 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	/* copy_from_user should be SMP safe. */
 	if (copy_from_user(a, args, nargs[call]))
 		return -EFAULT;
-		
+
+	err = audit_socketcall(nargs[call]/sizeof(unsigned long), args);
+	if (err)
+		return err;
+
 	a0=a[0];
 	a1=a[1];
 	
-- 
cgit v1.2.3


From 314324121f9b94b2ca657a494cf2b9cb0e4a28cc Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 23 May 2005 12:03:06 -0700
Subject: [TCP]: Fix stretch ACK performance killer when doing ucopy.

When we are doing ucopy, we try to defer the ACK generation to
cleanup_rbuf().  This works most of the time very well, but if the
ucopy prequeue is large, this ACKing behavior kills performance.

With TSO, it is possible to fill the prequeue so large that by the
time the ACK is sent and gets back to the sender, most of the window
has emptied of data and performance suffers significantly.

This behavior does help in some cases, so we should think about
re-enabling this trick in the future, using some kind of limit in
order to avoid the bug case.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 79835a67a27..5bad504630a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4355,16 +4355,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 					goto no_ack;
 			}
 
-			if (eaten) {
-				if (tcp_in_quickack_mode(tp)) {
-					tcp_send_ack(sk);
-				} else {
-					tcp_send_delayed_ack(sk);
-				}
-			} else {
-				__tcp_ack_snd_check(sk, 0);
-			}
-
+			__tcp_ack_snd_check(sk, 0);
 no_ack:
 			if (eaten)
 				__kfree_skb(skb);
-- 
cgit v1.2.3


From 180e42503300629692b513daeb55a6bb0b51500c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 23 May 2005 13:11:07 -0700
Subject: [IPV6]: Fix xfrm tunnel oops with large packets

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/xfrm6_output.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 601a148f60f..6b9867717d1 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -84,6 +84,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 		mtu = IPV6_MIN_MTU;
 
 	if (skb->len > mtu) {
+		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
 		ret = -EMSGSIZE;
 	}
-- 
cgit v1.2.3


From 0afb51e72855971dba83b3c6b70c547c2d1161fd Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 26 May 2005 12:53:49 -0700
Subject: [PKT_SCHED]: netem: reinsert for duplication

Handle duplication of packets in netem by re-inserting at top of qdisc tree.
This avoid problems with qlen accounting with nested qdisc. This recursion
requires no additional locking but will potentially increase stack depth.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 53 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e0c9fbe73b1..5c0f0c209a4 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -203,42 +203,47 @@ static int netem_run(struct Qdisc *sch)
 	return 0;
 }
 
+/*
+ * Insert one skb into qdisc.
+ * Note: parent depends on return value to account for queue length.
+ * 	NET_XMIT_DROP: queue length didn't change.
+ *      NET_XMIT_SUCCESS: one skb was queued.
+ */
 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb2;
 	int ret;
+	int count = 1;
 
 	pr_debug("netem_enqueue skb=%p\n", skb);
 
+	/* Random duplication */
+	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
+		++count;
+
 	/* Random packet drop 0 => none, ~0 => all */
-	if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
-		pr_debug("netem_enqueue: random loss\n");
+	if (q->loss && q->loss >= get_crandom(&q->loss_cor))
+		--count;
+
+	if (count == 0) {
 		sch->qstats.drops++;
 		kfree_skb(skb);
-		return 0;	/* lie about loss so TCP doesn't know */
+		return NET_XMIT_DROP;
 	}
 
-	/* Random duplication */
-	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) {
-		struct sk_buff *skb2;
-
-		skb2 = skb_clone(skb, GFP_ATOMIC);
-		if (skb2 && netem_delay(sch, skb2) == NET_XMIT_SUCCESS) {
-			struct Qdisc *qp;
-
-			/* Since one packet can generate two packets in the
-			 * queue, the parent's qlen accounting gets confused,
-			 * so fix it.
-			 */
-			qp = qdisc_lookup(sch->dev, TC_H_MAJ(sch->parent));
-			if (qp)
-				qp->q.qlen++;
-
-			sch->q.qlen++;
-			sch->bstats.bytes += skb2->len;
-			sch->bstats.packets++;
-		} else
-			sch->qstats.drops++;
+	/*
+	 * If we need to duplicate packet, then re-insert at top of the
+	 * qdisc tree, since parent queuer expects that only one
+	 * skb will be queued.
+	 */
+	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
+		struct Qdisc *rootq = sch->dev->qdisc;
+		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
+		q->duplicate = 0;
+
+		rootq->enqueue(skb2, rootq);
+		q->duplicate = dupsave;
 	}
 
 	/* If doing simple delay then gap == 0 so all packets
-- 
cgit v1.2.3


From 0f9f32ac65ee4a452a912a8440cebbc4dff73852 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 26 May 2005 12:55:01 -0700
Subject: [PKT_SCHED] netem: use only inner qdisc -- no private skbuff queue

Netem works better if there if packets are just queued in the inner discipline
rather than having a separate delayed queue. Change to use the dequeue/requeue
to peek like TBF does.

By doing this potential qlen problems with the old method are avoided. The problems
happened when the netem_run that moved packets from the inner discipline to the nested
discipline failed (because inner queue was full). This happened in dequeue, so the
effective qlen of the netem would be decreased (because of the drop), but there was
no way to keep the outer qdisc (caller of netem dequeue) in sync.

The problem window is still there since this patch doesn't address the issue of
requeue failing in netem_dequeue, but that shouldn't happen since the sequence dequeue/requeue
should always work.  Long term correct fix is to implement qdisc->peek in all the qdisc's
to allow for this (needed by several other qdisc's as well).

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 124 +++++++++++++++-----------------------------------
 1 file changed, 36 insertions(+), 88 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 5c0f0c209a4..48360f7eec5 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -53,7 +53,6 @@
 
 struct netem_sched_data {
 	struct Qdisc	*qdisc;
-	struct sk_buff_head delayed;
 	struct timer_list timer;
 
 	u32 latency;
@@ -137,72 +136,6 @@ static long tabledist(unsigned long mu, long sigma,
 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 }
 
-/* Put skb in the private delayed queue. */
-static int netem_delay(struct Qdisc *sch, struct sk_buff *skb)
-{
-	struct netem_sched_data *q = qdisc_priv(sch);
-	psched_tdiff_t td;
-	psched_time_t now;
-	
-	PSCHED_GET_TIME(now);
-	td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
-	
-	/* Always queue at tail to keep packets in order */
-	if (likely(q->delayed.qlen < q->limit)) {
-		struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
-	
-		PSCHED_TADD2(now, td, cb->time_to_send);
-
-		pr_debug("netem_delay: skb=%p now=%llu tosend=%llu\n", skb, 
-			 now, cb->time_to_send);
-	
-		__skb_queue_tail(&q->delayed, skb);
-		return NET_XMIT_SUCCESS;
-	}
-
-	pr_debug("netem_delay: queue over limit %d\n", q->limit);
-	sch->qstats.overlimits++;
-	kfree_skb(skb);
-	return NET_XMIT_DROP;
-}
-
-/*
- *  Move a packet that is ready to send from the delay holding
- *  list to the underlying qdisc.
- */
-static int netem_run(struct Qdisc *sch)
-{
-	struct netem_sched_data *q = qdisc_priv(sch);
-	struct sk_buff *skb;
-	psched_time_t now;
-
-	PSCHED_GET_TIME(now);
-
-	skb = skb_peek(&q->delayed);
-	if (skb) {
-		const struct netem_skb_cb *cb
-			= (const struct netem_skb_cb *)skb->cb;
-		long delay 
-			= PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
-		pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
-
-		/* if more time remaining? */
-		if (delay > 0) {
-			mod_timer(&q->timer, jiffies + delay);
-			return 1;
-		}
-
-		__skb_unlink(skb, &q->delayed);
-		
-		if (q->qdisc->enqueue(skb, q->qdisc)) {
-			sch->q.qlen--;
-			sch->qstats.drops++;
-		} 
-	}
-
-	return 0;
-}
-
 /*
  * Insert one skb into qdisc.
  * Note: parent depends on return value to account for queue length.
@@ -212,6 +145,7 @@ static int netem_run(struct Qdisc *sch)
 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
+	struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
 	struct sk_buff *skb2;
 	int ret;
 	int count = 1;
@@ -246,18 +180,24 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		q->duplicate = dupsave;
 	}
 
-	/* If doing simple delay then gap == 0 so all packets
-	 * go into the delayed holding queue
-	 * otherwise if doing out of order only "1 out of gap"
-	 * packets will be delayed.
+	/* 
+	 * Do re-ordering by putting one out of N packets at the front
+	 * of the queue.
+	 * gap == 0 is special case for no-reordering.
 	 */
-	if (q->counter < q->gap) {
+	if (q->gap == 0 || q->counter != q->gap) {
+		psched_time_t now;
+		PSCHED_GET_TIME(now);
+		PSCHED_TADD2(now, 
+			     tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist),
+			     cb->time_to_send);
+		
 		++q->counter;
 		ret = q->qdisc->enqueue(skb, q->qdisc);
 	} else {
 		q->counter = 0;
-		ret = netem_delay(sch, skb);
-		netem_run(sch);
+		PSCHED_GET_TIME(cb->time_to_send);
+		ret = q->qdisc->ops->requeue(skb, q->qdisc);
 	}
 
 	if (likely(ret == NET_XMIT_SUCCESS)) {
@@ -301,22 +241,33 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
-	int pending;
-
-	pending = netem_run(sch);
 
 	skb = q->qdisc->dequeue(q->qdisc);
 	if (skb) {
-		pr_debug("netem_dequeue: return skb=%p\n", skb);
-		sch->q.qlen--;
-		sch->flags &= ~TCQ_F_THROTTLED;
-	}
-	else if (pending) {
-		pr_debug("netem_dequeue: throttling\n");
+		const struct netem_skb_cb *cb
+			= (const struct netem_skb_cb *)skb->cb;
+		psched_time_t now;
+		long delay;
+
+		/* if more time remaining? */
+		PSCHED_GET_TIME(now);
+		delay = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
+		pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
+		if (delay <= 0) {
+			pr_debug("netem_dequeue: return skb=%p\n", skb);
+			sch->q.qlen--;
+			sch->flags &= ~TCQ_F_THROTTLED;
+			return skb;
+		}
+
+		mod_timer(&q->timer, jiffies + delay);
 		sch->flags |= TCQ_F_THROTTLED;
-	} 
 
-	return skb;
+		if (q->qdisc->ops->requeue(skb, q->qdisc) != 0)
+			sch->qstats.drops++;
+	}
+
+	return NULL;
 }
 
 static void netem_watchdog(unsigned long arg)
@@ -333,8 +284,6 @@ static void netem_reset(struct Qdisc *sch)
 	struct netem_sched_data *q = qdisc_priv(sch);
 
 	qdisc_reset(q->qdisc);
-	skb_queue_purge(&q->delayed);
-
 	sch->q.qlen = 0;
 	sch->flags &= ~TCQ_F_THROTTLED;
 	del_timer_sync(&q->timer);
@@ -460,7 +409,6 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
 	if (!opt)
 		return -EINVAL;
 
-	skb_queue_head_init(&q->delayed);
 	init_timer(&q->timer);
 	q->timer.function = netem_watchdog;
 	q->timer.data = (unsigned long) sch;
-- 
cgit v1.2.3


From 0dca51d362b8e4af6b0dbc9e54d1e5165341918a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 26 May 2005 12:55:48 -0700
Subject: [PKT_SCHED] netem: allow random reordering (with fix)

Here is a fixed up version of the reorder feature of netem.
It is the same as the earlier patch plus with the bugfix from Julio merged in.
Has expected backwards compatibility behaviour.

Go ahead and merge this one, the TCP strangeness I was seeing was due
to the reordering bug, and previous version of TSO patch.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_netem.c | 54 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 48360f7eec5..bb9bf8d5003 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -62,11 +62,12 @@ struct netem_sched_data {
 	u32 gap;
 	u32 jitter;
 	u32 duplicate;
+	u32 reorder;
 
 	struct crndstate {
 		unsigned long last;
 		unsigned long rho;
-	} delay_cor, loss_cor, dup_cor;
+	} delay_cor, loss_cor, dup_cor, reorder_cor;
 
 	struct disttable {
 		u32  size;
@@ -180,23 +181,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		q->duplicate = dupsave;
 	}
 
-	/* 
-	 * Do re-ordering by putting one out of N packets at the front
-	 * of the queue.
-	 * gap == 0 is special case for no-reordering.
-	 */
-	if (q->gap == 0 || q->counter != q->gap) {
+	if (q->gap == 0 		/* not doing reordering */
+	    || q->counter < q->gap 	/* inside last reordering gap */
+	    || q->reorder < get_crandom(&q->reorder_cor)) {
 		psched_time_t now;
 		PSCHED_GET_TIME(now);
-		PSCHED_TADD2(now, 
-			     tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist),
+		PSCHED_TADD2(now, tabledist(q->latency, q->jitter, 
+					    &q->delay_cor, q->delay_dist),
 			     cb->time_to_send);
-		
 		++q->counter;
 		ret = q->qdisc->enqueue(skb, q->qdisc);
 	} else {
-		q->counter = 0;
+		/* 
+		 * Do re-ordering by putting one out of N packets at the front
+		 * of the queue.
+		 */
 		PSCHED_GET_TIME(cb->time_to_send);
+		q->counter = 0;
 		ret = q->qdisc->ops->requeue(skb, q->qdisc);
 	}
 
@@ -351,6 +352,19 @@ static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
 	return 0;
 }
 
+static int get_reorder(struct Qdisc *sch, const struct rtattr *attr)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	const struct tc_netem_reorder *r = RTA_DATA(attr);
+
+	if (RTA_PAYLOAD(attr) != sizeof(*r))
+		return -EINVAL;
+
+	q->reorder = r->probability;
+	init_crandom(&q->reorder_cor, r->correlation);
+	return 0;
+}
+
 static int netem_change(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
@@ -371,9 +385,15 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
 	q->jitter = qopt->jitter;
 	q->limit = qopt->limit;
 	q->gap = qopt->gap;
+	q->counter = 0;
 	q->loss = qopt->loss;
 	q->duplicate = qopt->duplicate;
 
+	/* for compatiablity with earlier versions.
+	 * if gap is set, need to assume 100% probablity
+	 */
+	q->reorder = ~0;
+
 	/* Handle nested options after initial queue options.
 	 * Should have put all options in nested format but too late now.
 	 */ 
@@ -395,6 +415,11 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt)
 			if (ret)
 				return ret;
 		}
+		if (tb[TCA_NETEM_REORDER-1]) {
+			ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]);
+			if (ret)
+				return ret;
+		}
 	}
 
 
@@ -412,7 +437,6 @@ static int netem_init(struct Qdisc *sch, struct rtattr *opt)
 	init_timer(&q->timer);
 	q->timer.function = netem_watchdog;
 	q->timer.data = (unsigned long) sch;
-	q->counter = 0;
 
 	q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
 	if (!q->qdisc) {
@@ -444,6 +468,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	struct rtattr *rta = (struct rtattr *) b;
 	struct tc_netem_qopt qopt;
 	struct tc_netem_corr cor;
+	struct tc_netem_reorder reorder;
 
 	qopt.latency = q->latency;
 	qopt.jitter = q->jitter;
@@ -457,6 +482,11 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 	cor.loss_corr = q->loss_cor.rho;
 	cor.dup_corr = q->dup_cor.rho;
 	RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
+
+	reorder.probability = q->reorder;
+	reorder.correlation = q->reorder_cor.rho;
+	RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder);
+
 	rta->rta_len = skb->tail - b;
 
 	return skb->len;
-- 
cgit v1.2.3


From 92d63decc0b6a5d600f792fcf5f3ff9718c09a3d Mon Sep 17 00:00:00 2001
From: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Date: Thu, 26 May 2005 12:58:04 -0700
Subject: From: Kazunori Miyazawa <kazunori@miyazawa.org>

[XFRM] Call dst_check() with appropriate cookie

This fixes infinite loop issue with IPv6 tunnel mode.

Signed-off-by: Kazunori Miyazawa <kazunori@miyazawa.org>
Signed-off-by: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/xfrm6_policy.c | 4 ++++
 net/xfrm/xfrm_policy.c  | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 4429b1a1fe5..cf1d91e74c8 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -113,6 +113,8 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 
 		xdst = (struct xfrm_dst *)dst1;
 		xdst->route = &rt->u.dst;
+		if (rt->rt6i_node)
+			xdst->route_cookie = rt->rt6i_node->fn_sernum;
 
 		dst1->next = dst_prev;
 		dst_prev = dst1;
@@ -137,6 +139,8 @@ __xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int
 
 	dst_prev->child = &rt->u.dst;
 	dst->path = &rt->u.dst;
+	if (rt->rt6i_node)
+		((struct xfrm_dst *)dst)->path_cookie = rt->rt6i_node->fn_sernum;
 
 	*dst_p = dst;
 	dst = dst_prev;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 55ed979db14..d07f5ce3182 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1136,7 +1136,7 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family)
 	struct xfrm_dst *last;
 	u32 mtu;
 
-	if (!dst_check(dst->path, 0) ||
+	if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) ||
 	    (dst->dev && !netif_running(dst->dev)))
 		return 0;
 
@@ -1156,7 +1156,7 @@ int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family)
 			xdst->child_mtu_cached = mtu;
 		}
 
-		if (!dst_check(xdst->route, 0))
+		if (!dst_check(xdst->route, xdst->route_cookie))
 			return 0;
 		mtu = dst_mtu(xdst->route);
 		if (xdst->route_mtu_cached != mtu) {
-- 
cgit v1.2.3


From c6b3365391c626206f6789354794a81a010cb7a1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 26 May 2005 12:59:05 -0700
Subject: [TOKENRING]: be'ify trh_hdr, trllc, rif_cache_s

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/802/tr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/802/tr.c b/net/802/tr.c
index 85293ccf7ef..3bfb86260f2 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -50,8 +50,8 @@ static void rif_check_expire(unsigned long dummy);
 struct rif_cache_s {	
 	unsigned char addr[TR_ALEN];
 	int iface;
-	__u16 rcf;
-	__u16 rseg[8];
+	__be16 rcf;
+	__be16 rseg[8];
 	struct rif_cache_s *next;
 	unsigned long last_used;
 	unsigned char local_ring;
-- 
cgit v1.2.3


From c8b35d2a29ec3c93e3b9c1e70d649a77a214b1c1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 26 May 2005 12:59:42 -0700
Subject: [TOKENRING]: net/802/tr.c: s/struct rif_cache_s/struct rif_cache/

"_s" suffix is certainly of hungarian origin.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/802/tr.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/802/tr.c b/net/802/tr.c
index 3bfb86260f2..a755e880f4b 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -47,12 +47,12 @@ static void rif_check_expire(unsigned long dummy);
  *	Each RIF entry we learn is kept this way
  */
  
-struct rif_cache_s {	
+struct rif_cache {
 	unsigned char addr[TR_ALEN];
 	int iface;
 	__be16 rcf;
 	__be16 rseg[8];
-	struct rif_cache_s *next;
+	struct rif_cache *next;
 	unsigned long last_used;
 	unsigned char local_ring;
 };
@@ -64,7 +64,7 @@ struct rif_cache_s {
  *	up a lot.
  */
  
-static struct rif_cache_s *rif_table[RIF_TABLE_SIZE];
+static struct rif_cache *rif_table[RIF_TABLE_SIZE];
 
 static DEFINE_SPINLOCK(rif_lock);
 
@@ -249,7 +249,7 @@ void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device *
 {
 	int slack;
 	unsigned int hash;
-	struct rif_cache_s *entry;
+	struct rif_cache *entry;
 	unsigned char *olddata;
 	static const unsigned char mcast_func_addr[] 
 		= {0xC0,0x00,0x00,0x04,0x00,0x00};
@@ -337,7 +337,7 @@ printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0],
 static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev)
 {
 	unsigned int hash, rii_p = 0;
-	struct rif_cache_s *entry;
+	struct rif_cache *entry;
 
 
 	spin_lock_bh(&rif_lock);
@@ -373,7 +373,7 @@ printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n",
 		 *	FIXME: We ought to keep some kind of cache size
 		 *	limiting and adjust the timers to suit.
 		 */
-		entry=kmalloc(sizeof(struct rif_cache_s),GFP_ATOMIC);
+		entry=kmalloc(sizeof(struct rif_cache),GFP_ATOMIC);
 
 		if(!entry) 
 		{
@@ -435,7 +435,7 @@ static void rif_check_expire(unsigned long dummy)
 	spin_lock_bh(&rif_lock);
 	
 	for(i =0; i < RIF_TABLE_SIZE; i++) {
-		struct rif_cache_s *entry, **pentry;
+		struct rif_cache *entry, **pentry;
 		
 		pentry = rif_table+i;
 		while((entry=*pentry) != NULL) {
@@ -467,10 +467,10 @@ static void rif_check_expire(unsigned long dummy)
  
 #ifdef CONFIG_PROC_FS
 
-static struct rif_cache_s *rif_get_idx(loff_t pos)
+static struct rif_cache *rif_get_idx(loff_t pos)
 {
 	int i;
-	struct rif_cache_s *entry;
+	struct rif_cache *entry;
 	loff_t off = 0;
 
 	for(i = 0; i < RIF_TABLE_SIZE; i++) 
@@ -493,7 +493,7 @@ static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
 static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	int i;
-	struct rif_cache_s *ent = v;
+	struct rif_cache *ent = v;
 
 	++*pos;
 
@@ -522,7 +522,7 @@ static void rif_seq_stop(struct seq_file *seq, void *v)
 static int rif_seq_show(struct seq_file *seq, void *v)
 {
 	int j, rcf_len, segment, brdgnmb;
-	struct rif_cache_s *entry = v;
+	struct rif_cache *entry = v;
 
 	if (v == SEQ_START_TOKEN)
 		seq_puts(seq,
-- 
cgit v1.2.3


From d8a33ac435c43a1a404b2ec560ef1d1536710c36 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 29 May 2005 14:13:47 -0700
Subject: [BRIDGE]: features change notification

Resend of earlier patch (no changes) from Catalin used to provide
device feature change notification.

Signed-off-by: Catalin BOIE <catab at umbrella.ro>
Acked-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c     | 12 ++++++++++++
 net/core/ethtool.c |  8 +++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index d4d9e2680ad..f15a3ffff63 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -760,6 +760,18 @@ int dev_change_name(struct net_device *dev, char *newname)
 	return err;
 }
 
+/**
+ *	netdev_features_change - device changes fatures
+ *	@dev: device to cause notification
+ *
+ *	Called to indicate a device has changed features.
+ */
+void netdev_features_change(struct net_device *dev)
+{
+	notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
+}
+EXPORT_SYMBOL(netdev_features_change);
+
 /**
  *	netdev_state_change - device changes state
  *	@dev: device to cause notification
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f05fde97c43..252bfc6f03f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -682,6 +682,7 @@ int dev_ethtool(struct ifreq *ifr)
 	void __user *useraddr = ifr->ifr_data;
 	u32 ethcmd;
 	int rc;
+	int old_features;
 
 	/*
 	 * XXX: This can be pushed down into the ethtool_* handlers that
@@ -703,6 +704,8 @@ int dev_ethtool(struct ifreq *ifr)
 		if ((rc = dev->ethtool_ops->begin(dev)) < 0)
 			return rc;
 
+	old_features = dev->features;
+
 	switch (ethcmd) {
 	case ETHTOOL_GSET:
 		rc = ethtool_get_settings(dev, useraddr);
@@ -712,7 +715,6 @@ int dev_ethtool(struct ifreq *ifr)
 		break;
 	case ETHTOOL_GDRVINFO:
 		rc = ethtool_get_drvinfo(dev, useraddr);
-
 		break;
 	case ETHTOOL_GREGS:
 		rc = ethtool_get_regs(dev, useraddr);
@@ -801,6 +803,10 @@ int dev_ethtool(struct ifreq *ifr)
 	
 	if(dev->ethtool_ops->complete)
 		dev->ethtool_ops->complete(dev);
+
+	if (old_features != dev->features)
+		netdev_features_change(dev);
+
 	return rc;
 
  ioctl:
-- 
cgit v1.2.3


From 81e8157583c559c27aac75c708d40a35f563d734 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 29 May 2005 14:14:35 -0700
Subject: [BRIDGE]: make dev->features unsigned

The features field in netdevice is really a bitmask, and bitmask's should
be unsigned.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 252bfc6f03f..2a56a521836 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -682,7 +682,7 @@ int dev_ethtool(struct ifreq *ifr)
 	void __user *useraddr = ifr->ifr_data;
 	u32 ethcmd;
 	int rc;
-	int old_features;
+	unsigned long old_features;
 
 	/*
 	 * XXX: This can be pushed down into the ethtool_* handlers that
-- 
cgit v1.2.3


From 81d35307dd468b92fe8c58797abb13c62e3e64dd Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 29 May 2005 14:15:17 -0700
Subject: [BRIDGE]: set features based on enslaved devices

Make features of the bridge pseudo-device be a subset of the underlying
devices.  Motivated by Xen and others who use bridging to do failover.

Signed-off-by: Catalin BOIE <catab at umrella.ro>
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c  | 15 +++++++--------
 net/bridge/br_if.c      | 23 +++++++++++++++++++++++
 net/bridge/br_notify.c  |  9 +++++++++
 net/bridge/br_private.h |  1 +
 4 files changed, 40 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index d9b72fde433..f564ee99782 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -21,10 +21,7 @@
 
 static struct net_device_stats *br_dev_get_stats(struct net_device *dev)
 {
-	struct net_bridge *br;
-
-	br = dev->priv;
-
+	struct net_bridge *br = netdev_priv(dev);
 	return &br->statistics;
 }
 
@@ -54,9 +51,11 @@ int br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static int br_dev_open(struct net_device *dev)
 {
-	netif_start_queue(dev);
+	struct net_bridge *br = netdev_priv(dev);
 
-	br_stp_enable_bridge(dev->priv);
+	br_features_recompute(br);
+	netif_start_queue(dev);
+	br_stp_enable_bridge(br);
 
 	return 0;
 }
@@ -67,7 +66,7 @@ static void br_dev_set_multicast_list(struct net_device *dev)
 
 static int br_dev_stop(struct net_device *dev)
 {
-	br_stp_disable_bridge(dev->priv);
+	br_stp_disable_bridge(netdev_priv(dev));
 
 	netif_stop_queue(dev);
 
@@ -76,7 +75,7 @@ static int br_dev_stop(struct net_device *dev)
 
 static int br_change_mtu(struct net_device *dev, int new_mtu)
 {
-	if ((new_mtu < 68) || new_mtu > br_min_mtu(dev->priv))
+	if (new_mtu < 68 || new_mtu > br_min_mtu(netdev_priv(dev)))
 		return -EINVAL;
 
 	dev->mtu = new_mtu;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 69872bf3b87..91bb895375f 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -314,6 +314,28 @@ int br_min_mtu(const struct net_bridge *br)
 	return mtu;
 }
 
+/*
+ * Recomputes features using slave's features
+ */
+void br_features_recompute(struct net_bridge *br)
+{
+	struct net_bridge_port *p;
+	unsigned long features, checksum;
+
+	features = NETIF_F_SG | NETIF_F_FRAGLIST 
+		| NETIF_F_HIGHDMA | NETIF_F_TSO;
+	checksum = NETIF_F_IP_CSUM;	/* least commmon subset */
+
+	list_for_each_entry(p, &br->port_list, list) {
+		if (!(p->dev->features 
+		      & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)))
+			checksum = 0;
+		features &= p->dev->features;
+	}
+
+	br->dev->features = features | checksum | NETIF_F_LLTX;
+}
+
 /* called with RTNL */
 int br_add_if(struct net_bridge *br, struct net_device *dev)
 {
@@ -368,6 +390,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
 
 	spin_lock_bh(&br->lock);
 	br_stp_recalculate_bridge_id(br);
+	br_features_recompute(br);
 	spin_unlock_bh(&br->lock);
 
 	return 0;
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c
index f8fb49e3476..917311c6828 100644
--- a/net/bridge/br_notify.c
+++ b/net/bridge/br_notify.c
@@ -65,6 +65,15 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 		}
 		break;
 
+	case NETDEV_FEAT_CHANGE:
+		if (br->dev->flags & IFF_UP) 
+			br_features_recompute(br);
+
+		/* could do recursive feature change notification
+		 * but who would care?? 
+		 */
+		break;
+
 	case NETDEV_DOWN:
 		if (br->dev->flags & IFF_UP)
 			br_stp_disable_port(p);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 54d63f1372a..bdf95a74d8c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -174,6 +174,7 @@ extern int br_add_if(struct net_bridge *br,
 extern int br_del_if(struct net_bridge *br,
 	      struct net_device *dev);
 extern int br_min_mtu(const struct net_bridge *br);
+extern void br_features_recompute(struct net_bridge *br);
 
 /* br_input.c */
 extern int br_handle_frame_finish(struct sk_buff *skb);
-- 
cgit v1.2.3


From 85967bb46dd1f8f2c49b85a313866c00ac0c9b59 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 29 May 2005 14:15:55 -0700
Subject: [BRIDGE]: prevent bad forwarding table updates

Avoid poisoning of the bridge forwarding table by frames that have been
dropped by filtering. This prevents spoofed source addresses on hostile
side of bridge from causing packet leakage, a small but possible security
risk.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_input.c    | 6 ++++--
 net/bridge/br_stp_bpdu.c | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 2b1cce46cab..2aa5dda24a0 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -54,6 +54,9 @@ int br_handle_frame_finish(struct sk_buff *skb)
 	struct net_bridge_fdb_entry *dst;
 	int passedup = 0;
 
+	/* insert into forwarding database after filtering to avoid spoofing */
+	br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
+
 	if (br->dev->flags & IFF_PROMISC) {
 		struct sk_buff *skb2;
 
@@ -108,8 +111,7 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb)
 	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
 		goto err;
 
-	if (p->state == BR_STATE_LEARNING ||
-	    p->state == BR_STATE_FORWARDING)
+	if (p->state == BR_STATE_LEARNING)
 		br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
 
 	if (p->br->stp_enabled &&
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index b91a875aca0..d071f1c9ad0 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -140,6 +140,9 @@ int br_stp_handle_bpdu(struct sk_buff *skb)
 	struct net_bridge *br = p->br;
 	unsigned char *buf;
 
+	/* insert into forwarding database after filtering to avoid spoofing */
+	br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
+
 	/* need at least the 802 and STP headers */
 	if (!pskb_may_pull(skb, sizeof(header)+1) ||
 	    memcmp(skb->data, header, sizeof(header)))
-- 
cgit v1.2.3


From 7ce54e3f428b33af714271140601c87b8bf2c544 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Sun, 29 May 2005 14:16:48 -0700
Subject: [BRIDGE]: receive path optimization

This improves the bridge local receive path by avoiding going
through another softirq.  The bridge receive path is already being called
from a netif_receive_skb() there is no point in going through another
receiveq round trip.

Recursion is limited because bridge can never be a port of a bridge
so handle_bridge() always returns.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 2aa5dda24a0..8f5f2e73099 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -26,7 +26,7 @@ static int br_pass_frame_up_finish(struct sk_buff *skb)
 #ifdef CONFIG_NETFILTER_DEBUG
 	skb->nf_debug = 0;
 #endif
-	netif_rx(skb);
+	netif_receive_skb(skb);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 8f937c6099858eee15fae14009dcbd05177fa91d Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@gnumonks.org>
Date: Sun, 29 May 2005 20:23:46 -0700
Subject: [IPV4]: Primary and secondary addresses

Add an option to make secondary IP addresses get promoted
when primary IP addresses are removed from the device.
It defaults to off to preserve existing behavior.

Signed-off-by: Harald Welte <laforge@gnumonks.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 3cc96730c4e..478a30179a5 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -233,11 +233,14 @@ int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
 static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 			 int destroy)
 {
+	struct in_ifaddr *promote = NULL;
 	struct in_ifaddr *ifa1 = *ifap;
 
 	ASSERT_RTNL();
 
-	/* 1. Deleting primary ifaddr forces deletion all secondaries */
+	/* 1. Deleting primary ifaddr forces deletion all secondaries 
+	 * unless alias promotion is set
+	 **/
 
 	if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
 		struct in_ifaddr *ifa;
@@ -251,11 +254,16 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 				continue;
 			}
 
-			*ifap1 = ifa->ifa_next;
+			if (!IN_DEV_PROMOTE_SECONDARIES(in_dev)) {
+				*ifap1 = ifa->ifa_next;
 
-			rtmsg_ifa(RTM_DELADDR, ifa);
-			notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
-			inet_free_ifa(ifa);
+				rtmsg_ifa(RTM_DELADDR, ifa);
+				notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
+				inet_free_ifa(ifa);
+			} else {
+				promote = ifa;
+				break;
+			}
 		}
 	}
 
@@ -281,6 +289,13 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 		if (!in_dev->ifa_list)
 			inetdev_destroy(in_dev);
 	}
+
+	if (promote && IN_DEV_PROMOTE_SECONDARIES(in_dev)) {
+		/* not sure if we should send a delete notify first? */
+		promote->ifa_flags &= ~IFA_F_SECONDARY;
+		rtmsg_ifa(RTM_NEWADDR, promote);
+		notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote);
+	}
 }
 
 static int inet_insert_ifa(struct in_ifaddr *ifa)
@@ -1384,6 +1399,15 @@ static struct devinet_sysctl_table {
 			.proc_handler	= &ipv4_doint_and_flush,
 			.strategy	= &ipv4_doint_and_flush_strategy,
 		},
+		{
+			.ctl_name	= NET_IPV4_CONF_PROMOTE_SECONDARIES,
+			.procname	= "promote_secondaries",
+			.data		= &ipv4_devconf.promote_secondaries,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &ipv4_doint_and_flush,
+			.strategy	= &ipv4_doint_and_flush_strategy,
+		},
 	},
 	.devinet_dev = {
 		{
-- 
cgit v1.2.3


From 37e20a66db02eff9adbeee043af990cca85d0034 Mon Sep 17 00:00:00 2001
From: "Pravin B. Shelar" <pravins@calsoftinc.com>
Date: Sun, 29 May 2005 20:26:44 -0700
Subject: [IPV4]: Kill MULTIPATHHOLDROUTE flag.

It cannot work properly, so just ignore it in drr
and rr multipath algorithms just like the random
multipath algorithm does.

Suggested by Herbert Xu.

Signed-off by: Pravin B. Shelar <pravins@calsoftinc.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/multipath_drr.c | 18 +-----------------
 net/ipv4/multipath_rr.c  | 20 --------------------
 2 files changed, 1 insertion(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index 9349686131f..cf2e6bcf797 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -57,7 +57,6 @@ struct multipath_device {
 
 static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES];
 static DEFINE_SPINLOCK(state_lock);
-static struct rtable *last_selection = NULL;
 
 static int inline __multipath_findslot(void)
 {
@@ -111,11 +110,6 @@ struct notifier_block drr_dev_notifier = {
 	.notifier_call	= drr_dev_event,
 };
 
-static void drr_remove(struct rtable *rt)
-{
-	if (last_selection == rt)
-		last_selection = NULL;
-}
 
 static void drr_safe_inc(atomic_t *usecount)
 {
@@ -144,14 +138,6 @@ static void drr_select_route(const struct flowi *flp,
 	int devidx = -1;
 	int cur_min_devidx = -1;
 
-       	/* if necessary and possible utilize the old alternative */
-	if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
-	    last_selection != NULL) {
-		result = last_selection;
-		*rp = result;
-		return;
-	}
-
 	/* 1. make sure all alt. nexthops have the same GC related data */
 	/* 2. determine the new candidate to be returned */
 	result = NULL;
@@ -229,12 +215,10 @@ static void drr_select_route(const struct flowi *flp,
 	}
 
 	*rp = result;
-	last_selection = result;
 }
 
 static struct ip_mp_alg_ops drr_ops = {
 	.mp_alg_select_route	=	drr_select_route,
-	.mp_alg_remove		=	drr_remove,
 };
 
 static int __init drr_init(void)
@@ -244,7 +228,7 @@ static int __init drr_init(void)
 	if (err)
 		return err;
 
-	err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR);
+	err = multipath_alg_register(&drr_ops, IP_MP_ALG_DRR);
 	if (err)
 		goto fail;
 
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
index 554a8256816..061b6b25398 100644
--- a/net/ipv4/multipath_rr.c
+++ b/net/ipv4/multipath_rr.c
@@ -47,29 +47,12 @@
 #include <net/checksum.h>
 #include <net/ip_mp_alg.h>
 
-#define MULTIPATH_MAX_CANDIDATES 40
-
-static struct rtable* last_used = NULL;
-
-static void rr_remove(struct rtable *rt)
-{
-	if (last_used == rt)
-		last_used = NULL;
-}
-
 static void rr_select_route(const struct flowi *flp,
 			    struct rtable *first, struct rtable **rp)
 {
 	struct rtable *nh, *result, *min_use_cand = NULL;
 	int min_use = -1;
 
-	/* if necessary and possible utilize the old alternative */
-	if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
-	    last_used != NULL) {
-		result = last_used;
-		goto out;
-	}
-
 	/* 1. make sure all alt. nexthops have the same GC related data
 	 * 2. determine the new candidate to be returned
 	 */
@@ -90,15 +73,12 @@ static void rr_select_route(const struct flowi *flp,
 	if (!result)
 		result = first;
 
-out:
-	last_used = result;
 	result->u.dst.__use++;
 	*rp = result;
 }
 
 static struct ip_mp_alg_ops rr_ops = {
 	.mp_alg_select_route	=	rr_select_route,
-	.mp_alg_remove		=	rr_remove,
 };
 
 static int __init rr_init(void)
-- 
cgit v1.2.3


From 69f6a0fafcdf0bfe85af182695d6d38f80f9d549 Mon Sep 17 00:00:00 2001
From: Jon Mason <jdmason@us.ibm.com>
Date: Sun, 29 May 2005 20:27:24 -0700
Subject: [NET]: Add ethtool support for NETIF_F_HW_CSUM.

Signed-off-by: Jon Mason <jdmason@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 2a56a521836..8ec484894d6 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -29,7 +29,7 @@ u32 ethtool_op_get_link(struct net_device *dev)
 
 u32 ethtool_op_get_tx_csum(struct net_device *dev)
 {
-	return (dev->features & NETIF_F_IP_CSUM) != 0;
+	return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0;
 }
 
 int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
@@ -42,6 +42,15 @@ int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
 	return 0;
 }
 
+int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
+{
+	if (data)
+		dev->features |= NETIF_F_HW_CSUM;
+	else
+		dev->features &= ~NETIF_F_HW_CSUM;
+
+	return 0;
+}
 u32 ethtool_op_get_sg(struct net_device *dev)
 {
 	return (dev->features & NETIF_F_SG) != 0;
@@ -823,3 +832,4 @@ EXPORT_SYMBOL(ethtool_op_get_tx_csum);
 EXPORT_SYMBOL(ethtool_op_set_sg);
 EXPORT_SYMBOL(ethtool_op_set_tso);
 EXPORT_SYMBOL(ethtool_op_set_tx_csum);
+EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
-- 
cgit v1.2.3


From 6c94d3611be61e4cff33b311f1a626d93d1d3e92 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 May 2005 20:28:01 -0700
Subject: [IPV6]: Clear up user copy warning in flowlabel code.

We are intentionally ignoring the copy_to_user() value,
make it clear to the compiler too.

Noted by Jeff Garzik.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_flowlabel.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index a93f6dc5197..0e5f7499deb 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -535,10 +535,12 @@ release:
 		if (err)
 			goto done;
 
-		/* Do not check for fault */
-		if (!freq.flr_label)
-			copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label,
-				     &fl->label, sizeof(fl->label));
+		if (!freq.flr_label) {
+			if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label,
+					 &fl->label, sizeof(fl->label))) {
+				/* Intentionally ignore fault. */
+			}
+		}
 
 		sfl1->fl = fl;
 		sfl1->next = np->ipv6_fl_list;
-- 
cgit v1.2.3


From d1102b59ca7b3a3c58912330a4ae38f549c8d569 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 29 May 2005 20:28:25 -0700
Subject: [NET]: Use %lx for netdev->features sysfs formatting.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 060f703659e..910eb4c05a4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -21,6 +21,7 @@
 #define to_net_dev(class) container_of(class, struct net_device, class_dev)
 
 static const char fmt_hex[] = "%#x\n";
+static const char fmt_long_hex[] = "%#lx\n";
 static const char fmt_dec[] = "%d\n";
 static const char fmt_ulong[] = "%lu\n";
 
@@ -91,7 +92,7 @@ static CLASS_DEVICE_ATTR(field, S_IRUGO, show_##field, NULL)		\
 NETDEVICE_ATTR(addr_len, fmt_dec);
 NETDEVICE_ATTR(iflink, fmt_dec);
 NETDEVICE_ATTR(ifindex, fmt_dec);
-NETDEVICE_ATTR(features, fmt_hex);
+NETDEVICE_ATTR(features, fmt_long_hex);
 NETDEVICE_ATTR(type, fmt_dec);
 
 /* use same locking rules as GIFHWADDR ioctl's */
-- 
cgit v1.2.3


From 9bb7bc942d3da606f184ac6a4dfc7e4d470c831b Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 30 May 2005 15:35:26 -0700
Subject: [NETFILTER]: Fix deadlock with ip_queue and tcp local input path.

When we have ip_queue being used from LOCAL_IN, then we end up with a
situation where the verdicts coming back from userspace traverse the TCP
input path from syscall context.  While this seems to work most of the
time, there's an ugly deadlock:

syscall context is interrupted by the timer interrupt.  When the timer
interrupt leaves, the timer softirq get's scheduled and calls
tcp_delack_timer() and alike.  They themselves do bh_lock_sock(sk),
which is already held from somewhere else -> boom.

I've now tested the suggested solution by Patrick McHardy and Herbert Xu to
simply use local_bh_{en,dis}able().

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_queue.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index e5746b67441..eda1fba431a 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -3,6 +3,7 @@
  * communicating with userspace via netlink.
  *
  * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -17,6 +18,7 @@
  * 2005-01-10: Added /proc counter for dropped packets; fixed so
  *             packets aren't delivered to user space if they're going 
  *             to be dropped. 
+ * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte)
  *
  */
 #include <linux/module.h>
@@ -71,7 +73,15 @@ static DECLARE_MUTEX(ipqnl_sem);
 static void
 ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
 {
+	/* TCP input path (and probably other bits) assume to be called
+	 * from softirq context, not from syscall, like ipq_issue_verdict is
+	 * called.  TCP input path deadlocks with locks taken from timer
+	 * softirq, e.g.  We therefore emulate this by local_bh_disable() */
+
+	local_bh_disable();
 	nf_reinject(entry->skb, entry->info, verdict);
+	local_bh_enable();
+
 	kfree(entry);
 }
 
-- 
cgit v1.2.3


From 208d89843b7b03978d8e748b8b991c1be81c4f43 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 30 May 2005 15:50:15 -0700
Subject: [IPV4]: Fix BUG() in 2.6.x, udp_poll(), fragments + CONFIG_HIGHMEM

Steven Hand <Steven.Hand@cl.cam.ac.uk> wrote:
>
> Reconstructed forward trace:
>
>   net/ipv4/udp.c:1334   spin_lock_irq()
>   net/ipv4/udp.c:1336   udp_checksum_complete()
> net/core/skbuff.c:1069   skb_shinfo(skb)->nr_frags > 1
> net/core/skbuff.c:1086   kunmap_skb_frag()
> net/core/skbuff.h:1087   local_bh_enable()
> kernel/softirq.c:0140   WARN_ON(irqs_disabled());

The receive queue lock is never taken in IRQs (and should never be) so
we can simply substitute bh for irq.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4a6952e3fee..7c24e64b443 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -738,7 +738,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			unsigned long amount;
 
 			amount = 0;
-			spin_lock_irq(&sk->sk_receive_queue.lock);
+			spin_lock_bh(&sk->sk_receive_queue.lock);
 			skb = skb_peek(&sk->sk_receive_queue);
 			if (skb != NULL) {
 				/*
@@ -748,7 +748,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 				 */
 				amount = skb->len - sizeof(struct udphdr);
 			}
-			spin_unlock_irq(&sk->sk_receive_queue.lock);
+			spin_unlock_bh(&sk->sk_receive_queue.lock);
 			return put_user(amount, (int __user *)arg);
 		}
 
@@ -848,12 +848,12 @@ csum_copy_err:
 	/* Clear queue. */
 	if (flags&MSG_PEEK) {
 		int clear = 0;
-		spin_lock_irq(&sk->sk_receive_queue.lock);
+		spin_lock_bh(&sk->sk_receive_queue.lock);
 		if (skb == skb_peek(&sk->sk_receive_queue)) {
 			__skb_unlink(skb, &sk->sk_receive_queue);
 			clear = 1;
 		}
-		spin_unlock_irq(&sk->sk_receive_queue.lock);
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
 		if (clear)
 			kfree_skb(skb);
 	}
@@ -1334,7 +1334,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		struct sk_buff_head *rcvq = &sk->sk_receive_queue;
 		struct sk_buff *skb;
 
-		spin_lock_irq(&rcvq->lock);
+		spin_lock_bh(&rcvq->lock);
 		while ((skb = skb_peek(rcvq)) != NULL) {
 			if (udp_checksum_complete(skb)) {
 				UDP_INC_STATS_BH(UDP_MIB_INERRORS);
@@ -1345,7 +1345,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 				break;
 			}
 		}
-		spin_unlock_irq(&rcvq->lock);
+		spin_unlock_bh(&rcvq->lock);
 
 		/* nothing to see, move along */
 		if (skb == NULL)
-- 
cgit v1.2.3


From 0451eb074eef30240c6c06dacf2911bee26831e1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 31 May 2005 15:15:58 -0700
Subject: [PKT_SCHED]: Fix dsmark to count ignored indices while walking

Unused indices which are ignored while walking must still
be counted to avoid dumping the same index twice.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_dsmark.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 8a3db9d95ba..acbe9d2b3e1 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -163,14 +163,15 @@ static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
 		return;
 	for (i = 0; i < p->indices; i++) {
 		if (p->mask[i] == 0xff && !p->value[i])
-			continue;
+			goto ignore;
 		if (walker->count >= walker->skip) {
 			if (walker->fn(sch, i+1, walker) < 0) {
 				walker->stop = 1;
 				break;
 			}
 		}
-                walker->count++;
+ignore:		
+		walker->count++;
         }
 }
 
-- 
cgit v1.2.3


From 486b53e59ca8cd07d91ad88375c1c884b15cc9bd Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 31 May 2005 15:16:52 -0700
Subject: [PKT_SCHED]: make dsmark try using pfifo instead of noop while
 grafting

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_dsmark.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index acbe9d2b3e1..a8c948f6297 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -73,8 +73,13 @@ static int dsmark_graft(struct Qdisc *sch,unsigned long arg,
 
 	DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new,
 	    old);
-	if (!new)
-		new = &noop_qdisc;
+
+	if (new == NULL) {
+		new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+		if (new == NULL)
+			new = &noop_qdisc;
+	}
+
 	sch_tree_lock(sch);
 	*old = xchg(&p->q,new);
 	if (*old)
-- 
cgit v1.2.3


From 08e9cd1fc559c00bc05df3fc551efe3b87c57ee3 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 31 May 2005 15:17:28 -0700
Subject: [PKT_SCHED]: Disable dsmark debugging messages by default

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_dsmark.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index a8c948f6297..d8bd2a569c7 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -18,7 +18,7 @@
 #include <asm/byteorder.h>
 
 
-#if 1 /* control */
+#if 0 /* control */
 #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args)
 #else
 #define DPRINTK(format,args...)
-- 
cgit v1.2.3


From 36839836e8132731e0cadddce452423036a1d5b3 Mon Sep 17 00:00:00 2001
From: Edgar E Iglesias <edgar@axis.com>
Date: Tue, 31 May 2005 17:08:05 -0700
Subject: [IPSEC]: Fix esp_decap_data size verification in esp4.

Signed-off-by: Edgar E Iglesias <edgar@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/esp4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 053a883247b..eae84cc39d3 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -478,7 +478,7 @@ static int __init esp4_init(void)
 {
 	struct xfrm_decap_state decap;
 
-	if (sizeof(struct esp_decap_data)  <
+	if (sizeof(struct esp_decap_data)  >
 	    sizeof(decap.decap_data)) {
 		extern void decap_data_too_small(void);
 
-- 
cgit v1.2.3


From 4bcff1b37e7c3aed914d1ce5b45994adc7dbf455 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@shinybook.infradead.org>
Date: Thu, 2 Jun 2005 12:13:21 +0100
Subject: AUDIT: Fix user pointer deref thinko in sys_socketcall().

I cunningly put the audit call immediately after the
copy_from_user().... but used the _userspace_ copy of the args still.
Let's not do that.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 net/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 6b7c3b51a7c..38729af0946 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1908,7 +1908,7 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	if (copy_from_user(a, args, nargs[call]))
 		return -EFAULT;
 
-	err = audit_socketcall(nargs[call]/sizeof(unsigned long), args);
+	err = audit_socketcall(nargs[call]/sizeof(unsigned long), a);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 64a6c7aa3836e357499d2e822388f30c11f13604 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 2 Jun 2005 13:02:25 -0700
Subject: [IPVS]: remove net/ipv4/ipvs/ip_vs_proto_icmp.c

ip_vs_proto_icmp.c was never finished.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipvs/Makefile           |   2 +-
 net/ipv4/ipvs/ip_vs_proto.c      |   3 -
 net/ipv4/ipvs/ip_vs_proto_icmp.c | 182 ---------------------------------------
 3 files changed, 1 insertion(+), 186 deletions(-)
 delete mode 100644 net/ipv4/ipvs/ip_vs_proto_icmp.c

(limited to 'net')

diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
index a788461a40c..30e85de9fff 100644
--- a/net/ipv4/ipvs/Makefile
+++ b/net/ipv4/ipvs/Makefile
@@ -11,7 +11,7 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
 
 ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
 		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
-		ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o		   \
+		ip_vs_est.o ip_vs_proto.o 				   \
 		$(ip_vs_proto-objs-y)
 
 
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
index 253c46252bd..867d4e9c659 100644
--- a/net/ipv4/ipvs/ip_vs_proto.c
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -216,9 +216,6 @@ int ip_vs_protocol_init(void)
 #ifdef CONFIG_IP_VS_PROTO_UDP
 	REGISTER_PROTOCOL(&ip_vs_protocol_udp);
 #endif
-#ifdef CONFIG_IP_VS_PROTO_ICMP
-	REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
-#endif
 #ifdef CONFIG_IP_VS_PROTO_AH
 	REGISTER_PROTOCOL(&ip_vs_protocol_ah);
 #endif
diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c
deleted file mode 100644
index 191e94aa1c1..00000000000
--- a/net/ipv4/ipvs/ip_vs_proto_icmp.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * ip_vs_proto_icmp.c:	ICMP load balancing support for IP Virtual Server
- *
- * Authors:	Julian Anastasov <ja@ssi.bg>, March 2002
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		version 2 as published by the Free Software Foundation;
- *
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/icmp.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-
-#include <net/ip_vs.h>
-
-
-static int icmp_timeouts[1] =		{ 1*60*HZ };
-
-static char * icmp_state_name_table[1] = { "ICMP" };
-
-static struct ip_vs_conn *
-icmp_conn_in_get(const struct sk_buff *skb,
-		 struct ip_vs_protocol *pp,
-		 const struct iphdr *iph,
-		 unsigned int proto_off,
-		 int inverse)
-{
-#if 0
-	struct ip_vs_conn *cp;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(iph->protocol,
-			iph->saddr, 0,
-			iph->daddr, 0);
-	} else {
-		cp = ip_vs_conn_in_get(iph->protocol,
-			iph->daddr, 0,
-			iph->saddr, 0);
-	}
-
-	return cp;
-
-#else
-	return NULL;
-#endif
-}
-
-static struct ip_vs_conn *
-icmp_conn_out_get(const struct sk_buff *skb,
-		  struct ip_vs_protocol *pp,
-		  const struct iphdr *iph,
-		  unsigned int proto_off,
-		  int inverse)
-{
-#if 0
-	struct ip_vs_conn *cp;
-
-	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(iph->protocol,
-			iph->saddr, 0,
-			iph->daddr, 0);
-	} else {
-		cp = ip_vs_conn_out_get(IPPROTO_UDP,
-			iph->daddr, 0,
-			iph->saddr, 0);
-	}
-
-	return cp;
-#else
-	return NULL;
-#endif
-}
-
-static int
-icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
-		   int *verdict, struct ip_vs_conn **cpp)
-{
-	*verdict = NF_ACCEPT;
-	return 0;
-}
-
-static int
-icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
-{
-	if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
-		if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
-			if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
-				IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
-				return 0;
-			}
-		}
-	}
-	return 1;
-}
-
-static void
-icmp_debug_packet(struct ip_vs_protocol *pp,
-		  const struct sk_buff *skb,
-		  int offset,
-		  const char *msg)
-{
-	char buf[256];
-	struct iphdr _iph, *ih;
-
-	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
-	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
-	else if (ih->frag_off & __constant_htons(IP_OFFSET))
-		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
-			pp->name, NIPQUAD(ih->saddr),
-			NIPQUAD(ih->daddr));
-	else {
-		struct icmphdr _icmph, *ic;
-
-		ic = skb_header_pointer(skb, offset + ih->ihl*4,
-					sizeof(_icmph), &_icmph);
-		if (ic == NULL)
-			sprintf(buf, "%s TRUNCATED to %u bytes\n",
-				pp->name, skb->len - offset);
-		else
-			sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
-				pp->name, NIPQUAD(ih->saddr),
-				NIPQUAD(ih->daddr),
-				ic->type, ic->code);
-	}
-	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
-}
-
-static int
-icmp_state_transition(struct ip_vs_conn *cp, int direction,
-		      const struct sk_buff *skb,
-		      struct ip_vs_protocol *pp)
-{
-	cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
-	return 1;
-}
-
-static int
-icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
-{
-	int num;
-	char **names;
-
-	num = IP_VS_ICMP_S_LAST;
-	names = icmp_state_name_table;
-	return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
-}
-
-
-static void icmp_init(struct ip_vs_protocol *pp)
-{
-	pp->timeout_table = icmp_timeouts;
-}
-
-static void icmp_exit(struct ip_vs_protocol *pp)
-{
-}
-
-struct ip_vs_protocol ip_vs_protocol_icmp = {
-	.name =			"ICMP",
-	.protocol =		IPPROTO_ICMP,
-	.dont_defrag =		0,
-	.init =			icmp_init,
-	.exit =			icmp_exit,
-	.conn_schedule =	icmp_conn_schedule,
-	.conn_in_get =		icmp_conn_in_get,
-	.conn_out_get =		icmp_conn_out_get,
-	.snat_handler =		NULL,
-	.dnat_handler =		NULL,
-	.csum_check =		icmp_csum_check,
-	.state_transition =	icmp_state_transition,
-	.register_app =		NULL,
-	.unregister_app =	NULL,
-	.app_conn_bind =	NULL,
-	.debug_packet =		icmp_debug_packet,
-	.timeout_change =	NULL,
-	.set_state_timeout =	icmp_set_state_timeout,
-};
-- 
cgit v1.2.3


From 4fef0304eeaa4156db5625e3578f92ed94645a43 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 2 Jun 2005 13:06:36 -0700
Subject: [IPV6]: Kill export of fl6_sock_lookup.

There is no usage of this EXPORT_SYMBOL in the kernel.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ipv6_syms.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c
index 2f4c91ddc9a..5ade5a5d199 100644
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -37,5 +37,4 @@ EXPORT_SYMBOL(in6_dev_finish_destroy);
 EXPORT_SYMBOL(xfrm6_rcv);
 #endif
 EXPORT_SYMBOL(rt6_lookup);
-EXPORT_SYMBOL(fl6_sock_lookup);
 EXPORT_SYMBOL(ipv6_push_nfrag_opts);
-- 
cgit v1.2.3


From fa04ae5c09f3dfedbc923c2954a9a26a573833f1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 6 Jun 2005 15:07:19 -0700
Subject: [ETHTOOL]: Check correct pointer in ethtool_set_coalesce().

It was checking the "GET" function pointer instead of
the "SET" one.  Looks like a cut&paste error :-)

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 8ec484894d6..a3eeb88e1c8 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -356,7 +356,7 @@ static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_coalesce coalesce;
 
-	if (!dev->ethtool_ops->get_coalesce)
+	if (!dev->ethtool_ops->set_coalesce)
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
-- 
cgit v1.2.3


From 8181b8c1f3a69fe5abcc51cb732eb512ccd1566a Mon Sep 17 00:00:00 2001
From: Gabor Fekete <gfekete@cc.jyu.fi>
Date: Wed, 8 Jun 2005 14:54:38 -0700
Subject: [IPV6]: Update parm.link in ip6ip6_tnl_change()

Signed-off-by: Gabor Fekete <gfekete@cc.jyu.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 3b1c9fa184a..ba3b0c267f7 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -882,6 +882,7 @@ ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
 	t->parms.hop_limit = p->hop_limit;
 	t->parms.encap_limit = p->encap_limit;
 	t->parms.flowinfo = p->flowinfo;
+	t->parms.link = p->link;
 	ip6ip6_tnl_link_config(t);
 	return 0;
 }
-- 
cgit v1.2.3


From 699a411451a32cc111410f44f172b265f6d679c8 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 8 Jun 2005 14:55:42 -0700
Subject: [NET]: Allow controlling NAPI device weight with sysfs

Simple interface to allow changing network device scheduling weight
with sysfs. Please consider this for 2.6.12, since risk/impact is small.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net-sysfs.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net')

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 910eb4c05a4..e2137f3e489 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -185,6 +185,22 @@ static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, siz
 static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, 
 			 store_tx_queue_len);
 
+NETDEVICE_SHOW(weight, fmt_dec);
+
+static int change_weight(struct net_device *net, unsigned long new_weight)
+{
+	net->weight = new_weight;
+	return 0;
+}
+
+static ssize_t store_weight(struct class_device *dev, const char *buf, size_t len)
+{
+	return netdev_store(dev, buf, len, change_weight);
+}
+
+static CLASS_DEVICE_ATTR(weight, S_IRUGO | S_IWUSR, show_weight, 
+			 store_weight);
+
 
 static struct class_device_attribute *net_class_attributes[] = {
 	&class_device_attr_ifindex,
@@ -194,6 +210,7 @@ static struct class_device_attribute *net_class_attributes[] = {
 	&class_device_attr_features,
 	&class_device_attr_mtu,
 	&class_device_attr_flags,
+	&class_device_attr_weight,
 	&class_device_attr_type,
 	&class_device_attr_address,
 	&class_device_attr_broadcast,
-- 
cgit v1.2.3


From e3876605450979fe52a1a03e7eb78a89bf59e76a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 8 Jun 2005 14:56:01 -0700
Subject: [NET]: Fix sysctl net.core.dev_weight

Changing the sysctl net.core.dev_weight has no effect because the weight
of the backlog devices is set during initialization and never changed.

This patch propagates any changes to the global value affected by sysctl
to the per-cpu devices. It is done every time the packet handler
function is run.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index f15a3ffff63..ab935778ce8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1744,6 +1744,7 @@ static int process_backlog(struct net_device *backlog_dev, int *budget)
 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
 	unsigned long start_time = jiffies;
 
+	backlog_dev->weight = weight_p;
 	for (;;) {
 		struct sk_buff *skb;
 		struct net_device *dev;
-- 
cgit v1.2.3


From b824979aeccbfd997e6e5dbe75c47d586b5a2923 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 8 Jun 2005 15:10:22 -0700
Subject: [PKT_SCHED]: Fix typo in NET_EMATCH_STACK help text

Spotted by Geert Uytterhoeven <geert@linux-m68k.org>.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b0941186f86..b22c9beb604 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -405,7 +405,7 @@ config NET_EMATCH_STACK
 	---help---
 	  Size of the local stack variable used while evaluating the tree of
 	  ematches. Limits the depth of the tree, i.e. the number of
-	  encapsulated precedences. Every level requires 4 bytes of addtional
+	  encapsulated precedences. Every level requires 4 bytes of additional
 	  stack space.
 
 config NET_EMATCH_CMP
-- 
cgit v1.2.3


From 4890062960cbc4d3cebdbd8261a68bc85efcf5d4 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 8 Jun 2005 15:10:48 -0700
Subject: [PKT_SCHED]: Allow socket attributes to be matched on via meta ematch

Adds meta collectors for all socket attributes that make sense
to be filtered upon. Some of them are only useful for debugging
but having them doesn't hurt.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/em_meta.c | 291 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 267 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index f1eeaf65cee..ed2a46cbb67 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -32,7 +32,7 @@
  * 	      +-----------+                           +-----------+
  * 	            |                                       |
  * 	            ---> meta_ops[INT][INDEV](...)          |
- *                            |                            |
+ *	                      |                             |
  * 	            -----------                             |
  * 	            V                                       V
  * 	      +-----------+                           +-----------+
@@ -70,6 +70,7 @@
 #include <net/dst.h>
 #include <net/route.h>
 #include <net/pkt_cls.h>
+#include <net/sock.h>
 
 struct meta_obj
 {
@@ -283,6 +284,214 @@ META_COLLECTOR(int_rtiif)
 		dst->value = ((struct rtable*) skb->dst)->fl.iif;
 }
 
+/**************************************************************************
+ * Socket Attributes
+ **************************************************************************/
+
+#define SKIP_NONLOCAL(skb)			\
+	if (unlikely(skb->sk == NULL)) {	\
+		*err = -1;			\
+		return;				\
+	}
+
+META_COLLECTOR(int_sk_family)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_family;
+}
+
+META_COLLECTOR(int_sk_state)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_state;
+}
+
+META_COLLECTOR(int_sk_reuse)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_reuse;
+}
+
+META_COLLECTOR(int_sk_bound_if)
+{
+	SKIP_NONLOCAL(skb);
+	/* No error if bound_dev_if is 0, legal userspace check */
+	dst->value = skb->sk->sk_bound_dev_if;
+}
+
+META_COLLECTOR(var_sk_bound_if)
+{
+	SKIP_NONLOCAL(skb);
+
+	 if (skb->sk->sk_bound_dev_if == 0) {
+		dst->value = (unsigned long) "any";
+		dst->len = 3;
+	 } else  {
+		struct net_device *dev;
+		
+		dev = dev_get_by_index(skb->sk->sk_bound_dev_if);
+		*err = var_dev(dev, dst);
+		if (dev)
+			dev_put(dev);
+	 }
+}
+
+META_COLLECTOR(int_sk_refcnt)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = atomic_read(&skb->sk->sk_refcnt);
+}
+
+META_COLLECTOR(int_sk_rcvbuf)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_rcvbuf;
+}
+
+META_COLLECTOR(int_sk_shutdown)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_shutdown;
+}
+
+META_COLLECTOR(int_sk_proto)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_protocol;
+}
+
+META_COLLECTOR(int_sk_type)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_type;
+}
+
+META_COLLECTOR(int_sk_rmem_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = atomic_read(&skb->sk->sk_rmem_alloc);
+}
+
+META_COLLECTOR(int_sk_wmem_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = atomic_read(&skb->sk->sk_wmem_alloc);
+}
+
+META_COLLECTOR(int_sk_omem_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = atomic_read(&skb->sk->sk_omem_alloc);
+}
+
+META_COLLECTOR(int_sk_rcv_qlen)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_receive_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_snd_qlen)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_write_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_wmem_queued)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_wmem_queued;
+}
+
+META_COLLECTOR(int_sk_fwd_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_forward_alloc;
+}
+
+META_COLLECTOR(int_sk_sndbuf)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_sndbuf;
+}
+
+META_COLLECTOR(int_sk_alloc)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_allocation;
+}
+
+META_COLLECTOR(int_sk_route_caps)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_route_caps;
+}
+
+META_COLLECTOR(int_sk_hashent)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_hashent;
+}
+
+META_COLLECTOR(int_sk_lingertime)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_lingertime / HZ;
+}
+
+META_COLLECTOR(int_sk_err_qlen)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_error_queue.qlen;
+}
+
+META_COLLECTOR(int_sk_ack_bl)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_ack_backlog;
+}
+
+META_COLLECTOR(int_sk_max_ack_bl)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_max_ack_backlog;
+}
+
+META_COLLECTOR(int_sk_prio)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_priority;
+}
+
+META_COLLECTOR(int_sk_rcvlowat)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_rcvlowat;
+}
+
+META_COLLECTOR(int_sk_rcvtimeo)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_rcvtimeo / HZ;
+}
+
+META_COLLECTOR(int_sk_sndtimeo)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_sndtimeo / HZ;
+}
+
+META_COLLECTOR(int_sk_sendmsg_off)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_sndmsg_off;
+}
+
+META_COLLECTOR(int_sk_write_pend)
+{
+	SKIP_NONLOCAL(skb);
+	dst->value = skb->sk->sk_write_pending;
+}
+
 /**************************************************************************
  * Meta value collectors assignment table
  **************************************************************************/
@@ -293,41 +502,75 @@ struct meta_ops
 			       struct meta_value *, struct meta_obj *, int *);
 };
 
+#define META_ID(name) TCF_META_ID_##name
+#define META_FUNC(name) { .get = meta_##name }
+
 /* Meta value operations table listing all meta value collectors and
  * assigns them to a type and meta id. */
 static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 	[TCF_META_TYPE_VAR] = {
-		[TCF_META_ID_DEV]	= { .get = meta_var_dev },
-		[TCF_META_ID_INDEV]	= { .get = meta_var_indev },
-		[TCF_META_ID_REALDEV]	= { .get = meta_var_realdev }
+		[META_ID(DEV)]			= META_FUNC(var_dev),
+		[META_ID(INDEV)]		= META_FUNC(var_indev),
+		[META_ID(REALDEV)]		= META_FUNC(var_realdev),
+		[META_ID(SK_BOUND_IF)] 		= META_FUNC(var_sk_bound_if),
 	},
 	[TCF_META_TYPE_INT] = {
-		[TCF_META_ID_RANDOM]	= { .get = meta_int_random },
-		[TCF_META_ID_LOADAVG_0]	= { .get = meta_int_loadavg_0 },
-		[TCF_META_ID_LOADAVG_1]	= { .get = meta_int_loadavg_1 },
-		[TCF_META_ID_LOADAVG_2]	= { .get = meta_int_loadavg_2 },
-		[TCF_META_ID_DEV]	= { .get = meta_int_dev },
-		[TCF_META_ID_INDEV]	= { .get = meta_int_indev },
-		[TCF_META_ID_REALDEV]	= { .get = meta_int_realdev },
-		[TCF_META_ID_PRIORITY]	= { .get = meta_int_priority },
-		[TCF_META_ID_PROTOCOL]	= { .get = meta_int_protocol },
-		[TCF_META_ID_SECURITY]	= { .get = meta_int_security },
-		[TCF_META_ID_PKTTYPE]	= { .get = meta_int_pkttype },
-		[TCF_META_ID_PKTLEN]	= { .get = meta_int_pktlen },
-		[TCF_META_ID_DATALEN]	= { .get = meta_int_datalen },
-		[TCF_META_ID_MACLEN]	= { .get = meta_int_maclen },
+		[META_ID(RANDOM)]		= META_FUNC(int_random),
+		[META_ID(LOADAVG_0)]		= META_FUNC(int_loadavg_0),
+		[META_ID(LOADAVG_1)]		= META_FUNC(int_loadavg_1),
+		[META_ID(LOADAVG_2)]		= META_FUNC(int_loadavg_2),
+		[META_ID(DEV)]			= META_FUNC(int_dev),
+		[META_ID(INDEV)]		= META_FUNC(int_indev),
+		[META_ID(REALDEV)]		= META_FUNC(int_realdev),
+		[META_ID(PRIORITY)]		= META_FUNC(int_priority),
+		[META_ID(PROTOCOL)]		= META_FUNC(int_protocol),
+		[META_ID(SECURITY)]		= META_FUNC(int_security),
+		[META_ID(PKTTYPE)]		= META_FUNC(int_pkttype),
+		[META_ID(PKTLEN)]		= META_FUNC(int_pktlen),
+		[META_ID(DATALEN)]		= META_FUNC(int_datalen),
+		[META_ID(MACLEN)]		= META_FUNC(int_maclen),
 #ifdef CONFIG_NETFILTER
-		[TCF_META_ID_NFMARK]	= { .get = meta_int_nfmark },
+		[META_ID(NFMARK)]		= META_FUNC(int_nfmark),
 #endif
-		[TCF_META_ID_TCINDEX]	= { .get = meta_int_tcindex },
+		[META_ID(TCINDEX)]		= META_FUNC(int_tcindex),
 #ifdef CONFIG_NET_CLS_ACT
-		[TCF_META_ID_TCVERDICT]	= { .get = meta_int_tcverd },
-		[TCF_META_ID_TCCLASSID]	= { .get = meta_int_tcclassid },
+		[META_ID(TCVERDICT)]		= META_FUNC(int_tcverd),
+		[META_ID(TCCLASSID)]		= META_FUNC(int_tcclassid),
 #endif
 #ifdef CONFIG_NET_CLS_ROUTE
-		[TCF_META_ID_RTCLASSID]	= { .get = meta_int_rtclassid },
+		[META_ID(RTCLASSID)]		= META_FUNC(int_rtclassid),
 #endif
-		[TCF_META_ID_RTIIF]	= { .get = meta_int_rtiif }
+		[META_ID(RTIIF)]		= META_FUNC(int_rtiif),
+		[META_ID(SK_FAMILY)]		= META_FUNC(int_sk_family),
+		[META_ID(SK_STATE)]		= META_FUNC(int_sk_state),
+		[META_ID(SK_REUSE)]		= META_FUNC(int_sk_reuse),
+		[META_ID(SK_BOUND_IF)]		= META_FUNC(int_sk_bound_if),
+		[META_ID(SK_REFCNT)]		= META_FUNC(int_sk_refcnt),
+		[META_ID(SK_RCVBUF)]		= META_FUNC(int_sk_rcvbuf),
+		[META_ID(SK_SNDBUF)]		= META_FUNC(int_sk_sndbuf),
+		[META_ID(SK_SHUTDOWN)]		= META_FUNC(int_sk_shutdown),
+		[META_ID(SK_PROTO)]		= META_FUNC(int_sk_proto),
+		[META_ID(SK_TYPE)]		= META_FUNC(int_sk_type),
+		[META_ID(SK_RMEM_ALLOC)]	= META_FUNC(int_sk_rmem_alloc),
+		[META_ID(SK_WMEM_ALLOC)]	= META_FUNC(int_sk_wmem_alloc),
+		[META_ID(SK_OMEM_ALLOC)]	= META_FUNC(int_sk_omem_alloc),
+		[META_ID(SK_WMEM_QUEUED)]	= META_FUNC(int_sk_wmem_queued),
+		[META_ID(SK_RCV_QLEN)]		= META_FUNC(int_sk_rcv_qlen),
+		[META_ID(SK_SND_QLEN)]		= META_FUNC(int_sk_snd_qlen),
+		[META_ID(SK_ERR_QLEN)]		= META_FUNC(int_sk_err_qlen),
+		[META_ID(SK_FORWARD_ALLOCS)]	= META_FUNC(int_sk_fwd_alloc),
+		[META_ID(SK_ALLOCS)]		= META_FUNC(int_sk_alloc),
+		[META_ID(SK_ROUTE_CAPS)]	= META_FUNC(int_sk_route_caps),
+		[META_ID(SK_HASHENT)]		= META_FUNC(int_sk_hashent),
+		[META_ID(SK_LINGERTIME)]	= META_FUNC(int_sk_lingertime),
+		[META_ID(SK_ACK_BACKLOG)]	= META_FUNC(int_sk_ack_bl),
+		[META_ID(SK_MAX_ACK_BACKLOG)]	= META_FUNC(int_sk_max_ack_bl),
+		[META_ID(SK_PRIO)]		= META_FUNC(int_sk_prio),
+		[META_ID(SK_RCVLOWAT)]		= META_FUNC(int_sk_rcvlowat),
+		[META_ID(SK_RCVTIMEO)]		= META_FUNC(int_sk_rcvtimeo),
+		[META_ID(SK_SNDTIMEO)]		= META_FUNC(int_sk_sndtimeo),
+		[META_ID(SK_SENDMSG_OFF)]	= META_FUNC(int_sk_sendmsg_off),
+		[META_ID(SK_WRITE_PENDING)]	= META_FUNC(int_sk_write_pend),
 	}
 };
 
-- 
cgit v1.2.3


From e1e284a4bd827db2288af9536664b44590e419eb Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 8 Jun 2005 15:11:02 -0700
Subject: [PKT_SCHED]: Dump classification result for basic classifier

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_basic.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 0d2d4415f33..dfb300bb6ba 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -261,6 +261,9 @@ static int basic_dump(struct tcf_proto *tp, unsigned long fh,
 	rta = (struct rtattr *) b;
 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
 
+	if (f->res.classid)
+		RTA_PUT(skb, TCA_BASIC_CLASSID, sizeof(u32), &f->res.classid);
+
 	if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
 	    tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
 		goto rtattr_failure;
-- 
cgit v1.2.3


From 98e56405521b74b4826f855d45ef7859f34548ff Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 8 Jun 2005 15:11:19 -0700
Subject: [PKT_SCHED]: Fix numeric comparison in meta ematch

This patch is brought to you by the department of applied stupidity.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/em_meta.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index ed2a46cbb67..48bb23c2a35 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -639,9 +639,9 @@ static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
 	/* Let gcc optimize it, the unlikely is not really based on
 	 * some numbers but jump free code for mismatches seems
 	 * more logical. */
-	if (unlikely(a == b))
+	if (unlikely(a->value == b->value))
 		return 0;
-	else if (a < b)
+	else if (a->value < b->value)
 		return -1;
 	else
 		return 1;
-- 
cgit v1.2.3


From e7626486c3c4ce456b11a7944edf164ef76fc599 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 13 Jun 2005 14:24:52 -0700
Subject: [TCP]: Adjust TCP mem order check to new alloc_large_system_hash

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a037bafcba3..0d9a4fd5f1a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2338,7 +2338,7 @@ void __init tcp_init(void)
 			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
 			order++)
 		;
-	if (order > 4) {
+	if (order >= 4) {
 		sysctl_local_port_range[0] = 32768;
 		sysctl_local_port_range[1] = 61000;
 		sysctl_tcp_max_tw_buckets = 180000;
-- 
cgit v1.2.3


From 6efd8455cff1979dd081daaa1ce3d3f1764863dc Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 13 Jun 2005 14:29:06 -0700
Subject: [IPV4]: Multipath modules need a license to prevent kernel tainting.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/multipath_drr.c     | 2 ++
 net/ipv4/multipath_random.c  | 2 ++
 net/ipv4/multipath_rr.c      | 2 ++
 net/ipv4/multipath_wrandom.c | 2 ++
 4 files changed, 8 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
index cf2e6bcf797..c9cf8726051 100644
--- a/net/ipv4/multipath_drr.c
+++ b/net/ipv4/multipath_drr.c
@@ -31,6 +31,7 @@
 #include <linux/igmp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/module.h>
 #include <linux/mroute.h>
 #include <linux/init.h>
 #include <net/ip.h>
@@ -247,3 +248,4 @@ static void __exit drr_exit(void)
 
 module_init(drr_init);
 module_exit(drr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
index 805a16e47de..5249dbe7c55 100644
--- a/net/ipv4/multipath_random.c
+++ b/net/ipv4/multipath_random.c
@@ -31,6 +31,7 @@
 #include <linux/igmp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/module.h>
 #include <linux/mroute.h>
 #include <linux/init.h>
 #include <net/ip.h>
@@ -126,3 +127,4 @@ static void __exit random_exit(void)
 
 module_init(random_init);
 module_exit(random_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
index 061b6b25398..b6cd2870478 100644
--- a/net/ipv4/multipath_rr.c
+++ b/net/ipv4/multipath_rr.c
@@ -31,6 +31,7 @@
 #include <linux/igmp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/module.h>
 #include <linux/mroute.h>
 #include <linux/init.h>
 #include <net/ip.h>
@@ -93,3 +94,4 @@ static void __exit rr_exit(void)
 
 module_init(rr_init);
 module_exit(rr_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
index c3d2ca1a678..bd7d75b6abe 100644
--- a/net/ipv4/multipath_wrandom.c
+++ b/net/ipv4/multipath_wrandom.c
@@ -31,6 +31,7 @@
 #include <linux/igmp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/module.h>
 #include <linux/mroute.h>
 #include <linux/init.h>
 #include <net/ip.h>
@@ -342,3 +343,4 @@ static void __exit wrandom_exit(void)
 
 module_init(wrandom_init);
 module_exit(wrandom_exit);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 979b6c135fc4d466a39d8e3ec05583e5ee30261a Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Mon, 13 Jun 2005 14:30:40 -0700
Subject: [NET]: Move the netdev list to vger.kernel.org.

From: Ralf Baechle <ralf@linux-mips.org>

There are archives of the old list at http://oss.sgi.com/archives/netdev

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index cafcb084098..914c85ff8fe 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -881,7 +881,7 @@ static int __init tc_action_init(void)
 		link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action;
 	}
 
-	printk("TC classifier action (bugs to netdev@oss.sgi.com cc "
+	printk("TC classifier action (bugs to netdev@vger.kernel.org cc "
 	       "hadi@cyberus.ca)\n");
 	return 0;
 }
-- 
cgit v1.2.3


From 84427d533076a08137779b3182a71c37bf000b27 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Mon, 13 Jun 2005 14:59:44 -0700
Subject: [IPV6]: Ensure to use icmpv6_socket in non-preemptive context.

We saw following trace several times:

|BUG: using smp_processor_id() in preemptible [00000001] code: httpd/30137
|caller is icmpv6_send+0x23/0x540
| [<c01ad63b>] smp_processor_id+0x9b/0xb8
| [<c02993e7>] icmpv6_send+0x23/0x540

This is because of icmpv6_socket, which is the only one user of
smp_processor_id() in icmpv6_send(), AFAIK.

Since it should be used in non-preemptive context,
let's defer the dereference after disabling preemption
(by icmpv6_xmit_lock()).

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 8e0f569b883..ff3ec9822e3 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -277,8 +277,8 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 {
 	struct inet6_dev *idev = NULL;
 	struct ipv6hdr *hdr = skb->nh.ipv6h;
-	struct sock *sk = icmpv6_socket->sk;
-	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct sock *sk;
+	struct ipv6_pinfo *np;
 	struct in6_addr *saddr = NULL;
 	struct dst_entry *dst;
 	struct icmp6hdr tmp_hdr;
@@ -358,6 +358,9 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info,
 	if (icmpv6_xmit_lock())
 		return;
 
+	sk = icmpv6_socket->sk;
+	np = inet6_sk(sk);
+
 	if (!icmpv6_xrlim_allow(sk, type, &fl))
 		goto out;
 
@@ -423,9 +426,9 @@ out:
 
 static void icmpv6_echo_reply(struct sk_buff *skb)
 {
-	struct sock *sk = icmpv6_socket->sk;
+	struct sock *sk;
 	struct inet6_dev *idev;
-	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct ipv6_pinfo *np;
 	struct in6_addr *saddr = NULL;
 	struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw;
 	struct icmp6hdr tmp_hdr;
@@ -454,6 +457,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
 	if (icmpv6_xmit_lock())
 		return;
 
+	sk = icmpv6_socket->sk;
+	np = inet6_sk(sk);
+
 	if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst))
 		fl.oif = np->mcast_oif;
 
-- 
cgit v1.2.3


From 77bd91967a97e5b94ae36113efe1d9e4f68a716e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=E9mi=20Denis-Courmont?= <rdenis@simphalempin.com>
Date: Mon, 13 Jun 2005 15:01:34 -0700
Subject: [IPv6] Don't generate temporary for TUN devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Userland layer-2 tunneling devices allocated through the TUNTAP driver
(drivers/net/tun.c) have a type of ARPHRD_NONE, and have no link-layer
address. The kernel complains at regular interval when IPv6 Privacy
extension are enabled because it can't find an hardware address :

Dec 29 11:02:04 auguste kernel: __ipv6_regen_rndid(idev=cb3e0c00):
cannot get EUI64 identifier; use random bytes.

IPv6 Privacy extensions should probably be disabled on that sort of
device. They won't work anyway. If userland wants a more usual
Ethernet-ish interface with usual IPv6 autoconfiguration, it will use a
TAP device with an emulated link-layer  and a random hardware address
rather than a TUN device.

As far as I could fine, TUN virtual device from TUNTAP is the very only
sort of device using ARPHRD_NONE as kernel device type.

Signed-off-by: R�mi Denis-Courmont <rdenis@simphalempin.com>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 7744a259269..2720899d516 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -372,6 +372,7 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
 		ndev->regen_timer.data = (unsigned long) ndev;
 		if ((dev->flags&IFF_LOOPBACK) ||
 		    dev->type == ARPHRD_TUNNEL ||
+		    dev->type == ARPHRD_NONE ||
 		    dev->type == ARPHRD_SIT) {
 			printk(KERN_INFO
 				"Disabled Privacy Extensions on device %p(%s)\n",
-- 
cgit v1.2.3


From 4243cac1e76228f7ba916d5df9e75a219352160a Mon Sep 17 00:00:00 2001
From: Vladislav Yasevich <vladislav.yasevich@hp.com>
Date: Mon, 13 Jun 2005 15:10:49 -0700
Subject: [SCTP]: Fix bug in restart of peeled-off associations.

Signed-off-by: Vladislav Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 0b338eca6dc..2a3c0e08a09 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4686,6 +4686,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	struct sctp_endpoint *newep = newsp->ep;
 	struct sk_buff *skb, *tmp;
 	struct sctp_ulpevent *event;
+	int flags = 0;
 
 	/* Migrate socket buffer sizes and all the socket level options to the
 	 * new socket.
@@ -4707,6 +4708,17 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 	sctp_sk(newsk)->bind_hash = pp;
 	inet_sk(newsk)->num = inet_sk(oldsk)->num;
 
+	/* Copy the bind_addr list from the original endpoint to the new
+	 * endpoint so that we can handle restarts properly
+	 */
+	if (assoc->peer.ipv4_address)
+		flags |= SCTP_ADDR4_PEERSUPP;
+	if (assoc->peer.ipv6_address)
+		flags |= SCTP_ADDR6_PEERSUPP;
+	sctp_bind_addr_copy(&newsp->ep->base.bind_addr,
+			     &oldsp->ep->base.bind_addr,
+			     SCTP_SCOPE_GLOBAL, GFP_KERNEL, flags);
+
 	/* Move any messages in the old socket's receive queue that are for the
 	 * peeled off association to the new socket's receive queue.
 	 */
-- 
cgit v1.2.3


From 0fd9a65a76e883b7d16e72dde3f8bf20ebc1e82a Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@redhat.com>
Date: Mon, 13 Jun 2005 15:11:24 -0700
Subject: [SCTP] Support SO_BINDTODEVICE socket option on incoming packets.

Signed-off-by: Neil Horman <nhorman@redhat.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/input.c | 49 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/sctp/input.c b/net/sctp/input.c
index b719a77d66b..fffc880a646 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -178,6 +178,37 @@ int sctp_rcv(struct sk_buff *skb)
 
 	asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport);
 
+	if (!asoc)
+		ep = __sctp_rcv_lookup_endpoint(&dest);
+
+	/* Retrieve the common input handling substructure. */
+	rcvr = asoc ? &asoc->base : &ep->base;
+	sk = rcvr->sk;
+
+	/*
+	 * If a frame arrives on an interface and the receiving socket is
+	 * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB
+	 */
+	if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb)))
+	{
+		sock_put(sk);
+		if (asoc) {
+			sctp_association_put(asoc);
+			asoc = NULL;
+		} else {
+			sctp_endpoint_put(ep);
+			ep = NULL;
+		}
+		sk = sctp_get_ctl_sock();
+		ep = sctp_sk(sk)->ep;
+		sctp_endpoint_hold(ep);
+		sock_hold(sk);
+		rcvr = &ep->base;
+	}
+
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		goto discard_release;
+
 	/*
 	 * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
 	 * An SCTP packet is called an "out of the blue" (OOTB)
@@ -187,22 +218,12 @@ int sctp_rcv(struct sk_buff *skb)
 	 * packet belongs.
 	 */
 	if (!asoc) {
-		ep = __sctp_rcv_lookup_endpoint(&dest);
 		if (sctp_rcv_ootb(skb)) {
 			SCTP_INC_STATS_BH(SCTP_MIB_OUTOFBLUES);
 			goto discard_release;
 		}
 	}
 
-	/* Retrieve the common input handling substructure. */
-	rcvr = asoc ? &asoc->base : &ep->base;
-	sk = rcvr->sk;
-
-	if ((sk) && (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)) {
-		goto discard_release;
-	}
-
-
 	/* SCTP seems to always need a timestamp right now (FIXME) */
 	if (skb->stamp.tv_sec == 0) {
 		do_gettimeofday(&skb->stamp);
@@ -265,13 +286,11 @@ discard_it:
 
 discard_release:
 	/* Release any structures we may be holding. */
-	if (asoc) {
-		sock_put(asoc->base.sk);
+	sock_put(sk);
+	if (asoc)
 		sctp_association_put(asoc);
-	} else {
-		sock_put(ep->base.sk);
+	else
 		sctp_endpoint_put(ep);
-	}
 
 	goto discard_it;
 }
-- 
cgit v1.2.3


From bca735bd0d5969497704a125b05344b92155172f Mon Sep 17 00:00:00 2001
From: Vladislav Yasevich <vladislav.yasevich@hp.com>
Date: Mon, 13 Jun 2005 15:11:57 -0700
Subject: [SCTP] Extend the info exported via /proc/net/sctp to support netstat
 for SCTP.

Signed-off-by: Vladislav Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/proc.c | 194 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 151 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index e42fd8c2916..98d49ec9b74 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -132,14 +132,25 @@ void sctp_snmp_proc_exit(void)
 static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb)
 {
 	struct list_head *pos;
+	struct sctp_association *asoc;
 	struct sctp_sockaddr_entry *laddr;
-	union sctp_addr *addr;
+	struct sctp_transport *peer;
+	union sctp_addr *addr, *primary = NULL;
 	struct sctp_af *af;
 
+	if (epb->type == SCTP_EP_TYPE_ASSOCIATION) {
+	    asoc = sctp_assoc(epb);
+	    peer = asoc->peer.primary_path;
+	    primary = &peer->saddr;
+	}
+
 	list_for_each(pos, &epb->bind_addr.address_list) {
 		laddr = list_entry(pos, struct sctp_sockaddr_entry, list);
 		addr = (union sctp_addr *)&laddr->a;
 		af = sctp_get_af_specific(addr->sa.sa_family);
+		if (primary && af->cmp_addr(addr, primary)) {
+			seq_printf(seq, "*");
+		}
 		af->seq_dump_addr(seq, addr);
 	}
 }
@@ -149,17 +160,54 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa
 {
 	struct list_head *pos;
 	struct sctp_transport *transport;
-	union sctp_addr *addr;
+	union sctp_addr *addr, *primary;
 	struct sctp_af *af;
 
+	primary = &(assoc->peer.primary_addr);
 	list_for_each(pos, &assoc->peer.transport_addr_list) {
 		transport = list_entry(pos, struct sctp_transport, transports);
 		addr = (union sctp_addr *)&transport->ipaddr;
 		af = sctp_get_af_specific(addr->sa.sa_family);
+		if (af->cmp_addr(addr, primary)) {
+			seq_printf(seq, "*");
+		}
 		af->seq_dump_addr(seq, addr);
 	}
 }
 
+static void * sctp_eps_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos > sctp_ep_hashsize)
+		return NULL;
+
+	if (*pos < 0)
+		*pos = 0;
+
+	if (*pos == 0)
+		seq_printf(seq, " ENDPT     SOCK   STY SST HBKT LPORT   UID INODE LADDRS\n");
+
+	++*pos;
+
+	return (void *)pos;
+}
+
+static void sctp_eps_seq_stop(struct seq_file *seq, void *v)
+{
+	return;
+}
+
+
+static void * sctp_eps_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	if (*pos > sctp_ep_hashsize)
+		return NULL;
+
+	++*pos;
+
+	return pos;
+}
+
+
 /* Display sctp endpoints (/proc/net/sctp/eps). */
 static int sctp_eps_seq_show(struct seq_file *seq, void *v)
 {
@@ -167,38 +215,50 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v)
 	struct sctp_ep_common *epb;
 	struct sctp_endpoint *ep;
 	struct sock *sk;
-	int hash;
-
-	seq_printf(seq, " ENDPT     SOCK   STY SST HBKT LPORT LADDRS\n");
-	for (hash = 0; hash < sctp_ep_hashsize; hash++) {
-		head = &sctp_ep_hashtable[hash];
-		read_lock(&head->lock);
-		for (epb = head->chain; epb; epb = epb->next) {
-			ep = sctp_ep(epb);
-			sk = epb->sk;
-			seq_printf(seq, "%8p %8p %-3d %-3d %-4d %-5d ", ep, sk,
-				   sctp_sk(sk)->type, sk->sk_state, hash,
-				   epb->bind_addr.port);
-			sctp_seq_dump_local_addrs(seq, epb);
-			seq_printf(seq, "\n");
-		}
-		read_unlock(&head->lock);
+	int    hash = *(int *)v;
+
+	if (hash > sctp_ep_hashsize)
+		return -ENOMEM;
+
+	head = &sctp_ep_hashtable[hash-1];
+	sctp_local_bh_disable();
+	read_lock(&head->lock);
+	for (epb = head->chain; epb; epb = epb->next) {
+		ep = sctp_ep(epb);
+		sk = epb->sk;
+		seq_printf(seq, "%8p %8p %-3d %-3d %-4d %-5d %5d %5lu ", ep, sk,
+			   sctp_sk(sk)->type, sk->sk_state, hash-1,
+			   epb->bind_addr.port,
+			   sock_i_uid(sk), sock_i_ino(sk));
+
+		sctp_seq_dump_local_addrs(seq, epb);
+		seq_printf(seq, "\n");
 	}
+	read_unlock(&head->lock);
+	sctp_local_bh_enable();
 
 	return 0;
 }
 
+static struct seq_operations sctp_eps_ops = {
+	.start = sctp_eps_seq_start,
+	.next  = sctp_eps_seq_next,
+	.stop  = sctp_eps_seq_stop,
+	.show  = sctp_eps_seq_show,
+};
+
+
 /* Initialize the seq file operations for 'eps' object. */
 static int sctp_eps_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, sctp_eps_seq_show, NULL);
+	return seq_open(file, &sctp_eps_ops);
 }
 
 static struct file_operations sctp_eps_seq_fops = {
 	.open	 = sctp_eps_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = single_release,
+	.release = seq_release,
 };
 
 /* Set up the proc fs entry for 'eps' object. */
@@ -221,6 +281,40 @@ void sctp_eps_proc_exit(void)
 	remove_proc_entry("eps", proc_net_sctp);
 }
 
+
+static void * sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos > sctp_assoc_hashsize)
+		return NULL;
+
+	if (*pos < 0)
+		*pos = 0;
+
+	if (*pos == 0)
+		seq_printf(seq, " ASSOC     SOCK   STY SST ST HBKT ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT "
+				"RPORT LADDRS <-> RADDRS\n");
+
+	++*pos;
+
+	return (void *)pos;
+}
+
+static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
+{
+	return;
+}
+
+
+static void * sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	if (*pos > sctp_assoc_hashsize)
+		return NULL;
+
+	++*pos;
+
+	return pos;
+}
+
 /* Display sctp associations (/proc/net/sctp/assocs). */
 static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
 {
@@ -228,43 +322,57 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
 	struct sctp_ep_common *epb;
 	struct sctp_association *assoc;
 	struct sock *sk;
-	int hash;
-
-	seq_printf(seq, " ASSOC     SOCK   STY SST ST HBKT LPORT RPORT "
-			"LADDRS <-> RADDRS\n");
-	for (hash = 0; hash < sctp_assoc_hashsize; hash++) {
-		head = &sctp_assoc_hashtable[hash];
-		read_lock(&head->lock);
-		for (epb = head->chain; epb; epb = epb->next) {
-			assoc = sctp_assoc(epb);
-			sk = epb->sk;
-			seq_printf(seq,
-				   "%8p %8p %-3d %-3d %-2d %-4d %-5d %-5d ",
-				   assoc, sk, sctp_sk(sk)->type, sk->sk_state,
-				   assoc->state, hash, epb->bind_addr.port,
-				   assoc->peer.port);
-			sctp_seq_dump_local_addrs(seq, epb);
-			seq_printf(seq, "<-> ");
-			sctp_seq_dump_remote_addrs(seq, assoc);
-			seq_printf(seq, "\n");
-		}
-		read_unlock(&head->lock);
+	int    hash = *(int *)v;
+
+	if (hash > sctp_assoc_hashsize)
+		return -ENOMEM;
+
+	head = &sctp_assoc_hashtable[hash-1];
+	sctp_local_bh_disable();
+	read_lock(&head->lock);
+	for (epb = head->chain; epb; epb = epb->next) {
+		assoc = sctp_assoc(epb);
+		sk = epb->sk;
+		seq_printf(seq,
+			   "%8p %8p %-3d %-3d %-2d %-4d %4d %8d %8d %7d %5lu %-5d %5d ",
+			   assoc, sk, sctp_sk(sk)->type, sk->sk_state,
+			   assoc->state, hash-1, assoc->assoc_id,
+			   (sk->sk_rcvbuf - assoc->rwnd),
+			   assoc->sndbuf_used,
+			   sock_i_uid(sk), sock_i_ino(sk),
+			   epb->bind_addr.port,
+			   assoc->peer.port);
+
+		seq_printf(seq, " ");
+		sctp_seq_dump_local_addrs(seq, epb);
+		seq_printf(seq, "<-> ");
+		sctp_seq_dump_remote_addrs(seq, assoc);
+		seq_printf(seq, "\n");
 	}
+	read_unlock(&head->lock);
+	sctp_local_bh_enable();
 
 	return 0;
 }
 
+static struct seq_operations sctp_assoc_ops = {
+	.start = sctp_assocs_seq_start,
+	.next  = sctp_assocs_seq_next,
+	.stop  = sctp_assocs_seq_stop,
+	.show  = sctp_assocs_seq_show,
+};
+
 /* Initialize the seq file operations for 'assocs' object. */
 static int sctp_assocs_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, sctp_assocs_seq_show, NULL);
+	return seq_open(file, &sctp_assoc_ops);
 }
 
 static struct file_operations sctp_assocs_seq_fops = {
 	.open	 = sctp_assocs_seq_open,
 	.read	 = seq_read,
 	.llseek	 = seq_lseek,
-	.release = single_release,
+	.release = seq_release,
 };
 
 /* Set up the proc fs entry for 'assocs' object. */
-- 
cgit v1.2.3


From cdac4e07748934e37e415437055ed591aed9eb21 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@redhat.com>
Date: Mon, 13 Jun 2005 15:12:33 -0700
Subject: [SCTP] Add support for ip_nonlocal_bind sysctl & IP_FREEBIND socket
 option

Signed-off-by: Neil Horman <nhorman@redhat.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c  | 1 +
 net/sctp/protocol.c | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index b3cb49ce5fa..03942f13394 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1181,6 +1181,7 @@ EXPORT_SYMBOL(inet_stream_connect);
 EXPORT_SYMBOL(inet_stream_ops);
 EXPORT_SYMBOL(inet_unregister_protosw);
 EXPORT_SYMBOL(net_statistics);
+EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
 
 #ifdef INET_REFCNT_DEBUG
 EXPORT_SYMBOL(inet_sock_nr);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2e1f9c3556f..5135e1a25d2 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -378,10 +378,13 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp)
 {
 	int ret = inet_addr_type(addr->v4.sin_addr.s_addr);
 
-	/* FIXME: ip_nonlocal_bind sysctl support. */
 
-	if (addr->v4.sin_addr.s_addr != INADDR_ANY && ret != RTN_LOCAL)
+	if (addr->v4.sin_addr.s_addr != INADDR_ANY &&
+	   ret != RTN_LOCAL &&
+	   !sp->inet.freebind &&
+	   !sysctl_ip_nonlocal_bind)
 		return 0;
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From 6a6ddb2a9c11fcc3e8d7517841d28c9ea206ddef Mon Sep 17 00:00:00 2001
From: Sridhar Samudrala <sri@us.ibm.com>
Date: Mon, 13 Jun 2005 15:13:05 -0700
Subject: [SCTP] Fix incorrect setting of sk_bound_dev_if when binding/sending
 to a ipv6 link local address.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/ipv6.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index c9d9ea06473..c7e42d125b9 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -812,26 +812,23 @@ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr)
 	if (addr->sa.sa_family != AF_INET6)
 		af = sctp_get_af_specific(addr->sa.sa_family);
 	else {
-		struct sock *sk;
 		int type = ipv6_addr_type(&addr->v6.sin6_addr);
-		sk = sctp_opt2sk(opt);
+		struct net_device *dev;
+
 		if (type & IPV6_ADDR_LINKLOCAL) {
-			/* Note: Behavior similar to af_inet6.c:
-			 *  1) Overrides previous bound_dev_if
-			 *  2) Destructive even if bind isn't successful.
-			 */
-
-			if (addr->v6.sin6_scope_id)
-				sk->sk_bound_dev_if = addr->v6.sin6_scope_id;
-			if (!sk->sk_bound_dev_if)
+			if (!addr->v6.sin6_scope_id)
+				return 0;
+			dev = dev_get_by_index(addr->v6.sin6_scope_id);
+			if (!dev)
 				return 0;
+			dev_put(dev);
 		}
 		af = opt->pf->af;
 	}
 	return af->available(addr, opt);
 }
 
-/* Verify that the provided sockaddr looks bindable.   Common verification,
+/* Verify that the provided sockaddr looks sendable.   Common verification,
  * has already been taken care of.
  */
 static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
@@ -842,19 +839,16 @@ static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr)
 	if (addr->sa.sa_family != AF_INET6)
 		af = sctp_get_af_specific(addr->sa.sa_family);
 	else {
-		struct sock *sk;
 		int type = ipv6_addr_type(&addr->v6.sin6_addr);
-		sk = sctp_opt2sk(opt);
+		struct net_device *dev;
+
 		if (type & IPV6_ADDR_LINKLOCAL) {
-			/* Note: Behavior similar to af_inet6.c:
-			 *  1) Overrides previous bound_dev_if
-			 *  2) Destructive even if bind isn't successful.
-			 */
-
-			if (addr->v6.sin6_scope_id)
-				sk->sk_bound_dev_if = addr->v6.sin6_scope_id;
-			if (!sk->sk_bound_dev_if)
+			if (!addr->v6.sin6_scope_id)
+				return 0;
+			dev = dev_get_by_index(addr->v6.sin6_scope_id);
+			if (!dev)
 				return 0;
+			dev_put(dev);
 		}
 		af = opt->pf->af;
 	}
-- 
cgit v1.2.3


From 1c2fb7f93cb20621772bf304f3dba0849942e5db Mon Sep 17 00:00:00 2001
From: "J. Simonetti" <jeroen@simonetti.nl>
Date: Mon, 13 Jun 2005 15:19:03 -0700
Subject: [IPV4]: Sysctl configurable icmp error source address.

This patch alows you to change the source address of icmp error
messages. It applies cleanly to 2.6.11.11 and retains the default
behaviour.

In the old (default) behaviour icmp error messages are sent with the ip
of the exiting interface.

The new behaviour (when the sysctl variable is toggled on), it will send
the message with the ip of the interface that received the packet that
caused the icmp error. This is the behaviour network administrators will
expect from a router. It makes debugging complicated network layouts
much easier. Also, all 'vendor routers' I know of have the later
behaviour.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c            | 9 +++++++--
 net/ipv4/sysctl_net_ipv4.c | 9 +++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 85bf0d3e294..cb759484979 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -207,6 +207,7 @@ int sysctl_icmp_ignore_bogus_error_responses;
 
 int sysctl_icmp_ratelimit = 1 * HZ;
 int sysctl_icmp_ratemask = 0x1818;
+int sysctl_icmp_errors_use_inbound_ifaddr;
 
 /*
  *	ICMP control array. This specifies what to do with each ICMP.
@@ -511,8 +512,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
 	 */
 
 	saddr = iph->daddr;
-	if (!(rt->rt_flags & RTCF_LOCAL))
-		saddr = 0;
+	if (!(rt->rt_flags & RTCF_LOCAL)) {
+		if (sysctl_icmp_errors_use_inbound_ifaddr)
+			saddr = inet_select_addr(skb_in->dev, 0, RT_SCOPE_LINK);
+		else
+			saddr = 0;
+	}
 
 	tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
 					   IPTOS_PREC_INTERNETCONTROL) :
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3aafb298c1c..23068bddbf0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -23,6 +23,7 @@ extern int sysctl_ip_nonlocal_bind;
 extern int sysctl_icmp_echo_ignore_all;
 extern int sysctl_icmp_echo_ignore_broadcasts;
 extern int sysctl_icmp_ignore_bogus_error_responses;
+extern int sysctl_icmp_errors_use_inbound_ifaddr;
 
 /* From ip_fragment.c */
 extern int sysctl_ipfrag_low_thresh;
@@ -395,6 +396,14 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
+		.procname	= "icmp_errors_use_inbound_ifaddr",
+		.data		= &sysctl_icmp_errors_use_inbound_ifaddr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{
 		.ctl_name	= NET_IPV4_ROUTE,
 		.procname	= "route",
-- 
cgit v1.2.3


From a96aca88ac71f75e566981b554da44bfd0d111e8 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 13 Jun 2005 18:27:13 -0700
Subject: [NETFILTER]: Advance seq-file position in exp_next_seq()

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_standalone.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 46ca45f74d8..bc59f7b3980 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -256,6 +256,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
 {
  	struct list_head *e = v;
 
+	++*pos;
 	e = e->next;
 
 	if (e == &ip_conntrack_expect_list)
-- 
cgit v1.2.3


From bcfff0b471a60df350338bcd727fc9b8a6aa54b2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 15 Jun 2005 20:51:14 -0700
Subject: [NETFILTER]: ipt_recent: last_pkts is an array of "unsigned long" not
 "u_int32_t"

This fixes various crashes on 64-bit when using this module.

Based upon a patch by Juergen Kreileder <jk@blackdown.de>.

Signed-off-by: David S. Miller <davem@davemloft.net>
ACKed-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ipt_recent.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
index 25ab9fabdcb..2d44b07688a 100644
--- a/net/ipv4/netfilter/ipt_recent.c
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -223,7 +223,7 @@ static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned
 			curr_table->table[count].last_seen = 0;
 			curr_table->table[count].addr = 0;
 			curr_table->table[count].ttl = 0;
-			memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+			memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
 			curr_table->table[count].oldest_pkt = 0;
 			curr_table->table[count].time_pos = 0;
 			curr_table->time_info[count].position = count;
@@ -502,7 +502,7 @@ match(const struct sk_buff *skb,
 		location = time_info[curr_table->time_pos].position;
 		hash_table[r_list[location].hash_entry] = -1;
 		hash_table[hash_result] = location;
-		memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+		memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
 		r_list[location].time_pos = curr_table->time_pos;
 		r_list[location].addr = addr;
 		r_list[location].ttl = ttl;
@@ -631,7 +631,7 @@ match(const struct sk_buff *skb,
 			r_list[location].last_seen = 0;
 			r_list[location].addr = 0;
 			r_list[location].ttl = 0;
-			memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+			memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long));
 			r_list[location].oldest_pkt = 0;
 			ans = !info->invert;
 		}
@@ -734,10 +734,10 @@ checkentry(const char *tablename,
 	memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
 #ifdef DEBUG
 	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
-			sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+			sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
 #endif
 
-	hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+	hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot);
 #ifdef DEBUG
 	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
 #endif
-- 
cgit v1.2.3


From 26b15dad9f1c19d6d4f7b999b07eaa6d98e4b375 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Sat, 18 Jun 2005 22:42:13 -0700
Subject: [IPSEC] Add complete xfrm event notification

Heres the final patch.
What this patch provides

- netlink xfrm events
- ability to have events generated by netlink propagated to pfkey
  and vice versa.
- fixes the acquire lets-be-happy-with-one-success issue

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/key/af_key.c      | 357 +++++++++++++++++++++++++++++++++++++-------------
 net/xfrm/xfrm_state.c |  74 ++++++++---
 net/xfrm/xfrm_user.c  | 272 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 588 insertions(+), 115 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index ce980aa94ed..d086c117f5f 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1240,13 +1240,85 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
 	return 0;
 }
 
+static inline int event2poltype(int event)
+{
+	switch (event) {
+	case XFRM_SAP_DELETED:
+		return SADB_X_SPDDELETE;
+	case XFRM_SAP_ADDED:
+		return SADB_X_SPDADD;
+	case XFRM_SAP_UPDATED:
+		return SADB_X_SPDUPDATE;
+	case XFRM_SAP_EXPIRED:
+	//	return SADB_X_SPDEXPIRE;
+	default:
+		printk("pfkey: Unknown policy event %d\n", event);
+		break;
+	}
+
+	return 0;
+}
+
+static inline int event2keytype(int event)
+{
+	switch (event) {
+	case XFRM_SAP_DELETED:
+		return SADB_DELETE;
+	case XFRM_SAP_ADDED:
+		return SADB_ADD;
+	case XFRM_SAP_UPDATED:
+		return SADB_UPDATE;
+	case XFRM_SAP_EXPIRED:
+		return SADB_EXPIRE;
+	default:
+		printk("pfkey: Unknown SA event %d\n", event);
+		break;
+	}
+
+	return 0;
+}
+
+/* ADD/UPD/DEL */
+static int key_notify_sa(struct xfrm_state *x, struct km_event *c)
+{
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+	int hsc = 3;
+
+	if (c->event == XFRM_SAP_DELETED)
+		hsc = 0;
+
+	if (c->event == XFRM_SAP_EXPIRED) {
+		if (c->data)
+			hsc = 2;
+		else
+			hsc = 1;
+	}
+
+	skb = pfkey_xfrm_state2msg(x, 0, hsc);
+
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	hdr = (struct sadb_msg *) skb->data;
+	hdr->sadb_msg_version = PF_KEY_V2;
+	hdr->sadb_msg_type = event2keytype(c->event);
+	hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
+	hdr->sadb_msg_errno = 0;
+	hdr->sadb_msg_reserved = 0;
+	hdr->sadb_msg_seq = c->seq;
+	hdr->sadb_msg_pid = c->pid;
+
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
+
+	return 0;
+}
 
 static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
 {
-	struct sk_buff *out_skb;
-	struct sadb_msg *out_hdr;
 	struct xfrm_state *x;
 	int err;
+	struct km_event c;
 
 	xfrm_probe_algs();
 	
@@ -1254,6 +1326,7 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
 	if (IS_ERR(x))
 		return PTR_ERR(x);
 
+	xfrm_state_hold(x);
 	if (hdr->sadb_msg_type == SADB_ADD)
 		err = xfrm_state_add(x);
 	else
@@ -1265,27 +1338,23 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
 		return err;
 	}
 
-	out_skb = pfkey_xfrm_state2msg(x, 0, 3);
-	if (IS_ERR(out_skb))
-		return  PTR_ERR(out_skb); /* XXX Should we return 0 here ? */
-
-	out_hdr = (struct sadb_msg *) out_skb->data;
-	out_hdr->sadb_msg_version = hdr->sadb_msg_version;
-	out_hdr->sadb_msg_type = hdr->sadb_msg_type;
-	out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto);
-	out_hdr->sadb_msg_errno = 0;
-	out_hdr->sadb_msg_reserved = 0;
-	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
-	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
-
-	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
+	if (hdr->sadb_msg_type == SADB_ADD)
+		c.event = XFRM_SAP_ADDED;
+	else
+		c.event = XFRM_SAP_UPDATED;
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+	km_state_notify(x, &c);
+	xfrm_state_put(x);
 
-	return 0;
+	return err;
 }
 
 static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
 {
 	struct xfrm_state *x;
+	struct km_event c;
+	int err;
 
 	if (!ext_hdrs[SADB_EXT_SA-1] ||
 	    !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
@@ -1301,13 +1370,19 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 		return -EPERM;
 	}
 	
-	xfrm_state_delete(x);
-	xfrm_state_put(x);
+	err = xfrm_state_delete(x);
+	if (err < 0) {
+		xfrm_state_put(x);
+		return err;
+	}
 
-	pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, 
-			BROADCAST_ALL, sk);
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+	c.event = XFRM_SAP_DELETED;
+	km_state_notify(x, &c);
+	xfrm_state_put(x);
 
-	return 0;
+	return err;
 }
 
 static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
@@ -1445,28 +1520,42 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 	return 0;
 }
 
+static int key_notify_sa_flush(struct km_event *c)
+{
+	struct sk_buff *skb;
+	struct sadb_msg *hdr;
+
+	skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
+	if (!skb)
+		return -ENOBUFS;
+	hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
+	hdr->sadb_msg_satype = pfkey_proto2satype(c->data);
+	hdr->sadb_msg_seq = c->seq;
+	hdr->sadb_msg_pid = c->pid;
+	hdr->sadb_msg_version = PF_KEY_V2;
+	hdr->sadb_msg_errno = (uint8_t) 0;
+	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+
+	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
+
+	return 0;
+}
+
 static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
 {
 	unsigned proto;
-	struct sk_buff *skb_out;
-	struct sadb_msg *hdr_out;
+	struct km_event c;
 
 	proto = pfkey_satype2proto(hdr->sadb_msg_satype);
 	if (proto == 0)
 		return -EINVAL;
 
-	skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
-	if (!skb_out)
-		return -ENOBUFS;
-
 	xfrm_state_flush(proto);
-
-	hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
-	pfkey_hdr_dup(hdr_out, hdr);
-	hdr_out->sadb_msg_errno = (uint8_t) 0;
-	hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
-
-	pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL);
+	c.data = proto;
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+	c.event = XFRM_SAP_FLUSHED;
+	km_state_notify(NULL, &c);
 
 	return 0;
 }
@@ -1859,6 +1948,35 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i
 	hdr->sadb_msg_reserved = atomic_read(&xp->refcnt);
 }
 
+static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
+{
+	struct sk_buff *out_skb;
+	struct sadb_msg *out_hdr;
+	int err;
+
+	out_skb = pfkey_xfrm_policy2msg_prep(xp);
+	if (IS_ERR(out_skb)) {
+		err = PTR_ERR(out_skb);
+		goto out;
+	}
+	pfkey_xfrm_policy2msg(out_skb, xp, dir);
+
+	out_hdr = (struct sadb_msg *) out_skb->data;
+	out_hdr->sadb_msg_version = PF_KEY_V2;
+
+	if (c->data && c->event == XFRM_SAP_DELETED)
+		out_hdr->sadb_msg_type = SADB_X_SPDDELETE2;
+	else
+		out_hdr->sadb_msg_type = event2poltype(c->event);
+	out_hdr->sadb_msg_errno = 0;
+	out_hdr->sadb_msg_seq = c->seq;
+	out_hdr->sadb_msg_pid = c->pid;
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL);
+out:
+	return 0;
+
+}
+
 static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
 {
 	int err;
@@ -1866,8 +1984,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	struct sadb_address *sa;
 	struct sadb_x_policy *pol;
 	struct xfrm_policy *xp;
-	struct sk_buff *out_skb;
-	struct sadb_msg *out_hdr;
+	struct km_event c;
 
 	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
 				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -1935,31 +2052,23 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	    (err = parse_ipsecrequests(xp, pol)) < 0)
 		goto out;
 
-	out_skb = pfkey_xfrm_policy2msg_prep(xp);
-	if (IS_ERR(out_skb)) {
-		err =  PTR_ERR(out_skb);
-		goto out;
-	}
-
 	err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp,
 				 hdr->sadb_msg_type != SADB_X_SPDUPDATE);
 	if (err) {
-		kfree_skb(out_skb);
-		goto out;
+		kfree(xp);
+		return err;
 	}
 
-	pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
+	if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
+		c.event = XFRM_SAP_UPDATED;
+	else
+		c.event = XFRM_SAP_ADDED;
 
-	xfrm_pol_put(xp);
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
 
-	out_hdr = (struct sadb_msg *) out_skb->data;
-	out_hdr->sadb_msg_version = hdr->sadb_msg_version;
-	out_hdr->sadb_msg_type = hdr->sadb_msg_type;
-	out_hdr->sadb_msg_satype = 0;
-	out_hdr->sadb_msg_errno = 0;
-	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
-	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
-	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
+	km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+	xfrm_pol_put(xp);
 	return 0;
 
 out:
@@ -1973,9 +2082,8 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 	struct sadb_address *sa;
 	struct sadb_x_policy *pol;
 	struct xfrm_policy *xp;
-	struct sk_buff *out_skb;
-	struct sadb_msg *out_hdr;
 	struct xfrm_selector sel;
+	struct km_event c;
 
 	if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1],
 				     ext_hdrs[SADB_EXT_ADDRESS_DST-1]) ||
@@ -2010,25 +2118,40 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 
 	err = 0;
 
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+	c.event = XFRM_SAP_DELETED;
+	km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+
+	xfrm_pol_put(xp);
+	return err;
+}
+
+static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, struct sadb_msg *hdr, int dir)
+{
+	int err;
+	struct sk_buff *out_skb;
+	struct sadb_msg *out_hdr;
+	err = 0;
+
 	out_skb = pfkey_xfrm_policy2msg_prep(xp);
 	if (IS_ERR(out_skb)) {
 		err =  PTR_ERR(out_skb);
 		goto out;
 	}
-	pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
+	pfkey_xfrm_policy2msg(out_skb, xp, dir);
 
 	out_hdr = (struct sadb_msg *) out_skb->data;
 	out_hdr->sadb_msg_version = hdr->sadb_msg_version;
-	out_hdr->sadb_msg_type = SADB_X_SPDDELETE;
+	out_hdr->sadb_msg_type = hdr->sadb_msg_type;
 	out_hdr->sadb_msg_satype = 0;
 	out_hdr->sadb_msg_errno = 0;
 	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
 	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
-	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
+	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk);
 	err = 0;
 
 out:
-	xfrm_pol_put(xp);
 	return err;
 }
 
@@ -2037,8 +2160,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	int err;
 	struct sadb_x_policy *pol;
 	struct xfrm_policy *xp;
-	struct sk_buff *out_skb;
-	struct sadb_msg *out_hdr;
+	struct km_event c;
 
 	if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL)
 		return -EINVAL;
@@ -2050,24 +2172,16 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 
 	err = 0;
 
-	out_skb = pfkey_xfrm_policy2msg_prep(xp);
-	if (IS_ERR(out_skb)) {
-		err =  PTR_ERR(out_skb);
-		goto out;
+	c.seq = hdr->sadb_msg_seq;
+	c.pid = hdr->sadb_msg_pid;
+	if (hdr->sadb_msg_type == SADB_X_SPDDELETE2) {
+		c.data = 1; // to signal pfkey of SADB_X_SPDDELETE2
+		c.event = XFRM_SAP_DELETED;
+		km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
+	} else {
+		err = key_pol_get_resp(sk, xp, hdr, pol->sadb_x_policy_dir-1);
 	}
-	pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1);
 
-	out_hdr = (struct sadb_msg *) out_skb->data;
-	out_hdr->sadb_msg_version = hdr->sadb_msg_version;
-	out_hdr->sadb_msg_type = hdr->sadb_msg_type;
-	out_hdr->sadb_msg_satype = 0;
-	out_hdr->sadb_msg_errno = 0;
-	out_hdr->sadb_msg_seq = hdr->sadb_msg_seq;
-	out_hdr->sadb_msg_pid = hdr->sadb_msg_pid;
-	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk);
-	err = 0;
-
-out:
 	xfrm_pol_put(xp);
 	return err;
 }
@@ -2102,22 +2216,34 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
 	return xfrm_policy_walk(dump_sp, &data);
 }
 
-static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+static int key_notify_policy_flush(struct km_event *c)
 {
 	struct sk_buff *skb_out;
-	struct sadb_msg *hdr_out;
+	struct sadb_msg *hdr;
 
-	skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL);
+	skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC);
 	if (!skb_out)
 		return -ENOBUFS;
+	hdr = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
+	hdr->sadb_msg_seq = c->seq;
+	hdr->sadb_msg_pid = c->pid;
+	hdr->sadb_msg_version = PF_KEY_V2;
+	hdr->sadb_msg_errno = (uint8_t) 0;
+	hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
+	pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL);
+	return 0;
 
-	xfrm_policy_flush();
+}
+
+static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs)
+{
+	struct km_event c;
 
-	hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg));
-	pfkey_hdr_dup(hdr_out, hdr);
-	hdr_out->sadb_msg_errno = (uint8_t) 0;
-	hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t));
-	pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL);
+	xfrm_policy_flush();
+	c.event = XFRM_SAP_FLUSHED;
+	c.pid = hdr->sadb_msg_pid;
+	c.seq = hdr->sadb_msg_seq;
+	km_policy_notify(NULL, 0, &c);
 
 	return 0;
 }
@@ -2317,11 +2443,23 @@ static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t)
 	}
 }
 
-static int pfkey_send_notify(struct xfrm_state *x, int hard)
+static int key_notify_policy_expire(struct xfrm_policy *xp, struct km_event *c)
+{
+	return 0;
+}
+
+static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
 {
 	struct sk_buff *out_skb;
 	struct sadb_msg *out_hdr;
-	int hsc = (hard ? 2 : 1);
+	int hard;
+	int hsc;
+
+	hard = c->data;
+	if (hard)
+		hsc = 2;
+	else
+		hsc = 1;
 
 	out_skb = pfkey_xfrm_state2msg(x, 0, hsc);
 	if (IS_ERR(out_skb))
@@ -2340,6 +2478,44 @@ static int pfkey_send_notify(struct xfrm_state *x, int hard)
 	return 0;
 }
 
+static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
+{
+	switch (c->event) {
+	case XFRM_SAP_EXPIRED:
+		return key_notify_sa_expire(x, c);
+	case XFRM_SAP_DELETED:
+	case XFRM_SAP_ADDED:
+	case XFRM_SAP_UPDATED:
+		return key_notify_sa(x, c);
+	case XFRM_SAP_FLUSHED:
+		return key_notify_sa_flush(c);
+	default:
+		printk("pfkey: Unknown SA event %d\n", c->event);
+		break;
+	}
+
+	return 0;
+}
+
+static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
+{
+	switch (c->event) {
+	case XFRM_SAP_EXPIRED:
+		return key_notify_policy_expire(xp, c);
+	case XFRM_SAP_DELETED:
+	case XFRM_SAP_ADDED:
+	case XFRM_SAP_UPDATED:
+		return key_notify_policy(xp, dir, c);
+	case XFRM_SAP_FLUSHED:
+		return key_notify_policy_flush(c);
+	default:
+		printk("pfkey: Unknown policy event %d\n", c->event);
+		break;
+	}
+
+	return 0;
+}
+
 static u32 get_acqseq(void)
 {
 	u32 res;
@@ -2856,6 +3032,7 @@ static struct xfrm_mgr pfkeyv2_mgr =
 	.acquire	= pfkey_send_acquire,
 	.compile_policy	= pfkey_compile_policy,
 	.new_mapping	= pfkey_send_new_mapping,
+	.notify_policy	= pfkey_send_policy_notify,
 };
 
 static void __exit ipsec_pfkey_exit(void)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d11747c2a76..918a94c552a 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -50,7 +50,7 @@ static DEFINE_SPINLOCK(xfrm_state_gc_lock);
 
 static int xfrm_state_gc_flush_bundles;
 
-static void __xfrm_state_delete(struct xfrm_state *x);
+static int __xfrm_state_delete(struct xfrm_state *x);
 
 static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family);
 static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
@@ -215,8 +215,10 @@ void __xfrm_state_destroy(struct xfrm_state *x)
 }
 EXPORT_SYMBOL(__xfrm_state_destroy);
 
-static void __xfrm_state_delete(struct xfrm_state *x)
+static int __xfrm_state_delete(struct xfrm_state *x)
 {
+	int err = -ESRCH;
+
 	if (x->km.state != XFRM_STATE_DEAD) {
 		x->km.state = XFRM_STATE_DEAD;
 		spin_lock(&xfrm_state_lock);
@@ -245,14 +247,21 @@ static void __xfrm_state_delete(struct xfrm_state *x)
 		 * is what we are dropping here.
 		 */
 		atomic_dec(&x->refcnt);
+		err = 0;
 	}
+
+	return err;
 }
 
-void xfrm_state_delete(struct xfrm_state *x)
+int xfrm_state_delete(struct xfrm_state *x)
 {
+	int err;
+
 	spin_lock_bh(&x->lock);
-	__xfrm_state_delete(x);
+	err = __xfrm_state_delete(x);
 	spin_unlock_bh(&x->lock);
+
+	return err;
 }
 EXPORT_SYMBOL(xfrm_state_delete);
 
@@ -796,34 +805,60 @@ EXPORT_SYMBOL(xfrm_replay_advance);
 static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list);
 static DEFINE_RWLOCK(xfrm_km_lock);
 
-static void km_state_expired(struct xfrm_state *x, int hard)
+void km_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
 {
 	struct xfrm_mgr *km;
 
-	if (hard)
-		x->km.state = XFRM_STATE_EXPIRED;
-	else
-		x->km.dying = 1;
+	read_lock(&xfrm_km_lock);
+	list_for_each_entry(km, &xfrm_km_list, list)
+		if (km->notify_policy)
+			km->notify_policy(xp, dir, c);
+	read_unlock(&xfrm_km_lock);
+}
 
+void km_state_notify(struct xfrm_state *x, struct km_event *c)
+{
+	struct xfrm_mgr *km;
 	read_lock(&xfrm_km_lock);
 	list_for_each_entry(km, &xfrm_km_list, list)
-		km->notify(x, hard);
+		if (km->notify)
+			km->notify(x, c);
 	read_unlock(&xfrm_km_lock);
+}
+
+EXPORT_SYMBOL(km_policy_notify);
+EXPORT_SYMBOL(km_state_notify);
+
+static void km_state_expired(struct xfrm_state *x, int hard)
+{
+	struct km_event c;
+
+	if (hard)
+		x->km.state = XFRM_STATE_EXPIRED;
+	else
+		x->km.dying = 1;
+	c.data = hard;
+	c.event = XFRM_SAP_EXPIRED;
+	km_state_notify(x, &c);
 
 	if (hard)
 		wake_up(&km_waitq);
 }
 
+/*
+ * We send to all registered managers regardless of failure
+ * We are happy with one success
+*/
 static int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
 {
-	int err = -EINVAL;
+	int err = -EINVAL, acqret;
 	struct xfrm_mgr *km;
 
 	read_lock(&xfrm_km_lock);
 	list_for_each_entry(km, &xfrm_km_list, list) {
-		err = km->acquire(x, t, pol, XFRM_POLICY_OUT);
-		if (!err)
-			break;
+		acqret = km->acquire(x, t, pol, XFRM_POLICY_OUT);
+		if (!acqret)
+			err = acqret;
 	}
 	read_unlock(&xfrm_km_lock);
 	return err;
@@ -848,13 +883,12 @@ EXPORT_SYMBOL(km_new_mapping);
 
 void km_policy_expired(struct xfrm_policy *pol, int dir, int hard)
 {
-	struct xfrm_mgr *km;
+	struct km_event c;
 
-	read_lock(&xfrm_km_lock);
-	list_for_each_entry(km, &xfrm_km_list, list)
-		if (km->notify_policy)
-			km->notify_policy(pol, dir, hard);
-	read_unlock(&xfrm_km_lock);
+	c.data = hard;
+	c.data = hard;
+	c.event = XFRM_SAP_EXPIRED;
+	km_policy_notify(pol, dir, &c);
 
 	if (hard)
 		wake_up(&km_waitq);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 97509011c27..bd8e6882c08 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -277,6 +277,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	struct xfrm_usersa_info *p = NLMSG_DATA(nlh);
 	struct xfrm_state *x;
 	int err;
+	struct km_event c;
 
 	err = verify_newsa_info(p, (struct rtattr **) xfrma);
 	if (err)
@@ -286,6 +287,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	if (!x)
 		return err;
 
+	xfrm_state_hold(x);
 	if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
 		err = xfrm_state_add(x);
 	else
@@ -294,14 +296,27 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	if (err < 0) {
 		x->km.state = XFRM_STATE_DEAD;
 		xfrm_state_put(x);
+		return err;
 	}
 
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
+		c.event = XFRM_SAP_ADDED;
+	else
+		c.event = XFRM_SAP_UPDATED;
+
+	km_state_notify(x, &c);
+	xfrm_state_put(x);
+
 	return err;
 }
 
 static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 {
 	struct xfrm_state *x;
+	int err;
+	struct km_event c;
 	struct xfrm_usersa_id *p = NLMSG_DATA(nlh);
 
 	x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family);
@@ -313,10 +328,19 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 		return -EPERM;
 	}
 
-	xfrm_state_delete(x);
+	err = xfrm_state_delete(x);
+	if (err < 0) {
+		xfrm_state_put(x);
+		return err;
+	}
+
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	c.event = XFRM_SAP_DELETED;
+	km_state_notify(x, &c);
 	xfrm_state_put(x);
 
-	return 0;
+	return err;
 }
 
 static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p)
@@ -681,6 +705,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 {
 	struct xfrm_userpolicy_info *p = NLMSG_DATA(nlh);
 	struct xfrm_policy *xp;
+	struct km_event c;
 	int err;
 	int excl;
 
@@ -692,6 +717,10 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 	if (!xp)
 		return err;
 
+	/* shouldnt excl be based on nlh flags??
+	 * Aha! this is anti-netlink really i.e  more pfkey derived
+	 * in netlink excl is a flag and you wouldnt need
+	 * a type XFRM_MSG_UPDPOLICY - JHS */
 	excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY;
 	err = xfrm_policy_insert(p->dir, xp, excl);
 	if (err) {
@@ -699,6 +728,15 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 		return err;
 	}
 
+	if (!excl)
+		c.event = XFRM_SAP_UPDATED;
+	else
+		c.event = XFRM_SAP_ADDED;
+
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	km_policy_notify(xp, p->dir, &c);
+
 	xfrm_pol_put(xp);
 
 	return 0;
@@ -816,6 +854,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 	struct xfrm_policy *xp;
 	struct xfrm_userpolicy_id *p;
 	int err;
+	struct km_event c;
 	int delete;
 
 	p = NLMSG_DATA(nlh);
@@ -843,6 +882,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 					      NETLINK_CB(skb).pid,
 					      MSG_DONTWAIT);
 		}
+	} else {
+		c.event = XFRM_SAP_DELETED;
+		c.seq = nlh->nlmsg_seq;
+		c.pid = nlh->nlmsg_pid;
+		km_policy_notify(xp, p->dir, &c);
 	}
 
 	xfrm_pol_put(xp);
@@ -852,15 +896,28 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 
 static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 {
+	struct km_event c;
 	struct xfrm_usersa_flush *p = NLMSG_DATA(nlh);
 
 	xfrm_state_flush(p->proto);
+	c.data = p->proto;
+	c.event = XFRM_SAP_FLUSHED;
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	km_state_notify(NULL, &c);
+
 	return 0;
 }
 
 static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 {
+	struct km_event c;
+
 	xfrm_policy_flush();
+	c.event = XFRM_SAP_FLUSHED;
+	c.seq = nlh->nlmsg_seq;
+	c.pid = nlh->nlmsg_pid;
+	km_policy_notify(NULL, 0, &c);
 	return 0;
 }
 
@@ -1069,10 +1126,12 @@ nlmsg_failure:
 	return -1;
 }
 
-static int xfrm_send_state_notify(struct xfrm_state *x, int hard)
+static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
 {
 	struct sk_buff *skb;
+	int hard = c ->data;
 
+	/* fix to do alloc using NLM macros */
 	skb = alloc_skb(sizeof(struct xfrm_user_expire) + 16, GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
@@ -1085,6 +1144,122 @@ static int xfrm_send_state_notify(struct xfrm_state *x, int hard)
 	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
 }
 
+static int xfrm_notify_sa_flush(struct km_event *c)
+{
+	struct xfrm_usersa_flush *p;
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+	unsigned char *b;
+	int len = NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush));
+
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+	b = skb->tail;
+
+	nlh = NLMSG_PUT(skb, c->pid, c->seq,
+			XFRM_MSG_FLUSHSA, sizeof(*p));
+	nlh->nlmsg_flags = 0;
+
+	p = NLMSG_DATA(nlh);
+	p->proto = c->data;
+
+	nlh->nlmsg_len = skb->tail - b;
+
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
+
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+static int inline xfrm_sa_len(struct xfrm_state *x)
+{
+	int l = NLMSG_LENGTH(sizeof(struct xfrm_usersa_info));
+	if (x->aalg)
+		l += RTA_SPACE(sizeof(*x->aalg) + (x->aalg->alg_key_len+7)/8);
+	if (x->ealg)
+		l += RTA_SPACE(sizeof(*x->ealg) + (x->ealg->alg_key_len+7)/8);
+	if (x->calg)
+		l += RTA_SPACE(sizeof(*x->calg));
+	if (x->encap)
+		l += RTA_SPACE(sizeof(*x->encap));
+
+	return l;
+}
+
+static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
+{
+	struct xfrm_usersa_info *p;
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+	u32 nlt;
+	unsigned char *b;
+	int len = xfrm_sa_len(x);
+
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+	b = skb->tail;
+
+	if (c->event == XFRM_SAP_ADDED)
+		nlt = XFRM_MSG_NEWSA;
+	else if (c->event == XFRM_SAP_UPDATED)
+		nlt = XFRM_MSG_UPDSA;
+	else if (c->event == XFRM_SAP_DELETED)
+		nlt = XFRM_MSG_DELSA;
+	else
+		goto nlmsg_failure;
+
+	nlh = NLMSG_PUT(skb, c->pid, c->seq, nlt, sizeof(*p));
+	nlh->nlmsg_flags = 0;
+
+	p = NLMSG_DATA(nlh);
+	copy_to_user_state(x, p);
+
+	if (x->aalg)
+		RTA_PUT(skb, XFRMA_ALG_AUTH,
+			sizeof(*(x->aalg))+(x->aalg->alg_key_len+7)/8, x->aalg);
+	if (x->ealg)
+		RTA_PUT(skb, XFRMA_ALG_CRYPT,
+			sizeof(*(x->ealg))+(x->ealg->alg_key_len+7)/8, x->ealg);
+	if (x->calg)
+		RTA_PUT(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg);
+
+	if (x->encap)
+		RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap);
+
+	nlh->nlmsg_len = skb->tail - b;
+
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_SA, GFP_ATOMIC);
+
+nlmsg_failure:
+rtattr_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+static int xfrm_send_state_notify(struct xfrm_state *x, struct km_event *c)
+{
+
+	switch (c->event) {
+	case XFRM_SAP_EXPIRED:
+		return xfrm_exp_state_notify(x, c);
+	case XFRM_SAP_DELETED:
+	case XFRM_SAP_UPDATED:
+	case XFRM_SAP_ADDED:
+		return xfrm_notify_sa(x, c);
+	case XFRM_SAP_FLUSHED:
+		return xfrm_notify_sa_flush(c);
+	default:
+		 printk("xfrm_user: Unknown SA event %d\n", c->event);
+		 break;
+	}
+
+	return 0;
+
+}
+
 static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
 			 struct xfrm_tmpl *xt, struct xfrm_policy *xp,
 			 int dir)
@@ -1218,7 +1393,7 @@ nlmsg_failure:
 	return -1;
 }
 
-static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
+static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
 {
 	struct sk_buff *skb;
 	size_t len;
@@ -1229,7 +1404,7 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
 	if (skb == NULL)
 		return -ENOMEM;
 
-	if (build_polexpire(skb, xp, dir, hard) < 0)
+	if (build_polexpire(skb, xp, dir, c->data) < 0)
 		BUG();
 
 	NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
@@ -1237,6 +1412,93 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard)
 	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC);
 }
 
+static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
+{
+	struct xfrm_userpolicy_info *p;
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+	u32 nlt = 0 ;
+	unsigned char *b;
+	int len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
+	len += NLMSG_SPACE(sizeof(struct xfrm_userpolicy_info));
+
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+	b = skb->tail;
+
+	if (c->event == XFRM_SAP_ADDED)
+		nlt = XFRM_MSG_NEWPOLICY;
+	else if (c->event == XFRM_SAP_UPDATED)
+		nlt = XFRM_MSG_UPDPOLICY;
+	else if (c->event == XFRM_SAP_DELETED)
+		nlt = XFRM_MSG_DELPOLICY;
+	else
+		goto nlmsg_failure;
+
+	nlh = NLMSG_PUT(skb, c->pid, c->seq, nlt, sizeof(*p));
+
+	p = NLMSG_DATA(nlh);
+
+	nlh->nlmsg_flags = 0;
+
+	copy_to_user_policy(xp, p, dir);
+	if (copy_to_user_tmpl(xp, skb) < 0)
+		goto nlmsg_failure;
+
+	nlh->nlmsg_len = skb->tail - b;
+
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
+
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+static int xfrm_notify_policy_flush(struct km_event *c)
+{
+	struct nlmsghdr *nlh;
+	struct sk_buff *skb;
+	unsigned char *b;
+	int len = NLMSG_LENGTH(0);
+
+	skb = alloc_skb(len, GFP_ATOMIC);
+	if (skb == NULL)
+		return -ENOMEM;
+	b = skb->tail;
+
+
+	nlh = NLMSG_PUT(skb, c->pid, c->seq, XFRM_MSG_FLUSHPOLICY, 0);
+
+	nlh->nlmsg_len = skb->tail - b;
+
+	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
+
+nlmsg_failure:
+	kfree_skb(skb);
+	return -1;
+}
+
+static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
+{
+
+	switch (c->event) {
+	case XFRM_SAP_ADDED:
+	case XFRM_SAP_UPDATED:
+	case XFRM_SAP_DELETED:
+		return xfrm_notify_policy(xp, dir, c);
+	case XFRM_SAP_FLUSHED:
+		return xfrm_notify_policy_flush(c);
+	case XFRM_SAP_EXPIRED:
+		return xfrm_exp_policy_notify(xp, dir, c);
+	default:
+		printk("xfrm_user: Unknown Policy event %d\n", c->event);
+	}
+
+	return 0;
+
+}
+
 static struct xfrm_mgr netlink_mgr = {
 	.id		= "netlink",
 	.notify		= xfrm_send_state_notify,
-- 
cgit v1.2.3


From 4666faab095230ec8aa62da6c33391287f281154 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 18 Jun 2005 22:43:22 -0700
Subject: [IPSEC] Kill spurious hard expire messages

This patch ensures that the hard state/policy expire notifications are
only sent when the state/policy is successfully removed from their
respective tables.

As it is, it's possible for a state/policy to both expire through
reaching a hard limit, as well as being deleted by the user.

Note that this behaviour isn't actually forbidden by RFC 2367.
However, it is a quality of implementation issue.

As an added bonus, the restructuring in this patch will help
eventually in moving the expire notifications from softirq
context into process context, thus improving their reliability.

One important side-effect from this change is that SAs reaching
their hard byte/packet limits are now deleted immediately, just
like SAs that have reached their hard time limits.

Previously they were announced immediately but only deleted after
30 seconds.

This is bad because it prevents the system from issuing an ACQUIRE
command until the existing state was deleted by the user or expires
after the time is up.

In the scenario where the expire notification was lost this introduces
a 30 second delay into the system for no good reason.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/xfrm/xfrm_policy.c |  8 +++++---
 net/xfrm/xfrm_state.c  | 16 +++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d07f5ce3182..0a4260719a1 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,8 +216,8 @@ out:
 
 expired:
 	read_unlock(&xp->lock);
-	km_policy_expired(xp, dir, 1);
-	xfrm_policy_delete(xp, dir);
+	if (!xfrm_policy_delete(xp, dir))
+		km_policy_expired(xp, dir, 1);
 	xfrm_pol_put(xp);
 }
 
@@ -555,7 +555,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol,
 	return NULL;
 }
 
-void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
+int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 {
 	write_lock_bh(&xfrm_policy_lock);
 	pol = __xfrm_policy_unlink(pol, dir);
@@ -564,7 +564,9 @@ void xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 		if (dir < XFRM_POLICY_MAX)
 			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
+		return 0;
 	}
+	return -ENOENT;
 }
 
 int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 918a94c552a..94f7416a4ab 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -154,6 +154,7 @@ static void xfrm_timer_handler(unsigned long data)
 			next = tmo;
 	}
 
+	x->km.dying = warn;
 	if (warn)
 		km_state_expired(x, 0);
 resched:
@@ -169,9 +170,8 @@ expired:
 		next = 2;
 		goto resched;
 	}
-	if (x->id.spi != 0)
+	if (!__xfrm_state_delete(x) && x->id.spi)
 		km_state_expired(x, 1);
-	__xfrm_state_delete(x);
 
 out:
 	spin_unlock(&x->lock);
@@ -566,16 +566,18 @@ int xfrm_state_check_expire(struct xfrm_state *x)
 
 	if (x->curlft.bytes >= x->lft.hard_byte_limit ||
 	    x->curlft.packets >= x->lft.hard_packet_limit) {
-		km_state_expired(x, 1);
-		if (!mod_timer(&x->timer, jiffies + XFRM_ACQ_EXPIRES*HZ))
+		x->km.state = XFRM_STATE_EXPIRED;
+		if (!mod_timer(&x->timer, jiffies))
 			xfrm_state_hold(x);
 		return -EINVAL;
 	}
 
 	if (!x->km.dying &&
 	    (x->curlft.bytes >= x->lft.soft_byte_limit ||
-	     x->curlft.packets >= x->lft.soft_packet_limit))
+	     x->curlft.packets >= x->lft.soft_packet_limit)) {
+		x->km.dying = 1;
 		km_state_expired(x, 0);
+	}
 	return 0;
 }
 EXPORT_SYMBOL(xfrm_state_check_expire);
@@ -833,10 +835,6 @@ static void km_state_expired(struct xfrm_state *x, int hard)
 {
 	struct km_event c;
 
-	if (hard)
-		x->km.state = XFRM_STATE_EXPIRED;
-	else
-		x->km.dying = 1;
 	c.data = hard;
 	c.event = XFRM_SAP_EXPIRED;
 	km_state_notify(x, &c);
-- 
cgit v1.2.3


From 4f09f0bbc1cb3c74e8f2047ad4be201a059829ee Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 18 Jun 2005 22:43:43 -0700
Subject: [IPSEC] Fix xfrm to pfkey SA state conversion

This patch adjusts the SA state conversion in af_key such that
XFRM_STATE_ERROR/XFRM_STATE_DEAD will be converted to SADB_STATE_DEAD
instead of SADB_STATE_DYING.

According to RFC 2367, SADB_STATE_DYING SAs can be turned into
mature ones through updating their lifetime settings.  Since SAs
which are in the states XFRM_STATE_ERROR/XFRM_STATE_DEAD cannot
be resurrected, this value is unsuitable.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/key/af_key.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index d086c117f5f..560c93c1089 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -656,13 +656,18 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
 	sa->sadb_sa_exttype = SADB_EXT_SA;
 	sa->sadb_sa_spi = x->id.spi;
 	sa->sadb_sa_replay = x->props.replay_window;
-	sa->sadb_sa_state = SADB_SASTATE_DYING;
-	if (x->km.state == XFRM_STATE_VALID && !x->km.dying)
-		sa->sadb_sa_state = SADB_SASTATE_MATURE;
-	else if (x->km.state == XFRM_STATE_ACQ)
+	switch (x->km.state) {
+	case XFRM_STATE_VALID:
+		sa->sadb_sa_state = x->km.dying ?
+			SADB_SASTATE_DYING : SADB_SASTATE_MATURE;
+		break;
+	case XFRM_STATE_ACQ:
 		sa->sadb_sa_state = SADB_SASTATE_LARVAL;
-	else if (x->km.state == XFRM_STATE_EXPIRED)
+		break;
+	default:
 		sa->sadb_sa_state = SADB_SASTATE_DEAD;
+		break;
+	}
 	sa->sadb_sa_auth = 0;
 	if (x->aalg) {
 		struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
-- 
cgit v1.2.3


From bf08867f91a43aa3ba2e4598c06c4769a6cdddf6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 18 Jun 2005 22:44:00 -0700
Subject: [IPSEC] Turn km_event.data into a union

This patch turns km_event.data into a union.  This makes code that
uses it clearer.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/key/af_key.c      | 17 +++++------------
 net/xfrm/xfrm_state.c |  5 ++---
 net/xfrm/xfrm_user.c  |  9 ++++-----
 3 files changed, 11 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 560c93c1089..3fae5c4b48f 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1293,13 +1293,6 @@ static int key_notify_sa(struct xfrm_state *x, struct km_event *c)
 	if (c->event == XFRM_SAP_DELETED)
 		hsc = 0;
 
-	if (c->event == XFRM_SAP_EXPIRED) {
-		if (c->data)
-			hsc = 2;
-		else
-			hsc = 1;
-	}
-
 	skb = pfkey_xfrm_state2msg(x, 0, hsc);
 
 	if (IS_ERR(skb))
@@ -1534,7 +1527,7 @@ static int key_notify_sa_flush(struct km_event *c)
 	if (!skb)
 		return -ENOBUFS;
 	hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg));
-	hdr->sadb_msg_satype = pfkey_proto2satype(c->data);
+	hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto);
 	hdr->sadb_msg_seq = c->seq;
 	hdr->sadb_msg_pid = c->pid;
 	hdr->sadb_msg_version = PF_KEY_V2;
@@ -1556,7 +1549,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hd
 		return -EINVAL;
 
 	xfrm_state_flush(proto);
-	c.data = proto;
+	c.data.proto = proto;
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
 	c.event = XFRM_SAP_FLUSHED;
@@ -1969,7 +1962,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c
 	out_hdr = (struct sadb_msg *) out_skb->data;
 	out_hdr->sadb_msg_version = PF_KEY_V2;
 
-	if (c->data && c->event == XFRM_SAP_DELETED)
+	if (c->data.byid && c->event == XFRM_SAP_DELETED)
 		out_hdr->sadb_msg_type = SADB_X_SPDDELETE2;
 	else
 		out_hdr->sadb_msg_type = event2poltype(c->event);
@@ -2180,7 +2173,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
 	if (hdr->sadb_msg_type == SADB_X_SPDDELETE2) {
-		c.data = 1; // to signal pfkey of SADB_X_SPDDELETE2
+		c.data.byid = 1;
 		c.event = XFRM_SAP_DELETED;
 		km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
 	} else {
@@ -2460,7 +2453,7 @@ static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
 	int hard;
 	int hsc;
 
-	hard = c->data;
+	hard = c->data.hard;
 	if (hard)
 		hsc = 2;
 	else
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 94f7416a4ab..e068bd72105 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -835,7 +835,7 @@ static void km_state_expired(struct xfrm_state *x, int hard)
 {
 	struct km_event c;
 
-	c.data = hard;
+	c.data.hard = hard;
 	c.event = XFRM_SAP_EXPIRED;
 	km_state_notify(x, &c);
 
@@ -883,8 +883,7 @@ void km_policy_expired(struct xfrm_policy *pol, int dir, int hard)
 {
 	struct km_event c;
 
-	c.data = hard;
-	c.data = hard;
+	c.data.hard = hard;
 	c.event = XFRM_SAP_EXPIRED;
 	km_policy_notify(pol, dir, &c);
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index bd8e6882c08..4d3237d08ff 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -900,7 +900,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma
 	struct xfrm_usersa_flush *p = NLMSG_DATA(nlh);
 
 	xfrm_state_flush(p->proto);
-	c.data = p->proto;
+	c.data.proto = p->proto;
 	c.event = XFRM_SAP_FLUSHED;
 	c.seq = nlh->nlmsg_seq;
 	c.pid = nlh->nlmsg_pid;
@@ -1129,14 +1129,13 @@ nlmsg_failure:
 static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
 {
 	struct sk_buff *skb;
-	int hard = c ->data;
 
 	/* fix to do alloc using NLM macros */
 	skb = alloc_skb(sizeof(struct xfrm_user_expire) + 16, GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
 
-	if (build_expire(skb, x, hard) < 0)
+	if (build_expire(skb, x, c->data.hard) < 0)
 		BUG();
 
 	NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
@@ -1162,7 +1161,7 @@ static int xfrm_notify_sa_flush(struct km_event *c)
 	nlh->nlmsg_flags = 0;
 
 	p = NLMSG_DATA(nlh);
-	p->proto = c->data;
+	p->proto = c->data.proto;
 
 	nlh->nlmsg_len = skb->tail - b;
 
@@ -1404,7 +1403,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
 	if (skb == NULL)
 		return -ENOMEM;
 
-	if (build_polexpire(skb, xp, dir, c->data) < 0)
+	if (build_polexpire(skb, xp, dir, c->data.hard) < 0)
 		BUG();
 
 	NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE;
-- 
cgit v1.2.3


From e7443892f656d760ec1b9d92567178c87e100f4a Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 18 Jun 2005 22:44:18 -0700
Subject: [IPSEC] Set byid for km_event in xfrm_get_policy

This patch fixes policy deletion in xfrm_user so that it sets
km_event.data.byid.  This puts xfrm_user on par with what af_key
does in this case.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/xfrm/xfrm_user.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4d3237d08ff..ff6fc610730 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -883,6 +883,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 					      MSG_DONTWAIT);
 		}
 	} else {
+		c.data.byid = p->index;
 		c.event = XFRM_SAP_DELETED;
 		c.seq = nlh->nlmsg_seq;
 		c.pid = nlh->nlmsg_pid;
-- 
cgit v1.2.3


From f60f6b8f70c756fc786d68f02ec17a1e84db645f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 18 Jun 2005 22:44:37 -0700
Subject: [IPSEC] Use XFRM_MSG_* instead of XFRM_SAP_*

This patch removes XFRM_SAP_* and converts them over to XFRM_MSG_*.
The netlink interface is meant to map directly onto the underlying
xfrm subsystem.  Therefore rather than using a new independent
representation for the events we can simply use the existing ones
from xfrm_user.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 net/key/af_key.c      | 60 ++++++++++++++++++++++++------------------------
 net/xfrm/xfrm_state.c |  4 ++--
 net/xfrm/xfrm_user.c  | 63 +++++++++++++++------------------------------------
 3 files changed, 50 insertions(+), 77 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 3fae5c4b48f..577f0bb5bb3 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1248,13 +1248,13 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, struct sadb_msg *
 static inline int event2poltype(int event)
 {
 	switch (event) {
-	case XFRM_SAP_DELETED:
+	case XFRM_MSG_DELPOLICY:
 		return SADB_X_SPDDELETE;
-	case XFRM_SAP_ADDED:
+	case XFRM_MSG_NEWPOLICY:
 		return SADB_X_SPDADD;
-	case XFRM_SAP_UPDATED:
+	case XFRM_MSG_UPDPOLICY:
 		return SADB_X_SPDUPDATE;
-	case XFRM_SAP_EXPIRED:
+	case XFRM_MSG_POLEXPIRE:
 	//	return SADB_X_SPDEXPIRE;
 	default:
 		printk("pfkey: Unknown policy event %d\n", event);
@@ -1267,13 +1267,13 @@ static inline int event2poltype(int event)
 static inline int event2keytype(int event)
 {
 	switch (event) {
-	case XFRM_SAP_DELETED:
+	case XFRM_MSG_DELSA:
 		return SADB_DELETE;
-	case XFRM_SAP_ADDED:
+	case XFRM_MSG_NEWSA:
 		return SADB_ADD;
-	case XFRM_SAP_UPDATED:
+	case XFRM_MSG_UPDSA:
 		return SADB_UPDATE;
-	case XFRM_SAP_EXPIRED:
+	case XFRM_MSG_EXPIRE:
 		return SADB_EXPIRE;
 	default:
 		printk("pfkey: Unknown SA event %d\n", event);
@@ -1290,7 +1290,7 @@ static int key_notify_sa(struct xfrm_state *x, struct km_event *c)
 	struct sadb_msg *hdr;
 	int hsc = 3;
 
-	if (c->event == XFRM_SAP_DELETED)
+	if (c->event == XFRM_MSG_DELSA)
 		hsc = 0;
 
 	skb = pfkey_xfrm_state2msg(x, 0, hsc);
@@ -1337,9 +1337,9 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
 	}
 
 	if (hdr->sadb_msg_type == SADB_ADD)
-		c.event = XFRM_SAP_ADDED;
+		c.event = XFRM_MSG_NEWSA;
 	else
-		c.event = XFRM_SAP_UPDATED;
+		c.event = XFRM_MSG_UPDSA;
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
 	km_state_notify(x, &c);
@@ -1376,7 +1376,7 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
-	c.event = XFRM_SAP_DELETED;
+	c.event = XFRM_MSG_DELSA;
 	km_state_notify(x, &c);
 	xfrm_state_put(x);
 
@@ -1552,7 +1552,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hd
 	c.data.proto = proto;
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
-	c.event = XFRM_SAP_FLUSHED;
+	c.event = XFRM_MSG_FLUSHSA;
 	km_state_notify(NULL, &c);
 
 	return 0;
@@ -1962,7 +1962,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c
 	out_hdr = (struct sadb_msg *) out_skb->data;
 	out_hdr->sadb_msg_version = PF_KEY_V2;
 
-	if (c->data.byid && c->event == XFRM_SAP_DELETED)
+	if (c->data.byid && c->event == XFRM_MSG_DELPOLICY)
 		out_hdr->sadb_msg_type = SADB_X_SPDDELETE2;
 	else
 		out_hdr->sadb_msg_type = event2poltype(c->event);
@@ -2058,9 +2058,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	}
 
 	if (hdr->sadb_msg_type == SADB_X_SPDUPDATE)
-		c.event = XFRM_SAP_UPDATED;
-	else
-		c.event = XFRM_SAP_ADDED;
+		c.event = XFRM_MSG_UPDPOLICY;
+	else 
+		c.event = XFRM_MSG_NEWPOLICY;
 
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
@@ -2118,7 +2118,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
-	c.event = XFRM_SAP_DELETED;
+	c.event = XFRM_MSG_DELPOLICY;
 	km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
 
 	xfrm_pol_put(xp);
@@ -2174,7 +2174,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h
 	c.pid = hdr->sadb_msg_pid;
 	if (hdr->sadb_msg_type == SADB_X_SPDDELETE2) {
 		c.data.byid = 1;
-		c.event = XFRM_SAP_DELETED;
+		c.event = XFRM_MSG_DELPOLICY;
 		km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c);
 	} else {
 		err = key_pol_get_resp(sk, xp, hdr, pol->sadb_x_policy_dir-1);
@@ -2238,7 +2238,7 @@ static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg
 	struct km_event c;
 
 	xfrm_policy_flush();
-	c.event = XFRM_SAP_FLUSHED;
+	c.event = XFRM_MSG_FLUSHPOLICY;
 	c.pid = hdr->sadb_msg_pid;
 	c.seq = hdr->sadb_msg_seq;
 	km_policy_notify(NULL, 0, &c);
@@ -2479,13 +2479,13 @@ static int key_notify_sa_expire(struct xfrm_state *x, struct km_event *c)
 static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
 {
 	switch (c->event) {
-	case XFRM_SAP_EXPIRED:
+	case XFRM_MSG_EXPIRE:
 		return key_notify_sa_expire(x, c);
-	case XFRM_SAP_DELETED:
-	case XFRM_SAP_ADDED:
-	case XFRM_SAP_UPDATED:
+	case XFRM_MSG_DELSA:
+	case XFRM_MSG_NEWSA:
+	case XFRM_MSG_UPDSA:
 		return key_notify_sa(x, c);
-	case XFRM_SAP_FLUSHED:
+	case XFRM_MSG_FLUSHSA:
 		return key_notify_sa_flush(c);
 	default:
 		printk("pfkey: Unknown SA event %d\n", c->event);
@@ -2498,13 +2498,13 @@ static int pfkey_send_notify(struct xfrm_state *x, struct km_event *c)
 static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_event *c)
 {
 	switch (c->event) {
-	case XFRM_SAP_EXPIRED:
+	case XFRM_MSG_POLEXPIRE:
 		return key_notify_policy_expire(xp, c);
-	case XFRM_SAP_DELETED:
-	case XFRM_SAP_ADDED:
-	case XFRM_SAP_UPDATED:
+	case XFRM_MSG_DELPOLICY:
+	case XFRM_MSG_NEWPOLICY:
+	case XFRM_MSG_UPDPOLICY:
 		return key_notify_policy(xp, dir, c);
-	case XFRM_SAP_FLUSHED:
+	case XFRM_MSG_FLUSHPOLICY:
 		return key_notify_policy_flush(c);
 	default:
 		printk("pfkey: Unknown policy event %d\n", c->event);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index e068bd72105..2537f26f097 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -836,7 +836,7 @@ static void km_state_expired(struct xfrm_state *x, int hard)
 	struct km_event c;
 
 	c.data.hard = hard;
-	c.event = XFRM_SAP_EXPIRED;
+	c.event = XFRM_MSG_EXPIRE;
 	km_state_notify(x, &c);
 
 	if (hard)
@@ -884,7 +884,7 @@ void km_policy_expired(struct xfrm_policy *pol, int dir, int hard)
 	struct km_event c;
 
 	c.data.hard = hard;
-	c.event = XFRM_SAP_EXPIRED;
+	c.event = XFRM_MSG_POLEXPIRE;
 	km_policy_notify(pol, dir, &c);
 
 	if (hard)
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ff6fc610730..bc4fd745687 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -301,10 +301,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 
 	c.seq = nlh->nlmsg_seq;
 	c.pid = nlh->nlmsg_pid;
-	if (nlh->nlmsg_type == XFRM_MSG_NEWSA)
-		c.event = XFRM_SAP_ADDED;
-	else
-		c.event = XFRM_SAP_UPDATED;
+	c.event = nlh->nlmsg_type;
 
 	km_state_notify(x, &c);
 	xfrm_state_put(x);
@@ -336,7 +333,7 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 
 	c.seq = nlh->nlmsg_seq;
 	c.pid = nlh->nlmsg_pid;
-	c.event = XFRM_SAP_DELETED;
+	c.event = nlh->nlmsg_type;
 	km_state_notify(x, &c);
 	xfrm_state_put(x);
 
@@ -728,11 +725,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 		return err;
 	}
 
-	if (!excl)
-		c.event = XFRM_SAP_UPDATED;
-	else
-		c.event = XFRM_SAP_ADDED;
-
+	c.event = nlh->nlmsg_type;
 	c.seq = nlh->nlmsg_seq;
 	c.pid = nlh->nlmsg_pid;
 	km_policy_notify(xp, p->dir, &c);
@@ -884,7 +877,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr
 		}
 	} else {
 		c.data.byid = p->index;
-		c.event = XFRM_SAP_DELETED;
+		c.event = nlh->nlmsg_type;
 		c.seq = nlh->nlmsg_seq;
 		c.pid = nlh->nlmsg_pid;
 		km_policy_notify(xp, p->dir, &c);
@@ -902,7 +895,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma
 
 	xfrm_state_flush(p->proto);
 	c.data.proto = p->proto;
-	c.event = XFRM_SAP_FLUSHED;
+	c.event = nlh->nlmsg_type;
 	c.seq = nlh->nlmsg_seq;
 	c.pid = nlh->nlmsg_pid;
 	km_state_notify(NULL, &c);
@@ -915,7 +908,7 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **x
 	struct km_event c;
 
 	xfrm_policy_flush();
-	c.event = XFRM_SAP_FLUSHED;
+	c.event = nlh->nlmsg_type;
 	c.seq = nlh->nlmsg_seq;
 	c.pid = nlh->nlmsg_pid;
 	km_policy_notify(NULL, 0, &c);
@@ -1193,7 +1186,6 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
 	struct xfrm_usersa_info *p;
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
-	u32 nlt;
 	unsigned char *b;
 	int len = xfrm_sa_len(x);
 
@@ -1202,16 +1194,7 @@ static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
 		return -ENOMEM;
 	b = skb->tail;
 
-	if (c->event == XFRM_SAP_ADDED)
-		nlt = XFRM_MSG_NEWSA;
-	else if (c->event == XFRM_SAP_UPDATED)
-		nlt = XFRM_MSG_UPDSA;
-	else if (c->event == XFRM_SAP_DELETED)
-		nlt = XFRM_MSG_DELSA;
-	else
-		goto nlmsg_failure;
-
-	nlh = NLMSG_PUT(skb, c->pid, c->seq, nlt, sizeof(*p));
+	nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, sizeof(*p));
 	nlh->nlmsg_flags = 0;
 
 	p = NLMSG_DATA(nlh);
@@ -1243,13 +1226,13 @@ static int xfrm_send_state_notify(struct xfrm_state *x, struct km_event *c)
 {
 
 	switch (c->event) {
-	case XFRM_SAP_EXPIRED:
+	case XFRM_MSG_EXPIRE:
 		return xfrm_exp_state_notify(x, c);
-	case XFRM_SAP_DELETED:
-	case XFRM_SAP_UPDATED:
-	case XFRM_SAP_ADDED:
+	case XFRM_MSG_DELSA:
+	case XFRM_MSG_UPDSA:
+	case XFRM_MSG_NEWSA:
 		return xfrm_notify_sa(x, c);
-	case XFRM_SAP_FLUSHED:
+	case XFRM_MSG_FLUSHSA:
 		return xfrm_notify_sa_flush(c);
 	default:
 		 printk("xfrm_user: Unknown SA event %d\n", c->event);
@@ -1417,7 +1400,6 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *
 	struct xfrm_userpolicy_info *p;
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
-	u32 nlt = 0 ;
 	unsigned char *b;
 	int len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
 	len += NLMSG_SPACE(sizeof(struct xfrm_userpolicy_info));
@@ -1427,16 +1409,7 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *
 		return -ENOMEM;
 	b = skb->tail;
 
-	if (c->event == XFRM_SAP_ADDED)
-		nlt = XFRM_MSG_NEWPOLICY;
-	else if (c->event == XFRM_SAP_UPDATED)
-		nlt = XFRM_MSG_UPDPOLICY;
-	else if (c->event == XFRM_SAP_DELETED)
-		nlt = XFRM_MSG_DELPOLICY;
-	else
-		goto nlmsg_failure;
-
-	nlh = NLMSG_PUT(skb, c->pid, c->seq, nlt, sizeof(*p));
+	nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, sizeof(*p));
 
 	p = NLMSG_DATA(nlh);
 
@@ -1483,13 +1456,13 @@ static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, struct km_ev
 {
 
 	switch (c->event) {
-	case XFRM_SAP_ADDED:
-	case XFRM_SAP_UPDATED:
-	case XFRM_SAP_DELETED:
+	case XFRM_MSG_NEWPOLICY:
+	case XFRM_MSG_UPDPOLICY:
+	case XFRM_MSG_DELPOLICY:
 		return xfrm_notify_policy(xp, dir, c);
-	case XFRM_SAP_FLUSHED:
+	case XFRM_MSG_FLUSHPOLICY:
 		return xfrm_notify_policy_flush(c);
-	case XFRM_SAP_EXPIRED:
+	case XFRM_MSG_POLEXPIRE:
 		return xfrm_exp_policy_notify(xp, dir, c);
 	default:
 		printk("xfrm_user: Unknown Policy event %d\n", c->event);
-- 
cgit v1.2.3


From 7d6dfe1f5bc4c56e0c31173014a099ec3fa35907 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 18 Jun 2005 22:45:31 -0700
Subject: [IPSEC] Fix xfrm_state leaks in error path

Herbert Xu wrote:
> @@ -1254,6 +1326,7 @@ static int pfkey_add(struct sock *sk, st
>       if (IS_ERR(x))
>               return PTR_ERR(x);
>
> +     xfrm_state_hold(x);

This introduces a leak when xfrm_state_add()/xfrm_state_update()
fail. We hold two references (one from xfrm_state_alloc(), one
from xfrm_state_hold()), but only drop one. We need to take the
reference because the reference from xfrm_state_alloc() can
be dropped by __xfrm_state_delete(), so the fix is to drop both
references on error. Same problem in xfrm_user.c.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/key/af_key.c     | 4 ++--
 net/xfrm/xfrm_user.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 577f0bb5bb3..98b72f2024f 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1333,7 +1333,7 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
 	if (err < 0) {
 		x->km.state = XFRM_STATE_DEAD;
 		xfrm_state_put(x);
-		return err;
+		goto out;
 	}
 
 	if (hdr->sadb_msg_type == SADB_ADD)
@@ -1343,8 +1343,8 @@ static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
 	c.seq = hdr->sadb_msg_seq;
 	c.pid = hdr->sadb_msg_pid;
 	km_state_notify(x, &c);
+out:
 	xfrm_state_put(x);
-
 	return err;
 }
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index bc4fd745687..9d30f732a27 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -296,7 +296,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	if (err < 0) {
 		x->km.state = XFRM_STATE_DEAD;
 		xfrm_state_put(x);
-		return err;
+		goto out;
 	}
 
 	c.seq = nlh->nlmsg_seq;
@@ -304,8 +304,8 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma)
 	c.event = nlh->nlmsg_type;
 
 	km_state_notify(x, &c);
+out:
 	xfrm_state_put(x);
-
 	return err;
 }
 
-- 
cgit v1.2.3


From ee57eef99b9e19194677f552ebc0690ec35d62db Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Sat, 18 Jun 2005 22:45:56 -0700
Subject: [IPSEC] Use NLMSG_LENGTH in xfrm_exp_state_notify

Small fixup to use netlink macros instead of hardcoding.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 9d30f732a27..ffe1b217347 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1123,9 +1123,9 @@ nlmsg_failure:
 static int xfrm_exp_state_notify(struct xfrm_state *x, struct km_event *c)
 {
 	struct sk_buff *skb;
+	int len = NLMSG_LENGTH(sizeof(struct xfrm_user_expire));
 
-	/* fix to do alloc using NLM macros */
-	skb = alloc_skb(sizeof(struct xfrm_user_expire) + 16, GFP_ATOMIC);
+	skb = alloc_skb(len, GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 2e6599cb899ba4b133f42cbf9d2b1883d2dc583a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Sat, 18 Jun 2005 22:46:52 -0700
Subject: [NET] Generalise TCP's struct open_request minisock infrastructure

Kept this first changeset minimal, without changing existing names to
ease peer review.

Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:

->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
  a specific protocol

The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.

I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.

Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)

Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c          | 35 ++++++++++++++++++
 net/ipv4/syncookies.c    | 39 ++++++++++----------
 net/ipv4/tcp.c           |  9 -----
 net/ipv4/tcp_diag.c      | 25 +++++++------
 net/ipv4/tcp_ipv4.c      | 72 ++++++++++++++++++++----------------
 net/ipv4/tcp_minisocks.c | 48 ++++++++++++------------
 net/ipv4/tcp_output.c    | 25 +++++++------
 net/ipv4/tcp_timer.c     |  2 +-
 net/ipv6/tcp_ipv6.c      | 96 +++++++++++++++++++++++++-----------------------
 9 files changed, 200 insertions(+), 151 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 96e00b08698..a6ec3ada7f9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -118,6 +118,7 @@
 #include <linux/netdevice.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
+#include <net/request_sock.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
 #include <linux/ipsec.h>
@@ -1363,6 +1364,7 @@ static LIST_HEAD(proto_list);
 
 int proto_register(struct proto *prot, int alloc_slab)
 {
+	char *request_sock_slab_name;
 	int rc = -ENOBUFS;
 
 	if (alloc_slab) {
@@ -1374,6 +1376,25 @@ int proto_register(struct proto *prot, int alloc_slab)
 			       prot->name);
 			goto out;
 		}
+
+		if (prot->rsk_prot != NULL) {
+			static const char mask[] = "request_sock_%s";
+
+			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+			if (request_sock_slab_name == NULL)
+				goto out_free_sock_slab;
+
+			sprintf(request_sock_slab_name, mask, prot->name);
+			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
+								 prot->rsk_prot->obj_size, 0,
+								 SLAB_HWCACHE_ALIGN, NULL, NULL);
+
+			if (prot->rsk_prot->slab == NULL) {
+				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
+				       prot->name);
+				goto out_free_request_sock_slab_name;
+			}
+		}
 	}
 
 	write_lock(&proto_list_lock);
@@ -1382,6 +1403,12 @@ int proto_register(struct proto *prot, int alloc_slab)
 	rc = 0;
 out:
 	return rc;
+out_free_request_sock_slab_name:
+	kfree(request_sock_slab_name);
+out_free_sock_slab:
+	kmem_cache_destroy(prot->slab);
+	prot->slab = NULL;
+	goto out;
 }
 
 EXPORT_SYMBOL(proto_register);
@@ -1395,6 +1422,14 @@ void proto_unregister(struct proto *prot)
 		prot->slab = NULL;
 	}
 
+	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
+		const char *name = kmem_cache_name(prot->rsk_prot->slab);
+
+		kmem_cache_destroy(prot->rsk_prot->slab);
+		kfree(name);
+		prot->rsk_prot->slab = NULL;
+	}
+
 	list_del(&prot->node);
 	write_unlock(&proto_list_lock);
 }
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index e923d2f021a..dd47e6da6fb 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -190,6 +190,8 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 			     struct ip_options *opt)
 {
+	struct inet_request_sock *ireq;
+	struct tcp_request_sock *treq;
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 cookie = ntohl(skb->h.th->ack_seq) - 1; 
 	struct sock *ret = sk;
@@ -209,19 +211,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
 
-	req = tcp_openreq_alloc();
 	ret = NULL;
+	req = tcp_openreq_alloc(&or_ipv4); /* for safety */
 	if (!req)
 		goto out;
 
-	req->rcv_isn		= htonl(skb->h.th->seq) - 1;
-	req->snt_isn		= cookie; 
+	ireq = inet_rsk(req);
+	treq = tcp_rsk(req);
+	treq->rcv_isn		= htonl(skb->h.th->seq) - 1;
+	treq->snt_isn		= cookie; 
 	req->mss		= mss;
- 	req->rmt_port		= skb->h.th->source;
-	req->af.v4_req.loc_addr = skb->nh.iph->daddr;
-	req->af.v4_req.rmt_addr = skb->nh.iph->saddr;
-	req->class		= &or_ipv4; /* for savety */
-	req->af.v4_req.opt	= NULL;
+ 	ireq->rmt_port		= skb->h.th->source;
+	ireq->loc_addr		= skb->nh.iph->daddr;
+	ireq->rmt_addr		= skb->nh.iph->saddr;
+	ireq->opt		= NULL;
 
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -229,17 +232,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	if (opt && opt->optlen) {
 		int opt_size = sizeof(struct ip_options) + opt->optlen;
 
-		req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
-		if (req->af.v4_req.opt) {
-			if (ip_options_echo(req->af.v4_req.opt, skb)) {
-				kfree(req->af.v4_req.opt);
-				req->af.v4_req.opt = NULL;
-			}
+		ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
+		if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
+			kfree(ireq->opt);
+			ireq->opt = NULL;
 		}
 	}
 
-	req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
-	req->wscale_ok	= req->sack_ok = 0; 
+	ireq->snd_wscale = ireq->rcv_wscale = ireq->tstamp_ok = 0;
+	ireq->wscale_ok	 = ireq->sack_ok = 0; 
 	req->expires	= 0UL; 
 	req->retrans	= 0; 
 	
@@ -253,8 +254,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 		struct flowi fl = { .nl_u = { .ip4_u =
 					      { .daddr = ((opt && opt->srr) ?
 							  opt->faddr :
-							  req->af.v4_req.rmt_addr),
-						.saddr = req->af.v4_req.loc_addr,
+							  ireq->rmt_addr),
+						.saddr = ireq->loc_addr,
 						.tos = RT_CONN_FLAGS(sk) } },
 				    .proto = IPPROTO_TCP,
 				    .uli_u = { .ports =
@@ -272,7 +273,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 				  &req->rcv_wnd, &req->window_clamp, 
 				  0, &rcv_wscale);
 	/* BTW win scale with syncookies is 0 by definition */
-	req->rcv_wscale	  = rcv_wscale; 
+	ireq->rcv_wscale  = rcv_wscale; 
 
 	ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
 out:	return ret;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0d9a4fd5f1a..a3cabfa2022 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -271,7 +271,6 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 
 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 
-kmem_cache_t *tcp_openreq_cachep;
 kmem_cache_t *tcp_bucket_cachep;
 kmem_cache_t *tcp_timewait_cachep;
 
@@ -2271,13 +2270,6 @@ void __init tcp_init(void)
 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
 					   sizeof(skb->cb));
 
-	tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
-						   sizeof(struct open_request),
-					       0, SLAB_HWCACHE_ALIGN,
-					       NULL, NULL);
-	if (!tcp_openreq_cachep)
-		panic("tcp_init: Cannot alloc open_request cache.");
-
 	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
 					      sizeof(struct tcp_bind_bucket),
 					      0, SLAB_HWCACHE_ALIGN,
@@ -2374,7 +2366,6 @@ EXPORT_SYMBOL(tcp_destroy_sock);
 EXPORT_SYMBOL(tcp_disconnect);
 EXPORT_SYMBOL(tcp_getsockopt);
 EXPORT_SYMBOL(tcp_ioctl);
-EXPORT_SYMBOL(tcp_openreq_cachep);
 EXPORT_SYMBOL(tcp_poll);
 EXPORT_SYMBOL(tcp_read_sock);
 EXPORT_SYMBOL(tcp_recvmsg);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 8faa8948f75..700ff241358 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -458,6 +458,7 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
 			    struct open_request *req,
 			    u32 pid, u32 seq)
 {
+	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct inet_sock *inet = inet_sk(sk);
 	unsigned char *b = skb->tail;
 	struct tcpdiagmsg *r;
@@ -482,9 +483,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
 		tmo = 0;
 
 	r->id.tcpdiag_sport = inet->sport;
-	r->id.tcpdiag_dport = req->rmt_port;
-	r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr;
-	r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr;
+	r->id.tcpdiag_dport = ireq->rmt_port;
+	r->id.tcpdiag_src[0] = ireq->loc_addr;
+	r->id.tcpdiag_dst[0] = ireq->rmt_addr;
 	r->tcpdiag_expires = jiffies_to_msecs(tmo),
 	r->tcpdiag_rqueue = 0;
 	r->tcpdiag_wqueue = 0;
@@ -493,9 +494,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
 #ifdef CONFIG_IP_TCPDIAG_IPV6
 	if (r->tcpdiag_family == AF_INET6) {
 		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
-			       &req->af.v6_req.loc_addr);
+			       &tcp6_rsk(req)->loc_addr);
 		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
-			       &req->af.v6_req.rmt_addr);
+			       &tcp6_rsk(req)->rmt_addr);
 	}
 #endif
 	nlh->nlmsg_len = skb->tail - b;
@@ -545,9 +546,11 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 
 		reqnum = 0;
 		for (req = head; req; reqnum++, req = req->dl_next) {
+			struct inet_request_sock *ireq = inet_rsk(req);
+
 			if (reqnum < s_reqnum)
 				continue;
-			if (r->id.tcpdiag_dport != req->rmt_port &&
+			if (r->id.tcpdiag_dport != ireq->rmt_port &&
 			    r->id.tcpdiag_dport)
 				continue;
 
@@ -555,16 +558,16 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 				entry.saddr =
 #ifdef CONFIG_IP_TCPDIAG_IPV6
 					(entry.family == AF_INET6) ?
-					req->af.v6_req.loc_addr.s6_addr32 :
+					tcp6_rsk(req)->loc_addr.s6_addr32 :
 #endif
-					&req->af.v4_req.loc_addr;
+					&ireq->loc_addr;
 				entry.daddr = 
 #ifdef CONFIG_IP_TCPDIAG_IPV6
 					(entry.family == AF_INET6) ?
-					req->af.v6_req.rmt_addr.s6_addr32 :
+					tcp6_rsk(req)->rmt_addr.s6_addr32 :
 #endif
-					&req->af.v4_req.rmt_addr;
-				entry.dport = ntohs(req->rmt_port);
+					&ireq->rmt_addr;
+				entry.dport = ntohs(ireq->rmt_port);
 
 				if (!tcpdiag_bc_run(RTA_DATA(bc),
 						    RTA_PAYLOAD(bc), &entry))
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dad98e4a504..e156be90df1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -880,9 +880,11 @@ static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
 	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 	     (req = *prev) != NULL;
 	     prev = &req->dl_next) {
-		if (req->rmt_port == rport &&
-		    req->af.v4_req.rmt_addr == raddr &&
-		    req->af.v4_req.loc_addr == laddr &&
+		const struct inet_request_sock *ireq = inet_rsk(req);
+
+		if (ireq->rmt_port == rport &&
+		    ireq->rmt_addr == raddr &&
+		    ireq->loc_addr == laddr &&
 		    TCP_INET_FAMILY(req->class->family)) {
 			BUG_TRAP(!req->sk);
 			*prevp = prev;
@@ -897,7 +899,7 @@ static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_listen_opt *lopt = tp->listen_opt;
-	u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
+	u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 
 	req->expires = jiffies + TCP_TIMEOUT_INIT;
 	req->retrans = 0;
@@ -1065,7 +1067,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
 		 */
 		BUG_TRAP(!req->sk);
 
-		if (seq != req->snt_isn) {
+		if (seq != tcp_rsk(req)->snt_isn) {
 			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 			goto out;
 		}
@@ -1256,7 +1258,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 
 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
 {
-	tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
+	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 			req->ts_recent);
 }
 
@@ -1264,18 +1266,19 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk,
 					  struct open_request *req)
 {
 	struct rtable *rt;
-	struct ip_options *opt = req->af.v4_req.opt;
+	const struct inet_request_sock *ireq = inet_rsk(req);
+	struct ip_options *opt = inet_rsk(req)->opt;
 	struct flowi fl = { .oif = sk->sk_bound_dev_if,
 			    .nl_u = { .ip4_u =
 				      { .daddr = ((opt && opt->srr) ?
 						  opt->faddr :
-						  req->af.v4_req.rmt_addr),
-					.saddr = req->af.v4_req.loc_addr,
+						  ireq->rmt_addr),
+					.saddr = ireq->loc_addr,
 					.tos = RT_CONN_FLAGS(sk) } },
 			    .proto = IPPROTO_TCP,
 			    .uli_u = { .ports =
 				       { .sport = inet_sk(sk)->sport,
-					 .dport = req->rmt_port } } };
+					 .dport = ireq->rmt_port } } };
 
 	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
 		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
@@ -1297,6 +1300,7 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk,
 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
 			      struct dst_entry *dst)
 {
+	const struct inet_request_sock *ireq = inet_rsk(req);
 	int err = -1;
 	struct sk_buff * skb;
 
@@ -1310,14 +1314,14 @@ static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
 		struct tcphdr *th = skb->h.th;
 
 		th->check = tcp_v4_check(th, skb->len,
-					 req->af.v4_req.loc_addr,
-					 req->af.v4_req.rmt_addr,
+					 ireq->loc_addr,
+					 ireq->rmt_addr,
 					 csum_partial((char *)th, skb->len,
 						      skb->csum));
 
-		err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
-					    req->af.v4_req.rmt_addr,
-					    req->af.v4_req.opt);
+		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
+					    ireq->rmt_addr,
+					    ireq->opt);
 		if (err == NET_XMIT_CN)
 			err = 0;
 	}
@@ -1332,8 +1336,8 @@ out:
  */
 static void tcp_v4_or_free(struct open_request *req)
 {
-	if (req->af.v4_req.opt)
-		kfree(req->af.v4_req.opt);
+	if (inet_rsk(req)->opt)
+		kfree(inet_rsk(req)->opt);
 }
 
 static inline void syn_flood_warning(struct sk_buff *skb)
@@ -1387,6 +1391,7 @@ int sysctl_max_syn_backlog = 256;
 
 struct or_calltable or_ipv4 = {
 	.family		=	PF_INET,
+	.obj_size	=	sizeof(struct tcp_request_sock),
 	.rtx_syn_ack	=	tcp_v4_send_synack,
 	.send_ack	=	tcp_v4_or_send_ack,
 	.destructor	=	tcp_v4_or_free,
@@ -1395,6 +1400,7 @@ struct or_calltable or_ipv4 = {
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
+	struct inet_request_sock *ireq;
 	struct tcp_options_received tmp_opt;
 	struct open_request *req;
 	__u32 saddr = skb->nh.iph->saddr;
@@ -1433,7 +1439,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
 		goto drop;
 
-	req = tcp_openreq_alloc();
+	req = tcp_openreq_alloc(&or_ipv4);
 	if (!req)
 		goto drop;
 
@@ -1461,10 +1467,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	tcp_openreq_init(req, &tmp_opt, skb);
 
-	req->af.v4_req.loc_addr = daddr;
-	req->af.v4_req.rmt_addr = saddr;
-	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
-	req->class = &or_ipv4;
+	ireq = inet_rsk(req);
+	ireq->loc_addr = daddr;
+	ireq->rmt_addr = saddr;
+	ireq->opt = tcp_v4_save_options(sk, skb);
 	if (!want_cookie)
 		TCP_ECN_create_request(req, skb->h.th);
 
@@ -1523,7 +1529,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
 		isn = tcp_v4_init_sequence(sk, skb);
 	}
-	req->snt_isn = isn;
+	tcp_rsk(req)->snt_isn = isn;
 
 	if (tcp_v4_send_synack(sk, req, dst))
 		goto drop_and_free;
@@ -1551,6 +1557,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 				  struct open_request *req,
 				  struct dst_entry *dst)
 {
+	struct inet_request_sock *ireq;
 	struct inet_sock *newinet;
 	struct tcp_sock *newtp;
 	struct sock *newsk;
@@ -1570,11 +1577,12 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newtp		      = tcp_sk(newsk);
 	newinet		      = inet_sk(newsk);
-	newinet->daddr	      = req->af.v4_req.rmt_addr;
-	newinet->rcv_saddr    = req->af.v4_req.loc_addr;
-	newinet->saddr	      = req->af.v4_req.loc_addr;
-	newinet->opt	      = req->af.v4_req.opt;
-	req->af.v4_req.opt    = NULL;
+	ireq		      = inet_rsk(req);
+	newinet->daddr	      = ireq->rmt_addr;
+	newinet->rcv_saddr    = ireq->loc_addr;
+	newinet->saddr	      = ireq->loc_addr;
+	newinet->opt	      = ireq->opt;
+	ireq->opt	      = NULL;
 	newinet->mc_index     = tcp_v4_iif(skb);
 	newinet->mc_ttl	      = skb->nh.iph->ttl;
 	newtp->ext_header_len = 0;
@@ -2454,15 +2462,16 @@ void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
 static void get_openreq4(struct sock *sk, struct open_request *req,
 			 char *tmpbuf, int i, int uid)
 {
+	const struct inet_request_sock *ireq = inet_rsk(req);
 	int ttd = req->expires - jiffies;
 
 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
 		i,
-		req->af.v4_req.loc_addr,
+		ireq->loc_addr,
 		ntohs(inet_sk(sk)->sport),
-		req->af.v4_req.rmt_addr,
-		ntohs(req->rmt_port),
+		ireq->rmt_addr,
+		ntohs(ireq->rmt_port),
 		TCP_SYN_RECV,
 		0, 0, /* could print option size, but that is af dependent. */
 		1,    /* timers active (only the expire timer) */
@@ -2618,6 +2627,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
+	.rsk_prot		= &or_ipv4,
 };
 
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index eea1a17a9ac..1037401c7cc 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -692,6 +692,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 	struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
 
 	if(newsk != NULL) {
+		struct inet_request_sock *ireq = inet_rsk(req);
+		struct tcp_request_sock *treq = tcp_rsk(req);
 		struct tcp_sock *newtp;
 		struct sk_filter *filter;
 
@@ -703,7 +705,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		tcp_sk(newsk)->bind_hash = NULL;
 
 		/* Clone the TCP header template */
-		inet_sk(newsk)->dport = req->rmt_port;
+		inet_sk(newsk)->dport = ireq->rmt_port;
 
 		sock_lock_init(newsk);
 		bh_lock_sock(newsk);
@@ -739,14 +741,14 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		/* Now setup tcp_sock */
 		newtp = tcp_sk(newsk);
 		newtp->pred_flags = 0;
-		newtp->rcv_nxt = req->rcv_isn + 1;
-		newtp->snd_nxt = req->snt_isn + 1;
-		newtp->snd_una = req->snt_isn + 1;
-		newtp->snd_sml = req->snt_isn + 1;
+		newtp->rcv_nxt = treq->rcv_isn + 1;
+		newtp->snd_nxt = treq->snt_isn + 1;
+		newtp->snd_una = treq->snt_isn + 1;
+		newtp->snd_sml = treq->snt_isn + 1;
 
 		tcp_prequeue_init(newtp);
 
-		tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
+		tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
 
 		newtp->retransmits = 0;
 		newtp->backoff = 0;
@@ -775,10 +777,10 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		tcp_set_ca_state(newtp, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
-		newtp->rcv_wup = req->rcv_isn + 1;
-		newtp->write_seq = req->snt_isn + 1;
+		newtp->rcv_wup = treq->rcv_isn + 1;
+		newtp->write_seq = treq->snt_isn + 1;
 		newtp->pushed_seq = newtp->write_seq;
-		newtp->copied_seq = req->rcv_isn + 1;
+		newtp->copied_seq = treq->rcv_isn + 1;
 
 		newtp->rx_opt.saw_tstamp = 0;
 
@@ -808,18 +810,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 		newsk->sk_socket = NULL;
 		newsk->sk_sleep = NULL;
 
-		newtp->rx_opt.tstamp_ok = req->tstamp_ok;
-		if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) {
+		newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
+		if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
 			if (sysctl_tcp_fack)
 				newtp->rx_opt.sack_ok |= 2;
 		}
 		newtp->window_clamp = req->window_clamp;
 		newtp->rcv_ssthresh = req->rcv_wnd;
 		newtp->rcv_wnd = req->rcv_wnd;
-		newtp->rx_opt.wscale_ok = req->wscale_ok;
+		newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 		if (newtp->rx_opt.wscale_ok) {
-			newtp->rx_opt.snd_wscale = req->snd_wscale;
-			newtp->rx_opt.rcv_wscale = req->rcv_wscale;
+			newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+			newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
 		} else {
 			newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
 			newtp->window_clamp = min(newtp->window_clamp, 65535U);
@@ -881,7 +883,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 	}
 
 	/* Check for pure retransmitted SYN. */
-	if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
+	if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 	    flg == TCP_FLAG_SYN &&
 	    !paws_reject) {
 		/*
@@ -959,7 +961,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 	 * Invalid ACK: reset will be sent by listening socket
 	 */
 	if ((flg & TCP_FLAG_ACK) &&
-	    (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
+	    (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
 		return sk;
 
 	/* Also, it would be not so bad idea to check rcv_tsecr, which
@@ -970,7 +972,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 	/* RFC793: "first check sequence number". */
 
 	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-					  req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
+					  tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
 		/* Out of window: send ACK and drop. */
 		if (!(flg & TCP_FLAG_RST))
 			req->class->send_ack(skb, req);
@@ -981,12 +983,12 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 
 	/* In sequence, PAWS is OK. */
 
-	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
+	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
 			req->ts_recent = tmp_opt.rcv_tsval;
 
-		if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
+		if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
 			/* Truncate SYN, it is out of window starting
-			   at req->rcv_isn+1. */
+			   at tcp_rsk(req)->rcv_isn + 1. */
 			flg &= ~TCP_FLAG_SYN;
 		}
 
@@ -1003,8 +1005,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 			return NULL;
 
 		/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
-		if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
-			req->acked = 1;
+		if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
+			inet_rsk(req)->acked = 1;
 			return NULL;
 		}
 
@@ -1026,7 +1028,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 
 	listen_overflow:
 		if (!sysctl_tcp_abort_on_overflow) {
-			req->acked = 1;
+			inet_rsk(req)->acked = 1;
 			return NULL;
 		}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fa24e7ae1f4..f3c8747caf9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1358,6 +1358,7 @@ int tcp_send_synack(struct sock *sk)
 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 				 struct open_request *req)
 {
+	struct inet_request_sock *ireq = inet_rsk(req);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcphdr *th;
 	int tcp_header_size;
@@ -1373,47 +1374,47 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	skb->dst = dst_clone(dst);
 
 	tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
-			   (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
-			   (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
+			   (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
+			   (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
 			   /* SACK_PERM is in the place of NOP NOP of TS */
-			   ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
+			   ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
 	skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 
 	memset(th, 0, sizeof(struct tcphdr));
 	th->syn = 1;
 	th->ack = 1;
 	if (dst->dev->features&NETIF_F_TSO)
-		req->ecn_ok = 0;
+		ireq->ecn_ok = 0;
 	TCP_ECN_make_synack(req, th);
 	th->source = inet_sk(sk)->sport;
-	th->dest = req->rmt_port;
-	TCP_SKB_CB(skb)->seq = req->snt_isn;
+	th->dest = ireq->rmt_port;
+	TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
 	TCP_SKB_CB(skb)->sacked = 0;
 	skb_shinfo(skb)->tso_segs = 1;
 	skb_shinfo(skb)->tso_size = 0;
 	th->seq = htonl(TCP_SKB_CB(skb)->seq);
-	th->ack_seq = htonl(req->rcv_isn + 1);
+	th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
 	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
 		__u8 rcv_wscale; 
 		/* Set this up on the first call only */
 		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
 		/* tcp_full_space because it is guaranteed to be the first packet */
 		tcp_select_initial_window(tcp_full_space(sk), 
-			dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+			dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
 			&req->rcv_wnd,
 			&req->window_clamp,
-			req->wscale_ok,
+			ireq->wscale_ok,
 			&rcv_wscale);
-		req->rcv_wscale = rcv_wscale; 
+		ireq->rcv_wscale = rcv_wscale; 
 	}
 
 	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
 	th->window = htons(req->rcv_wnd);
 
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
-			      req->sack_ok, req->wscale_ok, req->rcv_wscale,
+	tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
+			      ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
 			      TCP_SKB_CB(skb)->when,
 			      req->ts_recent);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 799ebe061e2..ba30ca0aa6a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -513,7 +513,7 @@ static void tcp_synack_timer(struct sock *sk)
 		while ((req = *reqp) != NULL) {
 			if (time_after_eq(now, req->expires)) {
 				if ((req->retrans < thresh ||
-				     (req->acked && req->retrans < max_retries))
+				     (inet_rsk(req)->acked && req->retrans < max_retries))
 				    && !req->class->rtx_syn_ack(sk, req, NULL)) {
 					unsigned long timeo;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 0f69e800a0a..9199ad2fde0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -407,11 +407,13 @@ static struct open_request *tcp_v6_search_req(struct tcp_sock *tp,
 	for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
 	     (req = *prev) != NULL;
 	     prev = &req->dl_next) {
-		if (req->rmt_port == rport &&
+		const struct tcp6_request_sock *treq = tcp6_rsk(req);
+
+		if (inet_rsk(req)->rmt_port == rport &&
 		    req->class->family == AF_INET6 &&
-		    ipv6_addr_equal(&req->af.v6_req.rmt_addr, raddr) &&
-		    ipv6_addr_equal(&req->af.v6_req.loc_addr, laddr) &&
-		    (!req->af.v6_req.iif || req->af.v6_req.iif == iif)) {
+		    ipv6_addr_equal(&treq->rmt_addr, raddr) &&
+		    ipv6_addr_equal(&treq->loc_addr, laddr) &&
+		    (!treq->iif || treq->iif == iif)) {
 			BUG_TRAP(req->sk == NULL);
 			*prevp = prev;
 			return req;
@@ -923,7 +925,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		 */
 		BUG_TRAP(req->sk == NULL);
 
-		if (seq != req->snt_isn) {
+		if (seq != tcp_rsk(req)->snt_isn) {
 			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
 			goto out;
 		}
@@ -960,6 +962,7 @@ out:
 static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
 			      struct dst_entry *dst)
 {
+	struct tcp6_request_sock *treq = tcp6_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff * skb;
 	struct ipv6_txoptions *opt = NULL;
@@ -969,19 +972,19 @@ static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
 
 	memset(&fl, 0, sizeof(fl));
 	fl.proto = IPPROTO_TCP;
-	ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr);
-	ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr);
+	ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
+	ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
 	fl.fl6_flowlabel = 0;
-	fl.oif = req->af.v6_req.iif;
-	fl.fl_ip_dport = req->rmt_port;
+	fl.oif = treq->iif;
+	fl.fl_ip_dport = inet_rsk(req)->rmt_port;
 	fl.fl_ip_sport = inet_sk(sk)->sport;
 
 	if (dst == NULL) {
 		opt = np->opt;
 		if (opt == NULL &&
 		    np->rxopt.bits.srcrt == 2 &&
-		    req->af.v6_req.pktopts) {
-			struct sk_buff *pktopts = req->af.v6_req.pktopts;
+		    treq->pktopts) {
+			struct sk_buff *pktopts = treq->pktopts;
 			struct inet6_skb_parm *rxopt = IP6CB(pktopts);
 			if (rxopt->srcrt)
 				opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt));
@@ -1008,10 +1011,10 @@ static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
 		struct tcphdr *th = skb->h.th;
 
 		th->check = tcp_v6_check(th, skb->len,
-					 &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr,
+					 &treq->loc_addr, &treq->rmt_addr,
 					 csum_partial((char *)th, skb->len, skb->csum));
 
-		ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr);
+		ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
 		err = ip6_xmit(sk, skb, &fl, opt, 0);
 		if (err == NET_XMIT_CN)
 			err = 0;
@@ -1026,12 +1029,13 @@ done:
 
 static void tcp_v6_or_free(struct open_request *req)
 {
-	if (req->af.v6_req.pktopts)
-		kfree_skb(req->af.v6_req.pktopts);
+	if (tcp6_rsk(req)->pktopts)
+		kfree_skb(tcp6_rsk(req)->pktopts);
 }
 
 static struct or_calltable or_ipv6 = {
 	.family		=	AF_INET6,
+	.obj_size	=	sizeof(struct tcp6_request_sock),
 	.rtx_syn_ack	=	tcp_v6_send_synack,
 	.send_ack	=	tcp_v6_or_send_ack,
 	.destructor	=	tcp_v6_or_free,
@@ -1221,7 +1225,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 
 static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req)
 {
-	tcp_v6_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent);
+	tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent);
 }
 
 
@@ -1264,7 +1268,7 @@ static void tcp_v6_synq_add(struct sock *sk, struct open_request *req)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_listen_opt *lopt = tp->listen_opt;
-	u32 h = tcp_v6_synq_hash(&req->af.v6_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
+	u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 
 	req->sk = NULL;
 	req->expires = jiffies + TCP_TIMEOUT_INIT;
@@ -1284,6 +1288,7 @@ static void tcp_v6_synq_add(struct sock *sk, struct open_request *req)
  */
 static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 {
+	struct tcp6_request_sock *treq;
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_options_received tmp_opt;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -1308,7 +1313,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
 		goto drop;
 
-	req = tcp_openreq_alloc();
+	req = tcp_openreq_alloc(&or_ipv6);
 	if (req == NULL)
 		goto drop;
 
@@ -1321,28 +1326,28 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb);
 
-	req->class = &or_ipv6;
-	ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr);
-	ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr);
+	treq = tcp6_rsk(req);
+	ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr);
+	ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr);
 	TCP_ECN_create_request(req, skb->h.th);
-	req->af.v6_req.pktopts = NULL;
+	treq->pktopts = NULL;
 	if (ipv6_opt_accepted(sk, skb) ||
 	    np->rxopt.bits.rxinfo ||
 	    np->rxopt.bits.rxhlim) {
 		atomic_inc(&skb->users);
-		req->af.v6_req.pktopts = skb;
+		treq->pktopts = skb;
 	}
-	req->af.v6_req.iif = sk->sk_bound_dev_if;
+	treq->iif = sk->sk_bound_dev_if;
 
 	/* So that link locals have meaning */
 	if (!sk->sk_bound_dev_if &&
-	    ipv6_addr_type(&req->af.v6_req.rmt_addr) & IPV6_ADDR_LINKLOCAL)
-		req->af.v6_req.iif = tcp_v6_iif(skb);
+	    ipv6_addr_type(&treq->rmt_addr) & IPV6_ADDR_LINKLOCAL)
+		treq->iif = tcp_v6_iif(skb);
 
 	if (isn == 0) 
 		isn = tcp_v6_init_sequence(sk,skb);
 
-	req->snt_isn = isn;
+	tcp_rsk(req)->snt_isn = isn;
 
 	if (tcp_v6_send_synack(sk, req, NULL))
 		goto drop;
@@ -1363,6 +1368,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 					  struct open_request *req,
 					  struct dst_entry *dst)
 {
+	struct tcp6_request_sock *treq = tcp6_rsk(req);
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct tcp6_sock *newtcp6sk;
 	struct inet_sock *newinet;
@@ -1426,10 +1432,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		goto out_overflow;
 
 	if (np->rxopt.bits.srcrt == 2 &&
-	    opt == NULL && req->af.v6_req.pktopts) {
-		struct inet6_skb_parm *rxopt = IP6CB(req->af.v6_req.pktopts);
+	    opt == NULL && treq->pktopts) {
+		struct inet6_skb_parm *rxopt = IP6CB(treq->pktopts);
 		if (rxopt->srcrt)
-			opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt));
+			opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr *)(treq->pktopts->nh.raw + rxopt->srcrt));
 	}
 
 	if (dst == NULL) {
@@ -1438,16 +1444,16 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 		memset(&fl, 0, sizeof(fl));
 		fl.proto = IPPROTO_TCP;
-		ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr);
+		ipv6_addr_copy(&fl.fl6_dst, &treq->rmt_addr);
 		if (opt && opt->srcrt) {
 			struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
 			ipv6_addr_copy(&final, &fl.fl6_dst);
 			ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
 			final_p = &final;
 		}
-		ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr);
+		ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
 		fl.oif = sk->sk_bound_dev_if;
-		fl.fl_ip_dport = req->rmt_port;
+		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
 		fl.fl_ip_sport = inet_sk(sk)->sport;
 
 		if (ip6_dst_lookup(sk, &dst, &fl))
@@ -1482,10 +1488,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
 
-	ipv6_addr_copy(&newnp->daddr, &req->af.v6_req.rmt_addr);
-	ipv6_addr_copy(&newnp->saddr, &req->af.v6_req.loc_addr);
-	ipv6_addr_copy(&newnp->rcv_saddr, &req->af.v6_req.loc_addr);
-	newsk->sk_bound_dev_if = req->af.v6_req.iif;
+	ipv6_addr_copy(&newnp->daddr, &treq->rmt_addr);
+	ipv6_addr_copy(&newnp->saddr, &treq->loc_addr);
+	ipv6_addr_copy(&newnp->rcv_saddr, &treq->loc_addr);
+	newsk->sk_bound_dev_if = treq->iif;
 
 	/* Now IPv6 options... 
 
@@ -1498,11 +1504,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	/* Clone pktoptions received with SYN */
 	newnp->pktoptions = NULL;
-	if (req->af.v6_req.pktopts) {
-		newnp->pktoptions = skb_clone(req->af.v6_req.pktopts,
-					      GFP_ATOMIC);
-		kfree_skb(req->af.v6_req.pktopts);
-		req->af.v6_req.pktopts = NULL;
+	if (treq->pktopts != NULL) {
+		newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC);
+		kfree_skb(treq->pktopts);
+		treq->pktopts = NULL;
 		if (newnp->pktoptions)
 			skb_set_owner_r(newnp->pktoptions, newsk);
 	}
@@ -2058,8 +2063,8 @@ static void get_openreq6(struct seq_file *seq,
 	if (ttd < 0)
 		ttd = 0;
 
-	src = &req->af.v6_req.loc_addr;
-	dest = &req->af.v6_req.rmt_addr;
+	src = &tcp6_rsk(req)->loc_addr;
+	dest = &tcp6_rsk(req)->rmt_addr;
 	seq_printf(seq,
 		   "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
 		   "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n",
@@ -2069,7 +2074,7 @@ static void get_openreq6(struct seq_file *seq,
 		   ntohs(inet_sk(sk)->sport),
 		   dest->s6_addr32[0], dest->s6_addr32[1],
 		   dest->s6_addr32[2], dest->s6_addr32[3],
-		   ntohs(req->rmt_port),
+		   ntohs(inet_rsk(req)->rmt_port),
 		   TCP_SYN_RECV,
 		   0,0, /* could print option size, but that is af dependent. */
 		   1,   /* timers active (only the expire timer) */  
@@ -2239,6 +2244,7 @@ struct proto tcpv6_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
+	.rsk_prot		= &or_ipv6,
 };
 
 static struct inet6_protocol tcpv6_protocol = {
-- 
cgit v1.2.3


From 60236fdd08b2169045a3bbfc5ffe1576e6c3c17b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Sat, 18 Jun 2005 22:47:21 -0700
Subject: [NET] Rename open_request to request_sock

Ok, this one just renames some stuff to have a better namespace and to
dissassociate it from TCP:

struct open_request  -> struct request_sock
tcp_openreq_alloc    -> reqsk_alloc
tcp_openreq_free     -> reqsk_free
tcp_openreq_fastfree -> __reqsk_free

With this most of the infrastructure closely resembles a struct
sock methods subset.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/syncookies.c    | 12 +++++------
 net/ipv4/tcp.c           | 12 +++++------
 net/ipv4/tcp_diag.c      |  4 ++--
 net/ipv4/tcp_ipv4.c      | 56 ++++++++++++++++++++++++------------------------
 net/ipv4/tcp_minisocks.c | 14 ++++++------
 net/ipv4/tcp_output.c    |  2 +-
 net/ipv4/tcp_timer.c     |  6 +++---
 net/ipv6/tcp_ipv6.c      | 42 ++++++++++++++++++------------------
 8 files changed, 74 insertions(+), 74 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index dd47e6da6fb..72d01444218 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -169,10 +169,10 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
 	return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
 }
 
-extern struct or_calltable or_ipv4;
+extern struct request_sock_ops tcp_request_sock_ops;
 
 static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
-					   struct open_request *req,
+					   struct request_sock *req,
 					   struct dst_entry *dst)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -182,7 +182,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 	if (child)
 		tcp_acceptq_queue(sk, req, child);
 	else
-		tcp_openreq_free(req);
+		reqsk_free(req);
 
 	return child;
 }
@@ -195,7 +195,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 cookie = ntohl(skb->h.th->ack_seq) - 1; 
 	struct sock *ret = sk;
-	struct open_request *req; 
+	struct request_sock *req; 
 	int mss; 
 	struct rtable *rt; 
 	__u8 rcv_wscale;
@@ -212,7 +212,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
 
 	ret = NULL;
-	req = tcp_openreq_alloc(&or_ipv4); /* for safety */
+	req = reqsk_alloc(&tcp_request_sock_ops); /* for safety */
 	if (!req)
 		goto out;
 
@@ -262,7 +262,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 					       { .sport = skb->h.th->dest,
 						 .dport = skb->h.th->source } } };
 		if (ip_route_output_key(&rt, &fl)) {
-			tcp_openreq_free(req);
+			reqsk_free(req);
 			goto out; 
 		}
 	}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a3cabfa2022..1c29feb6b35 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -516,8 +516,8 @@ static void tcp_listen_stop (struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_listen_opt *lopt = tp->listen_opt;
-	struct open_request *acc_req = tp->accept_queue;
-	struct open_request *req;
+	struct request_sock *acc_req = tp->accept_queue;
+	struct request_sock *req;
 	int i;
 
 	tcp_delete_keepalive_timer(sk);
@@ -533,7 +533,7 @@ static void tcp_listen_stop (struct sock *sk)
 			while ((req = lopt->syn_table[i]) != NULL) {
 				lopt->syn_table[i] = req->dl_next;
 				lopt->qlen--;
-				tcp_openreq_free(req);
+				reqsk_free(req);
 
 		/* Following specs, it would be better either to send FIN
 		 * (and enter FIN-WAIT-1, it is normal close)
@@ -573,7 +573,7 @@ static void tcp_listen_stop (struct sock *sk)
 		sock_put(child);
 
 		sk_acceptq_removed(sk);
-		tcp_openreq_fastfree(req);
+		__reqsk_free(req);
 	}
 	BUG_TRAP(!sk->sk_ack_backlog);
 }
@@ -1894,7 +1894,7 @@ static int wait_for_connect(struct sock *sk, long timeo)
 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct open_request *req;
+	struct request_sock *req;
 	struct sock *newsk;
 	int error;
 
@@ -1927,7 +1927,7 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err)
 
  	newsk = req->sk;
 	sk_acceptq_removed(sk);
-	tcp_openreq_fastfree(req);
+	__reqsk_free(req);
 	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
 	release_sock(sk);
 	return newsk;
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 700ff241358..67277800d0c 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -455,7 +455,7 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
 }
 
 static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
-			    struct open_request *req,
+			    struct request_sock *req,
 			    u32 pid, u32 seq)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
@@ -542,7 +542,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 	}
 
 	for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
-		struct open_request *req, *head = lopt->syn_table[j];
+		struct request_sock *req, *head = lopt->syn_table[j];
 
 		reqnum = 0;
 		for (req = head; req; reqnum++, req = req->dl_next) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e156be90df1..95528a75a63 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -36,7 +36,7 @@
  *					ACK bit.
  *		Andi Kleen :		Implemented fast path mtu discovery.
  *	     				Fixed many serious bugs in the
- *					open_request handling and moved
+ *					request_sock handling and moved
  *					most of it into the af independent code.
  *					Added tail drop and some other bugfixes.
  *					Added new listen sematics.
@@ -869,13 +869,13 @@ static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
 	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
 }
 
-static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
-					      struct open_request ***prevp,
+static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
+					      struct request_sock ***prevp,
 					      __u16 rport,
 					      __u32 raddr, __u32 laddr)
 {
 	struct tcp_listen_opt *lopt = tp->listen_opt;
-	struct open_request *req, **prev;
+	struct request_sock *req, **prev;
 
 	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
 	     (req = *prev) != NULL;
@@ -885,7 +885,7 @@ static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
 		if (ireq->rmt_port == rport &&
 		    ireq->rmt_addr == raddr &&
 		    ireq->loc_addr == laddr &&
-		    TCP_INET_FAMILY(req->class->family)) {
+		    TCP_INET_FAMILY(req->rsk_ops->family)) {
 			BUG_TRAP(!req->sk);
 			*prevp = prev;
 			break;
@@ -895,7 +895,7 @@ static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
 	return req;
 }
 
-static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
+static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_listen_opt *lopt = tp->listen_opt;
@@ -1052,7 +1052,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
 	}
 
 	switch (sk->sk_state) {
-		struct open_request *req, **prev;
+		struct request_sock *req, **prev;
 	case TCP_LISTEN:
 		if (sock_owned_by_user(sk))
 			goto out;
@@ -1256,14 +1256,14 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 	tcp_tw_put(tw);
 }
 
-static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
+static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 {
 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 			req->ts_recent);
 }
 
 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
-					  struct open_request *req)
+					  struct request_sock *req)
 {
 	struct rtable *rt;
 	const struct inet_request_sock *ireq = inet_rsk(req);
@@ -1294,10 +1294,10 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk,
 
 /*
  *	Send a SYN-ACK after having received an ACK.
- *	This still operates on a open_request only, not on a big
+ *	This still operates on a request_sock only, not on a big
  *	socket.
  */
-static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
+static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 			      struct dst_entry *dst)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
@@ -1332,9 +1332,9 @@ out:
 }
 
 /*
- *	IPv4 open_request destructor.
+ *	IPv4 request_sock destructor.
  */
-static void tcp_v4_or_free(struct open_request *req)
+static void tcp_v4_reqsk_destructor(struct request_sock *req)
 {
 	if (inet_rsk(req)->opt)
 		kfree(inet_rsk(req)->opt);
@@ -1353,7 +1353,7 @@ static inline void syn_flood_warning(struct sk_buff *skb)
 }
 
 /*
- * Save and compile IPv4 options into the open_request if needed.
+ * Save and compile IPv4 options into the request_sock if needed.
  */
 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 						     struct sk_buff *skb)
@@ -1389,12 +1389,12 @@ static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
  */
 int sysctl_max_syn_backlog = 256;
 
-struct or_calltable or_ipv4 = {
+struct request_sock_ops tcp_request_sock_ops = {
 	.family		=	PF_INET,
 	.obj_size	=	sizeof(struct tcp_request_sock),
 	.rtx_syn_ack	=	tcp_v4_send_synack,
-	.send_ack	=	tcp_v4_or_send_ack,
-	.destructor	=	tcp_v4_or_free,
+	.send_ack	=	tcp_v4_reqsk_send_ack,
+	.destructor	=	tcp_v4_reqsk_destructor,
 	.send_reset	=	tcp_v4_send_reset,
 };
 
@@ -1402,7 +1402,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct inet_request_sock *ireq;
 	struct tcp_options_received tmp_opt;
-	struct open_request *req;
+	struct request_sock *req;
 	__u32 saddr = skb->nh.iph->saddr;
 	__u32 daddr = skb->nh.iph->daddr;
 	__u32 isn = TCP_SKB_CB(skb)->when;
@@ -1439,7 +1439,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
 		goto drop;
 
-	req = tcp_openreq_alloc(&or_ipv4);
+	req = reqsk_alloc(&tcp_request_sock_ops);
 	if (!req)
 		goto drop;
 
@@ -1535,14 +1535,14 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		goto drop_and_free;
 
 	if (want_cookie) {
-	   	tcp_openreq_free(req);
+	   	reqsk_free(req);
 	} else {
 		tcp_v4_synq_add(sk, req);
 	}
 	return 0;
 
 drop_and_free:
-	tcp_openreq_free(req);
+	reqsk_free(req);
 drop:
 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 	return 0;
@@ -1554,7 +1554,7 @@ drop:
  * now create the new socket.
  */
 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
-				  struct open_request *req,
+				  struct request_sock *req,
 				  struct dst_entry *dst)
 {
 	struct inet_request_sock *ireq;
@@ -1613,9 +1613,9 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 	struct iphdr *iph = skb->nh.iph;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sock *nsk;
-	struct open_request **prev;
+	struct request_sock **prev;
 	/* Find possible connection requests. */
-	struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
+	struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
 						     iph->saddr, iph->daddr);
 	if (req)
 		return tcp_check_req(sk, skb, req, prev);
@@ -2152,13 +2152,13 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 	++st->num;
 
 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
-		struct open_request *req = cur;
+		struct request_sock *req = cur;
 
 	       	tp = tcp_sk(st->syn_wait_sk);
 		req = req->dl_next;
 		while (1) {
 			while (req) {
-				if (req->class->family == st->family) {
+				if (req->rsk_ops->family == st->family) {
 					cur = req;
 					goto out;
 				}
@@ -2459,7 +2459,7 @@ void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
 	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
 }
 
-static void get_openreq4(struct sock *sk, struct open_request *req,
+static void get_openreq4(struct sock *sk, struct request_sock *req,
 			 char *tmpbuf, int i, int uid)
 {
 	const struct inet_request_sock *ireq = inet_rsk(req);
@@ -2627,7 +2627,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
-	.rsk_prot		= &or_ipv4,
+	.rsk_prot		= &tcp_request_sock_ops,
 };
 
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1037401c7cc..0e6d525a834 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -684,7 +684,7 @@ out:
  * Actually, we could lots of memory writes here. tp of listening
  * socket contains all necessary default parameters.
  */
-struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 {
 	/* allocate the newsk from the same slab of the master sock,
 	 * if not, at sk_free time we'll try to free it from the wrong
@@ -853,12 +853,12 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
 
 /* 
  *	Process an incoming packet for SYN_RECV sockets represented
- *	as an open_request.
+ *	as a request_sock.
  */
 
 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
-			   struct open_request *req,
-			   struct open_request **prev)
+			   struct request_sock *req,
+			   struct request_sock **prev)
 {
 	struct tcphdr *th = skb->h.th;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -903,7 +903,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 		 * Enforce "SYN-ACK" according to figure 8, figure 6
 		 * of RFC793, fixed by RFC1122.
 		 */
-		req->class->rtx_syn_ack(sk, req, NULL);
+		req->rsk_ops->rtx_syn_ack(sk, req, NULL);
 		return NULL;
 	}
 
@@ -975,7 +975,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 					  tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
 		/* Out of window: send ACK and drop. */
 		if (!(flg & TCP_FLAG_RST))
-			req->class->send_ack(skb, req);
+			req->rsk_ops->send_ack(skb, req);
 		if (paws_reject)
 			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 		return NULL;
@@ -1035,7 +1035,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 	embryonic_reset:
 		NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
 		if (!(flg & TCP_FLAG_RST))
-			req->class->send_reset(skb);
+			req->rsk_ops->send_reset(skb);
 
 		tcp_synq_drop(sk, req, prev);
 		return NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f3c8747caf9..f17c6577e33 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1356,7 +1356,7 @@ int tcp_send_synack(struct sock *sk)
  * Prepare a SYN-ACK.
  */
 struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
-				 struct open_request *req)
+				 struct request_sock *req)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ba30ca0aa6a..f03efe5fb76 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -468,7 +468,7 @@ static void tcp_synack_timer(struct sock *sk)
 	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
 	int thresh = max_retries;
 	unsigned long now = jiffies;
-	struct open_request **reqp, *req;
+	struct request_sock **reqp, *req;
 	int i, budget;
 
 	if (lopt == NULL || lopt->qlen == 0)
@@ -514,7 +514,7 @@ static void tcp_synack_timer(struct sock *sk)
 			if (time_after_eq(now, req->expires)) {
 				if ((req->retrans < thresh ||
 				     (inet_rsk(req)->acked && req->retrans < max_retries))
-				    && !req->class->rtx_syn_ack(sk, req, NULL)) {
+				    && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
 					unsigned long timeo;
 
 					if (req->retrans++ == 0)
@@ -533,7 +533,7 @@ static void tcp_synack_timer(struct sock *sk)
 				lopt->qlen--;
 				if (req->retrans == 0)
 					lopt->qlen_young--;
-				tcp_openreq_free(req);
+				reqsk_free(req);
 				continue;
 			}
 			reqp = &req->dl_next;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9199ad2fde0..068cd4a8c29 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -65,7 +65,7 @@
 #include <linux/seq_file.h>
 
 static void	tcp_v6_send_reset(struct sk_buff *skb);
-static void	tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req);
+static void	tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req);
 static void	tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, 
 				  struct sk_buff *skb);
 
@@ -394,15 +394,15 @@ static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd)
 	return c & (TCP_SYNQ_HSIZE - 1);
 }
 
-static struct open_request *tcp_v6_search_req(struct tcp_sock *tp,
-					      struct open_request ***prevp,
+static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp,
+					      struct request_sock ***prevp,
 					      __u16 rport,
 					      struct in6_addr *raddr,
 					      struct in6_addr *laddr,
 					      int iif)
 {
 	struct tcp_listen_opt *lopt = tp->listen_opt;
-	struct open_request *req, **prev;  
+	struct request_sock *req, **prev;  
 
 	for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
 	     (req = *prev) != NULL;
@@ -410,7 +410,7 @@ static struct open_request *tcp_v6_search_req(struct tcp_sock *tp,
 		const struct tcp6_request_sock *treq = tcp6_rsk(req);
 
 		if (inet_rsk(req)->rmt_port == rport &&
-		    req->class->family == AF_INET6 &&
+		    req->rsk_ops->family == AF_INET6 &&
 		    ipv6_addr_equal(&treq->rmt_addr, raddr) &&
 		    ipv6_addr_equal(&treq->loc_addr, laddr) &&
 		    (!treq->iif || treq->iif == iif)) {
@@ -908,9 +908,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 	icmpv6_err_convert(type, code, &err);
 
-	/* Might be for an open_request */
+	/* Might be for an request_sock */
 	switch (sk->sk_state) {
-		struct open_request *req, **prev;
+		struct request_sock *req, **prev;
 	case TCP_LISTEN:
 		if (sock_owned_by_user(sk))
 			goto out;
@@ -959,7 +959,7 @@ out:
 }
 
 
-static int tcp_v6_send_synack(struct sock *sk, struct open_request *req,
+static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 			      struct dst_entry *dst)
 {
 	struct tcp6_request_sock *treq = tcp6_rsk(req);
@@ -1027,18 +1027,18 @@ done:
 	return err;
 }
 
-static void tcp_v6_or_free(struct open_request *req)
+static void tcp_v6_reqsk_destructor(struct request_sock *req)
 {
 	if (tcp6_rsk(req)->pktopts)
 		kfree_skb(tcp6_rsk(req)->pktopts);
 }
 
-static struct or_calltable or_ipv6 = {
+static struct request_sock_ops tcp6_request_sock_ops = {
 	.family		=	AF_INET6,
 	.obj_size	=	sizeof(struct tcp6_request_sock),
 	.rtx_syn_ack	=	tcp_v6_send_synack,
-	.send_ack	=	tcp_v6_or_send_ack,
-	.destructor	=	tcp_v6_or_free,
+	.send_ack	=	tcp_v6_reqsk_send_ack,
+	.destructor	=	tcp_v6_reqsk_destructor,
 	.send_reset	=	tcp_v6_send_reset
 };
 
@@ -1223,7 +1223,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 	tcp_tw_put(tw);
 }
 
-static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req)
+static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
 {
 	tcp_v6_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, req->ts_recent);
 }
@@ -1231,7 +1231,7 @@ static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req)
 
 static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 {
-	struct open_request *req, **prev;
+	struct request_sock *req, **prev;
 	struct tcphdr *th = skb->h.th;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sock *nsk;
@@ -1264,7 +1264,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 	return sk;
 }
 
-static void tcp_v6_synq_add(struct sock *sk, struct open_request *req)
+static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_listen_opt *lopt = tp->listen_opt;
@@ -1292,7 +1292,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_options_received tmp_opt;
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct open_request *req = NULL;
+	struct request_sock *req = NULL;
 	__u32 isn = TCP_SKB_CB(skb)->when;
 
 	if (skb->protocol == htons(ETH_P_IP))
@@ -1313,7 +1313,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
 		goto drop;
 
-	req = tcp_openreq_alloc(&or_ipv6);
+	req = reqsk_alloc(&tcp6_request_sock_ops);
 	if (req == NULL)
 		goto drop;
 
@@ -1358,14 +1358,14 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 
 drop:
 	if (req)
-		tcp_openreq_free(req);
+		reqsk_free(req);
 
 	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
 	return 0; /* don't send reset */
 }
 
 static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
-					  struct open_request *req,
+					  struct request_sock *req,
 					  struct dst_entry *dst)
 {
 	struct tcp6_request_sock *treq = tcp6_rsk(req);
@@ -2055,7 +2055,7 @@ static int tcp_v6_destroy_sock(struct sock *sk)
 
 /* Proc filesystem TCPv6 sock list dumping. */
 static void get_openreq6(struct seq_file *seq, 
-			 struct sock *sk, struct open_request *req, int i, int uid)
+			 struct sock *sk, struct request_sock *req, int i, int uid)
 {
 	struct in6_addr *dest, *src;
 	int ttd = req->expires - jiffies;
@@ -2244,7 +2244,7 @@ struct proto tcpv6_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
-	.rsk_prot		= &or_ipv6,
+	.rsk_prot		= &tcp6_request_sock_ops,
 };
 
 static struct inet6_protocol tcpv6_protocol = {
-- 
cgit v1.2.3


From 0e87506fcc734647c7b2497eee4eb81e785c857a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Sat, 18 Jun 2005 22:47:59 -0700
Subject: [NET] Generalise tcp_listen_opt

This chunks out the accept_queue and tcp_listen_opt code and moves
them to net/core/request_sock.c and include/net/request_sock.h, to
make it useful for other transport protocols, DCCP being the first one
to use it.

Next patches will rename tcp_listen_opt to accept_sock and remove the
inline tcp functions that just call a reqsk_queue_ function.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/Makefile        |  3 ++-
 net/core/request_sock.c  | 48 ++++++++++++++++++++++++++++++++++
 net/ipv4/tcp.c           | 67 +++++++++++++++---------------------------------
 net/ipv4/tcp_diag.c      |  6 ++---
 net/ipv4/tcp_ipv4.c      | 32 +++++++++--------------
 net/ipv4/tcp_minisocks.c |  6 ++---
 net/ipv4/tcp_timer.c     | 10 +++-----
 net/ipv6/tcp_ipv6.c      | 14 +++-------
 8 files changed, 94 insertions(+), 92 deletions(-)
 create mode 100644 net/core/request_sock.c

(limited to 'net')

diff --git a/net/core/Makefile b/net/core/Makefile
index 81f03243fe2..5e0c56b7f60 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -2,7 +2,8 @@
 # Makefile for the Linux networking core.
 #
 
-obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o gen_stats.o gen_estimator.o
+obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
+	 gen_stats.o gen_estimator.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
 
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
new file mode 100644
index 00000000000..1258333ca00
--- /dev/null
+++ b/net/core/request_sock.c
@@ -0,0 +1,48 @@
+/*
+ * NET		Generic infrastructure for Network protocols.
+ *
+ * Authors:	Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * 		From code originally in include/net/tcp.h
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <net/request_sock.h>
+
+int reqsk_queue_alloc(struct request_sock_queue *queue,
+		      const int nr_table_entries)
+{
+	const int lopt_size = sizeof(struct tcp_listen_opt) +
+			      nr_table_entries * sizeof(struct request_sock *);
+	struct tcp_listen_opt *lopt = kmalloc(lopt_size, GFP_KERNEL);
+
+	if (lopt == NULL)
+		return -ENOMEM;
+
+	memset(lopt, 0, lopt_size);
+
+	for (lopt->max_qlen_log = 6;
+	     (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
+	     lopt->max_qlen_log++);
+
+	get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
+	rwlock_init(&queue->syn_wait_lock);
+	queue->rskq_accept_head = queue->rskq_accept_head = NULL;
+
+	write_lock_bh(&queue->syn_wait_lock);
+	queue->listen_opt = lopt;
+	write_unlock_bh(&queue->syn_wait_lock);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(reqsk_queue_alloc);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1c29feb6b35..b85a46dd40a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -316,7 +316,7 @@ EXPORT_SYMBOL(tcp_enter_memory_pressure);
 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 					       poll_table *wait)
 {
-	return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
+	return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0;
 }
 
 /*
@@ -462,28 +462,15 @@ int tcp_listen_start(struct sock *sk)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt;
+	int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE);
+
+	if (rc != 0)
+		return rc;
 
 	sk->sk_max_ack_backlog = 0;
 	sk->sk_ack_backlog = 0;
-	tp->accept_queue = tp->accept_queue_tail = NULL;
-	rwlock_init(&tp->syn_wait_lock);
 	tcp_delack_init(tp);
 
-	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
-	if (!lopt)
-		return -ENOMEM;
-
-	memset(lopt, 0, sizeof(struct tcp_listen_opt));
-	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
-		if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
-			break;
-	get_random_bytes(&lopt->hash_rnd, 4);
-
-	write_lock_bh(&tp->syn_wait_lock);
-	tp->listen_opt = lopt;
-	write_unlock_bh(&tp->syn_wait_lock);
-
 	/* There is race window here: we announce ourselves listening,
 	 * but this transition is still not validated by get_port().
 	 * It is OK, because this socket enters to hash table only
@@ -500,10 +487,7 @@ int tcp_listen_start(struct sock *sk)
 	}
 
 	sk->sk_state = TCP_CLOSE;
-	write_lock_bh(&tp->syn_wait_lock);
-	tp->listen_opt = NULL;
-	write_unlock_bh(&tp->syn_wait_lock);
-	kfree(lopt);
+	reqsk_queue_destroy(&tp->accept_queue);
 	return -EADDRINUSE;
 }
 
@@ -515,18 +499,16 @@ int tcp_listen_start(struct sock *sk)
 static void tcp_listen_stop (struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt = tp->listen_opt;
-	struct request_sock *acc_req = tp->accept_queue;
+	struct tcp_listen_opt *lopt;
+	struct request_sock *acc_req;
 	struct request_sock *req;
 	int i;
 
 	tcp_delete_keepalive_timer(sk);
 
 	/* make all the listen_opt local to us */
-	write_lock_bh(&tp->syn_wait_lock);
-	tp->listen_opt = NULL;
-	write_unlock_bh(&tp->syn_wait_lock);
-	tp->accept_queue = tp->accept_queue_tail = NULL;
+	lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue);
+	acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue);
 
 	if (lopt->qlen) {
 		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
@@ -1867,11 +1849,11 @@ static int wait_for_connect(struct sock *sk, long timeo)
 		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
 					  TASK_INTERRUPTIBLE);
 		release_sock(sk);
-		if (!tp->accept_queue)
+		if (reqsk_queue_empty(&tp->accept_queue))
 			timeo = schedule_timeout(timeo);
 		lock_sock(sk);
 		err = 0;
-		if (tp->accept_queue)
+		if (!reqsk_queue_empty(&tp->accept_queue))
 			break;
 		err = -EINVAL;
 		if (sk->sk_state != TCP_LISTEN)
@@ -1894,7 +1876,6 @@ static int wait_for_connect(struct sock *sk, long timeo)
 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct request_sock *req;
 	struct sock *newsk;
 	int error;
 
@@ -1905,37 +1886,31 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err)
 	 */
 	error = -EINVAL;
 	if (sk->sk_state != TCP_LISTEN)
-		goto out;
+		goto out_err;
 
 	/* Find already established connection */
-	if (!tp->accept_queue) {
+	if (reqsk_queue_empty(&tp->accept_queue)) {
 		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
 
 		/* If this is a non blocking socket don't sleep */
 		error = -EAGAIN;
 		if (!timeo)
-			goto out;
+			goto out_err;
 
 		error = wait_for_connect(sk, timeo);
 		if (error)
-			goto out;
+			goto out_err;
 	}
 
-	req = tp->accept_queue;
-	if ((tp->accept_queue = req->dl_next) == NULL)
-		tp->accept_queue_tail = NULL;
-
- 	newsk = req->sk;
-	sk_acceptq_removed(sk);
-	__reqsk_free(req);
+	newsk = reqsk_queue_get_child(&tp->accept_queue, sk);
 	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
-	release_sock(sk);
-	return newsk;
-
 out:
 	release_sock(sk);
+	return newsk;
+out_err:
+	newsk = NULL;
 	*err = error;
-	return NULL;
+	goto out;
 }
 
 /*
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 67277800d0c..c3328fa4883 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -529,9 +529,9 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 
 	entry.family = sk->sk_family;
 
-	read_lock_bh(&tp->syn_wait_lock);
+	read_lock_bh(&tp->accept_queue.syn_wait_lock);
 
-	lopt = tp->listen_opt;
+	lopt = tp->accept_queue.listen_opt;
 	if (!lopt || !lopt->qlen)
 		goto out;
 
@@ -588,7 +588,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 	}
 
 out:
-	read_unlock_bh(&tp->syn_wait_lock);
+	read_unlock_bh(&tp->accept_queue.syn_wait_lock);
 
 	return err;
 }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 95528a75a63..1745dc8d25e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -874,7 +874,7 @@ static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 					      __u16 rport,
 					      __u32 raddr, __u32 laddr)
 {
-	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
 	struct request_sock *req, **prev;
 
 	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
@@ -898,18 +898,10 @@ static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
 	u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 
-	req->expires = jiffies + TCP_TIMEOUT_INIT;
-	req->retrans = 0;
-	req->sk = NULL;
-	req->dl_next = lopt->syn_table[h];
-
-	write_lock(&tp->syn_wait_lock);
-	lopt->syn_table[h] = req;
-	write_unlock(&tp->syn_wait_lock);
-
+	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
 	tcp_synq_added(sk);
 }
 
@@ -2167,17 +2159,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
 			if (++st->sbucket >= TCP_SYNQ_HSIZE)
 				break;
 get_req:
-			req = tp->listen_opt->syn_table[st->sbucket];
+			req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
 		}
 		sk	  = sk_next(st->syn_wait_sk);
 		st->state = TCP_SEQ_STATE_LISTENING;
-		read_unlock_bh(&tp->syn_wait_lock);
+		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
 	} else {
 	       	tp = tcp_sk(sk);
-		read_lock_bh(&tp->syn_wait_lock);
-		if (tp->listen_opt && tp->listen_opt->qlen)
+		read_lock_bh(&tp->accept_queue.syn_wait_lock);
+		if (reqsk_queue_len(&tp->accept_queue))
 			goto start_req;
-		read_unlock_bh(&tp->syn_wait_lock);
+		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
 		sk = sk_next(sk);
 	}
 get_sk:
@@ -2187,8 +2179,8 @@ get_sk:
 			goto out;
 		}
 	       	tp = tcp_sk(sk);
-		read_lock_bh(&tp->syn_wait_lock);
-		if (tp->listen_opt && tp->listen_opt->qlen) {
+		read_lock_bh(&tp->accept_queue.syn_wait_lock);
+		if (reqsk_queue_len(&tp->accept_queue)) {
 start_req:
 			st->uid		= sock_i_uid(sk);
 			st->syn_wait_sk = sk;
@@ -2196,7 +2188,7 @@ start_req:
 			st->sbucket	= 0;
 			goto get_req;
 		}
-		read_unlock_bh(&tp->syn_wait_lock);
+		read_unlock_bh(&tp->accept_queue.syn_wait_lock);
 	}
 	if (++st->bucket < TCP_LHTABLE_SIZE) {
 		sk = sk_head(&tcp_listening_hash[st->bucket]);
@@ -2383,7 +2375,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
 	case TCP_SEQ_STATE_OPENREQ:
 		if (v) {
 			struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
-			read_unlock_bh(&tp->syn_wait_lock);
+			read_unlock_bh(&tp->accept_queue.syn_wait_lock);
 		}
 	case TCP_SEQ_STATE_LISTENING:
 		if (v != SEQ_START_TOKEN)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 0e6d525a834..b3943e7562f 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -790,10 +790,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->probes_out = 0;
 		newtp->rx_opt.num_sacks = 0;
 		newtp->urg_data = 0;
-		newtp->listen_opt = NULL;
-		newtp->accept_queue = newtp->accept_queue_tail = NULL;
-		/* Deinitialize syn_wait_lock to trap illegal accesses. */
-		memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
+		/* Deinitialize accept_queue to trap illegal accesses. */
+		memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
 
 		/* Back to base struct sock members. */
 		newsk->sk_err = 0;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f03efe5fb76..d97d191149c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -464,7 +464,7 @@ out_unlock:
 static void tcp_synack_timer(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
 	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
 	int thresh = max_retries;
 	unsigned long now = jiffies;
@@ -527,12 +527,8 @@ static void tcp_synack_timer(struct sock *sk)
 				}
 
 				/* Drop this request */
-				write_lock(&tp->syn_wait_lock);
-				*reqp = req->dl_next;
-				write_unlock(&tp->syn_wait_lock);
-				lopt->qlen--;
-				if (req->retrans == 0)
-					lopt->qlen_young--;
+				tcp_synq_unlink(tp, req, reqp);
+				reqsk_queue_removed(&tp->accept_queue, req);
 				reqsk_free(req);
 				continue;
 			}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 068cd4a8c29..84091daad6b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -401,7 +401,7 @@ static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp,
 					      struct in6_addr *laddr,
 					      int iif)
 {
-	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
 	struct request_sock *req, **prev;  
 
 	for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
@@ -1267,18 +1267,10 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
 	u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 
-	req->sk = NULL;
-	req->expires = jiffies + TCP_TIMEOUT_INIT;
-	req->retrans = 0;
-	req->dl_next = lopt->syn_table[h];
-
-	write_lock(&tp->syn_wait_lock);
-	lopt->syn_table[h] = req;
-	write_unlock(&tp->syn_wait_lock);
-
+	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
 	tcp_synq_added(sk);
 }
 
-- 
cgit v1.2.3


From 2ad69c55a282315e6119cf7fd744f26a925bdfd2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Sat, 18 Jun 2005 22:48:55 -0700
Subject: [NET] rename struct tcp_listen_opt to struct listen_sock

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/request_sock.c | 4 ++--
 net/ipv4/tcp.c          | 2 +-
 net/ipv4/tcp_diag.c     | 2 +-
 net/ipv4/tcp_ipv4.c     | 4 ++--
 net/ipv4/tcp_timer.c    | 2 +-
 net/ipv6/tcp_ipv6.c     | 4 ++--
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 1258333ca00..78fd60a46bf 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -21,9 +21,9 @@
 int reqsk_queue_alloc(struct request_sock_queue *queue,
 		      const int nr_table_entries)
 {
-	const int lopt_size = sizeof(struct tcp_listen_opt) +
+	const int lopt_size = sizeof(struct listen_sock) +
 			      nr_table_entries * sizeof(struct request_sock *);
-	struct tcp_listen_opt *lopt = kmalloc(lopt_size, GFP_KERNEL);
+	struct listen_sock *lopt = kmalloc(lopt_size, GFP_KERNEL);
 
 	if (lopt == NULL)
 		return -ENOMEM;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b85a46dd40a..3a4c52e77e0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -499,7 +499,7 @@ int tcp_listen_start(struct sock *sk)
 static void tcp_listen_stop (struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt;
+	struct listen_sock *lopt;
 	struct request_sock *acc_req;
 	struct request_sock *req;
 	int i;
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index c3328fa4883..634befc0792 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -514,7 +514,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 	struct tcpdiag_entry entry;
 	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt;
+	struct listen_sock *lopt;
 	struct rtattr *bc = NULL;
 	struct inet_sock *inet = inet_sk(sk);
 	int j, s_j;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1745dc8d25e..485ca9cb170 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -874,7 +874,7 @@ static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 					      __u16 rport,
 					      __u32 raddr, __u32 laddr)
 {
-	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
+	struct listen_sock *lopt = tp->accept_queue.listen_opt;
 	struct request_sock *req, **prev;
 
 	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
@@ -898,7 +898,7 @@ static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
+	struct listen_sock *lopt = tp->accept_queue.listen_opt;
 	u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 
 	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d97d191149c..b127b449856 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -464,7 +464,7 @@ out_unlock:
 static void tcp_synack_timer(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
+	struct listen_sock *lopt = tp->accept_queue.listen_opt;
 	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
 	int thresh = max_retries;
 	unsigned long now = jiffies;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 84091daad6b..2414937f2a8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -401,7 +401,7 @@ static struct request_sock *tcp_v6_search_req(struct tcp_sock *tp,
 					      struct in6_addr *laddr,
 					      int iif)
 {
-	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
+	struct listen_sock *lopt = tp->accept_queue.listen_opt;
 	struct request_sock *req, **prev;  
 
 	for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)];
@@ -1267,7 +1267,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_listen_opt *lopt = tp->accept_queue.listen_opt;
+	struct listen_sock *lopt = tp->accept_queue.listen_opt;
 	u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
 
 	reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
-- 
cgit v1.2.3


From e52c1f17e4ea8e61bd26eb25f1a184202693c2b9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 18 Jun 2005 22:49:40 -0700
Subject: [NET]: Move sysctl_max_syn_backlog into request_sock.c

This fixes the CONFIG_INET=n build failure noticed
by Andrew Morton.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/request_sock.c | 16 ++++++++++++++++
 net/ipv4/tcp_ipv4.c     | 16 ----------------
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 78fd60a46bf..bb55675f068 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -18,6 +18,22 @@
 
 #include <net/request_sock.h>
 
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Further increasing requires to change hash table size.
+ */
+int sysctl_max_syn_backlog = 256;
+EXPORT_SYMBOL(sysctl_max_syn_backlog);
+
 int reqsk_queue_alloc(struct request_sock_queue *queue,
 		      const int nr_table_entries)
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 485ca9cb170..2d41d5d6ad1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1366,21 +1366,6 @@ static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
 	return dopt;
 }
 
-/*
- * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
- * One SYN_RECV socket costs about 80bytes on a 32bit machine.
- * It would be better to replace it with a global counter for all sockets
- * but then some measure against one socket starving all other sockets
- * would be needed.
- *
- * It was 128 by default. Experiments with real servers show, that
- * it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems. This value is adjusted to 128 for very small machines
- * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
- * Further increasing requires to change hash table size.
- */
-int sysctl_max_syn_backlog = 256;
-
 struct request_sock_ops tcp_request_sock_ops = {
 	.family		=	PF_INET,
 	.obj_size	=	sizeof(struct tcp_request_sock),
@@ -2662,7 +2647,6 @@ EXPORT_SYMBOL(tcp_proc_register);
 EXPORT_SYMBOL(tcp_proc_unregister);
 #endif
 EXPORT_SYMBOL(sysctl_local_port_range);
-EXPORT_SYMBOL(sysctl_max_syn_backlog);
 EXPORT_SYMBOL(sysctl_tcp_low_latency);
 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
 
-- 
cgit v1.2.3


From c7fb64db001f83ece669c76a02d8ec2fdb1dd307 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:50:55 -0700
Subject: [NETLINK]: Neighbour table configuration and statistics via rtnetlink

To retrieve the neighbour tables send RTM_GETNEIGHTBL with the
NLM_F_DUMP flag set. Every neighbour table configuration is
spread over multiple messages to avoid running into message
size limits on systems with many interfaces. The first message
in the sequence transports all not device specific data such as
statistics, configuration, and the default parameter set.
This message is followed by 0..n messages carrying device
specific parameter sets.

Although the ordering should be sufficient, NDTA_NAME can be
used to identify sequences. The initial message can be identified
by checking for NDTA_CONFIG. The device specific messages do
not contain this TLV but have NDTPA_IFINDEX set to the
corresponding interface index.

To change neighbour table attributes, send RTM_SETNEIGHTBL
with NDTA_NAME set. Changeable attribute include NDTA_THRESH[1-3],
NDTA_GC_INTERVAL, and all TLVs in NDTA_PARMS unless marked
otherwise. Device specific parameter sets can be changed by
setting NDTPA_IFINDEX to the interface index of the corresponding
device.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 317 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/core/rtnetlink.c |  20 ++--
 2 files changed, 326 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 43bdc521e20..0841ac78c67 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1276,9 +1276,14 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 		INIT_RCU_HEAD(&p->rcu_head);
 		p->reachable_time =
 				neigh_rand_reach_time(p->base_reachable_time);
-		if (dev && dev->neigh_setup && dev->neigh_setup(dev, p)) {
-			kfree(p);
-			return NULL;
+		if (dev) {
+			if (dev->neigh_setup && dev->neigh_setup(dev, p)) {
+				kfree(p);
+				return NULL;
+			}
+
+			dev_hold(dev);
+			p->dev = dev;
 		}
 		p->sysctl_table = NULL;
 		write_lock_bh(&tbl->lock);
@@ -1309,6 +1314,8 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
 			*p = parms->next;
 			parms->dead = 1;
 			write_unlock_bh(&tbl->lock);
+			if (parms->dev)
+				dev_put(parms->dev);
 			call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
 			return;
 		}
@@ -1546,6 +1553,308 @@ out:
 	return err;
 }
 
+static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
+{
+	struct rtattr *nest = RTA_NEST(skb, NDTA_PARMS);
+
+	if (parms->dev)
+		RTA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
+
+	RTA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
+	RTA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
+	RTA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
+	RTA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
+	RTA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
+	RTA_PUT_U32(skb, NDTPA_MCAST_PROBES, parms->mcast_probes);
+	RTA_PUT_MSECS(skb, NDTPA_REACHABLE_TIME, parms->reachable_time);
+	RTA_PUT_MSECS(skb, NDTPA_BASE_REACHABLE_TIME,
+		      parms->base_reachable_time);
+	RTA_PUT_MSECS(skb, NDTPA_GC_STALETIME, parms->gc_staletime);
+	RTA_PUT_MSECS(skb, NDTPA_DELAY_PROBE_TIME, parms->delay_probe_time);
+	RTA_PUT_MSECS(skb, NDTPA_RETRANS_TIME, parms->retrans_time);
+	RTA_PUT_MSECS(skb, NDTPA_ANYCAST_DELAY, parms->anycast_delay);
+	RTA_PUT_MSECS(skb, NDTPA_PROXY_DELAY, parms->proxy_delay);
+	RTA_PUT_MSECS(skb, NDTPA_LOCKTIME, parms->locktime);
+
+	return RTA_NEST_END(skb, nest);
+
+rtattr_failure:
+	return RTA_NEST_CANCEL(skb, nest);
+}
+
+static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
+			      struct netlink_callback *cb)
+{
+	struct nlmsghdr *nlh;
+	struct ndtmsg *ndtmsg;
+
+	nlh = NLMSG_PUT_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg));
+	ndtmsg = NLMSG_DATA(nlh);
+
+	NLMSG_SET_MULTIPART(nlh);
+
+	read_lock_bh(&tbl->lock);
+	ndtmsg->ndtm_family = tbl->family;
+
+	RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
+	RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
+	RTA_PUT_U32(skb, NDTA_THRESH1, tbl->gc_thresh1);
+	RTA_PUT_U32(skb, NDTA_THRESH2, tbl->gc_thresh2);
+	RTA_PUT_U32(skb, NDTA_THRESH3, tbl->gc_thresh3);
+
+	{
+		unsigned long now = jiffies;
+		unsigned int flush_delta = now - tbl->last_flush;
+		unsigned int rand_delta = now - tbl->last_rand;
+
+		struct ndt_config ndc = {
+			.ndtc_key_len		= tbl->key_len,
+			.ndtc_entry_size	= tbl->entry_size,
+			.ndtc_entries		= atomic_read(&tbl->entries),
+			.ndtc_last_flush	= jiffies_to_msecs(flush_delta),
+			.ndtc_last_rand		= jiffies_to_msecs(rand_delta),
+			.ndtc_hash_rnd		= tbl->hash_rnd,
+			.ndtc_hash_mask		= tbl->hash_mask,
+			.ndtc_hash_chain_gc	= tbl->hash_chain_gc,
+			.ndtc_proxy_qlen	= tbl->proxy_queue.qlen,
+		};
+
+		RTA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
+	}
+
+	{
+		int cpu;
+		struct ndt_stats ndst;
+
+		memset(&ndst, 0, sizeof(ndst));
+
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			struct neigh_statistics	*st;
+
+			if (!cpu_possible(cpu))
+				continue;
+
+			st = per_cpu_ptr(tbl->stats, cpu);
+			ndst.ndts_allocs		+= st->allocs;
+			ndst.ndts_destroys		+= st->destroys;
+			ndst.ndts_hash_grows		+= st->hash_grows;
+			ndst.ndts_res_failed		+= st->res_failed;
+			ndst.ndts_lookups		+= st->lookups;
+			ndst.ndts_hits			+= st->hits;
+			ndst.ndts_rcv_probes_mcast	+= st->rcv_probes_mcast;
+			ndst.ndts_rcv_probes_ucast	+= st->rcv_probes_ucast;
+			ndst.ndts_periodic_gc_runs	+= st->periodic_gc_runs;
+			ndst.ndts_forced_gc_runs	+= st->forced_gc_runs;
+		}
+
+		RTA_PUT(skb, NDTA_STATS, sizeof(ndst), &ndst);
+	}
+
+	BUG_ON(tbl->parms.dev);
+	if (neightbl_fill_parms(skb, &tbl->parms) < 0)
+		goto rtattr_failure;
+
+	read_unlock_bh(&tbl->lock);
+	return NLMSG_END(skb, nlh);
+
+rtattr_failure:
+	read_unlock_bh(&tbl->lock);
+	return NLMSG_CANCEL(skb, nlh);
+ 
+nlmsg_failure:
+	return -1;
+}
+
+static int neightbl_fill_param_info(struct neigh_table *tbl,
+				    struct neigh_parms *parms,
+				    struct sk_buff *skb,
+				    struct netlink_callback *cb)
+{
+	struct ndtmsg *ndtmsg;
+	struct nlmsghdr *nlh;
+
+	nlh = NLMSG_PUT_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg));
+	ndtmsg = NLMSG_DATA(nlh);
+
+	NLMSG_SET_MULTIPART(nlh);
+
+	read_lock_bh(&tbl->lock);
+	ndtmsg->ndtm_family = tbl->family;
+	RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
+
+	if (neightbl_fill_parms(skb, parms) < 0)
+		goto rtattr_failure;
+
+	read_unlock_bh(&tbl->lock);
+	return NLMSG_END(skb, nlh);
+
+rtattr_failure:
+	read_unlock_bh(&tbl->lock);
+	return NLMSG_CANCEL(skb, nlh);
+
+nlmsg_failure:
+	return -1;
+}
+ 
+static inline struct neigh_parms *lookup_neigh_params(struct neigh_table *tbl,
+						      int ifindex)
+{
+	struct neigh_parms *p;
+	
+	for (p = &tbl->parms; p; p = p->next)
+		if ((p->dev && p->dev->ifindex == ifindex) ||
+		    (!p->dev && !ifindex))
+			return p;
+
+	return NULL;
+}
+
+int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct neigh_table *tbl;
+	struct ndtmsg *ndtmsg = NLMSG_DATA(nlh);
+	struct rtattr **tb = arg;
+	int err = -EINVAL;
+
+	if (!tb[NDTA_NAME - 1] || !RTA_PAYLOAD(tb[NDTA_NAME - 1]))
+		return -EINVAL;
+
+	read_lock(&neigh_tbl_lock);
+	for (tbl = neigh_tables; tbl; tbl = tbl->next) {
+		if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
+			continue;
+
+		if (!rtattr_strcmp(tb[NDTA_NAME - 1], tbl->id))
+			break;
+	}
+
+	if (tbl == NULL) {
+		err = -ENOENT;
+		goto errout;
+	}
+
+	/* 
+	 * We acquire tbl->lock to be nice to the periodic timers and
+	 * make sure they always see a consistent set of values.
+	 */
+	write_lock_bh(&tbl->lock);
+
+	if (tb[NDTA_THRESH1 - 1])
+		tbl->gc_thresh1 = RTA_GET_U32(tb[NDTA_THRESH1 - 1]);
+
+	if (tb[NDTA_THRESH2 - 1])
+		tbl->gc_thresh2 = RTA_GET_U32(tb[NDTA_THRESH2 - 1]);
+
+	if (tb[NDTA_THRESH3 - 1])
+		tbl->gc_thresh3 = RTA_GET_U32(tb[NDTA_THRESH3 - 1]);
+
+	if (tb[NDTA_GC_INTERVAL - 1])
+		tbl->gc_interval = RTA_GET_MSECS(tb[NDTA_GC_INTERVAL - 1]);
+
+	if (tb[NDTA_PARMS - 1]) {
+		struct rtattr *tbp[NDTPA_MAX];
+		struct neigh_parms *p;
+		u32 ifindex = 0;
+
+		if (rtattr_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS - 1]) < 0)
+			goto rtattr_failure;
+
+		if (tbp[NDTPA_IFINDEX - 1])
+			ifindex = RTA_GET_U32(tbp[NDTPA_IFINDEX - 1]);
+
+		p = lookup_neigh_params(tbl, ifindex);
+		if (p == NULL) {
+			err = -ENOENT;
+			goto rtattr_failure;
+		}
+	
+		if (tbp[NDTPA_QUEUE_LEN - 1])
+			p->queue_len = RTA_GET_U32(tbp[NDTPA_QUEUE_LEN - 1]);
+
+		if (tbp[NDTPA_PROXY_QLEN - 1])
+			p->proxy_qlen = RTA_GET_U32(tbp[NDTPA_PROXY_QLEN - 1]);
+
+		if (tbp[NDTPA_APP_PROBES - 1])
+			p->app_probes = RTA_GET_U32(tbp[NDTPA_APP_PROBES - 1]);
+
+		if (tbp[NDTPA_UCAST_PROBES - 1])
+			p->ucast_probes =
+			   RTA_GET_U32(tbp[NDTPA_UCAST_PROBES - 1]);
+
+		if (tbp[NDTPA_MCAST_PROBES - 1])
+			p->mcast_probes =
+			   RTA_GET_U32(tbp[NDTPA_MCAST_PROBES - 1]);
+
+		if (tbp[NDTPA_BASE_REACHABLE_TIME - 1])
+			p->base_reachable_time =
+			   RTA_GET_MSECS(tbp[NDTPA_BASE_REACHABLE_TIME - 1]);
+
+		if (tbp[NDTPA_GC_STALETIME - 1])
+			p->gc_staletime =
+			   RTA_GET_MSECS(tbp[NDTPA_GC_STALETIME - 1]);
+
+		if (tbp[NDTPA_DELAY_PROBE_TIME - 1])
+			p->delay_probe_time =
+			   RTA_GET_MSECS(tbp[NDTPA_DELAY_PROBE_TIME - 1]);
+
+		if (tbp[NDTPA_RETRANS_TIME - 1])
+			p->retrans_time =
+			   RTA_GET_MSECS(tbp[NDTPA_RETRANS_TIME - 1]);
+
+		if (tbp[NDTPA_ANYCAST_DELAY - 1])
+			p->anycast_delay =
+			   RTA_GET_MSECS(tbp[NDTPA_ANYCAST_DELAY - 1]);
+
+		if (tbp[NDTPA_PROXY_DELAY - 1])
+			p->proxy_delay =
+			   RTA_GET_MSECS(tbp[NDTPA_PROXY_DELAY - 1]);
+
+		if (tbp[NDTPA_LOCKTIME - 1])
+			p->locktime = RTA_GET_MSECS(tbp[NDTPA_LOCKTIME - 1]);
+	}
+
+	err = 0;
+
+rtattr_failure:
+	write_unlock_bh(&tbl->lock);
+errout:
+	read_unlock(&neigh_tbl_lock);
+	return err;
+}
+
+int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int idx, family;
+	int s_idx = cb->args[0];
+	struct neigh_table *tbl;
+
+	family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family;
+
+	read_lock(&neigh_tbl_lock);
+	for (tbl = neigh_tables, idx = 0; tbl; tbl = tbl->next) {
+		struct neigh_parms *p;
+
+		if (idx < s_idx || (family && tbl->family != family))
+			continue;
+
+		if (neightbl_fill_info(tbl, skb, cb) <= 0)
+			break;
+
+		for (++idx, p = tbl->parms.next; p; p = p->next, idx++) {
+			if (idx < s_idx)
+				continue;
+
+			if (neightbl_fill_param_info(tbl, p, skb, cb) <= 0)
+				goto out;
+		}
+
+	}
+out:
+	read_unlock(&neigh_tbl_lock);
+	cb->args[0] = idx;
+
+	return skb->len;
+}
 
 static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
 			   u32 pid, u32 seq, int event)
@@ -2352,6 +2661,8 @@ EXPORT_SYMBOL(neigh_update);
 EXPORT_SYMBOL(neigh_update_hhs);
 EXPORT_SYMBOL(pneigh_enqueue);
 EXPORT_SYMBOL(pneigh_lookup);
+EXPORT_SYMBOL(neightbl_dump_info);
+EXPORT_SYMBOL(neightbl_set);
 
 #ifdef CONFIG_ARPD
 EXPORT_SYMBOL(neigh_app_ns);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 00caf4b318b..56a20f014b8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -100,6 +100,7 @@ static const int rtm_min[RTM_NR_FAMILIES] =
 	[RTM_FAM(RTM_NEWPREFIX)]    = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
 	[RTM_FAM(RTM_GETMULTICAST)] = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
 	[RTM_FAM(RTM_GETANYCAST)]   = NLMSG_LENGTH(sizeof(struct rtgenmsg)),
+	[RTM_FAM(RTM_NEWNEIGHTBL)]  = NLMSG_LENGTH(sizeof(struct ndtmsg)),
 };
 
 static const int rta_max[RTM_NR_FAMILIES] =
@@ -113,6 +114,7 @@ static const int rta_max[RTM_NR_FAMILIES] =
 	[RTM_FAM(RTM_NEWTCLASS)]    = TCA_MAX,
 	[RTM_FAM(RTM_NEWTFILTER)]   = TCA_MAX,
 	[RTM_FAM(RTM_NEWACTION)]    = TCAA_MAX,
+	[RTM_FAM(RTM_NEWNEIGHTBL)]  = NDTA_MAX,
 };
 
 void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
@@ -649,14 +651,16 @@ static void rtnetlink_rcv(struct sock *sk, int len)
 
 static struct rtnetlink_link link_rtnetlink_table[RTM_NR_MSGTYPES] =
 {
-	[RTM_GETLINK  - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo },
-	[RTM_SETLINK  - RTM_BASE] = { .doit   = do_setlink	      },
-	[RTM_GETADDR  - RTM_BASE] = { .dumpit = rtnetlink_dump_all    },
-	[RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all    },
-	[RTM_NEWNEIGH - RTM_BASE] = { .doit   = neigh_add	      },
-	[RTM_DELNEIGH - RTM_BASE] = { .doit   = neigh_delete	      },
-	[RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info	      },
-	[RTM_GETRULE  - RTM_BASE] = { .dumpit = rtnetlink_dump_all    },
+	[RTM_GETLINK     - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo },
+	[RTM_SETLINK     - RTM_BASE] = { .doit   = do_setlink		 },
+	[RTM_GETADDR     - RTM_BASE] = { .dumpit = rtnetlink_dump_all	 },
+	[RTM_GETROUTE    - RTM_BASE] = { .dumpit = rtnetlink_dump_all	 },
+	[RTM_NEWNEIGH    - RTM_BASE] = { .doit   = neigh_add		 },
+	[RTM_DELNEIGH    - RTM_BASE] = { .doit   = neigh_delete		 },
+	[RTM_GETNEIGH    - RTM_BASE] = { .dumpit = neigh_dump_info	 },
+	[RTM_GETRULE     - RTM_BASE] = { .dumpit = rtnetlink_dump_all	 },
+	[RTM_GETNEIGHTBL - RTM_BASE] = { .dumpit = neightbl_dump_info	 },
+	[RTM_SETNEIGHTBL - RTM_BASE] = { .doit   = neightbl_set		 },
 };
 
 static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
-- 
cgit v1.2.3


From 4b6ea82dd18c97598c3caaa8d0b1feec87857e70 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:51:43 -0700
Subject: [NETLINK]: Kill bogus NLMSG_SET_MULTIPART uses.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 0841ac78c67..d1f8f7847f7 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1589,9 +1589,9 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
 	struct ndtmsg *ndtmsg;
 
 	nlh = NLMSG_PUT_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg));
-	ndtmsg = NLMSG_DATA(nlh);
+	nlh->nlmsg_flags |= NLM_F_MULTI;
 
-	NLMSG_SET_MULTIPART(nlh);
+	ndtmsg = NLMSG_DATA(nlh);
 
 	read_lock_bh(&tbl->lock);
 	ndtmsg->ndtm_family = tbl->family;
@@ -1674,9 +1674,9 @@ static int neightbl_fill_param_info(struct neigh_table *tbl,
 	struct nlmsghdr *nlh;
 
 	nlh = NLMSG_PUT_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg));
-	ndtmsg = NLMSG_DATA(nlh);
+	nlh->nlmsg_flags |= NLM_F_MULTI;
 
-	NLMSG_SET_MULTIPART(nlh);
+	ndtmsg = NLMSG_DATA(nlh);
 
 	read_lock_bh(&tbl->lock);
 	ndtmsg->ndtm_family = tbl->family;
-- 
cgit v1.2.3


From e386c6eb431ca2e435d0202ad6997f3d2ccab2ce Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:52:09 -0700
Subject: [NEIGH]: Fix use of uninitialized variable when trimming in
 neightbl_fill_parms

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d1f8f7847f7..2296a145fb7 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1555,7 +1555,9 @@ out:
 
 static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
 {
-	struct rtattr *nest = RTA_NEST(skb, NDTA_PARMS);
+	struct rtattr *nest = NULL;
+	
+	nest = RTA_NEST(skb, NDTA_PARMS);
 
 	if (parms->dev)
 		RTA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
-- 
cgit v1.2.3


From 758cc43c6d7326c62751fb516485e8e188854637 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:52:54 -0700
Subject: [PKT_SCHED]: Fix dsmark to apply changes consistent

Fixes dsmark to do all configuration sanity checks first and
only apply the changes if all of them can be applied without
any errors. Also fixes the weak sanity checks for DSMARK_VALUE
and DSMASK_MASK.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_dsmark.c | 131 +++++++++++++++++++++++++++++++------------------
 1 file changed, 82 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index d8bd2a569c7..66abf139f4b 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -62,6 +62,21 @@ struct dsmark_qdisc_data {
 	int			set_tc_index;
 };
 
+static inline int dsmark_valid_indices(u16 indices)
+{
+	while (indices != 1) {
+		if (indices & 1)
+			return 0;
+		indices >>= 1;
+	}
+ 
+	return 1;
+}
+
+static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
+{
+	return (index <= p->indices && index > 0);
+}
 
 /* ------------------------- Class/flow operations ------------------------- */
 
@@ -120,31 +135,39 @@ static void dsmark_put(struct Qdisc *sch, unsigned long cl)
 
 
 static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
-    struct rtattr **tca, unsigned long *arg)
+			 struct rtattr **tca, unsigned long *arg)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
 	struct rtattr *opt = tca[TCA_OPTIONS-1];
 	struct rtattr *tb[TCA_DSMARK_MAX];
+	int err = -EINVAL;
+	u8 mask = 0;
 
 	DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x),"
-	    "arg 0x%lx\n",sch,p,classid,parent,*arg);
-	if (*arg > p->indices)
-		return -ENOENT;
-	if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt))
-		return -EINVAL;
-	if (tb[TCA_DSMARK_MASK-1]) {
-		if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1]))
-			return -EINVAL;
-		p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]);
-	}
-	if (tb[TCA_DSMARK_VALUE-1]) {
-		if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1]))
-			return -EINVAL;
-		p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]);
+		"arg 0x%lx\n", sch, p, classid, parent, *arg);
+
+	if (!dsmark_valid_index(p, *arg)) {
+		err = -ENOENT;
+		goto rtattr_failure;
 	}
-	return 0;
-}
 
+	if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt))
+		goto rtattr_failure;
+
+	if (tb[TCA_DSMARK_MASK-1])
+		mask = RTA_GET_U8(tb[TCA_DSMARK_MASK-1]);
+
+	if (tb[TCA_DSMARK_VALUE-1])
+		p->value[*arg-1] = RTA_GET_U8(tb[TCA_DSMARK_VALUE-1]);
+		
+	if (tb[TCA_DSMARK_MASK-1])
+		p->mask[*arg-1] = mask;
+
+	err = 0;
+
+rtattr_failure:
+	return err;
+}
 
 static int dsmark_delete(struct Qdisc *sch,unsigned long arg)
 {
@@ -328,43 +351,53 @@ static unsigned int dsmark_drop(struct Qdisc *sch)
 }
 
 
-static int dsmark_init(struct Qdisc *sch,struct rtattr *opt)
+static int dsmark_init(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
 	struct rtattr *tb[TCA_DSMARK_MAX];
-	__u16 tmp;
-
-	DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt);
-	if (!opt ||
-	    rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 ||
-	    !tb[TCA_DSMARK_INDICES-1] ||
-	    RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16))
-                return -EINVAL;
-	p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]);
-	if (!p->indices)
-		return -EINVAL;
-	for (tmp = p->indices; tmp != 1; tmp >>= 1) {
-		if (tmp & 1)
-			return -EINVAL;
-	}
-	p->default_index = NO_DEFAULT_INDEX;
-	if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) {
-		if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16))
-			return -EINVAL;
-		p->default_index =
-		    *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
+	int err = -EINVAL;
+	u32 default_index = NO_DEFAULT_INDEX;
+	u16 indices;
+	u8 *mask;
+
+	DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
+
+	if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt) < 0)
+		goto errout;
+
+	indices = RTA_GET_U16(tb[TCA_DSMARK_INDICES-1]);
+	if (!indices || !dsmark_valid_indices(indices))
+		goto errout;
+
+	if (tb[TCA_DSMARK_DEFAULT_INDEX-1])
+		default_index = RTA_GET_U16(tb[TCA_DSMARK_DEFAULT_INDEX-1]);
+
+	mask = kmalloc(indices * 2, GFP_KERNEL);
+	if (mask == NULL) {
+		err = -ENOMEM;
+		goto errout;
 	}
-	p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1];
-	p->mask = kmalloc(p->indices*2,GFP_KERNEL);
-	if (!p->mask)
-		return -ENOMEM;
-	p->value = p->mask+p->indices;
-	memset(p->mask,0xff,p->indices);
-	memset(p->value,0,p->indices);
-	if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)))
+
+	p->mask = mask;
+	memset(p->mask, 0xff, indices);
+
+	p->value = p->mask + indices;
+	memset(p->value, 0, indices);
+
+	p->indices = indices;
+	p->default_index = default_index;
+	p->set_tc_index = RTA_GET_FLAG(tb[TCA_DSMARK_SET_TC_INDEX-1]);
+
+	p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
+	if (p->q == NULL)
 		p->q = &noop_qdisc;
-	DPRINTK("dsmark_init: qdisc %p\n",&p->q);
-	return 0;
+
+	DPRINTK("dsmark_init: qdisc %p\n", p->q);
+
+	err = 0;
+errout:
+rtattr_failure:
+	return err;
 }
 
 
-- 
cgit v1.2.3


From 02f23f095f1d9534873ecb5d94bbdb0ab67f1d8e Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:53:12 -0700
Subject: [PKT_SCHED]: Make dsmark use the new dumping macros

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_dsmark.c | 52 +++++++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 66abf139f4b..ac0efeae312 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -428,50 +428,46 @@ static void dsmark_destroy(struct Qdisc *sch)
 
 
 static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
-    struct sk_buff *skb, struct tcmsg *tcm)
+			     struct sk_buff *skb, struct tcmsg *tcm)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
-	unsigned char *b = skb->tail;
-	struct rtattr *rta;
+	struct rtattr *opts = NULL;
 
-	DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl);
-	if (!cl || cl > p->indices)
+	DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n", sch, p, cl);
+
+	if (!dsmark_valid_index(p, cl))
 		return -EINVAL;
-	tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1);
-	rta = (struct rtattr *) b;
-	RTA_PUT(skb,TCA_OPTIONS,0,NULL);
-	RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]);
-	RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]);
-	rta->rta_len = skb->tail-b;
-	return skb->len;
+
+	tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1);
+
+	opts = RTA_NEST(skb, TCA_OPTIONS);
+	RTA_PUT_U8(skb,TCA_DSMARK_MASK, p->mask[cl-1]);
+	RTA_PUT_U8(skb,TCA_DSMARK_VALUE, p->value[cl-1]);
+
+	return RTA_NEST_END(skb, opts);
 
 rtattr_failure:
-	skb_trim(skb,b-skb->data);
-	return -1;
+	return RTA_NEST_CANCEL(skb, opts);
 }
 
 static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
-	unsigned char *b = skb->tail;
-	struct rtattr *rta;
+	struct rtattr *opts = NULL;
 
-	rta = (struct rtattr *) b;
-	RTA_PUT(skb,TCA_OPTIONS,0,NULL);
-	RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices);
-	if (p->default_index != NO_DEFAULT_INDEX) {
-		__u16 tmp = p->default_index;
+	opts = RTA_NEST(skb, TCA_OPTIONS);
+	RTA_PUT_U16(skb, TCA_DSMARK_INDICES, p->indices);
+
+	if (p->default_index != NO_DEFAULT_INDEX)
+		RTA_PUT_U16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index);
 
-		RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp);
-	}
 	if (p->set_tc_index)
-		RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL);
-	rta->rta_len = skb->tail-b;
-	return skb->len;
+		RTA_PUT_FLAG(skb, TCA_DSMARK_SET_TC_INDEX);
+
+	return RTA_NEST_END(skb, opts);
 
 rtattr_failure:
-	skb_trim(skb,b-skb->data);
-	return -1;
+	return RTA_NEST_CANCEL(skb, opts);
 }
 
 static struct Qdisc_class_ops dsmark_class_ops = {
-- 
cgit v1.2.3


From af0d114176720c2100dfa624ab433796d333d730 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:53:29 -0700
Subject: [PKT_SCHED]: Logic simplifications and codingstyle/whitespace
 cleanups

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_dsmark.c | 174 +++++++++++++++++++++++++------------------------
 1 file changed, 88 insertions(+), 86 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index ac0efeae312..13e0e7b3856 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -31,7 +31,7 @@
 #endif
 
 
-#define PRIV(sch) qdisc_priv(sch)
+#define PRIV(sch) ((struct dsmark_qdisc_data *) qdisc_priv(sch))
 
 
 /*
@@ -55,10 +55,10 @@
 struct dsmark_qdisc_data {
 	struct Qdisc		*q;
 	struct tcf_proto	*filter_list;
-	__u8			*mask;	/* "owns" the array */
-	__u8			*value;
-	__u16			indices;
-	__u32			default_index;	/* index range is 0...0xffff */
+	u8			*mask;	/* "owns" the array */
+	u8			*value;
+	u16			indices;
+	u32			default_index;	/* index range is 0...0xffff */
 	int			set_tc_index;
 };
 
@@ -80,14 +80,13 @@ static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
 
 /* ------------------------- Class/flow operations ------------------------- */
 
-
-static int dsmark_graft(struct Qdisc *sch,unsigned long arg,
-    struct Qdisc *new,struct Qdisc **old)
+static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
+			struct Qdisc *new, struct Qdisc **old)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
 
-	DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new,
-	    old);
+	DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",
+		sch, p, new, old);
 
 	if (new == NULL) {
 		new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
@@ -96,44 +95,37 @@ static int dsmark_graft(struct Qdisc *sch,unsigned long arg,
 	}
 
 	sch_tree_lock(sch);
-	*old = xchg(&p->q,new);
-	if (*old)
-		qdisc_reset(*old);
+	*old = xchg(&p->q, new);
+	qdisc_reset(*old);
 	sch->q.qlen = 0;
-	sch_tree_unlock(sch); /* @@@ move up ? */
+	sch_tree_unlock(sch);
+
         return 0;
 }
 
-
 static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
-
-	return p->q;
+	return PRIV(sch)->q;
 }
 
-
-static unsigned long dsmark_get(struct Qdisc *sch,u32 classid)
+static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
 {
-	struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch);
+	DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",
+		sch, PRIV(sch), classid);
 
-	DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid);
-	return TC_H_MIN(classid)+1;
+	return TC_H_MIN(classid) + 1;
 }
 
-
 static unsigned long dsmark_bind_filter(struct Qdisc *sch,
-    unsigned long parent, u32 classid)
+					unsigned long parent, u32 classid)
 {
-	return dsmark_get(sch,classid);
+	return dsmark_get(sch, classid);
 }
 
-
 static void dsmark_put(struct Qdisc *sch, unsigned long cl)
 {
 }
 
-
 static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
 			 struct rtattr **tca, unsigned long *arg)
 {
@@ -169,26 +161,29 @@ rtattr_failure:
 	return err;
 }
 
-static int dsmark_delete(struct Qdisc *sch,unsigned long arg)
+static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
 
-	if (!arg || arg > p->indices)
+	if (!dsmark_valid_index(p, arg))
 		return -EINVAL;
+	
 	p->mask[arg-1] = 0xff;
 	p->value[arg-1] = 0;
+
 	return 0;
 }
 
-
 static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
 	int i;
 
-	DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker);
+	DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
+
 	if (walker->stop)
 		return;
+
 	for (i = 0; i < p->indices; i++) {
 		if (p->mask[i] == 0xff && !p->value[i])
 			goto ignore;
@@ -203,26 +198,20 @@ ignore:
         }
 }
 
-
 static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl)
 {
-	struct dsmark_qdisc_data *p = PRIV(sch);
-
-	return &p->filter_list;
+	return &PRIV(sch)->filter_list;
 }
 
-
 /* --------------------------- Qdisc operations ---------------------------- */
 
-
 static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
-	struct tcf_result res;
-	int result;
-	int ret = NET_XMIT_POLICED;
+	int err;
+
+	D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
 
-	D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
 	if (p->set_tc_index) {
 		/* FIXME: Safe with non-linear skbs? --RR */
 		switch (skb->protocol) {
@@ -239,17 +228,21 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 				break;
 		};
 	}
-	result = TC_POLICE_OK; /* be nice to gcc */
-	if (TC_H_MAJ(skb->priority) == sch->handle) {
+
+	if (TC_H_MAJ(skb->priority) == sch->handle)
 		skb->tc_index = TC_H_MIN(skb->priority);
-	} else {
-		result = tc_classify(skb,p->filter_list,&res);
-		D2PRINTK("result %d class 0x%04x\n",result,res.classid);
+	else {
+		struct tcf_result res;
+		int result = tc_classify(skb, p->filter_list, &res);
+
+		D2PRINTK("result %d class 0x%04x\n", result, res.classid);
+
 		switch (result) {
 #ifdef CONFIG_NET_CLS_POLICE
 			case TC_POLICE_SHOT:
 				kfree_skb(skb);
-				break;
+				sch->qstats.drops++;
+				return NET_XMIT_POLICED;
 #if 0
 			case TC_POLICE_RECLASSIFY:
 				/* FIXME: what to do here ??? */
@@ -266,43 +259,45 @@ static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch)
 				break;
 		};
 	}
-	if (
-#ifdef CONFIG_NET_CLS_POLICE
-	    result == TC_POLICE_SHOT ||
-#endif
 
-	    ((ret = p->q->enqueue(skb,p->q)) != 0)) {
+	err = p->q->enqueue(skb,p->q);
+	if (err != NET_XMIT_SUCCESS) {
 		sch->qstats.drops++;
-		return ret;
+		return err;
 	}
+
 	sch->bstats.bytes += skb->len;
 	sch->bstats.packets++;
 	sch->q.qlen++;
-	return ret;
-}
 
+	return NET_XMIT_SUCCESS;
+}
 
 static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
 	struct sk_buff *skb;
-	int index;
+	u32 index;
+
+	D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n", sch, p);
 
-	D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p);
 	skb = p->q->ops->dequeue(p->q);
-	if (!skb)
+	if (skb == NULL)
 		return NULL;
+
 	sch->q.qlen--;
-	index = skb->tc_index & (p->indices-1);
-	D2PRINTK("index %d->%d\n",skb->tc_index,index);
+
+	index = skb->tc_index & (p->indices - 1);
+	D2PRINTK("index %d->%d\n", skb->tc_index, index);
+
 	switch (skb->protocol) {
 		case __constant_htons(ETH_P_IP):
-			ipv4_change_dsfield(skb->nh.iph,
-			    p->mask[index],p->value[index]);
+			ipv4_change_dsfield(skb->nh.iph, p->mask[index],
+					    p->value[index]);
 			break;
 		case __constant_htons(ETH_P_IPV6):
-			ipv6_change_dsfield(skb->nh.ipv6h,
-			    p->mask[index],p->value[index]);
+			ipv6_change_dsfield(skb->nh.ipv6h, p->mask[index],
+					    p->value[index]);
 			break;
 		default:
 			/*
@@ -316,41 +311,46 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
 				       htons(skb->protocol));
 			break;
 	};
+
 	return skb;
 }
 
-
 static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch)
 {
-	int ret;
 	struct dsmark_qdisc_data *p = PRIV(sch);
+	int err;
 
-	D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p);
-        if ((ret = p->q->ops->requeue(skb, p->q)) == 0) {
-		sch->q.qlen++;
-		sch->qstats.requeues++;
-		return 0;
+	D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
+
+	err = p->q->ops->requeue(skb, p->q);
+	if (err != NET_XMIT_SUCCESS) {
+		sch->qstats.drops++;
+		return err;
 	}
-	sch->qstats.drops++;
-	return ret;
-}
 
+	sch->q.qlen++;
+	sch->qstats.requeues++;
+
+	return NET_XMIT_SUCCESS;
+}
 
 static unsigned int dsmark_drop(struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
 	unsigned int len;
 	
-	DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
-	if (!p->q->ops->drop)
-		return 0;
-	if (!(len = p->q->ops->drop(p->q)))
+	DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
+
+	if (p->q->ops->drop == NULL)
 		return 0;
-	sch->q.qlen--;
+
+	len = p->q->ops->drop(p->q);
+	if (len)
+		sch->q.qlen--;
+
 	return len;
 }
 
-
 static int dsmark_init(struct Qdisc *sch, struct rtattr *opt)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
@@ -400,33 +400,32 @@ rtattr_failure:
 	return err;
 }
 
-
 static void dsmark_reset(struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
 
-	DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p);
+	DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n", sch, p);
 	qdisc_reset(p->q);
 	sch->q.qlen = 0;
 }
 
-
 static void dsmark_destroy(struct Qdisc *sch)
 {
 	struct dsmark_qdisc_data *p = PRIV(sch);
 	struct tcf_proto *tp;
 
-	DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p);
+	DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n", sch, p);
+
 	while (p->filter_list) {
 		tp = p->filter_list;
 		p->filter_list = tp->next;
 		tcf_destroy(tp);
 	}
+
 	qdisc_destroy(p->q);
 	kfree(p->mask);
 }
 
-
 static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
 			     struct sk_buff *skb, struct tcmsg *tcm)
 {
@@ -505,10 +504,13 @@ static int __init dsmark_module_init(void)
 {
 	return register_qdisc(&dsmark_qdisc_ops);
 }
+
 static void __exit dsmark_module_exit(void) 
 {
 	unregister_qdisc(&dsmark_qdisc_ops);
 }
+
 module_init(dsmark_module_init)
 module_exit(dsmark_module_exit)
+
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 1797754ea7ee5e0d859b0a32506ff999f8d5fb71 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:53:48 -0700
Subject: [NETLINK]: Introduce NLMSG_NEW macro to better handle netlink flags

Introduces a new macro NLMSG_NEW which extends NLMSG_PUT but takes
a flags argument. NLMSG_PUT stays there for compatibility but now
calls NLMSG_NEW with flags == 0. NLMSG_PUT_ANSWER is renamed to
NLMSG_NEW_ANSWER which now also takes a flags argument.

Also converts the users of NLMSG_PUT_ANSWER to use NLMSG_NEW_ANSWER
and fixes the two direct users of __nlmsg_put to either provide
the flags or use NLMSG_NEW(_ANSWER).

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c     | 8 ++++----
 net/netlink/af_netlink.c | 8 +++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 2296a145fb7..0fb742e228c 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1590,8 +1590,8 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
 	struct nlmsghdr *nlh;
 	struct ndtmsg *ndtmsg;
 
-	nlh = NLMSG_PUT_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg));
-	nlh->nlmsg_flags |= NLM_F_MULTI;
+	nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
+			       NLM_F_MULTI);
 
 	ndtmsg = NLMSG_DATA(nlh);
 
@@ -1675,8 +1675,8 @@ static int neightbl_fill_param_info(struct neigh_table *tbl,
 	struct ndtmsg *ndtmsg;
 	struct nlmsghdr *nlh;
 
-	nlh = NLMSG_PUT_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg));
-	nlh->nlmsg_flags |= NLM_F_MULTI;
+	nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWNEIGHTBL, sizeof(struct ndtmsg),
+			       NLM_F_MULTI);
 
 	ndtmsg = NLMSG_DATA(nlh);
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e41ce458c2a..70bcd4744d9 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1095,8 +1095,7 @@ static int netlink_dump(struct sock *sk)
 		return 0;
 	}
 
-	nlh = __nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLMSG_DONE, sizeof(int));
-	nlh->nlmsg_flags |= NLM_F_MULTI;
+	nlh = NLMSG_NEW_ANSWER(skb, cb, NLMSG_DONE, sizeof(len), NLM_F_MULTI);
 	memcpy(NLMSG_DATA(nlh), &len, sizeof(len));
 	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk, skb->len);
@@ -1107,6 +1106,9 @@ static int netlink_dump(struct sock *sk)
 
 	netlink_destroy_callback(cb);
 	return 0;
+
+nlmsg_failure:
+	return -ENOBUFS;
 }
 
 int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
@@ -1178,7 +1180,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 	}
 
 	rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
-			  NLMSG_ERROR, sizeof(struct nlmsgerr));
+			  NLMSG_ERROR, sizeof(struct nlmsgerr), 0);
 	errmsg = NLMSG_DATA(rep);
 	errmsg->error = err;
 	memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(struct nlmsghdr));
-- 
cgit v1.2.3


From b6544c0b4cf2bd96195f3cdb7cebfb35090fc557 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Sat, 18 Jun 2005 22:54:12 -0700
Subject: [NETLINK]: Correctly set NLM_F_MULTI without checking the pid

This patch rectifies some rtnetlink message builders that derive the
flags from the pid. It is now explicit like the other cases
which get it right. Also fixes half a dozen dumpers which did not
set NLM_F_MULTI at all.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c     | 14 +++++++-------
 net/core/rtnetlink.c     | 13 ++++++++-----
 net/decnet/dn_dev.c      |  9 +++++----
 net/decnet/dn_route.c    | 11 ++++++-----
 net/decnet/dn_rules.c    |  7 ++++---
 net/decnet/dn_table.c    |  8 ++++----
 net/ipv4/devinet.c       |  9 ++++-----
 net/ipv4/fib_hash.c      |  3 ++-
 net/ipv4/fib_lookup.h    |  3 ++-
 net/ipv4/fib_rules.c     |  7 ++++---
 net/ipv4/fib_semantics.c |  6 +++---
 net/ipv4/route.c         | 11 +++++------
 net/ipv6/addrconf.c      | 49 +++++++++++++++++++++++-------------------------
 net/ipv6/route.c         | 11 ++++++-----
 14 files changed, 83 insertions(+), 78 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 0fb742e228c..f6bdcad47da 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1859,18 +1859,17 @@ out:
 }
 
 static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
-			   u32 pid, u32 seq, int event)
+			   u32 pid, u32 seq, int event, unsigned int flags)
 {
 	unsigned long now = jiffies;
 	unsigned char *b = skb->tail;
 	struct nda_cacheinfo ci;
 	int locked = 0;
 	u32 probes;
-	struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, event,
-					 sizeof(struct ndmsg));
+	struct nlmsghdr *nlh = NLMSG_NEW(skb, pid, seq, event,
+					 sizeof(struct ndmsg), flags);
 	struct ndmsg *ndm = NLMSG_DATA(nlh);
 
-	nlh->nlmsg_flags = pid ? NLM_F_MULTI : 0;
 	ndm->ndm_family	 = n->ops->family;
 	ndm->ndm_flags	 = n->flags;
 	ndm->ndm_type	 = n->type;
@@ -1920,7 +1919,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 				continue;
 			if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid,
 					    cb->nlh->nlmsg_seq,
-					    RTM_NEWNEIGH) <= 0) {
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI) <= 0) {
 				read_unlock_bh(&tbl->lock);
 				rc = -1;
 				goto out;
@@ -2329,7 +2329,7 @@ void neigh_app_ns(struct neighbour *n)
 	if (!skb)
 		return;
 
-	if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) {
+	if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH, 0) < 0) {
 		kfree_skb(skb);
 		return;
 	}
@@ -2348,7 +2348,7 @@ static void neigh_app_notify(struct neighbour *n)
 	if (!skb)
 		return;
 
-	if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) {
+	if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH, 0) < 0) {
 		kfree_skb(skb);
 		return;
 	}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 56a20f014b8..63bd8866518 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -178,14 +178,14 @@ rtattr_failure:
 
 
 static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
-				 int type, u32 pid, u32 seq, u32 change)
+				 int type, u32 pid, u32 seq, u32 change, 
+				 unsigned int flags)
 {
 	struct ifinfomsg *r;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r));
-	if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
 	r = NLMSG_DATA(nlh);
 	r->ifi_family = AF_UNSPEC;
 	r->ifi_type = dev->type;
@@ -275,7 +275,10 @@ static int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *c
 	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
 		if (idx < s_idx)
 			continue;
-		if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0)
+		if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK,
+					  NETLINK_CB(cb->skb).pid,
+					  cb->nlh->nlmsg_seq, 0,
+					  NLM_F_MULTI) <= 0)
 			break;
 	}
 	read_unlock(&dev_base_lock);
@@ -449,7 +452,7 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 	if (!skb)
 		return;
 
-	if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change) < 0) {
+	if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change, 0) < 0) {
 		kfree_skb(skb);
 		return;
 	}
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index ee7bf46eb78..00233ecbc9c 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -716,13 +716,13 @@ static int dn_dev_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *a
 }
 
 static int dn_dev_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
-				u32 pid, u32 seq, int event)
+				u32 pid, u32 seq, int event, unsigned int flags)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr *nlh;
 	unsigned char *b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
 	ifm = NLMSG_DATA(nlh);
 
 	ifm->ifa_family = AF_DECnet;
@@ -755,7 +755,7 @@ static void rtmsg_ifa(int event, struct dn_ifaddr *ifa)
 		netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS);
 		return;
 	}
-	if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+	if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL);
 		return;
@@ -790,7 +790,8 @@ static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 			if (dn_dev_fill_ifaddr(skb, ifa,
 					       NETLINK_CB(cb->skb).pid,
 					       cb->nlh->nlmsg_seq,
-					       RTM_NEWADDR) <= 0)
+					       RTM_NEWADDR,
+					       NLM_F_MULTI) <= 0)
 				goto done;
 		}
 	}
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 1e7b5c3ea21..2399fa8a3f8 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1465,7 +1465,8 @@ int dn_route_input(struct sk_buff *skb)
 	return dn_route_input_slow(skb);
 }
 
-static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait)
+static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+			   int event, int nowait, unsigned int flags)
 {
 	struct dn_route *rt = (struct dn_route *)skb->dst;
 	struct rtmsg *r;
@@ -1473,9 +1474,8 @@ static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int
 	unsigned char *b = skb->tail;
 	struct rta_cacheinfo ci;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
 	r = NLMSG_DATA(nlh);
-	nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
 	r->rtm_family = AF_DECnet;
 	r->rtm_dst_len = 16;
 	r->rtm_src_len = 0;
@@ -1596,7 +1596,7 @@ int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
 
 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
 
-	err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0);
+	err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
 
 	if (err == 0)
 		goto out_free;
@@ -1644,7 +1644,8 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
 				continue;
 			skb->dst = dst_clone(&rt->u.dst);
 			if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
-					cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) {
+					cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
+					1, NLM_F_MULTI) <= 0) {
 				dst_release(xchg(&skb->dst, NULL));
 				rcu_read_unlock_bh();
 				goto done;
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 597587d170d..1060de70bc0 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -342,14 +342,15 @@ static struct notifier_block dn_fib_rules_notifier = {
 	.notifier_call =	dn_fib_rules_event,
 };
 
-static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r, struct netlink_callback *cb)
+static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r,
+			    struct netlink_callback *cb, unsigned int flags)
 {
 	struct rtmsg *rtm;
 	struct nlmsghdr *nlh;
 	unsigned char *b = skb->tail;
 
 
-	nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
+	nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
 	rtm = NLMSG_DATA(nlh);
 	rtm->rtm_family = AF_DECnet;
 	rtm->rtm_dst_len = r->r_dst_len;
@@ -394,7 +395,7 @@ int dn_fib_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
 	for(r = dn_fib_rules, idx = 0; r; r = r->r_next, idx++) {
 		if (idx < s_idx)
 			continue;
-		if (dn_fib_fill_rule(skb, r, cb) < 0)
+		if (dn_fib_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
 			break;
 	}
 	read_unlock(&dn_fib_rules_lock);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index dad5603912b..28ba5777a25 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -270,13 +270,13 @@ static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern
 
 static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
                         u8 tb_id, u8 type, u8 scope, void *dst, int dst_len,
-                        struct dn_fib_info *fi)
+                        struct dn_fib_info *fi, unsigned int flags)
 {
         struct rtmsg *rtm;
         struct nlmsghdr *nlh;
         unsigned char *b = skb->tail;
 
-        nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
+        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
         rtm = NLMSG_DATA(nlh);
         rtm->rtm_family = AF_DECnet;
         rtm->rtm_dst_len = dst_len;
@@ -345,7 +345,7 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id,
 
         if (dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id, 
                                 f->fn_type, f->fn_scope, &f->fn_key, z, 
-                                DN_FIB_INFO(f)) < 0) {
+                                DN_FIB_INFO(f), 0) < 0) {
                 kfree_skb(skb);
                 return;
         }
@@ -377,7 +377,7 @@ static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
 				tb->n, 
 				(f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
 				f->fn_scope, &f->fn_key, dz->dz_order, 
-				f->fn_info) < 0) {
+				f->fn_info, NLM_F_MULTI) < 0) {
 			cb->args[3] = i;
 			return -1;
 		}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 478a30179a5..fd47a1e890f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1030,14 +1030,13 @@ static struct notifier_block ip_netdev_notifier = {
 };
 
 static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
-			    u32 pid, u32 seq, int event)
+			    u32 pid, u32 seq, int event, unsigned int flags)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
-	if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
 	ifm = NLMSG_DATA(nlh);
 	ifm->ifa_family = AF_INET;
 	ifm->ifa_prefixlen = ifa->ifa_prefixlen;
@@ -1090,7 +1089,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 				continue;
 			if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
 					     cb->nlh->nlmsg_seq,
-					     RTM_NEWADDR) <= 0) {
+					     RTM_NEWADDR, NLM_F_MULTI) <= 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -1113,7 +1112,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
 
 	if (!skb)
 		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
-	else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+	else if (inet_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
 	} else {
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 6506dcc01b4..b10d6bb5ef3 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -703,7 +703,8 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 					  &f->fn_key,
 					  fz->fz_order,
 					  fa->fa_tos,
-					  fa->fa_info) < 0) {
+					  fa->fa_info,
+					  NLM_F_MULTI) < 0) {
 				cb->args[3] = i;
 				return -1;
 			}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index ac4485f75e9..b729d97cfa9 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -30,7 +30,8 @@ extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
 			struct kern_rta *rta, struct fib_info *fi);
 extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 			 u8 tb_id, u8 type, u8 scope, void *dst,
-			 int dst_len, u8 tos, struct fib_info *fi);
+			 int dst_len, u8 tos, struct fib_info *fi,
+			 unsigned int);
 extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
 		      int z, int tb_id,
 		      struct nlmsghdr *n, struct netlink_skb_parms *req);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 39d0aadb9a2..0b298bbc151 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -367,13 +367,14 @@ static struct notifier_block fib_rules_notifier = {
 
 static __inline__ int inet_fill_rule(struct sk_buff *skb,
 				     struct fib_rule *r,
-				     struct netlink_callback *cb)
+				     struct netlink_callback *cb,
+				     unsigned int flags)
 {
 	struct rtmsg *rtm;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
+	nlh = NLMSG_NEW_ANSWER(skb, cb, RTM_NEWRULE, sizeof(*rtm), flags);
 	rtm = NLMSG_DATA(nlh);
 	rtm->rtm_family = AF_INET;
 	rtm->rtm_dst_len = r->r_dst_len;
@@ -422,7 +423,7 @@ int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
 	for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
 		if (idx < s_idx)
 			continue;
-		if (inet_fill_rule(skb, r, cb) < 0)
+		if (inet_fill_rule(skb, r, cb, NLM_F_MULTI) < 0)
 			break;
 	}
 	read_unlock(&fib_rules_lock);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 029362d6613..a9a44b4bef4 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -286,7 +286,7 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
 	if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
 			  fa->fa_type, fa->fa_scope, &key, z,
 			  fa->fa_tos,
-			  fa->fa_info) < 0) {
+			  fa->fa_info, 0) < 0) {
 		kfree_skb(skb);
 		return;
 	}
@@ -932,13 +932,13 @@ u32 __fib_res_prefsrc(struct fib_result *res)
 int
 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 	      u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
-	      struct fib_info *fi)
+	      struct fib_info *fi, unsigned int flags)
 {
 	struct rtmsg *rtm;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
 	rtm = NLMSG_DATA(nlh);
 	rtm->rtm_family = AF_INET;
 	rtm->rtm_dst_len = dst_len;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a682d28e247..f4d53c91986 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2581,7 +2581,7 @@ int ip_route_output_key(struct rtable **rp, struct flowi *flp)
 }
 
 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
-			int nowait)
+			int nowait, unsigned int flags)
 {
 	struct rtable *rt = (struct rtable*)skb->dst;
 	struct rtmsg *r;
@@ -2591,9 +2591,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
 #ifdef CONFIG_IP_MROUTE
 	struct rtattr *eptr;
 #endif
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
 	r = NLMSG_DATA(nlh);
-	nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
 	r->rtm_family	 = AF_INET;
 	r->rtm_dst_len	= 32;
 	r->rtm_src_len	= 0;
@@ -2744,7 +2743,7 @@ int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
 
 	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
-				RTM_NEWROUTE, 0);
+				RTM_NEWROUTE, 0, 0);
 	if (!err)
 		goto out_free;
 	if (err < 0) {
@@ -2781,8 +2780,8 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 				continue;
 			skb->dst = dst_clone(&rt->u.dst);
 			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
-					 cb->nlh->nlmsg_seq,
-					 RTM_NEWROUTE, 1) <= 0) {
+					 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 
+					 1, NLM_F_MULTI) <= 0) {
 				dst_release(xchg(&skb->dst, NULL));
 				rcu_read_unlock_bh();
 				goto done;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2720899d516..cdd19c54c03 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2622,15 +2622,14 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 }
 
 static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
-			     u32 pid, u32 seq, int event)
+			     u32 pid, u32 seq, int event, unsigned int flags)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
 	struct ifa_cacheinfo ci;
 	unsigned char	 *b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
-	if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
 	ifm = NLMSG_DATA(nlh);
 	ifm->ifa_family = AF_INET6;
 	ifm->ifa_prefixlen = ifa->prefix_len;
@@ -2672,15 +2671,14 @@ rtattr_failure:
 }
 
 static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
-				u32 pid, u32 seq, int event)
+				u32 pid, u32 seq, int event, unsigned flags)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
 	struct ifa_cacheinfo ci;
 	unsigned char	 *b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
-	if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
 	ifm = NLMSG_DATA(nlh);
 	ifm->ifa_family = AF_INET6;	
 	ifm->ifa_prefixlen = 128;
@@ -2709,15 +2707,14 @@ rtattr_failure:
 }
 
 static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
-				u32 pid, u32 seq, int event)
+				u32 pid, u32 seq, int event, unsigned int flags)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
 	struct ifa_cacheinfo ci;
 	unsigned char	 *b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
-	if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*ifm), flags);
 	ifm = NLMSG_DATA(nlh);
 	ifm->ifa_family = AF_INET6;	
 	ifm->ifa_prefixlen = 128;
@@ -2786,7 +2783,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 					continue;
 				if ((err = inet6_fill_ifaddr(skb, ifa, 
 				    NETLINK_CB(cb->skb).pid, 
-				    cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0)
+				    cb->nlh->nlmsg_seq, RTM_NEWADDR,
+				    NLM_F_MULTI)) <= 0)
 					goto done;
 			}
 			/* temp addr */
@@ -2797,7 +2795,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 					continue;
 				if ((err = inet6_fill_ifaddr(skb, ifa, 
 				    NETLINK_CB(cb->skb).pid, 
-				    cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0) 
+				    cb->nlh->nlmsg_seq, RTM_NEWADDR,
+				    NLM_F_MULTI)) <= 0) 
 					goto done;
 			}
 #endif
@@ -2810,7 +2809,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 					continue;
 				if ((err = inet6_fill_ifmcaddr(skb, ifmca, 
 				    NETLINK_CB(cb->skb).pid, 
-				    cb->nlh->nlmsg_seq, RTM_GETMULTICAST)) <= 0)
+				    cb->nlh->nlmsg_seq, RTM_GETMULTICAST,
+				    NLM_F_MULTI)) <= 0)
 					goto done;
 			}
 			break;
@@ -2822,7 +2822,8 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
 					continue;
 				if ((err = inet6_fill_ifacaddr(skb, ifaca, 
 				    NETLINK_CB(cb->skb).pid, 
-				    cb->nlh->nlmsg_seq, RTM_GETANYCAST)) <= 0) 
+				    cb->nlh->nlmsg_seq, RTM_GETANYCAST,
+				    NLM_F_MULTI)) <= 0) 
 					goto done;
 			}
 			break;
@@ -2872,7 +2873,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS);
 		return;
 	}
-	if (inet6_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+	if (inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL);
 		return;
@@ -2907,7 +2908,7 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf,
 }
 
 static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, 
-			     u32 pid, u32 seq, int event)
+			     u32 pid, u32 seq, int event, unsigned int flags)
 {
 	struct net_device	*dev = idev->dev;
 	__s32			*array = NULL;
@@ -2918,8 +2919,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
 	__u32			mtu = dev->mtu;
 	struct ifla_cacheinfo	ci;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
-	if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
 	r = NLMSG_DATA(nlh);
 	r->ifi_family = AF_INET6;
 	r->ifi_type = dev->type;
@@ -2986,7 +2986,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 		if ((idev = in6_dev_get(dev)) == NULL)
 			continue;
 		err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid, 
-				cb->nlh->nlmsg_seq, RTM_NEWLINK);
+				cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI);
 		in6_dev_put(idev);
 		if (err <= 0)
 			break;
@@ -3008,7 +3008,7 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS);
 		return;
 	}
-	if (inet6_fill_ifinfo(skb, idev, 0, 0, event) < 0) {
+	if (inet6_fill_ifinfo(skb, idev, 0, 0, event, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL);
 		return;
@@ -3018,18 +3018,15 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
 }
 
 static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
-			struct prefix_info *pinfo, u32 pid, u32 seq, int event)
+			struct prefix_info *pinfo, u32 pid, u32 seq, 
+			int event, unsigned int flags)
 {
 	struct prefixmsg	*pmsg;
 	struct nlmsghdr 	*nlh;
 	unsigned char		*b = skb->tail;
 	struct prefix_cacheinfo	ci;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*pmsg));
-	
-	if (pid) 
-		nlh->nlmsg_flags |= NLM_F_MULTI;
-	
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
 	pmsg = NLMSG_DATA(nlh);
 	pmsg->prefix_family = AF_INET6;
 	pmsg->prefix_ifindex = idev->dev->ifindex;
@@ -3068,7 +3065,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS);
 		return;
 	}
-	if (inet6_fill_prefix(skb, idev, pinfo, 0, 0, event) < 0) {
+	if (inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL);
 		return;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3bf8a0254f8..1f5b226c357 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1570,7 +1570,8 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 			 struct in6_addr *src,
 			 int iif,
 			 int type, u32 pid, u32 seq,
-			 struct nlmsghdr *in_nlh, int prefix)
+			 struct nlmsghdr *in_nlh, int prefix,
+			 unsigned int flags)
 {
 	struct rtmsg *rtm;
 	struct nlmsghdr  *nlh;
@@ -1588,7 +1589,7 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 		pid = in_nlh->nlmsg_pid;
 	}
 
-	nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm));
+	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
 	rtm = NLMSG_DATA(nlh);
 	rtm->rtm_family = AF_INET6;
 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
@@ -1674,7 +1675,7 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 
 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
-		     NULL, prefix);
+		     NULL, prefix, NLM_F_MULTI);
 }
 
 static int fib6_dump_node(struct fib6_walker_t *w)
@@ -1822,7 +1823,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 			    &fl.fl6_dst, &fl.fl6_src,
 			    iif,
 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
-			    nlh->nlmsg_seq, nlh, 0);
+			    nlh->nlmsg_seq, nlh, 0, 0);
 	if (err < 0) {
 		err = -EMSGSIZE;
 		goto out_free;
@@ -1848,7 +1849,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh)
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
 		return;
 	}
-	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0) < 0) {
+	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
 		return;
-- 
cgit v1.2.3


From 0603eac0d6b77acac5924a2734228cbaf072f993 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 18 Jun 2005 22:54:36 -0700
Subject: [IPSEC]: Add XFRMA_SA/XFRMA_POLICY for delete notification

This patch changes the format of the XFRM_MSG_DELSA and
XFRM_MSG_DELPOLICY notification so that the main message
sent is of the same format as that received by the kernel
if the original message was via netlink.  This also means
that we won't lose the byid information carried in km_event.

Since this user interface is introduced by Jamal's patch
we can still afford to change it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_user.c | 47 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index ffe1b217347..5ce8558eac9 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1168,7 +1168,7 @@ nlmsg_failure:
 
 static int inline xfrm_sa_len(struct xfrm_state *x)
 {
-	int l = NLMSG_LENGTH(sizeof(struct xfrm_usersa_info));
+	int l = 0;
 	if (x->aalg)
 		l += RTA_SPACE(sizeof(*x->aalg) + (x->aalg->alg_key_len+7)/8);
 	if (x->ealg)
@@ -1184,20 +1184,39 @@ static int inline xfrm_sa_len(struct xfrm_state *x)
 static int xfrm_notify_sa(struct xfrm_state *x, struct km_event *c)
 {
 	struct xfrm_usersa_info *p;
+	struct xfrm_usersa_id *id;
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
 	unsigned char *b;
 	int len = xfrm_sa_len(x);
+	int headlen;
+
+	headlen = sizeof(*p);
+	if (c->event == XFRM_MSG_DELSA) {
+		len += RTA_SPACE(headlen);
+		headlen = sizeof(*id);
+	}
+	len += NLMSG_SPACE(headlen);
 
 	skb = alloc_skb(len, GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
 	b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, sizeof(*p));
+	nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, headlen);
 	nlh->nlmsg_flags = 0;
 
 	p = NLMSG_DATA(nlh);
+	if (c->event == XFRM_MSG_DELSA) {
+		id = NLMSG_DATA(nlh);
+		memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr));
+		id->spi = x->id.spi;
+		id->family = x->props.family;
+		id->proto = x->id.proto;
+
+		p = RTA_DATA(__RTA_PUT(skb, XFRMA_SA, sizeof(*p)));
+	}
+
 	copy_to_user_state(x, p);
 
 	if (x->aalg)
@@ -1398,20 +1417,39 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve
 static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *c)
 {
 	struct xfrm_userpolicy_info *p;
+	struct xfrm_userpolicy_id *id;
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
 	unsigned char *b;
 	int len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr);
-	len += NLMSG_SPACE(sizeof(struct xfrm_userpolicy_info));
+	int headlen;
+
+	headlen = sizeof(*p);
+	if (c->event == XFRM_MSG_DELPOLICY) {
+		len += RTA_SPACE(headlen);
+		headlen = sizeof(*id);
+	}
+	len += NLMSG_SPACE(headlen);
 
 	skb = alloc_skb(len, GFP_ATOMIC);
 	if (skb == NULL)
 		return -ENOMEM;
 	b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, sizeof(*p));
+	nlh = NLMSG_PUT(skb, c->pid, c->seq, c->event, headlen);
 
 	p = NLMSG_DATA(nlh);
+	if (c->event == XFRM_MSG_DELPOLICY) {
+		id = NLMSG_DATA(nlh);
+		memset(id, 0, sizeof(*id));
+		id->dir = dir;
+		if (c->data.byid)
+			id->index = xp->index;
+		else
+			memcpy(&id->sel, &xp->selector, sizeof(id->sel));
+
+		p = RTA_DATA(__RTA_PUT(skb, XFRMA_POLICY, sizeof(*p)));
+	}
 
 	nlh->nlmsg_flags = 0;
 
@@ -1424,6 +1462,7 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, struct km_event *
 	return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_POLICY, GFP_ATOMIC);
 
 nlmsg_failure:
+rtattr_failure:
 	kfree_skb(skb);
 	return -1;
 }
-- 
cgit v1.2.3


From 58b82150da90681a4179db1bc94d412938e81b31 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:55:02 -0700
Subject: [DECNET]: Remove unnecessary initilization of unused variable entries

This patch was supposed to be part of the neighbour tables related
patchset but apparently got lost.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/decnet/dn_neigh.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index f6dfe96f45b..f32dba9e26f 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -101,7 +101,6 @@ struct neigh_table dn_neigh_table = {
 	.id =				"dn_neigh_cache",
 	.parms ={
 		.tbl =			&dn_neigh_table,
-		.entries =		0,
 		.base_reachable_time =	30 * HZ,
 		.retrans_time =	1 * HZ,
 		.gc_staletime =	60 * HZ,
-- 
cgit v1.2.3


From e431b8c004af6be03783dddea31b6e514118051d Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Sat, 18 Jun 2005 22:55:31 -0700
Subject: [NETLINK]: Explicit typing

This patch converts "unsigned flags" to use more explict types like u16
instead and incrementally introduces NLMSG_NEW().

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 10 +++++-----
 net/sched/act_api.c | 11 +++++------
 net/sched/cls_api.c |  5 ++---
 net/sched/sch_api.c | 10 ++++------
 4 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index cdd19c54c03..4d4cb46f043 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -131,7 +131,7 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
 
 static int addrconf_ifdown(struct net_device *dev, int how);
 
-static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags);
+static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags);
 static void addrconf_dad_timer(unsigned long data);
 static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
 static void addrconf_rs_timer(unsigned long data);
@@ -492,7 +492,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
 
 static struct inet6_ifaddr *
 ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
-	      int scope, unsigned flags)
+	      int scope, u32 flags)
 {
 	struct inet6_ifaddr *ifa = NULL;
 	struct rt6_info *rt;
@@ -1320,7 +1320,7 @@ static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
 
 static void
 addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
-		      unsigned long expires, unsigned flags)
+		      unsigned long expires, u32 flags)
 {
 	struct in6_rtmsg rtmsg;
 
@@ -2229,7 +2229,7 @@ out:
 /*
  *	Duplicate Address Detection
  */
-static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags)
+static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
 {
 	struct inet6_dev *idev = ifp->idev;
 	struct net_device *dev = idev->dev;
@@ -2671,7 +2671,7 @@ rtattr_failure:
 }
 
 static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
-				u32 pid, u32 seq, int event, unsigned flags)
+				u32 pid, u32 seq, int event, u16 flags)
 {
 	struct ifaddrmsg *ifm;
 	struct nlmsghdr  *nlh;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 914c85ff8fe..9594206e603 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -428,15 +428,15 @@ errout:
 
 static int
 tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
-             unsigned flags, int event, int bind, int ref)
+             u16 flags, int event, int bind, int ref)
 {
 	struct tcamsg *t;
 	struct nlmsghdr *nlh;
 	unsigned char *b = skb->tail;
 	struct rtattr *x;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t));
-	nlh->nlmsg_flags = flags;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
+
 	t = NLMSG_DATA(nlh);
 	t->tca_family = AF_UNSPEC;
 	
@@ -669,7 +669,7 @@ err:
 }
 
 static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
-                          unsigned flags)
+                          u16 flags)
 {
 	struct tcamsg *t;
 	struct nlmsghdr *nlh;
@@ -684,8 +684,7 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
 
 	b = (unsigned char *)skb->tail;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t));
-	nlh->nlmsg_flags = flags;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
 	t = NLMSG_DATA(nlh);
 	t->tca_family = AF_UNSPEC;
 	
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 56e66c3fe0f..1616bf5c962 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -322,14 +322,13 @@ errout:
 
 static int
 tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
-	      u32 pid, u32 seq, unsigned flags, int event)
+	      u32 pid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
-	nlh->nlmsg_flags = flags;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
 	tcm = NLMSG_DATA(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm_ifindex = tp->q->dev->ifindex;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 07977f8f267..97c1c75d5c7 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -760,15 +760,14 @@ graft:
 }
 
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
-			 u32 pid, u32 seq, unsigned flags, int event)
+			 u32 pid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
 	unsigned char	 *b = skb->tail;
 	struct gnet_dump d;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
-	nlh->nlmsg_flags = flags;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
 	tcm = NLMSG_DATA(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm_ifindex = q->dev->ifindex;
@@ -997,7 +996,7 @@ out:
 
 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 			  unsigned long cl,
-			  u32 pid, u32 seq, unsigned flags, int event)
+			  u32 pid, u32 seq, u16 flags, int event)
 {
 	struct tcmsg *tcm;
 	struct nlmsghdr  *nlh;
@@ -1005,8 +1004,7 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
 	struct gnet_dump d;
 	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
 
-	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
-	nlh->nlmsg_flags = flags;
+	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
 	tcm = NLMSG_DATA(nlh);
 	tcm->tcm_family = AF_UNSPEC;
 	tcm->tcm_ifindex = q->dev->ifindex;
-- 
cgit v1.2.3


From 9ed19f339e12e731986de84134ac293cd15910a7 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Sat, 18 Jun 2005 22:55:51 -0700
Subject: [NETLINK]: Set correct pid for ioctl originating netlink events

This patch ensures that netlink events created as a result of programns
using ioctls (such as ifconfig, route etc) contains the correct PID of
those events.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c     | 2 +-
 net/ipv4/devinet.c       | 2 +-
 net/ipv4/fib_semantics.c | 4 ++--
 net/ipv6/addrconf.c      | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 63bd8866518..e013d836a7a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -452,7 +452,7 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 	if (!skb)
 		return;
 
-	if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change, 0) < 0) {
+	if (rtnetlink_fill_ifinfo(skb, dev, type, current->pid, 0, change, 0) < 0) {
 		kfree_skb(skb);
 		return;
 	}
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index fd47a1e890f..650dcb12d9a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1112,7 +1112,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
 
 	if (!skb)
 		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
-	else if (inet_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
+	else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
 	} else {
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a9a44b4bef4..c886b28ba9f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -276,7 +276,7 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
 	       struct nlmsghdr *n, struct netlink_skb_parms *req)
 {
 	struct sk_buff *skb;
-	u32 pid = req ? req->pid : 0;
+	u32 pid = req ? req->pid : n->nlmsg_pid;
 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
 
 	skb = alloc_skb(size, GFP_KERNEL);
@@ -1035,7 +1035,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
 	}
 
 	nl->nlmsg_flags = NLM_F_REQUEST;
-	nl->nlmsg_pid = 0;
+	nl->nlmsg_pid = current->pid;
 	nl->nlmsg_seq = 0;
 	nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
 	if (cmd == SIOCDELRT) {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4d4cb46f043..47a30c3188e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2873,7 +2873,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS);
 		return;
 	}
-	if (inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0) < 0) {
+	if (inet6_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL);
 		return;
@@ -3008,7 +3008,7 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS);
 		return;
 	}
-	if (inet6_fill_ifinfo(skb, idev, 0, 0, event, 0) < 0) {
+	if (inet6_fill_ifinfo(skb, idev, current->pid, 0, event, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL);
 		return;
@@ -3065,7 +3065,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS);
 		return;
 	}
-	if (inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0) < 0) {
+	if (inet6_fill_prefix(skb, idev, pinfo, current->pid, 0, event, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL);
 		return;
-- 
cgit v1.2.3


From e0f9f8586a0b21fb3c7a4ead3804008d57dfdef7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 18 Jun 2005 22:56:18 -0700
Subject: [IPV4/IPV6]: Replace spin_lock_irq with spin_lock_bh

In light of my recent patch to net/ipv4/udp.c that replaced the
spin_lock_irq calls on the receive queue lock with spin_lock_bh,
here is a similar patch for all other occurences of spin_lock_irq
on receive/error queue locks in IPv4 and IPv6.

In these stacks, we know that they can only be entered from user
or softirq context.  Therefore it's safe to disable BH only.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 6 +++---
 net/ipv4/raw.c         | 4 ++--
 net/ipv6/datagram.c    | 6 +++---
 net/ipv6/raw.c         | 8 ++++----
 net/ipv6/udp.c         | 4 ++--
 5 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 47012b93cad..f8b172f8981 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -360,14 +360,14 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
 	err = copied;
 
 	/* Reset and regenerate socket error */
-	spin_lock_irq(&sk->sk_error_queue.lock);
+	spin_lock_bh(&sk->sk_error_queue.lock);
 	sk->sk_err = 0;
 	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
 		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
-		spin_unlock_irq(&sk->sk_error_queue.lock);
+		spin_unlock_bh(&sk->sk_error_queue.lock);
 		sk->sk_error_report(sk);
 	} else
-		spin_unlock_irq(&sk->sk_error_queue.lock);
+		spin_unlock_bh(&sk->sk_error_queue.lock);
 
 out_free_skb:	
 	kfree_skb(skb);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5b1ec586bae..8c1512021ee 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -691,11 +691,11 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			struct sk_buff *skb;
 			int amount = 0;
 
-			spin_lock_irq(&sk->sk_receive_queue.lock);
+			spin_lock_bh(&sk->sk_receive_queue.lock);
 			skb = skb_peek(&sk->sk_receive_queue);
 			if (skb != NULL)
 				amount = skb->len;
-			spin_unlock_irq(&sk->sk_receive_queue.lock);
+			spin_unlock_bh(&sk->sk_receive_queue.lock);
 			return put_user(amount, (int __user *)arg);
 		}
 
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 65b9375df57..5229365cd8b 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -353,14 +353,14 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len)
 	err = copied;
 
 	/* Reset and regenerate socket error */
-	spin_lock_irq(&sk->sk_error_queue.lock);
+	spin_lock_bh(&sk->sk_error_queue.lock);
 	sk->sk_err = 0;
 	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
 		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
-		spin_unlock_irq(&sk->sk_error_queue.lock);
+		spin_unlock_bh(&sk->sk_error_queue.lock);
 		sk->sk_error_report(sk);
 	} else {
-		spin_unlock_irq(&sk->sk_error_queue.lock);
+		spin_unlock_bh(&sk->sk_error_queue.lock);
 	}
 
 out_free_skb:	
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 617645bc5ed..e2b848ec985 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -434,12 +434,12 @@ csum_copy_err:
 	/* Clear queue. */
 	if (flags&MSG_PEEK) {
 		int clear = 0;
-		spin_lock_irq(&sk->sk_receive_queue.lock);
+		spin_lock_bh(&sk->sk_receive_queue.lock);
 		if (skb == skb_peek(&sk->sk_receive_queue)) {
 			__skb_unlink(skb, &sk->sk_receive_queue);
 			clear = 1;
 		}
-		spin_unlock_irq(&sk->sk_receive_queue.lock);
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
 		if (clear)
 			kfree_skb(skb);
 	}
@@ -971,11 +971,11 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
 			struct sk_buff *skb;
 			int amount = 0;
 
-			spin_lock_irq(&sk->sk_receive_queue.lock);
+			spin_lock_bh(&sk->sk_receive_queue.lock);
 			skb = skb_peek(&sk->sk_receive_queue);
 			if (skb != NULL)
 				amount = skb->tail - skb->h.raw;
-			spin_unlock_irq(&sk->sk_receive_queue.lock);
+			spin_unlock_bh(&sk->sk_receive_queue.lock);
 			return put_user(amount, (int __user *)arg);
 		}
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e251d0ba4f3..eff050ac704 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -300,12 +300,12 @@ csum_copy_err:
 	/* Clear queue. */
 	if (flags&MSG_PEEK) {
 		int clear = 0;
-		spin_lock_irq(&sk->sk_receive_queue.lock);
+		spin_lock_bh(&sk->sk_receive_queue.lock);
 		if (skb == skb_peek(&sk->sk_receive_queue)) {
 			__skb_unlink(skb, &sk->sk_receive_queue);
 			clear = 1;
 		}
-		spin_unlock_irq(&sk->sk_receive_queue.lock);
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
 		if (clear)
 			kfree_skb(skb);
 	}
-- 
cgit v1.2.3


From 1e061ab2e5aa50a84d68ca654773632f9c425bb6 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 18 Jun 2005 22:56:42 -0700
Subject: [SCTP]: Replace spin_lock_irqsave with spin_lock_bh

This patch replaces the spin_lock_irqsave call on the receive queue
lock in SCTP with spin_lock_bh.  Despite the proliferation of
spin_lock_irqsave calls in this stack, it is only entered from the
IPv4/IPv6 stack and user space.  That is, it is never entered from
hardirq context.

The call in question is only called from recvmsg which means that
IRQs aren't disabled.  Therefore it is safe to replace it with
spin_lock_bh.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 2a3c0e08a09..e6926cb1942 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4368,15 +4368,11 @@ static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
 		 *  However, this function was corrent in any case. 8)
 		 */
 		if (flags & MSG_PEEK) {
-			unsigned long cpu_flags;
-
-			sctp_spin_lock_irqsave(&sk->sk_receive_queue.lock,
-					       cpu_flags);
+			spin_lock_bh(&sk->sk_receive_queue.lock);
 			skb = skb_peek(&sk->sk_receive_queue);
 			if (skb)
 				atomic_inc(&skb->users);
-			sctp_spin_unlock_irqrestore(&sk->sk_receive_queue.lock,
-						    cpu_flags);
+			spin_unlock_bh(&sk->sk_receive_queue.lock);
 		} else {
 			skb = skb_dequeue(&sk->sk_receive_queue);
 		}
-- 
cgit v1.2.3


From aaae3013d186d71a01e1059c9633c4ec8729d891 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:57:42 -0700
Subject: [PKT_SCHED]: Transform fifo qdisc to use generic queue management
 interface

The simplicity of the fifo qdisc allows several qdisc operations to be
redirected to the relevant queue management function directly. Saves
a lot of code lines and gives the pfifo a byte based backlog.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fifo.c | 102 +++++++--------------------------------------------
 1 file changed, 14 insertions(+), 88 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 4888305c96d..83a4db4d3cd 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -47,61 +47,10 @@ bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 {
 	struct fifo_sched_data *q = qdisc_priv(sch);
 
-	if (sch->qstats.backlog + skb->len <= q->limit) {
-		__skb_queue_tail(&sch->q, skb);
-		sch->qstats.backlog += skb->len;
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
-		return 0;
-	}
-	sch->qstats.drops++;
-#ifdef CONFIG_NET_CLS_POLICE
-	if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
-#endif
-		kfree_skb(skb);
-	return NET_XMIT_DROP;
-}
-
-static int
-bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
-{
-	__skb_queue_head(&sch->q, skb);
-	sch->qstats.backlog += skb->len;
-	sch->qstats.requeues++;
-	return 0;
-}
-
-static struct sk_buff *
-bfifo_dequeue(struct Qdisc* sch)
-{
-	struct sk_buff *skb;
-
-	skb = __skb_dequeue(&sch->q);
-	if (skb)
-		sch->qstats.backlog -= skb->len;
-	return skb;
-}
-
-static unsigned int 
-fifo_drop(struct Qdisc* sch)
-{
-	struct sk_buff *skb;
-
-	skb = __skb_dequeue_tail(&sch->q);
-	if (skb) {
-		unsigned int len = skb->len;
-		sch->qstats.backlog -= len;
-		kfree_skb(skb);
-		return len;
-	}
-	return 0;
-}
+	if (likely(sch->qstats.backlog + skb->len <= q->limit))
+		return qdisc_enqueue_tail(skb, sch);
 
-static void
-fifo_reset(struct Qdisc* sch)
-{
-	skb_queue_purge(&sch->q);
-	sch->qstats.backlog = 0;
+	return qdisc_reshape_fail(skb, sch);
 }
 
 static int
@@ -109,33 +58,10 @@ pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 {
 	struct fifo_sched_data *q = qdisc_priv(sch);
 
-	if (sch->q.qlen < q->limit) {
-		__skb_queue_tail(&sch->q, skb);
-		sch->bstats.bytes += skb->len;
-		sch->bstats.packets++;
-		return 0;
-	}
-	sch->qstats.drops++;
-#ifdef CONFIG_NET_CLS_POLICE
-	if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
-#endif
-		kfree_skb(skb);
-	return NET_XMIT_DROP;
-}
+	if (likely(skb_queue_len(&sch->q) < q->limit))
+		return qdisc_enqueue_tail(skb, sch);
 
-static int
-pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch)
-{
-	__skb_queue_head(&sch->q, skb);
-	sch->qstats.requeues++;
-	return 0;
-}
-
-
-static struct sk_buff *
-pfifo_dequeue(struct Qdisc* sch)
-{
-	return __skb_dequeue(&sch->q);
+	return qdisc_reshape_fail(skb, sch);
 }
 
 static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
@@ -180,11 +106,11 @@ struct Qdisc_ops pfifo_qdisc_ops = {
 	.id		=	"pfifo",
 	.priv_size	=	sizeof(struct fifo_sched_data),
 	.enqueue	=	pfifo_enqueue,
-	.dequeue	=	pfifo_dequeue,
-	.requeue	=	pfifo_requeue,
-	.drop		=	fifo_drop,
+	.dequeue	=	qdisc_dequeue_head,
+	.requeue	=	qdisc_requeue,
+	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
-	.reset		=	fifo_reset,
+	.reset		=	qdisc_reset_queue,
 	.destroy	=	NULL,
 	.change		=	fifo_init,
 	.dump		=	fifo_dump,
@@ -197,11 +123,11 @@ struct Qdisc_ops bfifo_qdisc_ops = {
 	.id		=	"bfifo",
 	.priv_size	=	sizeof(struct fifo_sched_data),
 	.enqueue	=	bfifo_enqueue,
-	.dequeue	=	bfifo_dequeue,
-	.requeue	=	bfifo_requeue,
-	.drop		=	fifo_drop,
+	.dequeue	=	qdisc_dequeue_head,
+	.requeue	=	qdisc_requeue,
+	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
-	.reset		=	fifo_reset,
+	.reset		=	qdisc_reset_queue,
 	.destroy	=	NULL,
 	.change		=	fifo_init,
 	.dump		=	fifo_dump,
-- 
cgit v1.2.3


From 6fc8e84f4cf8d623f98aebfd6996dc3848bcf964 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:58:00 -0700
Subject: [PKT_SCHED]: Cleanup fifo qdisc and remove unnecessary code

Removes the skb trimming code which is not needed since we never
touch the skb upon failure. Removes unnecessary includes,
initializers, and simplifies the code a bit.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_fifo.c | 50 ++++++++++++--------------------------------------
 1 file changed, 12 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index 83a4db4d3cd..033083bf0e7 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -11,39 +11,21 @@
 
 #include <linux/config.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/in.h>
 #include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/if_ether.h>
-#include <linux/inet.h>
 #include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/notifier.h>
-#include <net/ip.h>
-#include <net/route.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
 #include <net/pkt_sched.h>
 
 /* 1 band FIFO pseudo-"scheduler" */
 
 struct fifo_sched_data
 {
-	unsigned limit;
+	u32 limit;
 };
 
-static int
-bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 {
 	struct fifo_sched_data *q = qdisc_priv(sch);
 
@@ -53,8 +35,7 @@ bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 	return qdisc_reshape_fail(skb, sch);
 }
 
-static int
-pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
+static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 {
 	struct fifo_sched_data *q = qdisc_priv(sch);
 
@@ -69,40 +50,37 @@ static int fifo_init(struct Qdisc *sch, struct rtattr *opt)
 	struct fifo_sched_data *q = qdisc_priv(sch);
 
 	if (opt == NULL) {
-		unsigned int limit = sch->dev->tx_queue_len ? : 1;
+		u32 limit = sch->dev->tx_queue_len ? : 1;
 
 		if (sch->ops == &bfifo_qdisc_ops)
-			q->limit = limit*sch->dev->mtu;
-		else	
-			q->limit = limit;
+			limit *= sch->dev->mtu;
+
+		q->limit = limit;
 	} else {
 		struct tc_fifo_qopt *ctl = RTA_DATA(opt);
-		if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
+
+		if (RTA_PAYLOAD(opt) < sizeof(*ctl))
 			return -EINVAL;
+
 		q->limit = ctl->limit;
 	}
+
 	return 0;
 }
 
 static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct fifo_sched_data *q = qdisc_priv(sch);
-	unsigned char	 *b = skb->tail;
-	struct tc_fifo_qopt opt;
+	struct tc_fifo_qopt opt = { .limit = q->limit };
 
-	opt.limit = q->limit;
 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
-
 	return skb->len;
 
 rtattr_failure:
-	skb_trim(skb, b - skb->data);
 	return -1;
 }
 
 struct Qdisc_ops pfifo_qdisc_ops = {
-	.next		=	NULL,
-	.cl_ops		=	NULL,
 	.id		=	"pfifo",
 	.priv_size	=	sizeof(struct fifo_sched_data),
 	.enqueue	=	pfifo_enqueue,
@@ -111,15 +89,12 @@ struct Qdisc_ops pfifo_qdisc_ops = {
 	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
-	.destroy	=	NULL,
 	.change		=	fifo_init,
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
 };
 
 struct Qdisc_ops bfifo_qdisc_ops = {
-	.next		=	NULL,
-	.cl_ops		=	NULL,
 	.id		=	"bfifo",
 	.priv_size	=	sizeof(struct fifo_sched_data),
 	.enqueue	=	bfifo_enqueue,
@@ -128,7 +103,6 @@ struct Qdisc_ops bfifo_qdisc_ops = {
 	.drop		=	qdisc_queue_drop,
 	.init		=	fifo_init,
 	.reset		=	qdisc_reset_queue,
-	.destroy	=	NULL,
 	.change		=	fifo_init,
 	.dump		=	fifo_dump,
 	.owner		=	THIS_MODULE,
-- 
cgit v1.2.3


From 821d24ae741f83ef0754a98b4b8aef7231856543 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:58:15 -0700
Subject: [PKT_SCHED]: Transform pfifo_fast to use generic queue management
 interface

Gives pfifo_fast a byte based backlog.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 87e48a4e105..03cf001adb4 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -318,16 +318,12 @@ pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 
 	list += prio2band[skb->priority&TC_PRIO_MAX];
 
-	if (list->qlen < qdisc->dev->tx_queue_len) {
-		__skb_queue_tail(list, skb);
+	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
 		qdisc->q.qlen++;
-		qdisc->bstats.bytes += skb->len;
-		qdisc->bstats.packets++;
-		return 0;
+		return __qdisc_enqueue_tail(skb, qdisc, list);
 	}
-	qdisc->qstats.drops++;
-	kfree_skb(skb);
-	return NET_XMIT_DROP;
+
+	return qdisc_drop(skb, qdisc);
 }
 
 static struct sk_buff *
@@ -335,10 +331,9 @@ pfifo_fast_dequeue(struct Qdisc* qdisc)
 {
 	int prio;
 	struct sk_buff_head *list = qdisc_priv(qdisc);
-	struct sk_buff *skb;
 
 	for (prio = 0; prio < 3; prio++, list++) {
-		skb = __skb_dequeue(list);
+		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
 		if (skb) {
 			qdisc->q.qlen--;
 			return skb;
@@ -354,10 +349,8 @@ pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 
 	list += prio2band[skb->priority&TC_PRIO_MAX];
 
-	__skb_queue_head(list, skb);
 	qdisc->q.qlen++;
-	qdisc->qstats.requeues++;
-	return 0;
+	return __qdisc_requeue(skb, qdisc, list);
 }
 
 static void
@@ -367,7 +360,9 @@ pfifo_fast_reset(struct Qdisc* qdisc)
 	struct sk_buff_head *list = qdisc_priv(qdisc);
 
 	for (prio=0; prio < 3; prio++)
-		skb_queue_purge(list+prio);
+		__qdisc_reset_queue(qdisc, list + prio);
+
+	qdisc->qstats.backlog = 0;
 	qdisc->q.qlen = 0;
 }
 
-- 
cgit v1.2.3


From 321090e7a468ab33917a0af839a3ae923b1c1bc5 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:58:35 -0700
Subject: [PKT_SCHED]: Add and use prio2list() in the pfifo_fast qdisc

prio2list() returns the relevant sk_buff_head for the
band specified by the priority for a given skb.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 03cf001adb4..a08a98e7b94 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -311,12 +311,17 @@ static const u8 prio2band[TC_PRIO_MAX+1] =
    generic prio+fifo combination.
  */
 
-static int
-pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
+					     struct Qdisc *qdisc)
 {
 	struct sk_buff_head *list = qdisc_priv(qdisc);
+	return list + prio2band[skb->priority & TC_PRIO_MAX];
+}
 
-	list += prio2band[skb->priority&TC_PRIO_MAX];
+static int
+pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+{
+	struct sk_buff_head *list = prio2list(skb, qdisc);
 
 	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
 		qdisc->q.qlen++;
@@ -345,12 +350,8 @@ pfifo_fast_dequeue(struct Qdisc* qdisc)
 static int
 pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 {
-	struct sk_buff_head *list = qdisc_priv(qdisc);
-
-	list += prio2band[skb->priority&TC_PRIO_MAX];
-
 	qdisc->q.qlen++;
-	return __qdisc_requeue(skb, qdisc, list);
+	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
 }
 
 static void
-- 
cgit v1.2.3


From f87a9c3ddf08c10d8427bcedf3f53098113136d0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:58:53 -0700
Subject: [PKT_SCHED]: Cleanup pfifo_fast qdisc and remove unnecessary code

Removes the skb trimming code which is not needed since we never
touch the skb upon failure. Removes unnecessary initializers,
and simplifies the code a bit.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a08a98e7b94..8bb7c47191e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -311,6 +311,8 @@ static const u8 prio2band[TC_PRIO_MAX+1] =
    generic prio+fifo combination.
  */
 
+#define PFIFO_FAST_BANDS 3
+
 static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
 					     struct Qdisc *qdisc)
 {
@@ -318,8 +320,7 @@ static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
 	return list + prio2band[skb->priority & TC_PRIO_MAX];
 }
 
-static int
-pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
+static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 {
 	struct sk_buff_head *list = prio2list(skb, qdisc);
 
@@ -331,36 +332,34 @@ pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
 	return qdisc_drop(skb, qdisc);
 }
 
-static struct sk_buff *
-pfifo_fast_dequeue(struct Qdisc* qdisc)
+static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
 {
 	int prio;
 	struct sk_buff_head *list = qdisc_priv(qdisc);
 
-	for (prio = 0; prio < 3; prio++, list++) {
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++, list++) {
 		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
 		if (skb) {
 			qdisc->q.qlen--;
 			return skb;
 		}
 	}
+
 	return NULL;
 }
 
-static int
-pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
+static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 {
 	qdisc->q.qlen++;
 	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
 }
 
-static void
-pfifo_fast_reset(struct Qdisc* qdisc)
+static void pfifo_fast_reset(struct Qdisc* qdisc)
 {
 	int prio;
 	struct sk_buff_head *list = qdisc_priv(qdisc);
 
-	for (prio=0; prio < 3; prio++)
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
 		__qdisc_reset_queue(qdisc, list + prio);
 
 	qdisc->qstats.backlog = 0;
@@ -369,35 +368,30 @@ pfifo_fast_reset(struct Qdisc* qdisc)
 
 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
 {
-	unsigned char	 *b = skb->tail;
-	struct tc_prio_qopt opt;
+	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
 
-	opt.bands = 3; 
 	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
 	return skb->len;
 
 rtattr_failure:
-	skb_trim(skb, b - skb->data);
 	return -1;
 }
 
 static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
 {
-	int i;
+	int prio;
 	struct sk_buff_head *list = qdisc_priv(qdisc);
 
-	for (i=0; i<3; i++)
-		skb_queue_head_init(list+i);
+	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
+		skb_queue_head_init(list + prio);
 
 	return 0;
 }
 
 static struct Qdisc_ops pfifo_fast_ops = {
-	.next		=	NULL,
-	.cl_ops		=	NULL,
 	.id		=	"pfifo_fast",
-	.priv_size	=	3 * sizeof(struct sk_buff_head),
+	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
 	.enqueue	=	pfifo_fast_enqueue,
 	.dequeue	=	pfifo_fast_dequeue,
 	.requeue	=	pfifo_fast_requeue,
-- 
cgit v1.2.3


From 94df109a8c802263837baccc1a3eeab9ab9e88db Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Sat, 18 Jun 2005 22:59:08 -0700
Subject: [PKT_SCHED]: noop/noqueue qdisc style cleanups

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_generic.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 8bb7c47191e..7683b34dc6a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -243,31 +243,27 @@ static void dev_watchdog_down(struct net_device *dev)
    cheaper.
  */
 
-static int
-noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
+static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
 {
 	kfree_skb(skb);
 	return NET_XMIT_CN;
 }
 
-static struct sk_buff *
-noop_dequeue(struct Qdisc * qdisc)
+static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
 {
 	return NULL;
 }
 
-static int
-noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
+static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
 {
 	if (net_ratelimit())
-		printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name);
+		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
+		       skb->dev->name);
 	kfree_skb(skb);
 	return NET_XMIT_CN;
 }
 
 struct Qdisc_ops noop_qdisc_ops = {
-	.next		=	NULL,
-	.cl_ops		=	NULL,
 	.id		=	"noop",
 	.priv_size	=	0,
 	.enqueue	=	noop_enqueue,
@@ -285,8 +281,6 @@ struct Qdisc noop_qdisc = {
 };
 
 static struct Qdisc_ops noqueue_qdisc_ops = {
-	.next		=	NULL,
-	.cl_ops		=	NULL,
 	.id		=	"noqueue",
 	.priv_size	=	0,
 	.enqueue	=	noop_enqueue,
-- 
cgit v1.2.3


From 5418c6926fcb0e5a324cd5bc1106fc0941db7aae Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Sat, 18 Jun 2005 22:59:45 -0700
Subject: [IPV4]: [1/4] signed vs unsigned cleanup in net/ipv4/raw.c

This patch silences these two gcc -W warnings in net/ipv4/raw.c :

net/ipv4/raw.c:517: warning: signed and unsigned type in conditional expression
net/ipv4/raw.c:613: warning: signed and unsigned type in conditional expression

It doesn't change the behaviour of the code, simply writes the conditional
expression with plain 'if()' syntax instead of '? :' , but since this
breaks it into sepperate statements gcc no longer complains about having
both a signed and unsigned value in the same conditional expression.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8c1512021ee..8d17dd3542d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -514,7 +514,10 @@ done:
 		kfree(ipc.opt);
 	ip_rt_put(rt);
 
-out:	return err < 0 ? err : len;
+out:
+	if (err < 0)
+		return err;
+	return len;
 
 do_confirm:
 	dst_confirm(&rt->u.dst);
@@ -610,7 +613,10 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		copied = skb->len;
 done:
 	skb_free_datagram(sk, skb);
-out:	return err ? err : copied;
+out:
+	if (err)
+		return err;
+	return copied;
 }
 
 static int raw_init(struct sock *sk)
-- 
cgit v1.2.3


From 926d4b8122fb324de294a09a7d96d8af7cfc6861 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Sat, 18 Jun 2005 23:00:00 -0700
Subject: [IPV4]: [2/4] signed vs unsigned cleanup in net/ipv4/raw.c

This patch gets rid of the following gcc -W warning in net/ipv4/raw.c :

net/ipv4/raw.c:387: warning: comparison of unsigned expression < 0 is always false

Since 'len' is of type size_t it is unsigned and can thus never be <0, and
since this is obvious from the function declaration just a few lines above
I think it's ok to remove the pointless check for len<0.


Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8d17dd3542d..1fcb23d3351 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -384,7 +384,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	int err;
 
 	err = -EMSGSIZE;
-	if (len < 0 || len > 0xFFFF)
+	if (len > 0xFFFF)
 		goto out;
 
 	/*
-- 
cgit v1.2.3


From 93765d8a435fa021c4b7cd0521b7959239d7158a Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Sat, 18 Jun 2005 23:00:15 -0700
Subject: [IPV4]: [3/4] signed vs unsigned cleanup in net/ipv4/raw.c

This patch changes the type of the local variable 'i' in
raw_probe_proto_opt() from 'int' to 'unsigned int'. The only use of 'i' in
this function is as a counter in a for() loop and subsequent index into
the msg->msg_iov[] array.
Since 'i' is compared in a loop to the unsigned variable msg->msg_iovlen
gcc -W generates this warning :

net/ipv4/raw.c:340: warning: comparison between signed and unsigned

Changing 'i' to unsigned silences this warning and is safe since the array
index can never be negative anyway, so unsigned int is the logical type to
use for 'i' and also enables a larger msg_iov[] array (but I don't know if
that will ever matter).

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1fcb23d3351..085a08ea5a5 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -332,7 +332,7 @@ static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
 	u8 __user *type = NULL;
 	u8 __user *code = NULL;
 	int probed = 0;
-	int i;
+	unsigned int i;
 
 	if (!msg->msg_iov)
 		return;
-- 
cgit v1.2.3


From f7d7fc0322c1770fe7ee836ca2732c2f88e2e1a4 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Sat, 18 Jun 2005 23:00:34 -0700
Subject: [IPV4]: [4/4] signed vs unsigned cleanup in net/ipv4/raw.c

This patch changes the type of the third parameter 'length' of the
raw_send_hdrinc() function from 'int' to 'size_t'.
This makes sense since this function is only ever called from one
location, and the value passed as the third parameter in that location is
itself of type size_t, so this makes the recieving functions parameter
type match. Also, inside raw_send_hdrinc() the 'length' variable is
used in comparisons with unsigned values and passed as parameter to
functions expecting unsigned values (it's used in a single comparison with
a signed value, but that one can never actually be negative so the patch
also casts that one to size_t to stop gcc worrying, and it is passed in a
single instance to memcpy_fromiovecend() which expects a signed int, but
as far as I can see that's not a problem since the value of 'length'
shouldn't ever exceed the value of a signed int).

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 085a08ea5a5..d1835b1bc8c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -259,7 +259,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static int raw_send_hdrinc(struct sock *sk, void *from, int length,
+static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
 			struct rtable *rt, 
 			unsigned int flags)
 {
@@ -298,7 +298,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, int length,
 		goto error_fault;
 
 	/* We don't modify invalid header */
-	if (length >= sizeof(*iph) && iph->ihl * 4 <= length) {
+	if (length >= sizeof(*iph) && iph->ihl * 4U <= length) {
 		if (!iph->saddr)
 			iph->saddr = rt->rt_src;
 		iph->check   = 0;
-- 
cgit v1.2.3


From 7df551254add79a445d2e47e8f849cef8fee6e38 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 18 Jun 2005 23:01:10 -0700
Subject: [TCP]: Fix sysctl_tcp_low_latency

When enabled, this should disable UCOPY prequeue'ing altogether,
but it does not due to a missing test.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3a4c52e77e0..674bbd8cfd3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1326,7 +1326,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 		cleanup_rbuf(sk, copied);
 
-		if (tp->ucopy.task == user_recv) {
+		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
 			/* Install new reader */
 			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
 				user_recv = current;
-- 
cgit v1.2.3


From 3f7a87d2fa9b42f7aade43914f060df68cc89cc7 Mon Sep 17 00:00:00 2001
From: Frank Filz <ffilzlnx@us.ibm.com>
Date: Mon, 20 Jun 2005 13:14:57 -0700
Subject: [SCTP] sctp_connectx() API support

Implements sctp_connectx() as defined in the SCTP sockets API draft by
tunneling the request through a setsockopt().

Signed-off-by: Frank Filz <ffilzlnx@us.ibm.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/associola.c     | 151 +++++++++++++-----
 net/sctp/endpointola.c   |   1 -
 net/sctp/input.c         |   2 +-
 net/sctp/outqueue.c      |  11 +-
 net/sctp/sm_make_chunk.c |  20 ++-
 net/sctp/sm_sideeffect.c | 105 +++++++++---
 net/sctp/sm_statefuns.c  | 148 +++++++++++------
 net/sctp/sm_statetable.c |   6 +-
 net/sctp/socket.c        | 405 +++++++++++++++++++++++++++++++++--------------
 net/sctp/transport.c     |   4 +-
 10 files changed, 615 insertions(+), 238 deletions(-)

(limited to 'net')

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 663843d97a9..7ae6aa772da 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -191,10 +191,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 	asoc->last_cwr_tsn = asoc->ctsn_ack_point;
 	asoc->unack_data = 0;
 
-	SCTP_DEBUG_PRINTK("myctsnap for %s INIT as 0x%x.\n",
-			  asoc->ep->debug_name,
-			  asoc->ctsn_ack_point);
-
 	/* ADDIP Section 4.1 Asconf Chunk Procedures
 	 *
 	 * When an endpoint has an ASCONF signaled change to be sent to the
@@ -211,6 +207,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 
 	/* Make an empty list of remote transport addresses.  */
 	INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
+	asoc->peer.transport_count = 0;
 
 	/* RFC 2960 5.1 Normal Establishment of an Association
 	 *
@@ -288,6 +285,7 @@ struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
 
 	asoc->base.malloced = 1;
 	SCTP_DBG_OBJCNT_INC(assoc);
+	SCTP_DEBUG_PRINTK("Created asoc %p\n", asoc);
 
 	return asoc;
 
@@ -356,6 +354,8 @@ void sctp_association_free(struct sctp_association *asoc)
 		sctp_transport_free(transport);
 	}
 
+	asoc->peer.transport_count = 0;
+
 	/* Free any cached ASCONF_ACK chunk. */
 	if (asoc->addip_last_asconf_ack)
 		sctp_chunk_free(asoc->addip_last_asconf_ack);
@@ -400,7 +400,7 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
 	/* If the primary path is changing, assume that the
 	 * user wants to use this new path.
 	 */
-	if (transport->active)
+	if (transport->state != SCTP_INACTIVE)
 		asoc->peer.active_path = transport;
 
 	/*
@@ -428,10 +428,58 @@ void sctp_assoc_set_primary(struct sctp_association *asoc,
 	transport->cacc.next_tsn_at_change = asoc->next_tsn;
 }
 
+/* Remove a transport from an association.  */
+void sctp_assoc_rm_peer(struct sctp_association *asoc,
+			struct sctp_transport *peer)
+{
+	struct list_head	*pos;
+	struct sctp_transport	*transport;
+
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_rm_peer:association %p addr: ",
+				 " port: %d\n",
+				 asoc,
+				 (&peer->ipaddr),
+				 peer->ipaddr.v4.sin_port);
+
+	/* If we are to remove the current retran_path, update it
+	 * to the next peer before removing this peer from the list.
+	 */
+	if (asoc->peer.retran_path == peer)
+		sctp_assoc_update_retran_path(asoc);
+
+	/* Remove this peer from the list. */
+	list_del(&peer->transports);
+
+	/* Get the first transport of asoc. */
+	pos = asoc->peer.transport_addr_list.next;
+	transport = list_entry(pos, struct sctp_transport, transports);
+
+	/* Update any entries that match the peer to be deleted. */
+	if (asoc->peer.primary_path == peer)
+		sctp_assoc_set_primary(asoc, transport);
+	if (asoc->peer.active_path == peer)
+		asoc->peer.active_path = transport;
+	if (asoc->peer.last_data_from == peer)
+		asoc->peer.last_data_from = transport;
+
+	/* If we remove the transport an INIT was last sent to, set it to
+	 * NULL. Combined with the update of the retran path above, this
+	 * will cause the next INIT to be sent to the next available
+	 * transport, maintaining the cycle.
+	 */
+	if (asoc->init_last_sent_to == peer)
+		asoc->init_last_sent_to = NULL;
+
+	asoc->peer.transport_count--;
+
+	sctp_transport_free(peer);
+}
+
 /* Add a transport address to an association.  */
 struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 					   const union sctp_addr *addr,
-					   int gfp)
+					   const int gfp,
+					   const int peer_state)
 {
 	struct sctp_transport *peer;
 	struct sctp_sock *sp;
@@ -442,14 +490,25 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* AF_INET and AF_INET6 share common port field. */
 	port = addr->v4.sin_port;
 
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_add_peer:association %p addr: ",
+				 " port: %d state:%s\n",
+				 asoc,
+				 addr,
+				 addr->v4.sin_port,
+				 peer_state == SCTP_UNKNOWN?"UNKNOWN":"ACTIVE");
+
 	/* Set the port if it has not been set yet.  */
 	if (0 == asoc->peer.port)
 		asoc->peer.port = port;
 
 	/* Check to see if this is a duplicate. */
 	peer = sctp_assoc_lookup_paddr(asoc, addr);
-	if (peer)
+	if (peer) {
+		if (peer_state == SCTP_ACTIVE &&
+		    peer->state == SCTP_UNKNOWN)
+		     peer->state = SCTP_ACTIVE;
 		return peer;
+	}
 
 	peer = sctp_transport_new(addr, gfp);
 	if (!peer)
@@ -516,8 +575,12 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the transport's RTO.initial value */
 	peer->rto = asoc->rto_initial;
 
+	/* Set the peer's active state. */
+	peer->state = peer_state;
+
 	/* Attach the remote transport to our asoc.  */
 	list_add_tail(&peer->transports, &asoc->peer.transport_addr_list);
+	asoc->peer.transport_count++;
 
 	/* If we do not yet have a primary path, set one.  */
 	if (!asoc->peer.primary_path) {
@@ -525,8 +588,9 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 		asoc->peer.retran_path = peer;
 	}
 
-	if (asoc->peer.active_path == asoc->peer.retran_path)
+	if (asoc->peer.active_path == asoc->peer.retran_path) {
 		asoc->peer.retran_path = peer;
+	}
 
 	return peer;
 }
@@ -537,37 +601,16 @@ void sctp_assoc_del_peer(struct sctp_association *asoc,
 {
 	struct list_head	*pos;
 	struct list_head	*temp;
-	struct sctp_transport	*peer = NULL;
 	struct sctp_transport	*transport;
 
 	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
 		transport = list_entry(pos, struct sctp_transport, transports);
 		if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) {
-			peer = transport;
-			list_del(pos);
+			/* Do book keeping for removing the peer and free it. */
+			sctp_assoc_rm_peer(asoc, transport);
 			break;
 		}
 	}
-
-	/* The address we want delete is not in the association. */
-	if (!peer)
-		return;
-
-	/* Get the first transport of asoc. */ 
-	pos = asoc->peer.transport_addr_list.next;
-	transport = list_entry(pos, struct sctp_transport, transports);
-
-	/* Update any entries that match the peer to be deleted. */  
-	if (asoc->peer.primary_path == peer)
-		sctp_assoc_set_primary(asoc, transport);
-	if (asoc->peer.active_path == peer)
-		asoc->peer.active_path = transport;
-	if (asoc->peer.retran_path == peer)
-		asoc->peer.retran_path = transport;
-	if (asoc->peer.last_data_from == peer)
-		asoc->peer.last_data_from = transport;
-
-	sctp_transport_free(peer);
 }
 
 /* Lookup a transport by address. */
@@ -608,12 +651,12 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	/* Record the transition on the transport.  */
 	switch (command) {
 	case SCTP_TRANSPORT_UP:
-		transport->active = SCTP_ACTIVE;
+		transport->state = SCTP_ACTIVE;
 		spc_state = SCTP_ADDR_AVAILABLE;
 		break;
 
 	case SCTP_TRANSPORT_DOWN:
-		transport->active = SCTP_INACTIVE;
+		transport->state = SCTP_INACTIVE;
 		spc_state = SCTP_ADDR_UNREACHABLE;
 		break;
 
@@ -643,7 +686,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	list_for_each(pos, &asoc->peer.transport_addr_list) {
 		t = list_entry(pos, struct sctp_transport, transports);
 
-		if (!t->active)
+		if (t->state == SCTP_INACTIVE)
 			continue;
 		if (!first || t->last_time_heard > first->last_time_heard) {
 			second = first;
@@ -663,7 +706,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc,
 	 * [If the primary is active but not most recent, bump the most
 	 * recently used transport.]
 	 */
-	if (asoc->peer.primary_path->active &&
+	if (asoc->peer.primary_path->state != SCTP_INACTIVE &&
 	    first != asoc->peer.primary_path) {
 		second = first;
 		first = asoc->peer.primary_path;
@@ -958,7 +1001,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
 					   transports);
 			if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr))
 				sctp_assoc_add_peer(asoc, &trans->ipaddr,
-						    GFP_ATOMIC);
+						    GFP_ATOMIC, SCTP_ACTIVE);
 		}
 
 		asoc->ctsn_ack_point = asoc->next_tsn - 1;
@@ -998,7 +1041,7 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
 
 		/* Try to find an active transport. */
 
-		if (t->active) {
+		if (t->state != SCTP_INACTIVE) {
 			break;
 		} else {
 			/* Keep track of the next transport in case
@@ -1019,6 +1062,40 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc)
 	}
 
 	asoc->peer.retran_path = t;
+
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
+				 " %p addr: ",
+				 " port: %d\n",
+				 asoc,
+				 (&t->ipaddr),
+				 t->ipaddr.v4.sin_port);
+}
+
+/* Choose the transport for sending a INIT packet.  */
+struct sctp_transport *sctp_assoc_choose_init_transport(
+	struct sctp_association *asoc)
+{
+	struct sctp_transport *t;
+
+	/* Use the retran path. If the last INIT was sent over the
+	 * retran path, update the retran path and use it.
+	 */
+	if (!asoc->init_last_sent_to) {
+		t = asoc->peer.active_path;
+	} else {
+		if (asoc->init_last_sent_to == asoc->peer.retran_path)
+			sctp_assoc_update_retran_path(asoc);
+		t = asoc->peer.retran_path;
+	}
+
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_assoc_update_retran_path:association"
+				 " %p addr: ",
+				 " port: %d\n",
+				 asoc,
+				 (&t->ipaddr),
+				 t->ipaddr.v4.sin_port);
+
+	return t;
 }
 
 /* Choose the transport for sending a SHUTDOWN packet.  */
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 334f61773e6..2ec0320fac3 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -134,7 +134,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 	ep->last_key = ep->current_key = 0;
 	ep->key_changed_at = jiffies;
 
-	ep->debug_name = "unnamedEndpoint";
 	return ep;
 }
 
diff --git a/net/sctp/input.c b/net/sctp/input.c
index fffc880a646..339f7acfdb6 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -353,7 +353,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
 
 	sctp_do_sm(SCTP_EVENT_T_OTHER,
 		   SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
-		   asoc->state, asoc->ep, asoc, NULL,
+		   asoc->state, asoc->ep, asoc, t,
 		   GFP_ATOMIC);
 
 }
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 1b2d4adc4dd..4eb81a1407b 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -682,9 +682,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 
 		if (!new_transport) {
 			new_transport = asoc->peer.active_path;
-		} else if (!new_transport->active) {
-			/* If the chunk is Heartbeat or Heartbeat Ack, 
-			 * send it to chunk->transport, even if it's 
+		} else if (new_transport->state == SCTP_INACTIVE) {
+			/* If the chunk is Heartbeat or Heartbeat Ack,
+			 * send it to chunk->transport, even if it's
 			 * inactive.
 			 *
 			 * 3.3.6 Heartbeat Acknowledgement:
@@ -840,7 +840,8 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
 			 * Otherwise, we want to use the active path.
 			 */
 			new_transport = chunk->transport;
-			if (!new_transport || !new_transport->active)
+			if (!new_transport ||
+			    new_transport->state == SCTP_INACTIVE)
 				new_transport = asoc->peer.active_path;
 
 			/* Change packets if necessary.  */
@@ -1454,7 +1455,7 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 			/* Mark the destination transport address as
 			 * active if it is not so marked.
 			 */
-			if (!transport->active) {
+			if (transport->state == SCTP_INACTIVE) {
 				sctp_assoc_control_transport(
 					transport->asoc,
 					transport,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 33ac8bf47b0..5baed9bb7de 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1830,7 +1830,7 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
 	 * be a a better choice than any of the embedded addresses.
 	 */
 	if (peer_addr)
-		if(!sctp_assoc_add_peer(asoc, peer_addr, gfp))
+		if(!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
 			goto nomem;
 
 	/* Process the initialization parameters.  */
@@ -1841,6 +1841,14 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
                         goto clean_up;
 	}
 
+	/* Walk list of transports, removing transports in the UNKNOWN state. */
+	list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
+		transport = list_entry(pos, struct sctp_transport, transports);
+		if (transport->state == SCTP_UNKNOWN) {
+			sctp_assoc_rm_peer(asoc, transport);
+		}
+	}
+
 	/* The fixed INIT headers are always in network byte
 	 * order.
 	 */
@@ -1906,7 +1914,8 @@ int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
 	 * stream sequence number shall be set to 0.
 	 */
 
-	/* Allocate storage for the negotiated streams if it is not a temporary 	 * association.
+	/* Allocate storage for the negotiated streams if it is not a temporary
+ 	 * association.
 	 */
 	if (!asoc->temp) {
 		int assoc_id;
@@ -1952,6 +1961,9 @@ clean_up:
 		list_del_init(pos);
 		sctp_transport_free(transport);
 	}
+
+	asoc->peer.transport_count = 0;
+
 nomem:
 	return 0;
 }
@@ -1995,7 +2007,7 @@ static int sctp_process_param(struct sctp_association *asoc,
 		af->from_addr_param(&addr, param.addr, asoc->peer.port, 0);
 		scope = sctp_scope(peer_addr);
 		if (sctp_in_scope(&addr, scope))
-			if (!sctp_assoc_add_peer(asoc, &addr, gfp))
+			if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_ACTIVE))
 				return 0;
 		break;
 
@@ -2396,7 +2408,7 @@ static __u16 sctp_process_asconf_param(struct sctp_association *asoc,
 	 	 * Due to Resource Shortage'.
 	 	 */
 
-		peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC);
+		peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_ACTIVE);
 		if (!peer)
 			return SCTP_ERROR_RSRC_LOW;
 
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index f65fa441952..778639db125 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -414,11 +414,13 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
 	 */
 	asoc->overall_error_count++;
 
-	if (transport->active &&
+	if (transport->state != SCTP_INACTIVE &&
 	    (transport->error_count++ >= transport->max_retrans)) {
-		SCTP_DEBUG_PRINTK("transport_strike: transport "
-				  "IP:%d.%d.%d.%d failed.\n",
-				  NIPQUAD(transport->ipaddr.v4.sin_addr));
+		SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
+					 " transport IP: port:%d failed.\n",
+					 asoc,
+					 (&transport->ipaddr),
+					 transport->ipaddr.v4.sin_port);
 		sctp_assoc_control_transport(asoc, transport,
 					     SCTP_TRANSPORT_DOWN,
 					     SCTP_FAILED_THRESHOLD);
@@ -593,7 +595,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
 	/* Mark the destination transport address as active if it is not so
 	 * marked.
 	 */
-	if (!t->active)
+	if (t->state == SCTP_INACTIVE)
 		sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
 					     SCTP_HEARTBEAT_SUCCESS);
 
@@ -665,8 +667,11 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
 
 	asoc->state = state;
 
+	SCTP_DEBUG_PRINTK("sctp_cmd_new_state: asoc %p[%s]\n",
+			  asoc, sctp_state_tbl[state]);
+
 	if (sctp_style(sk, TCP)) {
-		/* Change the sk->sk_state of a TCP-style socket that has 
+		/* Change the sk->sk_state of a TCP-style socket that has
 		 * sucessfully completed a connect() call.
 		 */
 		if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED))
@@ -678,6 +683,16 @@ static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds,
 			sk->sk_shutdown |= RCV_SHUTDOWN;
 	}
 
+	if (sctp_state(asoc, COOKIE_WAIT)) {
+		/* Reset init timeouts since they may have been
+		 * increased due to timer expirations.
+		 */
+		asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
+			asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT];
+		asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
+			asoc->ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE];
+	}
+
 	if (sctp_state(asoc, ESTABLISHED) ||
 	    sctp_state(asoc, CLOSED) ||
 	    sctp_state(asoc, SHUTDOWN_RECEIVED)) {
@@ -1120,10 +1135,10 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 			 * to be executed only during failed attempts of
 			 * association establishment.
 			 */
-			if ((asoc->peer.retran_path != 
-			     asoc->peer.primary_path) && 
-			    (asoc->counters[SCTP_COUNTER_INIT_ERROR] > 0)) {
-				sctp_add_cmd_sf(commands, 
+			if ((asoc->peer.retran_path !=
+			     asoc->peer.primary_path) &&
+			    (asoc->init_err_counter > 0)) {
+				sctp_add_cmd_sf(commands,
 				                SCTP_CMD_FORCE_PRIM_RETRAN,
 						SCTP_NULL());
 			}
@@ -1237,18 +1252,67 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				sctp_association_put(asoc);
 			break;
 
+		case SCTP_CMD_INIT_CHOOSE_TRANSPORT:
+			chunk = cmd->obj.ptr;
+			t = sctp_assoc_choose_init_transport(asoc);
+			asoc->init_last_sent_to = t;
+			chunk->transport = t;
+			t->init_sent_count++;
+			break;
+
 		case SCTP_CMD_INIT_RESTART:
 			/* Do the needed accounting and updates
 			 * associated with restarting an initialization
-			 * timer.
+			 * timer. Only multiply the timeout by two if
+			 * all transports have been tried at the current
+			 * timeout.
+			 */
+			t = asoc->init_last_sent_to;
+			asoc->init_err_counter++;
+
+			if (t->init_sent_count > (asoc->init_cycle + 1)) {
+				asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] *= 2;
+				if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] >
+				    asoc->max_init_timeo) {
+					asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
+						asoc->max_init_timeo;
+				}
+				asoc->init_cycle++;
+				SCTP_DEBUG_PRINTK(
+					"T1 INIT Timeout adjustment"
+					" init_err_counter: %d"
+					" cycle: %d"
+					" timeout: %d\n",
+					asoc->init_err_counter,
+					asoc->init_cycle,
+					asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT]);
+			}
+
+			sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
+					SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+			break;
+
+		case SCTP_CMD_COOKIEECHO_RESTART:
+			/* Do the needed accounting and updates
+			 * associated with restarting an initialization
+			 * timer. Only multiply the timeout by two if
+			 * all transports have been tried at the current
+			 * timeout.
 			 */
-			asoc->counters[SCTP_COUNTER_INIT_ERROR]++;
-			asoc->timeouts[cmd->obj.to] *= 2;
-			if (asoc->timeouts[cmd->obj.to] >
+			asoc->init_err_counter++;
+
+			asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] *= 2;
+			if (asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] >
 			    asoc->max_init_timeo) {
-				asoc->timeouts[cmd->obj.to] =
+				asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
 					asoc->max_init_timeo;
 			}
+			SCTP_DEBUG_PRINTK(
+				"T1 COOKIE Timeout adjustment"
+				" init_err_counter: %d"
+				" timeout: %d\n",
+				asoc->init_err_counter,
+				asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE]);
 
 			/* If we've sent any data bundled with
 			 * COOKIE-ECHO we need to resend.
@@ -1261,7 +1325,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 
 			sctp_add_cmd_sf(commands,
 					SCTP_CMD_TIMER_RESTART,
-					SCTP_TO(cmd->obj.to));
+					SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
 			break;
 
 		case SCTP_CMD_INIT_FAILED:
@@ -1273,12 +1337,13 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 					      subtype, chunk, cmd->obj.u32);
 			break;
 
-		case SCTP_CMD_COUNTER_INC:
-			asoc->counters[cmd->obj.counter]++;
+		case SCTP_CMD_INIT_COUNTER_INC:
+			asoc->init_err_counter++;
 			break;
 
-		case SCTP_CMD_COUNTER_RESET:
-			asoc->counters[cmd->obj.counter] = 0;
+		case SCTP_CMD_INIT_COUNTER_RESET:
+			asoc->init_err_counter = 0;
+			asoc->init_cycle = 0;
 			break;
 
 		case SCTP_CMD_REPORT_DUP:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8e01b8f09ac..058189684c7 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -533,6 +533,9 @@ sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep,
 	sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT,
 			SCTP_PEER_INIT(initchunk));
 
+	/* Reset init error count upon receipt of INIT-ACK.  */
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
+
 	/* 5.1 C) "A" shall stop the T1-init timer and leave
 	 * COOKIE-WAIT state.  "A" shall then ... start the T1-cookie
 	 * timer, and enter the COOKIE-ECHOED state.
@@ -775,8 +778,7 @@ sctp_disposition_t sctp_sf_do_5_1E_ca(const struct sctp_endpoint *ep,
 	 * from the COOKIE-ECHOED state to the COOKIE-WAIT
 	 * state is performed.
 	 */
-	sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_RESET,
-	                SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL());
 
 	/* RFC 2960 5.1 Normal Establishment of an Association
 	 *
@@ -1019,10 +1021,22 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep,
 	link = sctp_assoc_lookup_paddr(asoc, &from_addr);
 
 	/* This should never happen, but lets log it if so.  */
-	if (!link) {
-		printk(KERN_WARNING
-		       "%s: Could not find address %d.%d.%d.%d\n",
-		       __FUNCTION__, NIPQUAD(from_addr.v4.sin_addr));
+	if (unlikely(!link)) {
+		if (from_addr.sa.sa_family == AF_INET6) {
+			printk(KERN_WARNING
+			       "%s association %p could not find address "
+			       "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+			       __FUNCTION__,
+			       asoc,
+			       NIP6(from_addr.v6.sin6_addr));
+		} else {
+			printk(KERN_WARNING
+			       "%s association %p could not find address "
+			       "%u.%u.%u.%u\n",
+			       __FUNCTION__,
+			       asoc,
+			       NIPQUAD(from_addr.v4.sin_addr.s_addr));
+		}
 		return SCTP_DISPOSITION_DISCARD;
 	}
 
@@ -2095,9 +2109,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
 	sctp_errhdr_t *err;
 	struct sctp_chunk *reply;
 	struct sctp_bind_addr *bp;
-	int attempts;
-
-	attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
+	int attempts = asoc->init_err_counter + 1;
 
 	if (attempts >= asoc->max_init_attempts) {
 		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -2157,8 +2169,7 @@ static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep,
 	/* Cast away the const modifier, as we want to just
 	 * rerun it through as a sideffect.
 	 */
-	sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_INC,
-			SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR));
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_INC, SCTP_NULL());
 
 	sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
 			SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
@@ -2281,8 +2292,7 @@ sctp_disposition_t sctp_sf_cookie_wait_abort(const struct sctp_endpoint *ep,
 	if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr))
 		error = ((sctp_errhdr_t *)chunk->skb->data)->cause;
 
- 	sctp_stop_t1_and_abort(commands, error);
-	return SCTP_DISPOSITION_ABORT;
+	return sctp_stop_t1_and_abort(commands, error, asoc, chunk->transport);
 }
 
 /*
@@ -2294,8 +2304,8 @@ sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(const struct sctp_endpoint *ep
 					void *arg,
 					sctp_cmd_seq_t *commands)
 {
-	sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR);
- 	return SCTP_DISPOSITION_ABORT;
+	return sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR, asoc,
+				      (struct sctp_transport *)arg);
 }
 
 /*
@@ -2318,8 +2328,12 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
  *
  * This is common code called by several sctp_sf_*_abort() functions above.
  */
-void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error)
+sctp_disposition_t  sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+					   __u16 error,
+					   const struct sctp_association *asoc,
+					   struct sctp_transport *transport)
 {
+	SCTP_DEBUG_PRINTK("ABORT received (INIT).\n");
 	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
 			SCTP_STATE(SCTP_STATE_CLOSED));
 	SCTP_INC_STATS(SCTP_MIB_ABORTEDS);
@@ -2328,6 +2342,7 @@ void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error)
 	/* CMD_INIT_FAILED will DELETE_TCB. */
 	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
 			SCTP_U32(error));
+	return SCTP_DISPOSITION_ABORT;
 }
 
 /*
@@ -3805,6 +3820,10 @@ sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep,
 	sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC,
 			SCTP_ASOC((struct sctp_association *) asoc));
 
+	/* Choose transport for INIT. */
+	sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+			SCTP_CHUNK(repl));
+
 	/* After sending the INIT, "A" starts the T1-init timer and
 	 * enters the COOKIE-WAIT state.
 	 */
@@ -4589,7 +4608,7 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
 }
 
 /*
- * sctp_sf_t1_timer_expire
+ * sctp_sf_t1_init_timer_expire
  *
  * Section: 4 Note: 2
  * Verification Tag:
@@ -4603,7 +4622,59 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
  *     endpoint MUST abort the initialization process and report the
  *     error to SCTP user.
  *
- *   3) If the T1-cookie timer expires, the endpoint MUST retransmit
+ * Outputs
+ * (timers, events)
+ *
+ */
+sctp_disposition_t sctp_sf_t1_init_timer_expire(const struct sctp_endpoint *ep,
+					   const struct sctp_association *asoc,
+					   const sctp_subtype_t type,
+					   void *arg,
+					   sctp_cmd_seq_t *commands)
+{
+	struct sctp_chunk *repl = NULL;
+	struct sctp_bind_addr *bp;
+	int attempts = asoc->init_err_counter + 1;
+
+	SCTP_DEBUG_PRINTK("Timer T1 expired (INIT).\n");
+
+	if (attempts < asoc->max_init_attempts) {
+		bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
+		repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
+		if (!repl)
+			return SCTP_DISPOSITION_NOMEM;
+
+		/* Choose transport for INIT. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_CHOOSE_TRANSPORT,
+				SCTP_CHUNK(repl));
+
+		/* Issue a sideeffect to do the needed accounting. */
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
+
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
+	} else {
+		SCTP_DEBUG_PRINTK("Giving up on INIT, attempts: %d"
+				  " max_init_attempts: %d\n",
+				  attempts, asoc->max_init_attempts);
+		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
+				SCTP_U32(SCTP_ERROR_NO_ERROR));
+		return SCTP_DISPOSITION_DELETE_TCB;
+	}
+
+	return SCTP_DISPOSITION_CONSUME;
+}
+
+/*
+ * sctp_sf_t1_cookie_timer_expire
+ *
+ * Section: 4 Note: 2
+ * Verification Tag:
+ * Inputs
+ * (endpoint, asoc)
+ *
+ *  RFC 2960 Section 4 Notes
+ *  3) If the T1-cookie timer expires, the endpoint MUST retransmit
  *     COOKIE ECHO and re-start the T1-cookie timer without changing
  *     state.  This MUST be repeated up to 'Max.Init.Retransmits' times.
  *     After that, the endpoint MUST abort the initialization process and
@@ -4613,46 +4684,26 @@ sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep,
  * (timers, events)
  *
  */
-sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep,
+sctp_disposition_t sctp_sf_t1_cookie_timer_expire(const struct sctp_endpoint *ep,
 					   const struct sctp_association *asoc,
 					   const sctp_subtype_t type,
 					   void *arg,
 					   sctp_cmd_seq_t *commands)
 {
-	struct sctp_chunk *repl;
-	struct sctp_bind_addr *bp;
-	sctp_event_timeout_t timer = (sctp_event_timeout_t) arg;
-	int timeout;
-	int attempts;
-
-	timeout = asoc->timeouts[timer];
-	attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1;
-	repl = NULL;
+	struct sctp_chunk *repl = NULL;
+	int attempts = asoc->init_err_counter + 1;
 
-	SCTP_DEBUG_PRINTK("Timer T1 expired.\n");
+	SCTP_DEBUG_PRINTK("Timer T1 expired (COOKIE-ECHO).\n");
 
 	if (attempts < asoc->max_init_attempts) {
-		switch (timer) {
-		case SCTP_EVENT_TIMEOUT_T1_INIT:
-			bp = (struct sctp_bind_addr *) &asoc->base.bind_addr;
-			repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0);
-			break;
-
-		case SCTP_EVENT_TIMEOUT_T1_COOKIE:
-			repl = sctp_make_cookie_echo(asoc, NULL);
-			break;
-
-		default:
-			BUG();
-			break;
-		};
-
+		repl = sctp_make_cookie_echo(asoc, NULL);
 		if (!repl)
-			goto nomem;
+			return SCTP_DISPOSITION_NOMEM;
 
 		/* Issue a sideeffect to do the needed accounting. */
-		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART,
-				SCTP_TO(timer));
+		sctp_add_cmd_sf(commands, SCTP_CMD_COOKIEECHO_RESTART,
+				SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE));
+
 		sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl));
 	} else {
 		sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
@@ -4661,9 +4712,6 @@ sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep,
 	}
 
 	return SCTP_DISPOSITION_CONSUME;
-
-nomem:
-	return SCTP_DISPOSITION_NOMEM;
 }
 
 /* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 8967846f69e..75ef1040876 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -783,7 +783,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
 	/* SCTP_STATE_COOKIE_WAIT */ \
 	{.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
 	/* SCTP_STATE_COOKIE_ECHOED */ \
-	{.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \
+	{.fn = sctp_sf_t1_cookie_timer_expire, \
+	 .name = "sctp_sf_t1_cookie_timer_expire"}, \
 	/* SCTP_STATE_ESTABLISHED */ \
 	{.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
 	/* SCTP_STATE_SHUTDOWN_PENDING */ \
@@ -802,7 +803,8 @@ static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_
 	/* SCTP_STATE_CLOSED */ \
 	{.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
 	/* SCTP_STATE_COOKIE_WAIT */ \
-	{.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \
+	{.fn = sctp_sf_t1_init_timer_expire, \
+	 .name = "sctp_sf_t1_init_timer_expire"}, \
 	/* SCTP_STATE_COOKIE_ECHOED */ \
 	{.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \
 	/* SCTP_STATE_ESTABLISHED */ \
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e6926cb1942..aad55dc3792 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -262,18 +262,18 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk,
  *             sockaddr_in6 [RFC 2553]),
  *   addr_len - the size of the address structure.
  */
-SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len)
 {
 	int retval = 0;
 
 	sctp_lock_sock(sk);
 
-	SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, uaddr: %p, addr_len: %d)\n",
-			  sk, uaddr, addr_len);
+	SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, addr: %p, addr_len: %d)\n",
+			  sk, addr, addr_len);
 
 	/* Disallow binding twice. */
 	if (!sctp_sk(sk)->ep->base.bind_addr.port)
-		retval = sctp_do_bind(sk, (union sctp_addr *)uaddr,
+		retval = sctp_do_bind(sk, (union sctp_addr *)addr,
 				      addr_len);
 	else
 		retval = -EINVAL;
@@ -318,23 +318,27 @@ SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
 	unsigned short snum;
 	int ret = 0;
 
-	SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d)\n",
-			  sk, addr, len);
-
 	/* Common sockaddr verification. */
 	af = sctp_sockaddr_af(sp, addr, len);
-	if (!af)
+	if (!af) {
+		SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d) EINVAL\n",
+				  sk, addr, len);
 		return -EINVAL;
+	}
+
+	snum = ntohs(addr->v4.sin_port);
+
+	SCTP_DEBUG_PRINTK_IPADDR("sctp_do_bind(sk: %p, new addr: ",
+				 ", port: %d, new port: %d, len: %d)\n",
+				 sk,
+				 addr,
+				 bp->port, snum,
+				 len);
 
 	/* PF specific bind() address verification. */
 	if (!sp->pf->bind_verify(sp, addr))
 		return -EADDRNOTAVAIL;
 
-	snum= ntohs(addr->v4.sin_port);
-
-	SCTP_DEBUG_PRINTK("sctp_do_bind: port: %d, new port: %d\n",
-			  bp->port, snum);
-
 	/* We must either be unbound, or bind to the same port.  */
 	if (bp->port && (snum != bp->port)) {
 		SCTP_DEBUG_PRINTK("sctp_do_bind:"
@@ -816,7 +820,8 @@ out:
  *
  * Basically do nothing but copying the addresses from user to kernel
  * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk.
- * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() * from userspace.
+ * This is used for tunneling the sctp_bindx() request through sctp_setsockopt()
+ * from userspace.
  *
  * We don't use copy_from_user() for optimization: we first do the
  * sanity checks (buffer size -fast- and access check-healthy
@@ -913,6 +918,243 @@ out:
 	return err;
 }
 
+/* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
+ *
+ * Common routine for handling connect() and sctp_connectx().
+ * Connect will come in with just a single address.
+ */
+static int __sctp_connect(struct sock* sk,
+			  struct sockaddr *kaddrs,
+			  int addrs_size)
+{
+	struct sctp_sock *sp;
+	struct sctp_endpoint *ep;
+	struct sctp_association *asoc = NULL;
+	struct sctp_association *asoc2;
+	struct sctp_transport *transport;
+	union sctp_addr to;
+	struct sctp_af *af;
+	sctp_scope_t scope;
+	long timeo;
+	int err = 0;
+	int addrcnt = 0;
+	int walk_size = 0;
+	struct sockaddr *sa_addr;
+	void *addr_buf;
+
+	sp = sctp_sk(sk);
+	ep = sp->ep;
+
+	/* connect() cannot be done on a socket that is already in ESTABLISHED
+	 * state - UDP-style peeled off socket or a TCP-style socket that
+	 * is already connected.
+	 * It cannot be done even on a TCP-style listening socket.
+	 */
+	if (sctp_sstate(sk, ESTABLISHED) ||
+	    (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
+		err = -EISCONN;
+		goto out_free;
+	}
+
+	/* Walk through the addrs buffer and count the number of addresses. */
+	addr_buf = kaddrs;
+	while (walk_size < addrs_size) {
+		sa_addr = (struct sockaddr *)addr_buf;
+		af = sctp_get_af_specific(sa_addr->sa_family);
+
+		/* If the address family is not supported or if this address
+		 * causes the address buffer to overflow return EINVAL.
+		 */
+		if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
+			err = -EINVAL;
+			goto out_free;
+		}
+
+		err = sctp_verify_addr(sk, (union sctp_addr *)sa_addr,
+				       af->sockaddr_len);
+		if (err)
+			goto out_free;
+
+		memcpy(&to, sa_addr, af->sockaddr_len);
+		to.v4.sin_port = ntohs(to.v4.sin_port);
+
+		/* Check if there already is a matching association on the
+		 * endpoint (other than the one created here).
+		 */
+		asoc2 = sctp_endpoint_lookup_assoc(ep, &to, &transport);
+		if (asoc2 && asoc2 != asoc) {
+			if (asoc2->state >= SCTP_STATE_ESTABLISHED)
+				err = -EISCONN;
+			else
+				err = -EALREADY;
+			goto out_free;
+		}
+
+		/* If we could not find a matching association on the endpoint,
+		 * make sure that there is no peeled-off association matching
+		 * the peer address even on another socket.
+		 */
+		if (sctp_endpoint_is_peeled_off(ep, &to)) {
+			err = -EADDRNOTAVAIL;
+			goto out_free;
+		}
+
+		if (!asoc) {
+			/* If a bind() or sctp_bindx() is not called prior to
+			 * an sctp_connectx() call, the system picks an
+			 * ephemeral port and will choose an address set
+			 * equivalent to binding with a wildcard address.
+			 */
+			if (!ep->base.bind_addr.port) {
+				if (sctp_autobind(sk)) {
+					err = -EAGAIN;
+					goto out_free;
+				}
+			}
+
+			scope = sctp_scope(&to);
+			asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+			if (!asoc) {
+				err = -ENOMEM;
+				goto out_free;
+			}
+		}
+
+		/* Prime the peer's transport structures.  */
+		transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL,
+						SCTP_UNKNOWN);
+		if (!transport) {
+			err = -ENOMEM;
+			goto out_free;
+		}
+
+		addrcnt++;
+		addr_buf += af->sockaddr_len;
+		walk_size += af->sockaddr_len;
+	}
+
+	err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL);
+	if (err < 0) {
+		goto out_free;
+	}
+
+	err = sctp_primitive_ASSOCIATE(asoc, NULL);
+	if (err < 0) {
+		goto out_free;
+	}
+
+	/* Initialize sk's dport and daddr for getpeername() */
+	inet_sk(sk)->dport = htons(asoc->peer.port);
+	af = sctp_get_af_specific(to.sa.sa_family);
+	af->to_sk_daddr(&to, sk);
+
+	timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
+	err = sctp_wait_for_connect(asoc, &timeo);
+
+	/* Don't free association on exit. */
+	asoc = NULL;
+
+out_free:
+
+	SCTP_DEBUG_PRINTK("About to exit __sctp_connect() free asoc: %p"
+		          " kaddrs: %p err: %d\n",
+	                  asoc, kaddrs, err);
+	if (asoc)
+		sctp_association_free(asoc);
+	return err;
+}
+
+/* Helper for tunneling sctp_connectx() requests through sctp_setsockopt()
+ *
+ * API 8.9
+ * int sctp_connectx(int sd, struct sockaddr *addrs, int addrcnt);
+ *
+ * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses.
+ * If the sd is an IPv6 socket, the addresses passed can either be IPv4
+ * or IPv6 addresses.
+ *
+ * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see
+ * Section 3.1.2 for this usage.
+ *
+ * addrs is a pointer to an array of one or more socket addresses. Each
+ * address is contained in its appropriate structure (i.e. struct
+ * sockaddr_in or struct sockaddr_in6) the family of the address type
+ * must be used to distengish the address length (note that this
+ * representation is termed a "packed array" of addresses). The caller
+ * specifies the number of addresses in the array with addrcnt.
+ *
+ * On success, sctp_connectx() returns 0. On failure, sctp_connectx() returns
+ * -1, and sets errno to the appropriate error code.
+ *
+ * For SCTP, the port given in each socket address must be the same, or
+ * sctp_connectx() will fail, setting errno to EINVAL.
+ *
+ * An application can use sctp_connectx to initiate an association with
+ * an endpoint that is multi-homed.  Much like sctp_bindx() this call
+ * allows a caller to specify multiple addresses at which a peer can be
+ * reached.  The way the SCTP stack uses the list of addresses to set up
+ * the association is implementation dependant.  This function only
+ * specifies that the stack will try to make use of all the addresses in
+ * the list when needed.
+ *
+ * Note that the list of addresses passed in is only used for setting up
+ * the association.  It does not necessarily equal the set of addresses
+ * the peer uses for the resulting association.  If the caller wants to
+ * find out the set of peer addresses, it must use sctp_getpaddrs() to
+ * retrieve them after the association has been set up.
+ *
+ * Basically do nothing but copying the addresses from user to kernel
+ * land and invoking either sctp_connectx(). This is used for tunneling
+ * the sctp_connectx() request through sctp_setsockopt() from userspace.
+ *
+ * We don't use copy_from_user() for optimization: we first do the
+ * sanity checks (buffer size -fast- and access check-healthy
+ * pointer); if all of those succeed, then we can alloc the memory
+ * (expensive operation) needed to copy the data to kernel. Then we do
+ * the copying without checking the user space area
+ * (__copy_from_user()).
+ *
+ * On exit there is no need to do sockfd_put(), sys_setsockopt() does
+ * it.
+ *
+ * sk        The sk of the socket
+ * addrs     The pointer to the addresses in user land
+ * addrssize Size of the addrs buffer
+ *
+ * Returns 0 if ok, <0 errno code on error.
+ */
+SCTP_STATIC int sctp_setsockopt_connectx(struct sock* sk,
+				      struct sockaddr __user *addrs,
+				      int addrs_size)
+{
+	int err = 0;
+	struct sockaddr *kaddrs;
+
+	SCTP_DEBUG_PRINTK("%s - sk %p addrs %p addrs_size %d\n",
+			  __FUNCTION__, sk, addrs, addrs_size);
+
+	if (unlikely(addrs_size <= 0))
+		return -EINVAL;
+
+	/* Check the user passed a healthy pointer.  */
+	if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size)))
+		return -EFAULT;
+
+	/* Alloc space for the address array in kernel memory.  */
+	kaddrs = (struct sockaddr *)kmalloc(addrs_size, GFP_KERNEL);
+	if (unlikely(!kaddrs))
+		return -ENOMEM;
+
+	if (__copy_from_user(kaddrs, addrs, addrs_size)) {
+		err = -EFAULT;
+	} else {
+		err = __sctp_connect(sk, kaddrs, addrs_size);
+	}
+
+	kfree(kaddrs);
+	return err;
+}
+
 /* API 3.1.4 close() - UDP Style Syntax
  * Applications use close() to perform graceful shutdown (as described in
  * Section 10.1 of [SCTP]) on ALL the associations currently represented
@@ -1095,7 +1337,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
 	sp = sctp_sk(sk);
 	ep = sp->ep;
 
-	SCTP_DEBUG_PRINTK("Using endpoint: %s.\n", ep->debug_name);
+	SCTP_DEBUG_PRINTK("Using endpoint: %p.\n", ep);
 
 	/* We cannot send a message over a TCP-style listening socket. */
 	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) {
@@ -1306,7 +1548,7 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
 		}
 
 		/* Prime the peer's transport structures.  */
-		transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL);
+		transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN);
 		if (!transport) {
 			err = -ENOMEM;
 			goto out_free;
@@ -2208,6 +2450,12 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname,
 					       optlen, SCTP_BINDX_REM_ADDR);
 		break;
 
+	case SCTP_SOCKOPT_CONNECTX:
+		/* 'optlen' is the size of the addresses buffer. */
+		retval = sctp_setsockopt_connectx(sk, (struct sockaddr __user *)optval,
+					       optlen);
+		break;
+
 	case SCTP_DISABLE_FRAGMENTS:
 		retval = sctp_setsockopt_disable_fragments(sk, optval, optlen);
 		break;
@@ -2283,112 +2531,29 @@ out_nounlock:
  *
  * len: the size of the address.
  */
-SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *uaddr,
+SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *addr,
 			     int addr_len)
 {
-	struct sctp_sock *sp;
-	struct sctp_endpoint *ep;
-	struct sctp_association *asoc;
-	struct sctp_transport *transport;
-	union sctp_addr to;
-	struct sctp_af *af;
-	sctp_scope_t scope;
-	long timeo;
 	int err = 0;
+	struct sctp_af *af;
 
 	sctp_lock_sock(sk);
 
-	SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d)\n",
-			  __FUNCTION__, sk, uaddr, addr_len);
-
-	sp = sctp_sk(sk);
-	ep = sp->ep;
-
-	/* connect() cannot be done on a socket that is already in ESTABLISHED
-	 * state - UDP-style peeled off socket or a TCP-style socket that
-	 * is already connected.
-	 * It cannot be done even on a TCP-style listening socket.
-	 */
-	if (sctp_sstate(sk, ESTABLISHED) ||
-	    (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
-		err = -EISCONN;
-		goto out_unlock;
-	}
-
-	err = sctp_verify_addr(sk, (union sctp_addr *)uaddr, addr_len);
-	if (err)
-		goto out_unlock;
-
-	if (addr_len > sizeof(to))
-		addr_len = sizeof(to);
-	memcpy(&to, uaddr, addr_len);
-	to.v4.sin_port = ntohs(to.v4.sin_port);
-
-	asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport);
-	if (asoc) {
-		if (asoc->state >= SCTP_STATE_ESTABLISHED)
-			err = -EISCONN;
-		else
-			err = -EALREADY;
-		goto out_unlock;
-	}
-
-	/* If we could not find a matching association on the endpoint,
-	 * make sure that there is no peeled-off association matching the
-	 * peer address even on another socket.
-	 */
-	if (sctp_endpoint_is_peeled_off(ep, &to)) {
-		err = -EADDRNOTAVAIL;
-		goto out_unlock;
-	}
-
-	/* If a bind() or sctp_bindx() is not called prior to a connect()
-	 * call, the system picks an ephemeral port and will choose an address
-	 * set equivalent to binding with a wildcard address.
-	 */
-	if (!ep->base.bind_addr.port) {
-		if (sctp_autobind(sk)) {
-			err = -EAGAIN;
-			goto out_unlock;
-		}
-	}
-
-	scope = sctp_scope(&to);
-	asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
-	if (!asoc) {
-		err = -ENOMEM;
-		goto out_unlock;
-  	}
+	SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d\n",
+			  __FUNCTION__, sk, addr, addr_len);
 
-	/* Prime the peer's transport structures.  */
-	transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL);
-	if (!transport) {
-		sctp_association_free(asoc);
-		goto out_unlock;
-	}
-	err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL);
-	if (err < 0) {
-		sctp_association_free(asoc);
-		goto out_unlock;
-	}
-
-	err = sctp_primitive_ASSOCIATE(asoc, NULL);
-	if (err < 0) {
-		sctp_association_free(asoc);
-		goto out_unlock;
+	/* Validate addr_len before calling common connect/connectx routine. */
+	af = sctp_get_af_specific(addr->sa_family);
+	if (!af || addr_len < af->sockaddr_len) {
+		err = -EINVAL;
+	} else {
+		/* Pass correct addr len to common routine (so it knows there
+		 * is only one address being passed.
+		 */
+		err = __sctp_connect(sk, addr, af->sockaddr_len);
 	}
 
-	/* Initialize sk's dport and daddr for getpeername() */
-	inet_sk(sk)->dport = htons(asoc->peer.port);
-	af = sctp_get_af_specific(to.sa.sa_family);
-	af->to_sk_daddr(&to, sk);
-
-	timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK);
-	err = sctp_wait_for_connect(asoc, &timeo);
-
-out_unlock:
 	sctp_release_sock(sk);
-
 	return err;
 }
 
@@ -2677,12 +2842,15 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
 	/* Map ipv4 address into v4-mapped-on-v6 address.  */
 	sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk),
 		(union sctp_addr *)&status.sstat_primary.spinfo_address);
-	status.sstat_primary.spinfo_state = transport->active;
+	status.sstat_primary.spinfo_state = transport->state;
 	status.sstat_primary.spinfo_cwnd = transport->cwnd;
 	status.sstat_primary.spinfo_srtt = transport->srtt;
 	status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto);
 	status.sstat_primary.spinfo_mtu = transport->pmtu;
 
+	if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN)
+		status.sstat_primary.spinfo_state = SCTP_ACTIVE;
+
 	if (put_user(len, optlen)) {
 		retval = -EFAULT;
 		goto out;
@@ -2733,12 +2901,15 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
 		return -EINVAL;
 
 	pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc);
-	pinfo.spinfo_state = transport->active;
+	pinfo.spinfo_state = transport->state;
 	pinfo.spinfo_cwnd = transport->cwnd;
 	pinfo.spinfo_srtt = transport->srtt;
 	pinfo.spinfo_rto = jiffies_to_msecs(transport->rto);
 	pinfo.spinfo_mtu = transport->pmtu;
 
+	if (pinfo.spinfo_state == SCTP_UNKNOWN)
+		pinfo.spinfo_state = SCTP_ACTIVE;
+
 	if (put_user(len, optlen)) {
 		retval = -EFAULT;
 		goto out;
@@ -3591,7 +3762,8 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
 	int retval = 0;
 	int len;
 
-	SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p, ...)\n", sk);
+	SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p... optname: %d)\n",
+			  sk, optname);
 
 	/* I can hardly begin to describe how wrong this is.  This is
 	 * so broken as to be worse than useless.  The API draft
@@ -4596,8 +4768,7 @@ out:
 	return err;
 
 do_error:
-	if (asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1 >=
-					 	asoc->max_init_attempts)
+	if (asoc->init_err_counter + 1 >= asoc->max_init_attempts)
 		err = -ETIMEDOUT;
 	else
 		err = -ECONNREFUSED;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index f30882e1e96..0ec0fde6e6c 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -83,7 +83,9 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 	peer->last_time_used = jiffies;
 	peer->last_time_ecne_reduced = jiffies;
 
-	peer->active = SCTP_ACTIVE;
+	peer->init_sent_count = 0;
+
+	peer->state = SCTP_ACTIVE;
 	peer->hb_allowed = 0;
 
 	/* Initialize the default path max_retrans.  */
-- 
cgit v1.2.3


From 72cb6962a91f2af9eef69a06198e1949c10259ae Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 20 Jun 2005 13:18:08 -0700
Subject: [IPSEC]: Add xfrm_init_state

This patch adds xfrm_init_state which is simply a wrapper that calls
xfrm_get_type and subsequently x->type->init_state.  It also gets rid
of the unused args argument.

Abstracting it out allows us to add common initialisation code, e.g.,
to set family-specific flags.

The add_time setting in xfrm_user.c was deleted because it's already
set by xfrm_state_alloc.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: James Morris <jmorris@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ah4.c          |  2 +-
 net/ipv4/esp4.c         |  2 +-
 net/ipv4/ipcomp.c       | 11 +++--------
 net/ipv4/xfrm4_tunnel.c |  2 +-
 net/ipv6/ah6.c          |  2 +-
 net/ipv6/esp6.c         |  2 +-
 net/ipv6/ipcomp6.c      |  9 ++-------
 net/ipv6/xfrm6_tunnel.c |  2 +-
 net/key/af_key.c        | 12 +++---------
 net/xfrm/xfrm_policy.c  |  1 -
 net/xfrm/xfrm_state.c   | 21 +++++++++++++++++++++
 net/xfrm/xfrm_user.c    |  9 +--------
 12 files changed, 36 insertions(+), 39 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 0e98f2235b6..514c85b2631 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -200,7 +200,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
 	xfrm_state_put(x);
 }
 
-static int ah_init_state(struct xfrm_state *x, void *args)
+static int ah_init_state(struct xfrm_state *x)
 {
 	struct ah_data *ahp = NULL;
 	struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index eae84cc39d3..ba57446d5d1 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -362,7 +362,7 @@ static void esp_destroy(struct xfrm_state *x)
 	kfree(esp);
 }
 
-static int esp_init_state(struct xfrm_state *x, void *args)
+static int esp_init_state(struct xfrm_state *x)
 {
 	struct esp_data *esp = NULL;
 
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 1a23c5263b9..2065944fd9e 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -236,15 +236,10 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
 	t->props.mode = 1;
 	t->props.saddr.a4 = x->props.saddr.a4;
 	t->props.flags = x->props.flags;
-	
-	t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family);
-	if (t->type == NULL)
-		goto error;
-		
-	if (t->type->init_state(t, NULL))
+
+	if (xfrm_init_state(t))
 		goto error;
 
-	t->km.state = XFRM_STATE_VALID;
 	atomic_set(&t->tunnel_users, 1);
 out:
 	return t;
@@ -422,7 +417,7 @@ static void ipcomp_destroy(struct xfrm_state *x)
 	kfree(ipcd);
 }
 
-static int ipcomp_init_state(struct xfrm_state *x, void *args)
+static int ipcomp_init_state(struct xfrm_state *x)
 {
 	int err;
 	struct ipcomp_data *ipcd;
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 413191f585f..e1fe360ed27 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -84,7 +84,7 @@ static void ipip_err(struct sk_buff *skb, u32 info)
 		handler->err_handler(skb, &arg);
 }
 
-static int ipip_init_state(struct xfrm_state *x, void *args)
+static int ipip_init_state(struct xfrm_state *x)
 {
 	if (!x->props.mode)
 		return -EINVAL;
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index e3ecf626cbf..986fdfdccbc 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -339,7 +339,7 @@ static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	xfrm_state_put(x);
 }
 
-static int ah6_init_state(struct xfrm_state *x, void *args)
+static int ah6_init_state(struct xfrm_state *x)
 {
 	struct ah_data *ahp = NULL;
 	struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index be7095d6bab..324db62515a 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -296,7 +296,7 @@ static void esp6_destroy(struct xfrm_state *x)
 	kfree(esp);
 }
 
-static int esp6_init_state(struct xfrm_state *x, void *args)
+static int esp6_init_state(struct xfrm_state *x)
 {
 	struct esp_data *esp = NULL;
 
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 6cde5310cd7..423feb46ccc 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -234,14 +234,9 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
 	t->props.mode = 1;
 	memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
 
-	t->type = xfrm_get_type(IPPROTO_IPV6, t->props.family);
-	if (t->type == NULL)
+	if (xfrm_init_state(t))
 		goto error;
 
-	if (t->type->init_state(t, NULL))
-		goto error;
-
-	t->km.state = XFRM_STATE_VALID;
 	atomic_set(&t->tunnel_users, 1);
 
 out:
@@ -420,7 +415,7 @@ static void ipcomp6_destroy(struct xfrm_state *x)
 	xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr);
 }
 
-static int ipcomp6_init_state(struct xfrm_state *x, void *args)
+static int ipcomp6_init_state(struct xfrm_state *x)
 {
 	int err;
 	struct ipcomp_data *ipcd;
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index ffcadd68b95..60c26c87277 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -466,7 +466,7 @@ static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	return;
 }
 
-static int xfrm6_tunnel_init_state(struct xfrm_state *x, void *args)
+static int xfrm6_tunnel_init_state(struct xfrm_state *x)
 {
 	if (!x->props.mode)
 		return -EINVAL;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 98b72f2024f..652dd09ccd3 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1096,17 +1096,11 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
 		}
 	}
 
-	x->type = xfrm_get_type(proto, x->props.family);
-	if (x->type == NULL) {
-		err = -ENOPROTOOPT;
-		goto out;
-	}
-	if (x->type->init_state(x, NULL)) {
-		err = -EINVAL;
+	err = xfrm_init_state(x);
+	if (err)
 		goto out;
-	}
+
 	x->km.seq = hdr->sadb_msg_seq;
-	x->km.state = XFRM_STATE_VALID;
 	return x;
 
 out:
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 0a4260719a1..d65ed8684fc 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -118,7 +118,6 @@ retry:
 	xfrm_policy_put_afinfo(afinfo);
 	return type;
 }
-EXPORT_SYMBOL(xfrm_get_type);
 
 int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, 
 		    unsigned short family)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 2537f26f097..1845b73d69f 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1055,6 +1055,27 @@ int xfrm_state_mtu(struct xfrm_state *x, int mtu)
 }
 
 EXPORT_SYMBOL(xfrm_state_mtu);
+
+int xfrm_init_state(struct xfrm_state *x)
+{
+	int err;
+
+	err = -ENOENT;
+	x->type = xfrm_get_type(x->id.proto, x->props.family);
+	if (x->type == NULL)
+		goto error;
+
+	err = x->type->init_state(x);
+	if (err)
+		goto error;
+
+	x->km.state = XFRM_STATE_VALID;
+
+error:
+	return err;
+}
+
+EXPORT_SYMBOL(xfrm_init_state);
  
 void __init xfrm_state_init(void)
 {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 5ce8558eac9..ecade4893a1 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -249,17 +249,10 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p,
 	if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1])))
 		goto error;
 
-	err = -ENOENT;
-	x->type = xfrm_get_type(x->id.proto, x->props.family);
-	if (x->type == NULL)
-		goto error;
-
-	err = x->type->init_state(x, NULL);
+	err = xfrm_init_state(x);
 	if (err)
 		goto error;
 
-	x->curlft.add_time = (unsigned long) xtime.tv_sec;
-	x->km.state = XFRM_STATE_VALID;
 	x->km.seq = p->seq;
 
 	return x;
-- 
cgit v1.2.3


From d094cd83c06e06e01d8edb540555f3f64e4081c2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 20 Jun 2005 13:19:41 -0700
Subject: [IPSEC]: Add xfrm_state_afinfo->init_flags

This patch adds the xfrm_state_afinfo->init_flags hook which allows
each address family to perform any common initialisation that does
not require a corresponding destructor call.

It will be used subsequently to set the XFRM_STATE_NOPMTUDISC flag
in IPv4.

It also fixes up the error codes returned by xfrm_init_state.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: James Morris <jmorris@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_state.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1845b73d69f..9d206c282cf 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1058,10 +1058,26 @@ EXPORT_SYMBOL(xfrm_state_mtu);
 
 int xfrm_init_state(struct xfrm_state *x)
 {
+	struct xfrm_state_afinfo *afinfo;
+	int family = x->props.family;
 	int err;
 
-	err = -ENOENT;
-	x->type = xfrm_get_type(x->id.proto, x->props.family);
+	err = -EAFNOSUPPORT;
+	afinfo = xfrm_state_get_afinfo(family);
+	if (!afinfo)
+		goto error;
+
+	err = 0;
+	if (afinfo->init_flags)
+		err = afinfo->init_flags(x);
+
+	xfrm_state_put_afinfo(afinfo);
+
+	if (err)
+		goto error;
+
+	err = -EPROTONOSUPPORT;
+	x->type = xfrm_get_type(x->id.proto, family);
 	if (x->type == NULL)
 		goto error;
 
-- 
cgit v1.2.3


From dd87147eed934eaff92869f3d158697c7239d1d2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 20 Jun 2005 13:21:43 -0700
Subject: [IPSEC]: Add XFRM_STATE_NOPMTUDISC flag

This patch adds the flag XFRM_STATE_NOPMTUDISC for xfrm states.  It is
similar to the nopmtudisc on IPIP/GRE tunnels.  It only has an effect
on IPv4 tunnel mode states.  For these states, it will ensure that the
DF flag is always cleared.

This is primarily useful to work around ICMP blackholes.

In future this flag could also allow a larger MTU to be set within the
tunnel just like IPIP/GRE tunnels.  This could be useful for short haul
tunnels where temporary fragmentation outside the tunnel is desired over
smaller fragments inside the tunnel.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: James Morris <jmorris@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/xfrm4_output.c | 8 ++++++--
 net/ipv4/xfrm4_state.c  | 9 +++++++++
 net/key/af_key.c        | 4 ++++
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index af2392ae576..66620a95942 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -33,6 +33,7 @@ static void xfrm4_encap(struct sk_buff *skb)
 	struct dst_entry *dst = skb->dst;
 	struct xfrm_state *x = dst->xfrm;
 	struct iphdr *iph, *top_iph;
+	int flags;
 
 	iph = skb->nh.iph;
 	skb->h.ipiph = iph;
@@ -51,10 +52,13 @@ static void xfrm4_encap(struct sk_buff *skb)
 
 	/* DS disclosed */
 	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
-	if (x->props.flags & XFRM_STATE_NOECN)
+
+	flags = x->props.flags;
+	if (flags & XFRM_STATE_NOECN)
 		IP_ECN_clear(top_iph);
 
-	top_iph->frag_off = iph->frag_off & htons(IP_DF);
+	top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
+		0 : (iph->frag_off & htons(IP_DF));
 	if (!top_iph->frag_off)
 		__ip_select_ident(top_iph, dst, 0);
 
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 223a2e83853..050611d7a96 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -7,12 +7,20 @@
  *
  */
 
+#include <net/ip.h>
 #include <net/xfrm.h>
 #include <linux/pfkeyv2.h>
 #include <linux/ipsec.h>
 
 static struct xfrm_state_afinfo xfrm4_state_afinfo;
 
+static int xfrm4_init_flags(struct xfrm_state *x)
+{
+	if (ipv4_config.no_pmtu_disc)
+		x->props.flags |= XFRM_STATE_NOPMTUDISC;
+	return 0;
+}
+
 static void
 __xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
 		     struct xfrm_tmpl *tmpl,
@@ -109,6 +117,7 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto,
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
 	.lock			= RW_LOCK_UNLOCKED,
+	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
 	.state_lookup		= __xfrm4_state_lookup,
 	.find_acq		= __xfrm4_find_acq,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 652dd09ccd3..4879743b945 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -690,6 +690,8 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys,
 		sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN;
 	if (x->props.flags & XFRM_STATE_DECAP_DSCP)
 		sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP;
+	if (x->props.flags & XFRM_STATE_NOPMTUDISC)
+		sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC;
 
 	/* hard time */
 	if (hsc & 2) {
@@ -974,6 +976,8 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr,
 		x->props.flags |= XFRM_STATE_NOECN;
 	if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP)
 		x->props.flags |= XFRM_STATE_DECAP_DSCP;
+	if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC)
+		x->props.flags |= XFRM_STATE_NOPMTUDISC;
 
 	lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1];
 	if (lifetime != NULL) {
-- 
cgit v1.2.3


From f6e276ee67c0ac9efafd24bc6f7a84aa359656df Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 20 Jun 2005 13:32:05 -0700
Subject: [ATALK]: endian annotations

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/appletalk/aarp.c | 2 +-
 net/appletalk/ddp.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 54640c01b50..10d04046102 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -565,7 +565,7 @@ int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb,
 			 *	numbers	we just happen to need. Now put the
 			 *	length in the lower two.
 			 */
-			*((__u16 *)skb->data) = htons(skb->len);
+			*((__be16 *)skb->data) = htons(skb->len);
 			ft = 1;
 		}
 		/*
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 876dbac7106..192b529f86a 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -401,7 +401,7 @@ out_err:
 }
 
 /* Find a match for a specific network:node pair */
-static struct atalk_iface *atalk_find_interface(int net, int node)
+static struct atalk_iface *atalk_find_interface(__be16 net, int node)
 {
 	struct atalk_iface *iface;
 
-- 
cgit v1.2.3


From 246955fe4c38bd706ae30e37c64892c94213775d Mon Sep 17 00:00:00 2001
From: Robert Olsson <Robert.Olsson@data.slu.se>
Date: Mon, 20 Jun 2005 13:36:39 -0700
Subject: [NETLINK]: fib_lookup() via netlink

Below is a more generic patch to do fib_lookup via netlink. For others
we should say that we discussed this as a way to verify route selection.
It's also possible there are others uses for this.

In short the fist half of struct fib_result_nl is filled in by caller
and netlink call fills in the other half and returns it.

In case anyone is interested there is a corresponding user app to compare
the full routing table this was used to test implementation of the LC-trie.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 563e7d61270..cd8e45ab958 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -516,6 +516,60 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 #undef BRD1_OK
 }
 
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+{
+	
+	struct fib_result       res;
+	struct flowi            fl = { .nl_u = { .ip4_u = { .daddr = frn->fl_addr, 
+							    .fwmark = frn->fl_fwmark,
+							    .tos = frn->fl_tos,
+							    .scope = frn->fl_scope } } };
+	if (tb) {
+		local_bh_disable();
+
+		frn->tb_id = tb->tb_id;
+		frn->err = tb->tb_lookup(tb, &fl, &res);
+
+		if (!frn->err) {
+			frn->prefixlen = res.prefixlen;
+			frn->nh_sel = res.nh_sel;
+			frn->type = res.type;
+			frn->scope = res.scope;
+		}
+		local_bh_enable();
+	}
+}
+
+static void nl_fib_input(struct sock *sk, int len)
+{
+	struct sk_buff *skb = NULL;
+        struct nlmsghdr *nlh = NULL;
+	struct fib_result_nl *frn;
+	int err;
+	u32 pid;     
+	struct fib_table *tb;
+	
+	skb = skb_recv_datagram(sk, 0, 0, &err);
+	nlh = (struct nlmsghdr *)skb->data;
+	
+	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
+	tb = fib_get_table(frn->tb_id_in);
+
+	nl_fib_lookup(frn, tb);
+	
+	pid = nlh->nlmsg_pid;           /*pid of sending process */
+	NETLINK_CB(skb).groups = 0;     /* not in mcast group */
+	NETLINK_CB(skb).pid = 0;         /* from kernel */
+	NETLINK_CB(skb).dst_pid = pid;
+	NETLINK_CB(skb).dst_groups = 0;  /* unicast */
+	netlink_unicast(sk, skb, pid, MSG_DONTWAIT);
+}    
+
+static void nl_fib_lookup_init(void)
+{
+      netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input);
+}
+
 static void fib_disable_ip(struct net_device *dev, int force)
 {
 	if (fib_sync_down(0, dev, force))
@@ -604,6 +658,7 @@ void __init ip_fib_init(void)
 
 	register_netdevice_notifier(&fib_netdev_notifier);
 	register_inetaddr_notifier(&fib_inetaddr_notifier);
+	nl_fib_lookup_init();
 }
 
 EXPORT_SYMBOL(inet_addr_type);
-- 
cgit v1.2.3


From 19baf839ff4a8daa1f2a7400897094fc18e4f5e9 Mon Sep 17 00:00:00 2001
From: Robert Olsson <Robert.Olsson@data.slu.se>
Date: Tue, 21 Jun 2005 12:43:18 -0700
Subject: [IPV4]: Add LC-Trie FIB lookup algorithm.

Signed-off-by: Robert Olsson <Robert.Olsson@data.slu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig    |   26 +
 net/ipv4/Makefile   |    4 +-
 net/ipv4/af_inet.c  |   12 +
 net/ipv4/fib_trie.c | 2454 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 2495 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/fib_trie.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6d3e8b1bd1f..05107e0dc14 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,6 +1,32 @@
 #
 # IP configuration
 #
+choice 
+	prompt "Choose IP: FIB lookup""
+	depends on INET
+	default IP_FIB_HASH
+
+config IP_FIB_HASH
+	bool "FIB_HASH"
+	---help---
+	Current FIB is very proven and good enough for most users.
+
+config IP_FIB_TRIE
+	bool "FIB_TRIE"
+	---help---
+	Use new experimental LC-trie as FIB lookup algoritm. 
+        This improves lookup performance
+	
+	LC-trie is described in:
+	
+ 	IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ 	IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+	An experimental study of compression methods for dynamic tries
+ 	Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ 	http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+       
+endchoice
+
 config IP_MULTICAST
 	bool "IP: multicasting"
 	depends on INET
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8b379627ebb..65d57d8e1ad 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,8 +7,10 @@ obj-y     := utils.o route.o inetpeer.o protocol.o \
 	     ip_output.o ip_sockglue.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
 	     datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
-	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o
+	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
 
+obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
+obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 03942f13394..658e7977924 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1119,6 +1119,10 @@ module_init(inet_init);
 #ifdef CONFIG_PROC_FS
 extern int  fib_proc_init(void);
 extern void fib_proc_exit(void);
+#ifdef CONFIG_IP_FIB_TRIE
+extern int  fib_stat_proc_init(void);
+extern void fib_stat_proc_exit(void);
+#endif
 extern int  ip_misc_proc_init(void);
 extern int  raw_proc_init(void);
 extern void raw_proc_exit(void);
@@ -1139,11 +1143,19 @@ static int __init ipv4_proc_init(void)
 		goto out_udp;
 	if (fib_proc_init())
 		goto out_fib;
+#ifdef CONFIG_IP_FIB_TRIE
+         if (fib_stat_proc_init())
+                 goto out_fib_stat;
+ #endif
 	if (ip_misc_proc_init())
 		goto out_misc;
 out:
 	return rc;
 out_misc:
+#ifdef CONFIG_IP_FIB_TRIE
+ 	fib_stat_proc_exit();
+out_fib_stat:
+#endif
 	fib_proc_exit();
 out_fib:
 	udp4_proc_exit();
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
new file mode 100644
index 00000000000..c0ece94fc63
--- /dev/null
+++ b/net/ipv4/fib_trie.c
@@ -0,0 +1,2454 @@
+/*
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation; either version
+ *   2 of the License, or (at your option) any later version.
+ *
+ *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
+ *     & Swedish University of Agricultural Sciences.
+ *
+ *   Jens Laas <jens.laas@data.slu.se> Swedish University of 
+ *     Agricultural Sciences.
+ * 
+ *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
+ *
+ * This work is based on the LPC-trie which is originally descibed in:
+ * 
+ * An experimental study of compression methods for dynamic tries
+ * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+ *
+ *
+ * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+ *
+ * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
+ *
+ *
+ * Code from fib_hash has been reused which includes the following header:
+ *
+ *
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 FIB: lookup engine and maintenance routines.
+ *
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#define VERSION "0.323"
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include "fib_lookup.h"
+
+#undef CONFIG_IP_FIB_TRIE_STATS
+#define MAX_CHILDS 16384
+
+#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
+#define KEYLENGTH (8*sizeof(t_key))
+#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
+#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
+
+static DEFINE_RWLOCK(fib_lock);
+
+typedef unsigned int t_key;
+
+#define T_TNODE 0
+#define T_LEAF  1
+#define NODE_TYPE_MASK	0x1UL
+#define NODE_PARENT(_node) \
+((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK))
+#define NODE_SET_PARENT(_node, _ptr) \
+((_node)->_parent = (((unsigned long)(_ptr)) | \
+                     ((_node)->_parent & NODE_TYPE_MASK)))
+#define NODE_INIT_PARENT(_node, _type) \
+((_node)->_parent = (_type))
+#define NODE_TYPE(_node) \
+((_node)->_parent & NODE_TYPE_MASK)
+
+#define IS_TNODE(n) (!(n->_parent & T_LEAF))
+#define IS_LEAF(n) (n->_parent & T_LEAF)
+
+struct node {
+        t_key key;
+	unsigned long _parent;
+};
+
+struct leaf {
+        t_key key;
+	unsigned long _parent;
+	struct hlist_head list;
+};
+
+struct leaf_info {
+	struct hlist_node hlist;
+	int plen;
+	struct list_head falh;
+};
+
+struct tnode {
+        t_key key;
+	unsigned long _parent;
+        unsigned short pos:5;        /* 2log(KEYLENGTH) bits needed */
+        unsigned short bits:5;       /* 2log(KEYLENGTH) bits needed */
+        unsigned short full_children;  /* KEYLENGTH bits needed */
+        unsigned short empty_children; /* KEYLENGTH bits needed */
+        struct node *child[0];
+};
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+struct trie_use_stats {
+	unsigned int gets;
+	unsigned int backtrack;
+	unsigned int semantic_match_passed;
+	unsigned int semantic_match_miss;
+	unsigned int null_node_hit;
+};
+#endif
+
+struct trie_stat {
+	unsigned int totdepth;
+	unsigned int maxdepth;
+	unsigned int tnodes;
+	unsigned int leaves;
+	unsigned int nullpointers;
+	unsigned int nodesizes[MAX_CHILDS];
+};    
+
+struct trie {
+        struct node *trie;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	struct trie_use_stats stats;
+#endif
+        int size;
+	unsigned int revision;
+};
+
+static int trie_debug = 0;
+
+static int tnode_full(struct tnode *tn, struct node *n);
+static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
+static int tnode_child_length(struct tnode *tn);
+static struct node *resize(struct trie *t, struct tnode *tn);
+static struct tnode *inflate(struct trie *t, struct tnode *tn);
+static struct tnode *halve(struct trie *t, struct tnode *tn);
+static void tnode_free(struct tnode *tn);
+static void trie_dump_seq(struct seq_file *seq, struct trie *t);
+extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
+extern int fib_detect_death(struct fib_info *fi, int order,
+                            struct fib_info **last_resort, int *last_idx, int *dflt);
+
+extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
+               struct nlmsghdr *n, struct netlink_skb_parms *req);
+
+static kmem_cache_t *fn_alias_kmem;
+static struct trie *trie_local = NULL, *trie_main = NULL;
+
+static void trie_bug(char *err)
+{
+	printk("Trie Bug: %s\n", err);
+	BUG();
+}
+
+static inline struct node *tnode_get_child(struct tnode *tn, int i) 
+{
+        if (i >=  1<<tn->bits) 
+                trie_bug("tnode_get_child");
+
+        return tn->child[i];
+}
+
+static inline int tnode_child_length(struct tnode *tn)
+{
+        return 1<<tn->bits;
+}
+
+/*
+  _________________________________________________________________
+  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+  ----------------------------------------------------------------
+    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15 
+
+  _________________________________________________________________
+  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+  -----------------------------------------------------------------
+   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+
+  tp->pos = 7
+  tp->bits = 3
+  n->pos = 15
+  n->bits=4
+  KEYLENGTH=32
+*/
+
+static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
+{
+        if (offset < KEYLENGTH)
+		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
+        else
+		return 0;
+}
+
+static inline int tkey_equals(t_key a, t_key b)
+{
+  return a == b;
+}
+
+static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
+{
+     if (bits == 0 || offset >= KEYLENGTH)
+            return 1;
+        bits = bits > KEYLENGTH ? KEYLENGTH : bits;
+        return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
+}	
+
+static inline int tkey_mismatch(t_key a, int offset, t_key b)
+{
+	t_key diff = a ^ b;
+	int i = offset;
+
+	if(!diff) 
+	  return 0;
+	while((diff << i) >> (KEYLENGTH-1) == 0)
+		i++;
+	return i;
+}
+
+/* Candiate for fib_semantics */
+
+static void fn_free_alias(struct fib_alias *fa)
+{
+	fib_release_info(fa->fa_info);
+	kmem_cache_free(fn_alias_kmem, fa);
+}
+
+/*
+  To understand this stuff, an understanding of keys and all their bits is 
+  necessary. Every node in the trie has a key associated with it, but not 
+  all of the bits in that key are significant.
+
+  Consider a node 'n' and its parent 'tp'.
+
+  If n is a leaf, every bit in its key is significant. Its presence is 
+  necessitaded by path compression, since during a tree traversal (when 
+  searching for a leaf - unless we are doing an insertion) we will completely 
+  ignore all skipped bits we encounter. Thus we need to verify, at the end of 
+  a potentially successful search, that we have indeed been walking the 
+  correct key path.
+
+  Note that we can never "miss" the correct key in the tree if present by 
+  following the wrong path. Path compression ensures that segments of the key 
+  that are the same for all keys with a given prefix are skipped, but the 
+  skipped part *is* identical for each node in the subtrie below the skipped 
+  bit! trie_insert() in this implementation takes care of that - note the 
+  call to tkey_sub_equals() in trie_insert().
+
+  if n is an internal node - a 'tnode' here, the various parts of its key 
+  have many different meanings.
+
+  Example:  
+  _________________________________________________________________
+  | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+  -----------------------------------------------------------------
+    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15 
+
+  _________________________________________________________________
+  | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+  -----------------------------------------------------------------
+   16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
+
+  tp->pos = 7
+  tp->bits = 3
+  n->pos = 15
+  n->bits=4
+
+  First, let's just ignore the bits that come before the parent tp, that is 
+  the bits from 0 to (tp->pos-1). They are *known* but at this point we do 
+  not use them for anything.
+
+  The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+  index into the parent's child array. That is, they will be used to find 
+  'n' among tp's children.
+
+  The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
+  for the node n.
+
+  All the bits we have seen so far are significant to the node n. The rest 
+  of the bits are really not needed or indeed known in n->key.
+
+  The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 
+  n's child array, and will of course be different for each child.
+  
+  The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
+  at this point.
+
+*/
+
+static void check_tnode(struct tnode *tn)
+{
+	if(tn && tn->pos+tn->bits > 32) {
+		printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
+	}
+}
+
+static int halve_threshold = 25;
+static int inflate_threshold = 50;
+
+static struct leaf *leaf_new(void)
+{
+	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
+	if(l) {
+		NODE_INIT_PARENT(l, T_LEAF);
+		INIT_HLIST_HEAD(&l->list);
+	}
+	return l;
+}
+
+static struct leaf_info *leaf_info_new(int plen)
+{
+	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
+	li->plen = plen;
+	INIT_LIST_HEAD(&li->falh);
+	return li;
+}
+
+static inline void free_leaf(struct leaf *l)
+{
+	kfree(l);
+}
+
+static inline void free_leaf_info(struct leaf_info *li)
+{
+	kfree(li);
+}
+
+static struct tnode* tnode_new(t_key key, int pos, int bits)
+{
+	int nchildren = 1<<bits;
+	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
+	struct tnode *tn = kmalloc(sz,  GFP_KERNEL);
+
+	if(tn)  {
+		memset(tn, 0, sz);
+		NODE_INIT_PARENT(tn, T_TNODE);
+		tn->pos = pos;
+		tn->bits = bits;
+		tn->key = key;
+		tn->full_children = 0;
+		tn->empty_children = 1<<bits;
+	}
+	if(trie_debug > 0) 
+		printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
+		       (unsigned int) (sizeof(struct node) * 1<<bits));
+	return tn;
+}
+
+static void tnode_free(struct tnode *tn)
+{
+	if(!tn) {
+		trie_bug("tnode_free\n");
+	}
+	if(IS_LEAF(tn)) {
+		free_leaf((struct leaf *)tn);
+		if(trie_debug > 0 ) 
+			printk("FL %p \n", tn);
+	}
+	else if(IS_TNODE(tn)) { 
+		kfree(tn);
+		if(trie_debug > 0 ) 
+			printk("FT %p \n", tn);
+	}
+	else {
+		trie_bug("tnode_free\n");
+	}
+}
+
+/*
+ * Check whether a tnode 'n' is "full", i.e. it is an internal node
+ * and no bits are skipped. See discussion in dyntree paper p. 6
+ */
+
+static inline int tnode_full(struct tnode *tn, struct node *n)
+{
+	if(n == NULL || IS_LEAF(n))
+		return 0;
+
+	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
+}
+
+static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) 
+{
+	tnode_put_child_reorg(tn, i, n, -1);
+}
+
+ /* 
+  * Add a child at position i overwriting the old value.
+  * Update the value of full_children and empty_children.
+  */
+
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 
+{
+	struct node *chi;
+	int isfull;
+
+	if(i >=  1<<tn->bits) {
+		printk("bits=%d, i=%d\n", tn->bits, i);
+		trie_bug("tnode_put_child_reorg bits");
+	}
+	write_lock_bh(&fib_lock);
+	chi = tn->child[i];	
+
+	/* update emptyChildren */
+	if (n == NULL && chi != NULL)
+		tn->empty_children++;
+	else if (n != NULL && chi == NULL)
+		tn->empty_children--;
+  
+	/* update fullChildren */
+        if (wasfull == -1)
+		wasfull = tnode_full(tn, chi);
+
+	isfull = tnode_full(tn, n);
+	if (wasfull && !isfull) 
+		tn->full_children--;
+	
+	else if (!wasfull && isfull) 
+		tn->full_children++;
+	if(n) 
+		NODE_SET_PARENT(n, tn);	
+
+	tn->child[i] = n;
+	write_unlock_bh(&fib_lock);
+}
+
+static struct node *resize(struct trie *t, struct tnode *tn) 
+{
+	int i;
+
+ 	if (!tn)
+		return NULL;
+
+	if(trie_debug) 
+		printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 
+		      tn, inflate_threshold, halve_threshold);
+
+	/* No children */
+	if (tn->empty_children == tnode_child_length(tn)) {
+		tnode_free(tn);
+		return NULL;
+	}
+	/* One child */
+	if (tn->empty_children == tnode_child_length(tn) - 1)
+		for (i = 0; i < tnode_child_length(tn); i++) {
+
+			write_lock_bh(&fib_lock);
+			if (tn->child[i] != NULL) {
+
+				/* compress one level */
+				struct node *n = tn->child[i];
+				if(n)
+					NODE_INIT_PARENT(n, NODE_TYPE(n));
+
+				write_unlock_bh(&fib_lock);
+				tnode_free(tn);
+				return n;
+			}
+			write_unlock_bh(&fib_lock);
+		}
+	/* 
+	 * Double as long as the resulting node has a number of
+	 * nonempty nodes that are above the threshold.
+	 */
+
+	/*
+	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of 
+	 * the Helsinki University of Technology and Matti Tikkanen of Nokia 
+	 * Telecommunications, page 6:
+	 * "A node is doubled if the ratio of non-empty children to all 
+	 * children in the *doubled* node is at least 'high'."
+	 *
+	 * 'high' in this instance is the variable 'inflate_threshold'. It 
+	 * is expressed as a percentage, so we multiply it with 
+	 * tnode_child_length() and instead of multiplying by 2 (since the 
+	 * child array will be doubled by inflate()) and multiplying 
+	 * the left-hand side by 100 (to handle the percentage thing) we 
+	 * multiply the left-hand side by 50.
+	 * 
+	 * The left-hand side may look a bit weird: tnode_child_length(tn) 
+	 * - tn->empty_children is of course the number of non-null children 
+	 * in the current node. tn->full_children is the number of "full" 
+	 * children, that is non-null tnodes with a skip value of 0.
+	 * All of those will be doubled in the resulting inflated tnode, so 
+	 * we just count them one extra time here.
+	 * 
+	 * A clearer way to write this would be:
+	 * 
+	 * to_be_doubled = tn->full_children;
+	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - 
+	 *     tn->full_children;
+	 *
+	 * new_child_length = tnode_child_length(tn) * 2;
+	 *
+	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 
+	 *      new_child_length;
+	 * if (new_fill_factor >= inflate_threshold)
+	 * 
+	 * ...and so on, tho it would mess up the while() loop.
+	 * 
+	 * anyway,
+	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+	 *      inflate_threshold
+	 * 
+	 * avoid a division:
+	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+	 *      inflate_threshold * new_child_length
+	 * 
+	 * expand not_to_be_doubled and to_be_doubled, and shorten:
+	 * 100 * (tnode_child_length(tn) - tn->empty_children + 
+	 *    tn->full_children ) >= inflate_threshold * new_child_length
+	 * 
+	 * expand new_child_length:
+	 * 100 * (tnode_child_length(tn) - tn->empty_children + 
+	 *    tn->full_children ) >=
+	 *      inflate_threshold * tnode_child_length(tn) * 2
+	 * 
+	 * shorten again:
+	 * 50 * (tn->full_children + tnode_child_length(tn) - 
+	 *    tn->empty_children ) >= inflate_threshold * 
+	 *    tnode_child_length(tn)
+	 * 
+	 */
+
+	check_tnode(tn);
+
+	while ((tn->full_children > 0 &&
+	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
+				inflate_threshold * tnode_child_length(tn))) {
+
+		tn = inflate(t, tn);
+	}
+
+	check_tnode(tn);
+
+	/*
+	 * Halve as long as the number of empty children in this
+	 * node is above threshold.
+	 */
+	while (tn->bits > 1 &&
+	       100 * (tnode_child_length(tn) - tn->empty_children) <
+	       halve_threshold * tnode_child_length(tn))
+
+		tn = halve(t, tn);
+  
+	/* Only one child remains */
+
+	if (tn->empty_children == tnode_child_length(tn) - 1)
+		for (i = 0; i < tnode_child_length(tn); i++) {
+			
+			write_lock_bh(&fib_lock);
+			if (tn->child[i] != NULL) {
+				/* compress one level */
+				struct node *n = tn->child[i];
+
+				if(n)
+					NODE_INIT_PARENT(n, NODE_TYPE(n));
+
+				write_unlock_bh(&fib_lock);
+				tnode_free(tn);
+				return n;
+			}
+			write_unlock_bh(&fib_lock);
+		}
+
+	return (struct node *) tn;
+}
+
+static struct tnode *inflate(struct trie *t, struct tnode *tn)
+{
+	struct tnode *inode;
+	struct tnode *oldtnode = tn;
+	int olen = tnode_child_length(tn);
+	int i;
+
+  	if(trie_debug) 
+		printk("In inflate\n");
+
+	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
+
+	if (!tn)
+		trie_bug("tnode_new failed");
+
+	for(i = 0; i < olen; i++) {
+		struct node *node = tnode_get_child(oldtnode, i);
+      
+		/* An empty child */
+		if (node == NULL)
+			continue;
+
+		/* A leaf or an internal node with skipped bits */
+
+		if(IS_LEAF(node) || ((struct tnode *) node)->pos >
+		   tn->pos + tn->bits - 1) {
+			if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
+					     1) == 0)
+				put_child(t, tn, 2*i, node);
+			else
+				put_child(t, tn, 2*i+1, node);
+			continue;
+		}
+
+		/* An internal node with two children */
+		inode = (struct tnode *) node;
+
+		if (inode->bits == 1) {
+			put_child(t, tn, 2*i, inode->child[0]);
+			put_child(t, tn, 2*i+1, inode->child[1]);
+
+			tnode_free(inode);
+		}
+
+			/* An internal node with more than two children */
+		else {
+			struct tnode *left, *right;
+			int size, j;
+
+			/* We will replace this node 'inode' with two new 
+			 * ones, 'left' and 'right', each with half of the 
+			 * original children. The two new nodes will have 
+			 * a position one bit further down the key and this 
+			 * means that the "significant" part of their keys 
+			 * (see the discussion near the top of this file) 
+			 * will differ by one bit, which will be "0" in 
+			 * left's key and "1" in right's key. Since we are 
+			 * moving the key position by one step, the bit that 
+			 * we are moving away from - the bit at position 
+			 * (inode->pos) - is the one that will differ between 
+			 * left and right. So... we synthesize that bit in the
+			 * two  new keys.
+			 * The mask 'm' below will be a single "one" bit at 
+			 * the position (inode->pos)
+			 */
+
+			t_key m = TKEY_GET_MASK(inode->pos, 1);
+ 
+			/* Use the old key, but set the new significant 
+			 *   bit to zero. 
+			 */
+			left = tnode_new(inode->key&(~m), inode->pos + 1,
+					 inode->bits - 1);
+
+			if(!left) 
+				trie_bug("tnode_new failed");
+			
+			
+			/* Use the old key, but set the new significant 
+			 * bit to one. 
+			 */
+			right = tnode_new(inode->key|m, inode->pos + 1,
+					  inode->bits - 1);
+
+			if(!right) 
+				trie_bug("tnode_new failed");
+			
+			size = tnode_child_length(left);
+			for(j = 0; j < size; j++) {
+				put_child(t, left, j, inode->child[j]);
+				put_child(t, right, j, inode->child[j + size]);
+			}
+			put_child(t, tn, 2*i, resize(t, left));
+			put_child(t, tn, 2*i+1, resize(t, right));
+
+			tnode_free(inode);
+		}
+	}
+	tnode_free(oldtnode);
+	return tn;
+}
+
+static struct tnode *halve(struct trie *t, struct tnode *tn)
+{
+	struct tnode *oldtnode = tn;
+	struct node *left, *right;
+	int i;
+	int olen = tnode_child_length(tn);
+
+	if(trie_debug) printk("In halve\n");
+  
+	tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
+
+	if(!tn) 
+		trie_bug("tnode_new failed");
+
+	for(i = 0; i < olen; i += 2) {
+		left = tnode_get_child(oldtnode, i);
+		right = tnode_get_child(oldtnode, i+1);
+    
+		/* At least one of the children is empty */
+		if (left == NULL) {
+			if (right == NULL)    /* Both are empty */
+				continue;
+			put_child(t, tn, i/2, right);
+		} else if (right == NULL)
+			put_child(t, tn, i/2, left);
+     
+		/* Two nonempty children */
+		else {
+			struct tnode *newBinNode =
+				tnode_new(left->key, tn->pos + tn->bits, 1);
+
+			if(!newBinNode) 
+				trie_bug("tnode_new failed");
+
+			put_child(t, newBinNode, 0, left);
+			put_child(t, newBinNode, 1, right);
+			put_child(t, tn, i/2, resize(t, newBinNode));
+		}
+	}
+	tnode_free(oldtnode);
+	return tn;
+}
+
+static void *trie_init(struct trie *t)
+{
+	if(t) {
+		t->size = 0;
+		t->trie = NULL;
+		t->revision = 0;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+       		memset(&t->stats, 0, sizeof(struct trie_use_stats));
+#endif
+	}
+	return t;
+}
+
+static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
+{
+	struct hlist_node *node;
+	struct leaf_info *li;
+
+	hlist_for_each_entry(li, node, head, hlist) {
+		  
+		if ( li->plen == plen )
+			return li;
+	}
+	return NULL;
+}
+
+static inline struct list_head * get_fa_head(struct leaf *l, int plen)
+{
+	struct list_head *fa_head=NULL;
+	struct leaf_info *li = find_leaf_info(&l->list, plen);
+	
+	if(li) 
+		fa_head = &li->falh;
+	
+	return fa_head;
+}
+
+static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
+{
+	struct leaf_info *li=NULL, *last=NULL;
+	struct hlist_node *node, *tmp;
+
+	write_lock_bh(&fib_lock);
+	
+	if(hlist_empty(head))
+		hlist_add_head(&new->hlist, head);
+	else {
+		hlist_for_each_entry_safe(li, node, tmp, head, hlist) {
+			
+			if (new->plen > li->plen) 
+				break;
+			
+			last = li;
+		}
+		if(last) 
+			hlist_add_after(&last->hlist, &new->hlist);
+		else 
+			hlist_add_before(&new->hlist, &li->hlist);
+	}
+	write_unlock_bh(&fib_lock);
+}
+
+static struct leaf *
+fib_find_node(struct trie *t, u32 key)
+{
+	int pos;
+	struct tnode *tn;
+	struct node *n;
+
+	pos = 0;
+	n=t->trie;
+
+	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
+		tn = (struct tnode *) n;
+			
+		check_tnode(tn);
+			
+		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
+			pos=tn->pos + tn->bits;
+			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+		}
+		else
+			break;
+	}
+	/* Case we have found a leaf. Compare prefixes */
+
+	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
+		struct leaf *l = (struct leaf *) n;
+		return l;
+	}
+	return NULL;
+}
+
+static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
+{
+	int i = 0;
+	int wasfull;
+	t_key cindex, key;
+	struct tnode *tp = NULL;
+
+	if(!tn) 
+		BUG();
+	
+	key = tn->key;
+	i = 0;
+
+	while (tn != NULL && NODE_PARENT(tn) != NULL) {
+
+		if( i > 10 ) {
+			printk("Rebalance tn=%p \n", tn);
+			if(tn) 		printk("tn->parent=%p \n", NODE_PARENT(tn));
+			
+			printk("Rebalance tp=%p \n", tp);
+			if(tp) 		printk("tp->parent=%p \n", NODE_PARENT(tp));
+		}
+
+		if( i > 12 ) BUG();
+		i++;
+
+		tp = NODE_PARENT(tn);
+		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
+		tn = (struct tnode *) resize (t, (struct tnode *)tn);
+		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
+		
+		if(!NODE_PARENT(tn))
+			break;
+
+		tn = NODE_PARENT(tn);
+	}
+	/* Handle last (top) tnode */
+	if (IS_TNODE(tn)) 
+		tn = (struct tnode*) resize(t, (struct tnode *)tn);
+
+	return (struct node*) tn;
+}
+
+static struct list_head *
+fib_insert_node(struct trie *t, u32 key, int plen)
+{
+	int pos, newpos;
+	struct tnode *tp = NULL, *tn = NULL;
+	struct node *n;
+	struct leaf *l;
+	int missbit;
+	struct list_head *fa_head=NULL;
+	struct leaf_info *li;
+	t_key cindex;
+
+	pos = 0;
+	n=t->trie;
+
+	/* If we point to NULL, stop. Either the tree is empty and we should 
+	 * just put a new leaf in if, or we have reached an empty child slot, 
+	 * and we should just put our new leaf in that.
+	 * If we point to a T_TNODE, check if it matches our key. Note that 
+	 * a T_TNODE might be skipping any number of bits - its 'pos' need 
+	 * not be the parent's 'pos'+'bits'!
+	 *
+	 * If it does match the current key, get pos/bits from it, extract 
+	 * the index from our key, push the T_TNODE and walk the tree.
+	 *
+	 * If it doesn't, we have to replace it with a new T_TNODE.
+	 *
+	 * If we point to a T_LEAF, it might or might not have the same key 
+	 * as we do. If it does, just change the value, update the T_LEAF's 
+	 * value, and return it. 
+	 * If it doesn't, we need to replace it with a T_TNODE.
+	 */
+
+	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
+		tn = (struct tnode *) n;
+			
+		check_tnode(tn);
+		
+		if(tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
+			tp = tn;
+			pos=tn->pos + tn->bits;
+			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
+
+			if(n && NODE_PARENT(n) != tn) {
+				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
+				BUG();
+			}
+		}
+		else
+			break;
+	}
+
+	/*
+	 * n  ----> NULL, LEAF or TNODE
+	 *
+	 * tp is n's (parent) ----> NULL or TNODE  
+	 */
+
+	if(tp && IS_LEAF(tp))
+		BUG();
+
+	t->revision++;
+
+	/* Case 1: n is a leaf. Compare prefixes */
+
+	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 
+		struct leaf *l = ( struct leaf *)  n;
+		
+		li = leaf_info_new(plen);
+		
+		if(! li) 
+			BUG();
+
+		fa_head = &li->falh;
+		insert_leaf_info(&l->list, li);
+		goto done;
+	}
+	t->size++;
+	l = leaf_new();
+
+	if(! l) 
+		BUG();
+
+	l->key = key;
+	li = leaf_info_new(plen);
+
+	if(! li) 
+		BUG();
+
+	fa_head = &li->falh;
+	insert_leaf_info(&l->list, li);
+
+	/* Case 2: n is NULL, and will just insert a new leaf */
+	if (t->trie && n == NULL) {
+
+		NODE_SET_PARENT(l, tp);
+		
+		if (!tp) 
+			BUG();
+
+		else {
+			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+			put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+		}
+	}
+	/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
+	else {
+		/* 
+		 *  Add a new tnode here 
+		 *  first tnode need some special handling
+		 */
+
+		if (tp)
+			pos=tp->pos+tp->bits;
+		else
+			pos=0;
+		if(n) {
+			newpos = tkey_mismatch(key, pos, n->key);
+			tn = tnode_new(n->key, newpos, 1);
+		}
+		else {
+			newpos = 0;
+			tn = tnode_new(key, newpos, 1); /* First tnode */ 
+		}
+		if(!tn) 
+			trie_bug("tnode_pfx_new failed");
+
+		NODE_SET_PARENT(tn, tp);
+
+		missbit=tkey_extract_bits(key, newpos, 1);
+		put_child(t, tn, missbit, (struct node *)l);
+		put_child(t, tn, 1-missbit, n);
+
+		if(tp) {
+			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
+		}
+		else { 
+			t->trie = (struct node*) tn; /* First tnode */
+			tp = tn;
+		}
+	}
+	if(tp && tp->pos+tp->bits > 32) {
+		printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 
+		       tp, tp->pos, tp->bits, key, plen);
+	}
+	/* Rebalance the trie */
+	t->trie = trie_rebalance(t, tp);
+done:;
+	return fa_head;
+}
+
+static int
+fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	struct fib_alias *fa, *new_fa;
+	struct list_head *fa_head=NULL;
+	struct fib_info *fi;
+	int plen = r->rtm_dst_len;
+	int type = r->rtm_type;
+	u8 tos = r->rtm_tos;
+	u32 key, mask;
+	int err;
+	struct leaf *l;
+
+	if (plen > 32)
+		return -EINVAL;
+
+	key = 0;
+	if (rta->rta_dst) 
+		memcpy(&key, rta->rta_dst, 4);
+
+	key = ntohl(key);
+
+	if(trie_debug)
+		printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
+
+	mask =  ntohl( inet_make_mask(plen) );
+
+	if(key & ~mask)
+		return -EINVAL;
+
+	key = key & mask;
+
+	if  ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL)
+		goto err;
+
+	l = fib_find_node(t, key);
+	fa = NULL;	
+
+	if(l) {
+		fa_head = get_fa_head(l, plen);
+		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
+	}
+
+	/* Now fa, if non-NULL, points to the first fib alias
+	 * with the same keys [prefix,tos,priority], if such key already
+	 * exists or to the node before which we will insert new one.
+	 *
+	 * If fa is NULL, we will need to allocate a new one and
+	 * insert to the head of f.
+	 *
+	 * If f is NULL, no fib node matched the destination key
+	 * and we need to allocate a new one of those as well.
+	 */
+
+	if (fa &&
+	    fa->fa_info->fib_priority == fi->fib_priority) {
+		struct fib_alias *fa_orig;
+
+		err = -EEXIST;
+		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
+			goto out;
+
+		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
+			struct fib_info *fi_drop;
+			u8 state;
+
+			write_lock_bh(&fib_lock);
+
+			fi_drop = fa->fa_info;
+			fa->fa_info = fi;
+			fa->fa_type = type;
+			fa->fa_scope = r->rtm_scope;
+			state = fa->fa_state;
+			fa->fa_state &= ~FA_S_ACCESSED;
+
+			write_unlock_bh(&fib_lock);
+
+			fib_release_info(fi_drop);
+			if (state & FA_S_ACCESSED)
+			  rt_cache_flush(-1);
+
+			    goto succeeded;
+		}
+		/* Error if we find a perfect match which
+		 * uses the same scope, type, and nexthop
+		 * information.
+		 */
+		fa_orig = fa;
+		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
+			if (fa->fa_tos != tos)
+				break;
+			if (fa->fa_info->fib_priority != fi->fib_priority)
+				break;
+			if (fa->fa_type == type &&
+			    fa->fa_scope == r->rtm_scope &&
+			    fa->fa_info == fi) {
+				goto out;
+			}
+		}
+		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
+			fa = fa_orig;
+	}
+	err = -ENOENT;
+	if (!(nlhdr->nlmsg_flags&NLM_F_CREATE))
+		goto out;
+
+	err = -ENOBUFS;
+	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+	if (new_fa == NULL)
+		goto out;
+
+	new_fa->fa_info = fi;
+	new_fa->fa_tos = tos;
+	new_fa->fa_type = type;
+	new_fa->fa_scope = r->rtm_scope;
+	new_fa->fa_state = 0;
+#if 0
+	new_fa->dst  = NULL;
+#endif
+	/*
+	 * Insert new entry to the list.
+	 */
+
+	if(!fa_head)
+		fa_head = fib_insert_node(t, key, plen);
+
+	write_lock_bh(&fib_lock);
+
+	list_add_tail(&new_fa->fa_list,
+		 (fa ? &fa->fa_list : fa_head));
+
+	write_unlock_bh(&fib_lock);
+
+	rt_cache_flush(-1);
+	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
+succeeded:
+	return 0;
+out:
+	fib_release_info(fi);
+err:;	
+	return err;
+}
+
+static inline int check_leaf(struct trie *t, struct leaf *l,  t_key key, int *plen, const struct flowi *flp, 
+			     struct fib_result *res, int *err)
+{
+	int i;
+	t_key mask;
+	struct leaf_info *li;
+	struct hlist_head *hhead = &l->list;
+	struct hlist_node *node;
+	
+	hlist_for_each_entry(li, node, hhead, hlist) {
+
+		i = li->plen;
+		mask = ntohl(inet_make_mask(i));
+		if (l->key != (key & mask)) 
+			continue;
+
+		if (((*err) = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) == 0) {
+			*plen = i;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.semantic_match_passed++;
+#endif
+			return 1;
+		}
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+		t->stats.semantic_match_miss++;
+#endif
+	}
+	return 0;
+}
+
+static int
+fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	int plen, ret = 0;
+	struct node *n;
+	struct tnode *pn;
+	int pos, bits;
+	t_key key=ntohl(flp->fl4_dst);
+	int chopped_off;
+	t_key cindex = 0;
+	int current_prefix_length = KEYLENGTH;
+	n = t->trie;
+
+	read_lock(&fib_lock);
+	if(!n)
+		goto failed;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	t->stats.gets++;
+#endif
+
+	/* Just a leaf? */
+	if (IS_LEAF(n)) {
+		if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret) )
+			goto found;
+		goto failed;
+	}
+	pn = (struct tnode *) n;
+	chopped_off = 0;
+	
+        while (pn) {
+
+		pos = pn->pos;
+		bits = pn->bits;
+
+		if(!chopped_off) 
+			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
+
+		n = tnode_get_child(pn, cindex);
+
+		if (n == NULL) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.null_node_hit++;
+#endif
+			goto backtrace;
+		}
+
+		if (IS_TNODE(n)) {
+#define HL_OPTIMIZE
+#ifdef HL_OPTIMIZE
+			struct tnode *cn = (struct tnode *)n;
+			t_key node_prefix, key_prefix, pref_mismatch;
+			int mp;
+
+			/*
+			 * It's a tnode, and we can do some extra checks here if we 
+			 * like, to avoid descending into a dead-end branch.
+			 * This tnode is in the parent's child array at index 
+			 * key[p_pos..p_pos+p_bits] but potentially with some bits 
+			 * chopped off, so in reality the index may be just a 
+			 * subprefix, padded with zero at the end.
+			 * We can also take a look at any skipped bits in this 
+			 * tnode - everything up to p_pos is supposed to be ok, 
+			 * and the non-chopped bits of the index (se previous
+			 * paragraph) are also guaranteed ok, but the rest is 
+			 * considered unknown.
+			 *
+			 * The skipped bits are key[pos+bits..cn->pos].
+			 */
+			
+			/* If current_prefix_length < pos+bits, we are already doing 
+			 * actual prefix  matching, which means everything from 
+			 * pos+(bits-chopped_off) onward must be zero along some 
+			 * branch of this subtree - otherwise there is *no* valid 
+			 * prefix present. Here we can only check the skipped
+			 * bits. Remember, since we have already indexed into the 
+			 * parent's child array, we know that the bits we chopped of 
+			 * *are* zero.
+			 */
+
+			/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
+			
+			if (current_prefix_length < pos+bits) {
+				if (tkey_extract_bits(cn->key, current_prefix_length,
+						      cn->pos - current_prefix_length) != 0 ||
+				    !(cn->child[0]))
+					goto backtrace;
+			}
+
+			/*
+			 * If chopped_off=0, the index is fully validated and we 
+			 * only need to look at the skipped bits for this, the new, 
+			 * tnode. What we actually want to do is to find out if
+			 * these skipped bits match our key perfectly, or if we will
+			 * have to count on finding a matching prefix further down, 
+			 * because if we do, we would like to have some way of 
+			 * verifying the existence of such a prefix at this point. 
+			 */
+
+			/* The only thing we can do at this point is to verify that
+			 * any such matching prefix can indeed be a prefix to our
+			 * key, and if the bits in the node we are inspecting that
+			 * do not match our key are not ZERO, this cannot be true.
+			 * Thus, find out where there is a mismatch (before cn->pos)
+			 * and verify that all the mismatching bits are zero in the
+			 * new tnode's key.
+			 */
+
+			/* Note: We aren't very concerned about the piece of the key 
+			 * that precede pn->pos+pn->bits, since these have already been 
+			 * checked. The bits after cn->pos aren't checked since these are 
+			 * by definition "unknown" at this point. Thus, what we want to 
+			 * see is if we are about to enter the "prefix matching" state, 
+			 * and in that case verify that the skipped bits that will prevail 
+			 * throughout this subtree are zero, as they have to be if we are 
+			 * to find a matching prefix.
+			 */
+
+			node_prefix = MASK_PFX(cn->key, cn->pos);
+			key_prefix =  MASK_PFX(key, cn->pos);
+			pref_mismatch = key_prefix^node_prefix;
+			mp = 0;
+
+			/* In short: If skipped bits in this node do not match the search 
+			 * key, enter the "prefix matching" state.directly.
+			 */
+			if (pref_mismatch) {
+				while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
+					mp++;
+					pref_mismatch = pref_mismatch <<1;
+				}
+				key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+				
+				if (key_prefix != 0)
+					goto backtrace;
+
+				if (current_prefix_length >= cn->pos)
+					current_prefix_length=mp;
+		       }
+#endif
+		       pn = (struct tnode *)n; /* Descend */
+		       chopped_off = 0;
+		       continue;
+		} 
+		if (IS_LEAF(n)) {	
+			if( check_leaf(t, (struct leaf *)n, key, &plen, flp, res, &ret))
+				goto found;
+	       }
+backtrace:
+		chopped_off++;
+
+		/* As zero don't change the child key (cindex) */
+		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) {
+			chopped_off++;
+		}
+
+		/* Decrease current_... with bits chopped off */
+		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
+			current_prefix_length = pn->pos + pn->bits - chopped_off;
+		
+		/*
+		 * Either we do the actual chop off according or if we have 
+		 * chopped off all bits in this tnode walk up to our parent.
+		 */
+
+		if(chopped_off <= pn->bits)
+			cindex &= ~(1 << (chopped_off-1));
+		else {
+			if( NODE_PARENT(pn) == NULL)
+				goto failed;
+			
+			/* Get Child's index */
+			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
+			pn = NODE_PARENT(pn);
+			chopped_off = 0;
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+			t->stats.backtrack++;
+#endif
+			goto backtrace;
+		} 
+	}
+failed:
+	ret =  1;
+found:
+	read_unlock(&fib_lock);
+	return ret;
+}
+
+static int trie_leaf_remove(struct trie *t, t_key key)
+{
+	t_key cindex;
+	struct tnode *tp = NULL;
+	struct node *n = t->trie;
+	struct leaf *l;
+
+	if(trie_debug) 
+		printk("entering trie_leaf_remove(%p)\n", n);
+
+	/* Note that in the case skipped bits, those bits are *not* checked!
+	 * When we finish this, we will have NULL or a T_LEAF, and the 
+	 * T_LEAF may or may not match our key.
+	 */
+
+        while (n != NULL && IS_TNODE(n)) {
+		struct tnode *tn = (struct tnode *) n;
+		check_tnode(tn);
+		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
+
+			if(n && NODE_PARENT(n) != tn) {
+				printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
+				BUG();
+			}
+        }
+	l = (struct leaf *) n;
+
+	if(!n || !tkey_equals(l->key, key)) 
+		return 0;
+    
+	/* 
+	 * Key found. 
+	 * Remove the leaf and rebalance the tree 
+	 */
+
+	t->revision++;
+	t->size--;
+
+	tp = NODE_PARENT(n);
+	tnode_free((struct tnode *) n);
+
+	if(tp) {
+		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+		put_child(t, (struct tnode *)tp, cindex, NULL);
+		t->trie = trie_rebalance(t, tp);
+	}
+	else
+		t->trie = NULL;
+
+	return 1;
+}
+
+static int
+fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	u32 key, mask;
+	int plen = r->rtm_dst_len;
+	u8 tos = r->rtm_tos;
+	struct fib_alias *fa, *fa_to_delete;
+	struct list_head *fa_head;
+	struct leaf *l;
+
+	if (plen > 32) 
+		return -EINVAL;
+
+	key = 0;
+	if (rta->rta_dst) 
+		memcpy(&key, rta->rta_dst, 4);
+
+	key = ntohl(key);
+	mask =  ntohl( inet_make_mask(plen) );
+
+	if(key & ~mask)
+		return -EINVAL;
+
+	key = key & mask;
+	l = fib_find_node(t, key);
+
+	if(!l)
+		return -ESRCH;
+
+	fa_head = get_fa_head(l, plen);
+	fa = fib_find_alias(fa_head, tos, 0);
+
+	if (!fa)
+		return -ESRCH;
+
+	if (trie_debug)
+		printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+
+	fa_to_delete = NULL;
+	fa_head = fa->fa_list.prev;
+	list_for_each_entry(fa, fa_head, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if (fa->fa_tos != tos)
+			break;
+
+		if ((!r->rtm_type ||
+		     fa->fa_type == r->rtm_type) &&
+		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
+		     fa->fa_scope == r->rtm_scope) &&
+		    (!r->rtm_protocol ||
+		     fi->fib_protocol == r->rtm_protocol) &&
+		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
+			fa_to_delete = fa;
+			break;
+		}
+	}
+
+	if (fa_to_delete) {
+		int kill_li = 0;
+		struct leaf_info *li;
+
+		fa = fa_to_delete;
+		rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
+
+		l = fib_find_node(t, key);
+		li = find_leaf_info(&l->list, plen);
+
+		write_lock_bh(&fib_lock);
+
+		list_del(&fa->fa_list);
+
+		if(list_empty(fa_head)) {
+			hlist_del(&li->hlist);
+			kill_li = 1;
+		}
+		write_unlock_bh(&fib_lock);
+		
+		if(kill_li)
+			free_leaf_info(li);
+
+		if(hlist_empty(&l->list))
+			trie_leaf_remove(t, key);
+
+		if (fa->fa_state & FA_S_ACCESSED)
+			rt_cache_flush(-1);
+
+		fn_free_alias(fa);
+		return 0;
+	}
+	return -ESRCH;
+}
+
+static int trie_flush_list(struct trie *t, struct list_head *head)
+{
+	struct fib_alias *fa, *fa_node;
+	int found = 0;
+
+	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+		
+		if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
+
+ 			write_lock_bh(&fib_lock);	
+			list_del(&fa->fa_list);
+			write_unlock_bh(&fib_lock); 
+
+			fn_free_alias(fa);
+			found++;
+		}
+	}
+	return found;
+}
+
+static int trie_flush_leaf(struct trie *t, struct leaf *l)
+{
+	int found = 0;
+	struct hlist_head *lih = &l->list;
+	struct hlist_node *node, *tmp;
+	struct leaf_info *li = NULL;
+
+	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
+			
+		found += trie_flush_list(t, &li->falh);
+
+		if (list_empty(&li->falh)) {
+
+ 			write_lock_bh(&fib_lock); 
+			hlist_del(&li->hlist);
+			write_unlock_bh(&fib_lock); 
+
+			free_leaf_info(li);
+		}
+	}
+	return found;
+}
+
+static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
+{
+	struct node *c = (struct node *) thisleaf;
+	struct tnode *p;
+	int idx;
+
+	if(c == NULL) {
+		if(t->trie == NULL)
+			return NULL;
+
+		if (IS_LEAF(t->trie))          /* trie w. just a leaf */
+			return (struct leaf *) t->trie;
+
+		p = (struct tnode*) t->trie;  /* Start */
+	}
+	else 
+		p = (struct tnode *) NODE_PARENT(c);
+	while (p) {
+		int pos, last;
+
+		/*  Find the next child of the parent */
+		if(c)
+			pos  = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
+		else 
+			pos = 0;
+
+		last = 1 << p->bits;
+		for(idx = pos; idx < last ; idx++) {
+			if( p->child[idx]) {
+
+				/* Decend if tnode */
+
+				while (IS_TNODE(p->child[idx])) {
+					p = (struct tnode*) p->child[idx];
+					idx = 0;
+					
+					/* Rightmost non-NULL branch */
+					if( p && IS_TNODE(p) )
+						while ( p->child[idx] == NULL && idx < (1 << p->bits) ) idx++;
+
+					/* Done with this tnode? */
+					if( idx >= (1 << p->bits) || p->child[idx] == NULL ) 
+						goto up;
+				}
+				return (struct leaf*) p->child[idx];
+			}
+		}
+up:
+		/* No more children go up one step  */
+		c = (struct node*) p;
+		p = (struct tnode *) NODE_PARENT(p);
+	}
+	return NULL; /* Ready. Root of trie */
+}
+
+static int fn_trie_flush(struct fib_table *tb)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	struct leaf *ll = NULL, *l = NULL;
+	int found = 0, h;
+
+	t->revision++;
+
+	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+		found += trie_flush_leaf(t, l);
+
+		if (ll && hlist_empty(&ll->list))
+			trie_leaf_remove(t, ll->key);
+		ll = l;
+	}
+
+	if (ll && hlist_empty(&ll->list))
+		trie_leaf_remove(t, ll->key);
+
+	if(trie_debug) 
+		printk("trie_flush found=%d\n", found);
+	return found;
+}
+
+static int trie_last_dflt=-1;
+
+static void
+fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+	struct trie *t = (struct trie *) tb->tb_data;
+	int order, last_idx;
+	struct fib_info *fi = NULL;
+	struct fib_info *last_resort;
+	struct fib_alias *fa = NULL;
+	struct list_head *fa_head;
+	struct leaf *l;
+
+	last_idx = -1;
+	last_resort = NULL;
+	order = -1;
+
+	read_lock(&fib_lock);
+	
+	l = fib_find_node(t, 0);
+	if(!l) 
+		goto out;
+
+	fa_head = get_fa_head(l, 0);
+	if(!fa_head)
+		goto out;
+
+	if (list_empty(fa_head)) 
+		goto out;
+
+	list_for_each_entry(fa, fa_head, fa_list) {
+		struct fib_info *next_fi = fa->fa_info;
+		
+		if (fa->fa_scope != res->scope ||
+		    fa->fa_type != RTN_UNICAST)
+			continue;
+		
+		if (next_fi->fib_priority > res->fi->fib_priority)
+			break;
+		if (!next_fi->fib_nh[0].nh_gw ||
+		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+			continue;
+		fa->fa_state |= FA_S_ACCESSED;
+		
+		if (fi == NULL) {
+			if (next_fi != res->fi)
+				break;
+		} else if (!fib_detect_death(fi, order, &last_resort,
+					     &last_idx, &trie_last_dflt)) {
+			if (res->fi)
+				fib_info_put(res->fi);
+			res->fi = fi;
+			atomic_inc(&fi->fib_clntref);
+			trie_last_dflt = order;
+			goto out;
+		}
+		fi = next_fi;
+		order++;
+	}
+	if (order <= 0 || fi == NULL) {
+		trie_last_dflt = -1;
+		goto out;
+	}
+
+	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
+		if (res->fi)
+			fib_info_put(res->fi);
+		res->fi = fi;
+		atomic_inc(&fi->fib_clntref);
+		trie_last_dflt = order;
+		goto out;
+	}
+	if (last_idx >= 0) {
+		if (res->fi)
+			fib_info_put(res->fi);
+		res->fi = last_resort;
+		if (last_resort)
+			atomic_inc(&last_resort->fib_clntref);
+	}
+	trie_last_dflt = last_idx;
+ out:;
+	read_unlock(&fib_lock);	
+}
+
+static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 
+			   struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int i, s_i;
+	struct fib_alias *fa;
+
+	u32 xkey=htonl(key);
+
+	s_i=cb->args[3];
+	i = 0;
+
+	list_for_each_entry(fa, fah, fa_list) {
+		if (i < s_i) {
+			i++;
+			continue;
+		}
+		if (fa->fa_info->fib_nh == NULL) {
+			printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
+			i++;
+			continue;
+		}
+		if (fa->fa_info == NULL) {
+			printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
+			i++;
+			continue;
+		}
+
+		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+				  cb->nlh->nlmsg_seq,
+				  RTM_NEWROUTE,
+				  tb->tb_id,
+				  fa->fa_type,
+				  fa->fa_scope,
+				  &xkey,
+				  plen,
+				  fa->fa_tos,
+				  fa->fa_info) < 0) {
+			cb->args[3] = i;
+			return -1;
+			}
+		i++;
+	}
+	cb->args[3]=i;
+	return skb->len;
+}
+
+static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, 
+			     struct netlink_callback *cb)
+{
+	int h, s_h;
+	struct list_head *fa_head;
+	struct leaf *l = NULL;
+	s_h=cb->args[2];
+
+	for (h=0; (l = nextleaf(t, l)) != NULL; h++) {
+
+		if (h < s_h)
+			continue;
+		if (h > s_h)
+			memset(&cb->args[3], 0,
+			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
+
+		fa_head = get_fa_head(l, plen);
+		
+		if(!fa_head)
+			continue;
+
+		if(list_empty(fa_head))
+			continue;
+
+		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
+			cb->args[2]=h;
+			return -1;
+		}
+	}
+	cb->args[2]=h;
+	return skb->len;
+}
+
+static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int m, s_m;
+	struct trie *t = (struct trie *) tb->tb_data;
+
+	s_m = cb->args[1];
+
+	read_lock(&fib_lock);
+	for (m=0; m<=32; m++) {
+
+		if (m < s_m)
+			continue;
+		if (m > s_m)
+			memset(&cb->args[2], 0,
+			       sizeof(cb->args) - 2*sizeof(cb->args[0]));
+
+		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
+			cb->args[1] = m;
+			goto out;
+		}
+	}
+	read_unlock(&fib_lock);
+	cb->args[1] = m;
+	return skb->len;
+ out:
+	read_unlock(&fib_lock);
+	return -1;
+}
+
+/* Fix more generic FIB names for init later */
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+struct fib_table * fib_hash_init(int id)
+#else
+struct fib_table * __init fib_hash_init(int id)
+#endif
+{
+	struct fib_table *tb;
+	struct trie *t;
+
+	if (fn_alias_kmem == NULL)
+		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+						  sizeof(struct fib_alias),
+						  0, SLAB_HWCACHE_ALIGN,
+						  NULL, NULL);
+
+	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
+		     GFP_KERNEL);
+	if (tb == NULL)
+		return NULL;
+
+	tb->tb_id = id;
+	tb->tb_lookup = fn_trie_lookup;
+	tb->tb_insert = fn_trie_insert;
+	tb->tb_delete = fn_trie_delete;
+	tb->tb_flush = fn_trie_flush;
+	tb->tb_select_default = fn_trie_select_default;
+	tb->tb_dump = fn_trie_dump;
+	memset(tb->tb_data, 0, sizeof(struct trie));
+
+	t = (struct trie *) tb->tb_data;
+
+	trie_init(t);
+
+	if (id == RT_TABLE_LOCAL) 
+                trie_local=t;
+	  else if (id == RT_TABLE_MAIN) 
+                trie_main=t;
+
+	if (id == RT_TABLE_LOCAL)
+		printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
+
+	return tb;
+}
+
+/* Trie dump functions */
+
+static void putspace_seq(struct seq_file *seq, int n)
+{
+	while (n--) seq_printf(seq, " ");
+}
+
+static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
+{
+	while (bits--)
+		seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
+}
+
+static void printnode_seq(struct seq_file *seq, int indent, struct node *n, 
+		   int pend, int cindex, int bits)
+{
+	putspace_seq(seq, indent);
+	if (IS_LEAF(n))
+		seq_printf(seq, "|");
+	else
+		seq_printf(seq, "+");
+	if (bits) {
+		seq_printf(seq, "%d/", cindex);
+		printbin_seq(seq, cindex, bits);
+		seq_printf(seq, ": ");
+	}
+	else
+		seq_printf(seq, "<root>: ");
+	seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
+
+	if (IS_LEAF(n))
+		seq_printf(seq, "key=%d.%d.%d.%d\n", 
+			   n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
+	else {
+		int plen=((struct tnode *)n)->pos;
+		t_key prf=MASK_PFX(n->key, plen);
+		seq_printf(seq, "key=%d.%d.%d.%d/%d\n", 
+			   prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
+	}
+	if (IS_LEAF(n)) {
+		struct leaf *l=(struct leaf *)n;
+		struct fib_alias *fa;
+		int i;
+		for (i=32; i>=0; i--)
+		  if(find_leaf_info(&l->list, i)) {
+			
+				struct list_head *fa_head = get_fa_head(l, i);
+				
+				if(!fa_head)
+					continue;
+
+				if(list_empty(fa_head))
+					continue;
+
+				putspace_seq(seq, indent+2);
+				seq_printf(seq, "{/%d...dumping}\n", i);
+
+
+				list_for_each_entry(fa, fa_head, fa_list) {
+					putspace_seq(seq, indent+2);
+					if (fa->fa_info->fib_nh == NULL) {
+						seq_printf(seq, "Error _fib_nh=NULL\n");
+						continue;
+					}
+					if (fa->fa_info == NULL) {
+						seq_printf(seq, "Error fa_info=NULL\n");
+						continue;
+					}
+
+					seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
+					      fa->fa_type,
+					      fa->fa_scope,
+					      fa->fa_tos);
+				}
+			}
+	}
+	else if (IS_TNODE(n)) {
+		struct tnode *tn=(struct tnode *)n;
+		putspace_seq(seq, indent); seq_printf(seq, "|    ");
+		seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos));
+		printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
+		seq_printf(seq, "}\n");
+		putspace_seq(seq, indent); seq_printf(seq, "|    ");
+		seq_printf(seq, "{pos=%d", tn->pos);
+		seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
+		seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
+		putspace_seq(seq, indent); seq_printf(seq, "|    ");
+		seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
+	}
+}
+
+static void trie_dump_seq(struct seq_file *seq, struct trie *t)
+{
+	struct node *n=t->trie;
+	int cindex=0;
+	int indent=1;
+	int pend=0;
+	int depth = 0;
+
+  	read_lock(&fib_lock);
+
+	seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
+	if (n) {
+		printnode_seq(seq, indent, n, pend, cindex, 0);
+		if (IS_TNODE(n)) {
+			struct tnode *tn=(struct tnode *)n;
+			pend = tn->pos+tn->bits;
+			putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
+			indent += 3;
+			depth++;
+
+			while (tn && cindex < (1 << tn->bits)) {
+				if (tn->child[cindex]) {
+					
+					/* Got a child */
+					
+					printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
+					if (IS_LEAF(tn->child[cindex])) { 
+						cindex++;
+						
+					}
+					else {
+						/* 
+						 * New tnode. Decend one level 
+						 */
+						
+						depth++;
+						n=tn->child[cindex];
+						tn=(struct tnode *)n;
+						pend=tn->pos+tn->bits;
+						putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
+						indent+=3;
+						cindex=0;
+					}
+				}
+				else 
+					cindex++;
+
+				/*
+				 * Test if we are done 
+				 */
+				
+				while (cindex >= (1 << tn->bits)) {
+
+					/*
+					 * Move upwards and test for root
+					 * pop off all traversed  nodes
+					 */
+					
+					if (NODE_PARENT(tn) == NULL) {
+						tn = NULL;
+						n = NULL;
+						break;
+					}
+					else {
+						cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+						tn = NODE_PARENT(tn);
+						cindex++;
+						n=(struct node *)tn;
+						pend=tn->pos+tn->bits;
+						indent-=3;
+						depth--;
+					}
+				}
+			}
+		}
+		else n = NULL;
+	}
+	else seq_printf(seq, "------ trie is empty\n");
+
+  	read_unlock(&fib_lock);
+}
+
+static struct trie_stat *trie_stat_new(void)
+{
+	struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
+	int i;
+	
+	if(s) {
+		s->totdepth = 0;
+		s->maxdepth = 0;
+		s->tnodes = 0;
+		s->leaves = 0;
+		s->nullpointers = 0;
+		
+		for(i=0; i< MAX_CHILDS; i++)
+			s->nodesizes[i] = 0;
+	}
+	return s;
+}    
+
+static struct trie_stat *trie_collect_stats(struct trie *t)
+{
+	struct node *n=t->trie;
+	struct trie_stat *s = trie_stat_new();
+	int cindex = 0;
+	int indent = 1;
+	int pend = 0;
+	int depth = 0;
+
+	read_lock(&fib_lock);		
+
+	if (s) {
+		if (n) {
+			if (IS_TNODE(n)) {
+				struct tnode *tn = (struct tnode *)n;
+				pend=tn->pos+tn->bits;
+				indent += 3;
+				s->nodesizes[tn->bits]++;
+				depth++;
+
+				while (tn && cindex < (1 << tn->bits)) {
+					if (tn->child[cindex]) {
+						/* Got a child */
+					
+						if (IS_LEAF(tn->child[cindex])) { 
+							cindex++;
+						
+							/* stats */
+							if (depth > s->maxdepth)
+								s->maxdepth = depth;
+							s->totdepth += depth;
+							s->leaves++;
+						}
+					
+						else {
+							/* 
+							 * New tnode. Decend one level 
+							 */
+						
+							s->tnodes++;
+							s->nodesizes[tn->bits]++;
+							depth++;
+						
+							n = tn->child[cindex];
+							tn = (struct tnode *)n;
+							pend = tn->pos+tn->bits;
+
+							indent += 3;
+							cindex = 0;
+						}
+					}
+					else {
+						cindex++;
+						s->nullpointers++; 
+					}
+
+					/*
+					 * Test if we are done 
+					 */
+				
+					while (cindex >= (1 << tn->bits)) {
+
+						/*
+						 * Move upwards and test for root
+						 * pop off all traversed  nodes
+						 */
+
+						
+						if (NODE_PARENT(tn) == NULL) {
+							tn = NULL;
+							n = NULL;
+							break;
+						}
+						else {
+							cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
+							tn = NODE_PARENT(tn);
+							cindex++; 
+							n = (struct node *)tn;
+							pend=tn->pos+tn->bits;
+							indent -= 3;
+							depth--;
+						}
+ 					}
+				}
+			}
+			else n = NULL;
+		}
+	}
+
+	read_unlock(&fib_lock);		
+	return s;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
+{
+	return NULL;
+}
+
+static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
+{
+	return NULL;
+}
+
+static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	void *v = NULL;
+
+	if (ip_fib_main_table)
+		v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN;
+	return v;
+}
+
+static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq);
+}
+
+static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+/* 
+ *	This outputs /proc/net/fib_triestats
+ *
+ *	It always works in backward compatibility mode.
+ *	The format of the file is not supposed to be changed.
+ */
+
+static void collect_and_show(struct trie *t, struct seq_file *seq)
+{
+	int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
+	int i, max, pointers;
+        struct trie_stat *stat;
+	int avdepth;
+
+	stat = trie_collect_stats(t);
+
+	bytes=0;
+	seq_printf(seq, "trie=%p\n", t);
+
+	if (stat) {
+		if (stat->leaves)
+			avdepth=stat->totdepth*100 / stat->leaves;
+		else
+			avdepth=0;
+		seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
+		seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
+				
+		seq_printf(seq, "Leaves: %d\n", stat->leaves);
+		bytes += sizeof(struct leaf) * stat->leaves;
+		seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
+		bytes += sizeof(struct tnode) * stat->tnodes;
+
+		max = MAX_CHILDS-1;
+
+		while (max >= 0 && stat->nodesizes[max] == 0)
+			max--;
+		pointers = 0;
+
+		for (i = 1; i <= max; i++) 
+			if (stat->nodesizes[i] != 0) {
+				seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
+				pointers += (1<<i) * stat->nodesizes[i];
+			}
+		seq_printf(seq, "\n");
+		seq_printf(seq, "Pointers: %d\n", pointers);
+		bytes += sizeof(struct node *) * pointers;
+		seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
+		seq_printf(seq, "Total size: %d  kB\n", bytes / 1024);
+
+		kfree(stat);
+	}
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+	seq_printf(seq, "Counters:\n---------\n");
+	seq_printf(seq,"gets = %d\n", t->stats.gets);
+	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
+	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
+	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
+	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
+#ifdef CLEAR_STATS
+	memset(&(t->stats), 0, sizeof(t->stats));
+#endif
+#endif /*  CONFIG_IP_FIB_TRIE_STATS */
+}
+
+static int fib_triestat_seq_show(struct seq_file *seq, void *v)
+{
+	char bf[128];
+    
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", 
+			   sizeof(struct leaf), sizeof(struct tnode));
+		if (trie_local) 
+			collect_and_show(trie_local, seq);
+
+		if (trie_main) 
+			collect_and_show(trie_main, seq);
+	}
+	else {
+		snprintf(bf, sizeof(bf),
+			 "*\t%08X\t%08X", 200, 400);
+		
+		seq_printf(seq, "%-127s\n", bf);
+	}
+	return 0;
+}
+
+static struct seq_operations fib_triestat_seq_ops = {
+	.start  = fib_triestat_seq_start,
+	.next   = fib_triestat_seq_next,
+	.stop   = fib_triestat_seq_stop,
+	.show   = fib_triestat_seq_show,
+};
+
+static int fib_triestat_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+
+	rc = seq_open(file, &fib_triestat_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq	     = file->private_data;
+out:
+	return rc;
+out_kfree:
+	goto out;
+}
+
+static struct file_operations fib_triestat_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = fib_triestat_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release_private,
+};
+
+int __init fib_stat_proc_init(void)
+{
+	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+void __init fib_stat_proc_exit(void)
+{
+	proc_net_remove("fib_triestat");
+}
+
+static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
+{
+	return NULL;
+}
+
+static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
+{
+	return NULL;
+}
+
+static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	void *v = NULL;
+
+	if (ip_fib_main_table)
+		v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN;
+	return v;
+}
+
+static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq);
+}
+
+static void fib_trie_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+/* 
+ *	This outputs /proc/net/fib_trie.
+ *
+ *	It always works in backward compatibility mode.
+ *	The format of the file is not supposed to be changed.
+ */
+
+static int fib_trie_seq_show(struct seq_file *seq, void *v)
+{
+	char bf[128];
+
+	if (v == SEQ_START_TOKEN) {
+		if (trie_local) 
+			trie_dump_seq(seq, trie_local);
+
+		if (trie_main) 
+			trie_dump_seq(seq, trie_main);
+	}
+
+	else {
+		snprintf(bf, sizeof(bf),
+			 "*\t%08X\t%08X", 200, 400);
+		seq_printf(seq, "%-127s\n", bf);
+	}
+
+	return 0;
+}
+
+static struct seq_operations fib_trie_seq_ops = {
+	.start  = fib_trie_seq_start,
+	.next   = fib_trie_seq_next,
+	.stop   = fib_trie_seq_stop,
+	.show   = fib_trie_seq_show,
+};
+
+static int fib_trie_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+
+	rc = seq_open(file, &fib_trie_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq	     = file->private_data;
+out:
+	return rc;
+out_kfree:
+	goto out;
+}
+
+static struct file_operations fib_trie_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = fib_trie_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release_private,
+};
+
+int __init fib_proc_init(void)
+{
+	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+void __init fib_proc_exit(void)
+{
+	proc_net_remove("fib_trie");
+}
+
+#endif /* CONFIG_PROC_FS */
-- 
cgit v1.2.3


From 0d51aa80a9b1db43920c0770c3bb842dd823c005 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <hadi@cyberus.ca>
Date: Tue, 21 Jun 2005 13:51:04 -0700
Subject: [IPV6]: V6 route events reported with wrong netlink PID and seq
 number

Essentially netlink at the moment always reports a pid and sequence of 0
always for v6 route activities.
To understand the repurcassions of this look at:
http://lists.quagga.net/pipermail/quagga-dev/2005-June/003507.html

While fixing this, i took the liberty to resolve the outstanding issue
of IPV6 routes inserted via ioctls to have the correct pids as well.

This patch tries to behave as close as possible to the v4 routes i.e
maintains whatever PID the socket issuing the command owns as opposed to
the process. That made the patch a little bulky.

I have tested against both netlink derived utility to add/del routes as
well as ioctl derived one. The Quagga folks have tested against quagga.
This fixes the problem and so far hasnt been detected to introduce any
new issues.

Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 14 +++++-----
 net/ipv6/anycast.c  |  4 +--
 net/ipv6/ip6_fib.c  | 19 ++++++-------
 net/ipv6/ndisc.c    |  4 +--
 net/ipv6/route.c    | 78 ++++++++++++++++++++++++++++-------------------------
 5 files changed, 62 insertions(+), 57 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 47a30c3188e..14f5c53235f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -695,7 +695,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
 
 		if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
 			if (onlink == 0) {
-				ip6_del_rt(rt, NULL, NULL);
+				ip6_del_rt(rt, NULL, NULL, NULL);
 				rt = NULL;
 			} else if (!(rt->rt6i_flags & RTF_EXPIRES)) {
 				rt->rt6i_expires = expires;
@@ -1340,7 +1340,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
 	if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT))
 		rtmsg.rtmsg_flags |= RTF_NONEXTHOP;
 
-	ip6_route_add(&rtmsg, NULL, NULL);
+	ip6_route_add(&rtmsg, NULL, NULL, NULL);
 }
 
 /* Create "default" multicast route to the interface */
@@ -1357,7 +1357,7 @@ static void addrconf_add_mroute(struct net_device *dev)
 	rtmsg.rtmsg_ifindex = dev->ifindex;
 	rtmsg.rtmsg_flags = RTF_UP;
 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
-	ip6_route_add(&rtmsg, NULL, NULL);
+	ip6_route_add(&rtmsg, NULL, NULL, NULL);
 }
 
 static void sit_route_add(struct net_device *dev)
@@ -1374,7 +1374,7 @@ static void sit_route_add(struct net_device *dev)
 	rtmsg.rtmsg_flags	= RTF_UP|RTF_NONEXTHOP;
 	rtmsg.rtmsg_ifindex	= dev->ifindex;
 
-	ip6_route_add(&rtmsg, NULL, NULL);
+	ip6_route_add(&rtmsg, NULL, NULL, NULL);
 }
 
 static void addrconf_add_lroute(struct net_device *dev)
@@ -1467,7 +1467,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
 		if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) {
 			if (rt->rt6i_flags&RTF_EXPIRES) {
 				if (valid_lft == 0) {
-					ip6_del_rt(rt, NULL, NULL);
+					ip6_del_rt(rt, NULL, NULL, NULL);
 					rt = NULL;
 				} else {
 					rt->rt6i_expires = rt_expires;
@@ -3094,7 +3094,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 	switch (event) {
 	case RTM_NEWADDR:
 		dst_hold(&ifp->rt->u.dst);
-		if (ip6_ins_rt(ifp->rt, NULL, NULL))
+		if (ip6_ins_rt(ifp->rt, NULL, NULL, NULL))
 			dst_release(&ifp->rt->u.dst);
 		if (ifp->idev->cnf.forwarding)
 			addrconf_join_anycast(ifp);
@@ -3104,7 +3104,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			addrconf_leave_anycast(ifp);
 		addrconf_leave_solict(ifp->idev, &ifp->addr);
 		dst_hold(&ifp->rt->u.dst);
-		if (ip6_del_rt(ifp->rt, NULL, NULL))
+		if (ip6_del_rt(ifp->rt, NULL, NULL, NULL))
 			dst_free(&ifp->rt->u.dst);
 		else
 			dst_release(&ifp->rt->u.dst);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 5d22ca3cca2..6b729404723 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -337,7 +337,7 @@ int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr)
 	write_unlock_bh(&idev->lock);
 
 	dst_hold(&rt->u.dst);
-	if (ip6_ins_rt(rt, NULL, NULL))
+	if (ip6_ins_rt(rt, NULL, NULL, NULL))
 		dst_release(&rt->u.dst);
 
 	addrconf_join_solict(dev, &aca->aca_addr);
@@ -380,7 +380,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr)
 	addrconf_leave_solict(idev, &aca->aca_addr);
 
 	dst_hold(&aca->aca_rt->u.dst);
-	if (ip6_del_rt(aca->aca_rt, NULL, NULL))
+	if (ip6_del_rt(aca->aca_rt, NULL, NULL, NULL))
 		dst_free(&aca->aca_rt->u.dst);
 	else
 		dst_release(&aca->aca_rt->u.dst);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 405740b75ab..1b354aa9793 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -394,7 +394,7 @@ insert_above:
  */
 
 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
-    struct nlmsghdr *nlh)
+		struct nlmsghdr *nlh,  struct netlink_skb_parms *req)
 {
 	struct rt6_info *iter = NULL;
 	struct rt6_info **ins;
@@ -449,7 +449,7 @@ out:
 	*ins = rt;
 	rt->rt6i_node = fn;
 	atomic_inc(&rt->rt6i_ref);
-	inet6_rt_notify(RTM_NEWROUTE, rt, nlh);
+	inet6_rt_notify(RTM_NEWROUTE, rt, nlh, req);
 	rt6_stats.fib_rt_entries++;
 
 	if ((fn->fn_flags & RTN_RTINFO) == 0) {
@@ -479,7 +479,8 @@ void fib6_force_start_gc(void)
  *	with source addr info in sub-trees
  */
 
-int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+int fib6_add(struct fib6_node *root, struct rt6_info *rt, 
+		struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
 {
 	struct fib6_node *fn;
 	int err = -ENOMEM;
@@ -552,7 +553,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh,
 	}
 #endif
 
-	err = fib6_add_rt2node(fn, rt, nlh);
+	err = fib6_add_rt2node(fn, rt, nlh, req);
 
 	if (err == 0) {
 		fib6_start_gc(rt);
@@ -859,7 +860,7 @@ static struct fib6_node * fib6_repair_tree(struct fib6_node *fn)
 }
 
 static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
-    struct nlmsghdr *nlh, void *_rtattr)
+    struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
 {
 	struct fib6_walker_t *w;
 	struct rt6_info *rt = *rtp;
@@ -915,11 +916,11 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
 		if (atomic_read(&rt->rt6i_ref) != 1) BUG();
 	}
 
-	inet6_rt_notify(RTM_DELROUTE, rt, nlh);
+	inet6_rt_notify(RTM_DELROUTE, rt, nlh, req);
 	rt6_release(rt);
 }
 
-int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
 {
 	struct fib6_node *fn = rt->rt6i_node;
 	struct rt6_info **rtp;
@@ -944,7 +945,7 @@ int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
 
 	for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) {
 		if (*rtp == rt) {
-			fib6_del_route(fn, rtp, nlh, _rtattr);
+			fib6_del_route(fn, rtp, nlh, _rtattr, req);
 			return 0;
 		}
 	}
@@ -1073,7 +1074,7 @@ static int fib6_clean_node(struct fib6_walker_t *w)
 		res = c->func(rt, c->arg);
 		if (res < 0) {
 			w->leaf = rt;
-			res = fib6_del(rt, NULL, NULL);
+			res = fib6_del(rt, NULL, NULL, NULL);
 			if (res) {
 #if RT6_DEBUG >= 2
 				printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7c291f4e9ed..7ae72d4c9bd 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -955,7 +955,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 			struct rt6_info *rt;
 			rt = rt6_get_dflt_router(saddr, dev);
 			if (rt)
-				ip6_del_rt(rt, NULL, NULL);
+				ip6_del_rt(rt, NULL, NULL, NULL);
 		}
 
 out:
@@ -1096,7 +1096,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
 
 	if (rt && lifetime == 0) {
 		neigh_clone(neigh);
-		ip6_del_rt(rt, NULL, NULL);
+		ip6_del_rt(rt, NULL, NULL, NULL);
 		rt = NULL;
 	}
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1f5b226c357..878789b3122 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -384,12 +384,13 @@ struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
    be destroyed.
  */
 
-int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
+		void *_rtattr, struct netlink_skb_parms *req)
 {
 	int err;
 
 	write_lock_bh(&rt6_lock);
-	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr);
+	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
 	write_unlock_bh(&rt6_lock);
 
 	return err;
@@ -400,7 +401,7 @@ int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
  */
 
 static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
-				struct in6_addr *saddr)
+				struct in6_addr *saddr, struct netlink_skb_parms *req)
 {
 	int err;
 	struct rt6_info *rt;
@@ -432,7 +433,7 @@ static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr,
 
 		dst_hold(&rt->u.dst);
 
-		err = ip6_ins_rt(rt, NULL, NULL);
+		err = ip6_ins_rt(rt, NULL, NULL, req);
 		if (err == 0)
 			return rt;
 
@@ -491,7 +492,8 @@ restart:
 		read_unlock_bh(&rt6_lock);
 
 		nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr,
-			      &skb->nh.ipv6h->saddr);
+			      &skb->nh.ipv6h->saddr,
+			      &NETLINK_CB(skb));
 
 		dst_release(&rt->u.dst);
 		rt = nrt;
@@ -551,7 +553,7 @@ restart:
 		dst_hold(&rt->u.dst);
 		read_unlock_bh(&rt6_lock);
 
-		nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src);
+		nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src, NULL);
 
 		dst_release(&rt->u.dst);
 		rt = nrt;
@@ -598,7 +600,7 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE)
-			ip6_del_rt(rt, NULL, NULL);
+			ip6_del_rt(rt, NULL, NULL, NULL);
 		else
 			dst_release(dst);
 	}
@@ -787,7 +789,8 @@ int ipv6_get_hoplimit(struct net_device *dev)
  *
  */
 
-int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr)
+int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
+		void *_rtattr, struct netlink_skb_parms *req)
 {
 	int err;
 	struct rtmsg *r;
@@ -974,7 +977,7 @@ install_route:
 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
 	rt->u.dst.dev = dev;
 	rt->rt6i_idev = idev;
-	return ip6_ins_rt(rt, nlh, _rtattr);
+	return ip6_ins_rt(rt, nlh, _rtattr, req);
 
 out:
 	if (dev)
@@ -986,7 +989,7 @@ out:
 	return err;
 }
 
-int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
+int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
 {
 	int err;
 
@@ -994,7 +997,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
 
 	rt6_reset_dflt_pointer(NULL);
 
-	err = fib6_del(rt, nlh, _rtattr);
+	err = fib6_del(rt, nlh, _rtattr, req);
 	dst_release(&rt->u.dst);
 
 	write_unlock_bh(&rt6_lock);
@@ -1002,7 +1005,7 @@ int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr)
 	return err;
 }
 
-static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr)
+static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
 {
 	struct fib6_node *fn;
 	struct rt6_info *rt;
@@ -1029,7 +1032,7 @@ static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_r
 			dst_hold(&rt->u.dst);
 			read_unlock_bh(&rt6_lock);
 
-			return ip6_del_rt(rt, nlh, _rtattr);
+			return ip6_del_rt(rt, nlh, _rtattr, req);
 		}
 	}
 	read_unlock_bh(&rt6_lock);
@@ -1136,11 +1139,11 @@ source_ok:
 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
 
-	if (ip6_ins_rt(nrt, NULL, NULL))
+	if (ip6_ins_rt(nrt, NULL, NULL, NULL))
 		goto out;
 
 	if (rt->rt6i_flags&RTF_CACHE) {
-		ip6_del_rt(rt, NULL, NULL);
+		ip6_del_rt(rt, NULL, NULL, NULL);
 		return;
 	}
 
@@ -1204,7 +1207,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
 	 */
 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) {
-		nrt = rt6_cow(rt, daddr, saddr);
+		nrt = rt6_cow(rt, daddr, saddr, NULL);
 		if (!nrt->u.dst.error) {
 			nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
 			if (allfrag)
@@ -1232,7 +1235,7 @@ void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
 		if (allfrag)
 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
-		ip6_ins_rt(nrt, NULL, NULL);
+		ip6_ins_rt(nrt, NULL, NULL, NULL);
 	}
 
 out:
@@ -1305,7 +1308,7 @@ struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
 
 	rtmsg.rtmsg_ifindex = dev->ifindex;
 
-	ip6_route_add(&rtmsg, NULL, NULL);
+	ip6_route_add(&rtmsg, NULL, NULL, NULL);
 	return rt6_get_dflt_router(gwaddr, dev);
 }
 
@@ -1323,7 +1326,7 @@ restart:
 
 			read_unlock_bh(&rt6_lock);
 
-			ip6_del_rt(rt, NULL, NULL);
+			ip6_del_rt(rt, NULL, NULL, NULL);
 
 			goto restart;
 		}
@@ -1349,10 +1352,10 @@ int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
 		rtnl_lock();
 		switch (cmd) {
 		case SIOCADDRT:
-			err = ip6_route_add(&rtmsg, NULL, NULL);
+			err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
 			break;
 		case SIOCDELRT:
-			err = ip6_route_del(&rtmsg, NULL, NULL);
+			err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
 			break;
 		default:
 			err = -EINVAL;
@@ -1546,7 +1549,7 @@ int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
 		return -EINVAL;
-	return ip6_route_del(&rtmsg, nlh, arg);
+	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
 }
 
 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
@@ -1556,7 +1559,7 @@ int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 
 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
 		return -EINVAL;
-	return ip6_route_add(&rtmsg, nlh, arg);
+	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
 }
 
 struct rt6_rtnl_dump_arg
@@ -1566,12 +1569,9 @@ struct rt6_rtnl_dump_arg
 };
 
 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
-			 struct in6_addr *dst,
-			 struct in6_addr *src,
-			 int iif,
-			 int type, u32 pid, u32 seq,
-			 struct nlmsghdr *in_nlh, int prefix,
-			 unsigned int flags)
+			 struct in6_addr *dst, struct in6_addr *src,
+			 int iif, int type, u32 pid, u32 seq,
+			 int prefix, unsigned int flags)
 {
 	struct rtmsg *rtm;
 	struct nlmsghdr  *nlh;
@@ -1585,10 +1585,6 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
 		}
 	}
 
-	if (!pid && in_nlh) {
-		pid = in_nlh->nlmsg_pid;
-	}
-
 	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
 	rtm = NLMSG_DATA(nlh);
 	rtm->rtm_family = AF_INET6;
@@ -1675,7 +1671,7 @@ static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
 
 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
-		     NULL, prefix, NLM_F_MULTI);
+		     prefix, NLM_F_MULTI);
 }
 
 static int fib6_dump_node(struct fib6_walker_t *w)
@@ -1823,7 +1819,7 @@ int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 			    &fl.fl6_dst, &fl.fl6_src,
 			    iif,
 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
-			    nlh->nlmsg_seq, nlh, 0, 0);
+			    nlh->nlmsg_seq, 0, 0);
 	if (err < 0) {
 		err = -EMSGSIZE;
 		goto out_free;
@@ -1839,17 +1835,25 @@ out_free:
 	goto out;	
 }
 
-void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh)
+void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
+			struct netlink_skb_parms *req)
 {
 	struct sk_buff *skb;
 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
+	u32 pid = current->pid;
+	u32 seq = 0;
 
+	if (req)
+		pid = req->pid;
+	if (nlh)
+		seq = nlh->nlmsg_seq;
+	
 	skb = alloc_skb(size, gfp_any());
 	if (!skb) {
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS);
 		return;
 	}
-	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0, 0) < 0) {
+	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
 		kfree_skb(skb);
 		netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL);
 		return;
-- 
cgit v1.2.3


From c9e3e8b6958e02230079e6817862ea2968509866 Mon Sep 17 00:00:00 2001
From: David L Stevens <dlstevens@us.ibm.com>
Date: Tue, 21 Jun 2005 13:58:25 -0700
Subject: [IPV6]: multicast join and misc

Here is a simplified version of the patch to fix a bug in IPv6
multicasting. It:

1) adds existence check & EADDRINUSE error for regular joins
2) adds an exception for EADDRINUSE in the source-specific multicast
        join (where a prior join is ok)
3) adds a missing/needed read_lock on sock_mc_list; would've raced
        with destroying the socket on interface down without
4) adds a "leave group" in the (INCLUDE, empty) source filter case.
        This frees unneeded socket buffer memory, but also prevents
        an inappropriate interaction among the 8 socket options that
        mess with this. Some would fail as if in the group when you
        aren't really.

Item #4 had a locking bug in the last version of this patch; rather than
removing the idev->lock read lock only, I've simplified it to remove
all lock state in the path and treat it as a direct "leave group" call for
the (INCLUDE,empty) case it covers. Tested on an MP machine. :-)

Much thanks to HoerdtMickael <hoerdt@clarinet.u-strasbg.fr> who
reported the original bug.

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ipv6_sockglue.c |  5 +++--
 net/ipv6/mcast.c         | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 279ab86be66..f3ef4c38d31 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -423,11 +423,12 @@ done:
 			psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
 			retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
 				&psin6->sin6_addr);
-			if (retv)
+			/* prior join w/ different source is ok */
+			if (retv && retv != -EADDRINUSE)
 				break;
 			omode = MCAST_INCLUDE;
 			add = 1;
-		} else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+		} else /* MCAST_LEAVE_SOURCE_GROUP */ {
 			omode = MCAST_INCLUDE;
 			add = 0;
 		}
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 393b6e6f50a..c0ca92e8230 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -188,6 +188,16 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr)
 	if (!ipv6_addr_is_multicast(addr))
 		return -EINVAL;
 
+	read_lock_bh(&ipv6_sk_mc_lock);
+	for (mc_lst=np->ipv6_mc_list; mc_lst; mc_lst=mc_lst->next) {
+		if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
+		    ipv6_addr_equal(&mc_lst->addr, addr)) {
+			read_unlock_bh(&ipv6_sk_mc_lock);
+			return -EADDRINUSE;
+		}
+	}
+	read_unlock_bh(&ipv6_sk_mc_lock);
+
 	mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
 
 	if (mc_lst == NULL)
@@ -349,6 +359,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 	struct ipv6_pinfo *inet6 = inet6_sk(sk);
 	struct ip6_sf_socklist *psl;
 	int i, j, rv;
+	int leavegroup = 0;
 	int err;
 
 	if (pgsr->gsr_group.ss_family != AF_INET6 ||
@@ -368,6 +379,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 
 	err = -EADDRNOTAVAIL;
 
+	read_lock_bh(&ipv6_sk_mc_lock);
 	for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
 		if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
 			continue;
@@ -401,6 +413,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 		if (rv)		/* source not found */
 			goto done;
 
+		/* special case - (INCLUDE, empty) == LEAVE_GROUP */
+		if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+			leavegroup = 1;
+			goto done;
+		}
+
 		/* update the interface filter */
 		ip6_mc_del_src(idev, group, omode, 1, source, 1);
 
@@ -453,9 +471,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
 	/* update the interface list */
 	ip6_mc_add_src(idev, group, omode, 1, source, 1);
 done:
+	read_unlock_bh(&ipv6_sk_mc_lock);
 	read_unlock_bh(&idev->lock);
 	in6_dev_put(idev);
 	dev_put(dev);
+	if (leavegroup)
+		return ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
 	return err;
 }
 
-- 
cgit v1.2.3


From e45b1be8bcb3643808975a426fa3e201a2588e87 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 21 Jun 2005 14:01:30 -0700
Subject: [NETFILTER]: Kill lockhelp.h

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/arp_tables.c              |  1 -
 net/ipv4/netfilter/ip_conntrack_amanda.c     |  7 +--
 net/ipv4/netfilter/ip_conntrack_core.c       | 84 ++++++++++++++--------------
 net/ipv4/netfilter/ip_conntrack_ftp.c        |  7 +--
 net/ipv4/netfilter/ip_conntrack_irc.c        |  7 +--
 net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 23 ++++----
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c  | 27 +++++----
 net/ipv4/netfilter/ip_conntrack_standalone.c | 22 ++++----
 net/ipv4/netfilter/ip_nat_core.c             | 32 +++++------
 net/ipv4/netfilter/ip_nat_helper.c           | 10 ++--
 net/ipv4/netfilter/ip_nat_rule.c             |  4 +-
 net/ipv4/netfilter/ip_nat_standalone.c       |  5 +-
 net/ipv4/netfilter/ip_tables.c               |  1 -
 net/ipv4/netfilter/ipt_CLUSTERIP.c           | 49 ++++++++--------
 net/ipv4/netfilter/ipt_MASQUERADE.c          | 10 ++--
 net/ipv4/netfilter/ipt_ULOG.c                | 15 +++--
 net/ipv4/netfilter/ipt_hashlimit.c           | 17 +++---
 net/ipv4/netfilter/ipt_helper.c              |  4 +-
 net/ipv6/netfilter/ip6_tables.c              |  1 -
 19 files changed, 158 insertions(+), 168 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index df79f5ed6a0..fa163425668 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -60,7 +60,6 @@ static DECLARE_MUTEX(arpt_mutex);
 
 #define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
 #define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
-#include <linux/netfilter_ipv4/lockhelp.h>
 #include <linux/netfilter_ipv4/listhelp.h>
 
 struct arpt_table_info {
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index 3dbddd06260..a78a320eee0 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -26,7 +26,6 @@
 #include <net/checksum.h>
 #include <net/udp.h>
 
-#include <linux/netfilter_ipv4/lockhelp.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
 
@@ -42,7 +41,7 @@ static char *conns[] = { "DATA ", "MESG ", "INDEX " };
 
 /* This is slow, but it's simple. --RR */
 static char amanda_buffer[65536];
-static DECLARE_LOCK(amanda_buffer_lock);
+static DEFINE_SPINLOCK(amanda_buffer_lock);
 
 unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
 				   enum ip_conntrack_info ctinfo,
@@ -76,7 +75,7 @@ static int help(struct sk_buff **pskb,
 		return NF_ACCEPT;
 	}
 
-	LOCK_BH(&amanda_buffer_lock);
+	spin_lock_bh(&amanda_buffer_lock);
 	skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
 	data = amanda_buffer;
 	data_limit = amanda_buffer + (*pskb)->len - dataoff;
@@ -134,7 +133,7 @@ static int help(struct sk_buff **pskb,
 	}
 
 out:
-	UNLOCK_BH(&amanda_buffer_lock);
+	spin_unlock_bh(&amanda_buffer_lock);
 	return ret;
 }
 
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 09e82462297..a7377a331ad 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -38,10 +38,10 @@
 #include <linux/percpu.h>
 #include <linux/moduleparam.h>
 
-/* This rwlock protects the main hash table, protocol/helper/expected
+/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
    registrations, conntrack timers*/
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
 
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -57,7 +57,7 @@
 #define DEBUGP(format, args...)
 #endif
 
-DECLARE_RWLOCK(ip_conntrack_lock);
+DEFINE_RWLOCK(ip_conntrack_lock);
 
 /* ip_conntrack_standalone needs this */
 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
@@ -147,7 +147,7 @@ static void destroy_expect(struct ip_conntrack_expect *exp)
 
 static void unlink_expect(struct ip_conntrack_expect *exp)
 {
-	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+	ASSERT_WRITE_LOCK(&ip_conntrack_lock);
 	list_del(&exp->list);
 	/* Logically in destroy_expect, but we hold the lock here. */
 	exp->master->expecting--;
@@ -157,9 +157,9 @@ static void expectation_timed_out(unsigned long ul_expect)
 {
 	struct ip_conntrack_expect *exp = (void *)ul_expect;
 
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	unlink_expect(exp);
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 	destroy_expect(exp);
 }
 
@@ -209,7 +209,7 @@ clean_from_lists(struct ip_conntrack *ct)
 	unsigned int ho, hr;
 	
 	DEBUGP("clean_from_lists(%p)\n", ct);
-	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+	ASSERT_WRITE_LOCK(&ip_conntrack_lock);
 
 	ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 	hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -240,7 +240,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	if (ip_conntrack_destroyed)
 		ip_conntrack_destroyed(ct);
 
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	/* Expectations will have been removed in clean_from_lists,
 	 * except TFTP can create an expectation on the first packet,
 	 * before connection is in the list, so we need to clean here,
@@ -254,7 +254,7 @@ destroy_conntrack(struct nf_conntrack *nfct)
 	}
 
 	CONNTRACK_STAT_INC(delete);
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 
 	if (ct->master)
 		ip_conntrack_put(ct->master);
@@ -268,12 +268,12 @@ static void death_by_timeout(unsigned long ul_conntrack)
 {
 	struct ip_conntrack *ct = (void *)ul_conntrack;
 
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	/* Inside lock so preempt is disabled on module removal path.
 	 * Otherwise we can get spurious warnings. */
 	CONNTRACK_STAT_INC(delete_list);
 	clean_from_lists(ct);
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 	ip_conntrack_put(ct);
 }
 
@@ -282,7 +282,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
 		    const struct ip_conntrack_tuple *tuple,
 		    const struct ip_conntrack *ignored_conntrack)
 {
-	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+	ASSERT_READ_LOCK(&ip_conntrack_lock);
 	return tuplehash_to_ctrack(i) != ignored_conntrack
 		&& ip_ct_tuple_equal(tuple, &i->tuple);
 }
@@ -294,7 +294,7 @@ __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
 	struct ip_conntrack_tuple_hash *h;
 	unsigned int hash = hash_conntrack(tuple);
 
-	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+	ASSERT_READ_LOCK(&ip_conntrack_lock);
 	list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
 		if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
 			CONNTRACK_STAT_INC(found);
@@ -313,11 +313,11 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
 {
 	struct ip_conntrack_tuple_hash *h;
 
-	READ_LOCK(&ip_conntrack_lock);
+	read_lock_bh(&ip_conntrack_lock);
 	h = __ip_conntrack_find(tuple, ignored_conntrack);
 	if (h)
 		atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
-	READ_UNLOCK(&ip_conntrack_lock);
+	read_unlock_bh(&ip_conntrack_lock);
 
 	return h;
 }
@@ -352,7 +352,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
 	IP_NF_ASSERT(!is_confirmed(ct));
 	DEBUGP("Confirming conntrack %p\n", ct);
 
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 
 	/* See if there's one in the list already, including reverse:
            NAT could have grabbed it without realizing, since we're
@@ -380,12 +380,12 @@ __ip_conntrack_confirm(struct sk_buff **pskb)
 		atomic_inc(&ct->ct_general.use);
 		set_bit(IPS_CONFIRMED_BIT, &ct->status);
 		CONNTRACK_STAT_INC(insert);
-		WRITE_UNLOCK(&ip_conntrack_lock);
+		write_unlock_bh(&ip_conntrack_lock);
 		return NF_ACCEPT;
 	}
 
 	CONNTRACK_STAT_INC(insert_failed);
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 
 	return NF_DROP;
 }
@@ -398,9 +398,9 @@ ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
 {
 	struct ip_conntrack_tuple_hash *h;
 
-	READ_LOCK(&ip_conntrack_lock);
+	read_lock_bh(&ip_conntrack_lock);
 	h = __ip_conntrack_find(tuple, ignored_conntrack);
-	READ_UNLOCK(&ip_conntrack_lock);
+	read_unlock_bh(&ip_conntrack_lock);
 
 	return h != NULL;
 }
@@ -419,13 +419,13 @@ static int early_drop(struct list_head *chain)
 	struct ip_conntrack *ct = NULL;
 	int dropped = 0;
 
-	READ_LOCK(&ip_conntrack_lock);
+	read_lock_bh(&ip_conntrack_lock);
 	h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
 	if (h) {
 		ct = tuplehash_to_ctrack(h);
 		atomic_inc(&ct->ct_general.use);
 	}
-	READ_UNLOCK(&ip_conntrack_lock);
+	read_unlock_bh(&ip_conntrack_lock);
 
 	if (!ct)
 		return dropped;
@@ -508,7 +508,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 	conntrack->timeout.data = (unsigned long)conntrack;
 	conntrack->timeout.function = death_by_timeout;
 
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	exp = find_expectation(tuple);
 
 	if (exp) {
@@ -532,7 +532,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 	list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
 
 	atomic_inc(&ip_conntrack_count);
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 
 	if (exp) {
 		if (exp->expectfn)
@@ -723,17 +723,17 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
 {
 	struct ip_conntrack_expect *i;
 
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	/* choose the the oldest expectation to evict */
 	list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
 		if (expect_matches(i, exp) && del_timer(&i->timeout)) {
 			unlink_expect(i);
-			WRITE_UNLOCK(&ip_conntrack_lock);
+			write_unlock_bh(&ip_conntrack_lock);
 			destroy_expect(i);
 			return;
 		}
 	}
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 }
 
 struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
@@ -808,7 +808,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
 	DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
 	DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
 
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	list_for_each_entry(i, &ip_conntrack_expect_list, list) {
 		if (expect_matches(i, expect)) {
 			/* Refresh timer: if it's dying, ignore.. */
@@ -832,7 +832,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
 	ip_conntrack_expect_insert(expect);
 	ret = 0;
 out:
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
  	return ret;
 }
 
@@ -841,7 +841,7 @@ out:
 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
 			      const struct ip_conntrack_tuple *newreply)
 {
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	/* Should be unconfirmed, so not in hash table yet */
 	IP_NF_ASSERT(!is_confirmed(conntrack));
 
@@ -851,15 +851,15 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
 	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
 	if (!conntrack->master && conntrack->expecting == 0)
 		conntrack->helper = ip_ct_find_helper(newreply);
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 }
 
 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
 {
 	BUG_ON(me->timeout == 0);
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	list_prepend(&helpers, me);
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 
 	return 0;
 }
@@ -878,7 +878,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 	struct ip_conntrack_expect *exp, *tmp;
 
 	/* Need write lock here, to delete helper. */
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	LIST_DELETE(&helpers, me);
 
 	/* Get rid of expectations */
@@ -893,7 +893,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 	for (i = 0; i < ip_conntrack_htable_size; i++)
 		LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
 			    struct ip_conntrack_tuple_hash *, me);
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 
 	/* Someone could be still looking at the helper in a bh. */
 	synchronize_net();
@@ -925,14 +925,14 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
 		ct->timeout.expires = extra_jiffies;
 		ct_add_counters(ct, ctinfo, skb);
 	} else {
-		WRITE_LOCK(&ip_conntrack_lock);
+		write_lock_bh(&ip_conntrack_lock);
 		/* Need del_timer for race avoidance (may already be dying). */
 		if (del_timer(&ct->timeout)) {
 			ct->timeout.expires = jiffies + extra_jiffies;
 			add_timer(&ct->timeout);
 		}
 		ct_add_counters(ct, ctinfo, skb);
-		WRITE_UNLOCK(&ip_conntrack_lock);
+		write_unlock_bh(&ip_conntrack_lock);
 	}
 }
 
@@ -997,7 +997,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
 {
 	struct ip_conntrack_tuple_hash *h = NULL;
 
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
 		h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
 				struct ip_conntrack_tuple_hash *, iter, data);
@@ -1009,7 +1009,7 @@ get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
 				struct ip_conntrack_tuple_hash *, iter, data);
 	if (h)
 		atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 
 	return h;
 }
@@ -1201,14 +1201,14 @@ int __init ip_conntrack_init(void)
 	}
 
 	/* Don't NEED lock here, but good form anyway. */
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	for (i = 0; i < MAX_IP_CT_PROTO; i++)
 		ip_ct_protos[i] = &ip_conntrack_generic_protocol;
 	/* Sew in builtin protocols. */
 	ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
 	ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
 	ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 
 	for (i = 0; i < ip_conntrack_htable_size; i++)
 		INIT_LIST_HEAD(&ip_conntrack_hash[i]);
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index dd86503aa78..fea6dd2a00b 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -16,7 +16,6 @@
 #include <net/checksum.h>
 #include <net/tcp.h>
 
-#include <linux/netfilter_ipv4/lockhelp.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
 #include <linux/moduleparam.h>
@@ -28,7 +27,7 @@ MODULE_DESCRIPTION("ftp connection tracking helper");
 /* This is slow, but it's simple. --RR */
 static char ftp_buffer[65536];
 
-static DECLARE_LOCK(ip_ftp_lock);
+static DEFINE_SPINLOCK(ip_ftp_lock);
 
 #define MAX_PORTS 8
 static int ports[MAX_PORTS];
@@ -319,7 +318,7 @@ static int help(struct sk_buff **pskb,
 	}
 	datalen = (*pskb)->len - dataoff;
 
-	LOCK_BH(&ip_ftp_lock);
+	spin_lock_bh(&ip_ftp_lock);
 	fb_ptr = skb_header_pointer(*pskb, dataoff,
 				    (*pskb)->len - dataoff, ftp_buffer);
 	BUG_ON(fb_ptr == NULL);
@@ -442,7 +441,7 @@ out_update_nl:
 	if (ends_in_nl)
 		update_nl_seq(seq, ct_ftp_info,dir);
  out:
-	UNLOCK_BH(&ip_ftp_lock);
+	spin_unlock_bh(&ip_ftp_lock);
 	return ret;
 }
 
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index 33cc7348b6e..cd98772cc33 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -29,7 +29,6 @@
 #include <net/checksum.h>
 #include <net/tcp.h>
 
-#include <linux/netfilter_ipv4/lockhelp.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
 #include <linux/netfilter_ipv4/ip_conntrack_irc.h>
 #include <linux/moduleparam.h>
@@ -41,7 +40,7 @@ static int max_dcc_channels = 8;
 static unsigned int dcc_timeout = 300;
 /* This is slow, but it's simple. --RR */
 static char irc_buffer[65536];
-static DECLARE_LOCK(irc_buffer_lock);
+static DEFINE_SPINLOCK(irc_buffer_lock);
 
 unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
 				enum ip_conntrack_info ctinfo,
@@ -141,7 +140,7 @@ static int help(struct sk_buff **pskb,
 	if (dataoff >= (*pskb)->len)
 		return NF_ACCEPT;
 
-	LOCK_BH(&irc_buffer_lock);
+	spin_lock_bh(&irc_buffer_lock);
 	ib_ptr = skb_header_pointer(*pskb, dataoff,
 				    (*pskb)->len - dataoff, irc_buffer);
 	BUG_ON(ib_ptr == NULL);
@@ -237,7 +236,7 @@ static int help(struct sk_buff **pskb,
 	} /* while data < ... */
 
  out:
-	UNLOCK_BH(&irc_buffer_lock);
+	spin_unlock_bh(&irc_buffer_lock);
 	return ret;
 }
 
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index ff8c34a860f..31d75390bf1 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -26,7 +26,6 @@
 
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
 
 #if 0
 #define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
@@ -35,7 +34,7 @@
 #endif
 
 /* Protects conntrack->proto.sctp */
-static DECLARE_RWLOCK(sctp_lock);
+static DEFINE_RWLOCK(sctp_lock);
 
 /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
    closely.  They're more complex. --RR 
@@ -199,9 +198,9 @@ static int sctp_print_conntrack(struct seq_file *s,
 	DEBUGP(__FUNCTION__);
 	DEBUGP("\n");
 
-	READ_LOCK(&sctp_lock);
+	read_lock_bh(&sctp_lock);
 	state = conntrack->proto.sctp.state;
-	READ_UNLOCK(&sctp_lock);
+	read_unlock_bh(&sctp_lock);
 
 	return seq_printf(s, "%s ", sctp_conntrack_names[state]);
 }
@@ -343,13 +342,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
 
 	oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
 	for_each_sctp_chunk (skb, sch, _sch, offset, count) {
-		WRITE_LOCK(&sctp_lock);
+		write_lock_bh(&sctp_lock);
 
 		/* Special cases of Verification tag check (Sec 8.5.1) */
 		if (sch->type == SCTP_CID_INIT) {
 			/* Sec 8.5.1 (A) */
 			if (sh->vtag != 0) {
-				WRITE_UNLOCK(&sctp_lock);
+				write_unlock_bh(&sctp_lock);
 				return -1;
 			}
 		} else if (sch->type == SCTP_CID_ABORT) {
@@ -357,7 +356,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
 			if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
 				&& !(sh->vtag == conntrack->proto.sctp.vtag
 							[1 - CTINFO2DIR(ctinfo)])) {
-				WRITE_UNLOCK(&sctp_lock);
+				write_unlock_bh(&sctp_lock);
 				return -1;
 			}
 		} else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
@@ -366,13 +365,13 @@ static int sctp_packet(struct ip_conntrack *conntrack,
 				&& !(sh->vtag == conntrack->proto.sctp.vtag
 							[1 - CTINFO2DIR(ctinfo)] 
 					&& (sch->flags & 1))) {
-				WRITE_UNLOCK(&sctp_lock);
+				write_unlock_bh(&sctp_lock);
 				return -1;
 			}
 		} else if (sch->type == SCTP_CID_COOKIE_ECHO) {
 			/* Sec 8.5.1 (D) */
 			if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
-				WRITE_UNLOCK(&sctp_lock);
+				write_unlock_bh(&sctp_lock);
 				return -1;
 			}
 		}
@@ -384,7 +383,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
 		if (newconntrack == SCTP_CONNTRACK_MAX) {
 			DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
 			       CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
-			WRITE_UNLOCK(&sctp_lock);
+			write_unlock_bh(&sctp_lock);
 			return -1;
 		}
 
@@ -396,7 +395,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
 			ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
 			                        sizeof(_inithdr), &_inithdr);
 			if (ih == NULL) {
-					WRITE_UNLOCK(&sctp_lock);
+					write_unlock_bh(&sctp_lock);
 					return -1;
 			}
 			DEBUGP("Setting vtag %x for dir %d\n", 
@@ -405,7 +404,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
 		}
 
 		conntrack->proto.sctp.state = newconntrack;
-		WRITE_UNLOCK(&sctp_lock);
+		write_unlock_bh(&sctp_lock);
 	}
 
 	ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 721ddbf522b..809dfed766d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -36,7 +36,6 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
 
 #if 0
 #define DEBUGP printk
@@ -46,7 +45,7 @@
 #endif
 
 /* Protects conntrack->proto.tcp */
-static DECLARE_RWLOCK(tcp_lock);
+static DEFINE_RWLOCK(tcp_lock);
 
 /* "Be conservative in what you do, 
     be liberal in what you accept from others." 
@@ -330,9 +329,9 @@ static int tcp_print_conntrack(struct seq_file *s,
 {
 	enum tcp_conntrack state;
 
-	READ_LOCK(&tcp_lock);
+	read_lock_bh(&tcp_lock);
 	state = conntrack->proto.tcp.state;
-	READ_UNLOCK(&tcp_lock);
+	read_unlock_bh(&tcp_lock);
 
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
@@ -738,14 +737,14 @@ void ip_conntrack_tcp_update(struct sk_buff *skb,
 
 	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
 	
-	WRITE_LOCK(&tcp_lock);
+	write_lock_bh(&tcp_lock);
 	/*
 	 * We have to worry for the ack in the reply packet only...
 	 */
 	if (after(end, conntrack->proto.tcp.seen[dir].td_end))
 		conntrack->proto.tcp.seen[dir].td_end = end;
 	conntrack->proto.tcp.last_end = end;
-	WRITE_UNLOCK(&tcp_lock);
+	write_unlock_bh(&tcp_lock);
 	DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
 	       "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 		sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -857,7 +856,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 				sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
 	
-	WRITE_LOCK(&tcp_lock);
+	write_lock_bh(&tcp_lock);
 	old_state = conntrack->proto.tcp.state;
 	dir = CTINFO2DIR(ctinfo);
 	index = get_conntrack_index(th);
@@ -879,7 +878,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 			 * that the client cannot but retransmit its SYN and 
 			 * thus initiate a clean new session.
 			 */
-		    	WRITE_UNLOCK(&tcp_lock);
+		    	write_unlock_bh(&tcp_lock);
 			if (LOG_INVALID(IPPROTO_TCP))
 				nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
 					  "ip_ct_tcp: killing out of sync session ");
@@ -894,7 +893,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		conntrack->proto.tcp.last_end = 
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
 		
-		WRITE_UNLOCK(&tcp_lock);
+		write_unlock_bh(&tcp_lock);
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
 				  "ip_ct_tcp: invalid packet ignored ");
@@ -904,7 +903,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 		       dir, get_conntrack_index(th),
 		       old_state);
-		WRITE_UNLOCK(&tcp_lock);
+		write_unlock_bh(&tcp_lock);
 		if (LOG_INVALID(IPPROTO_TCP))
 			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
 				  "ip_ct_tcp: invalid state ");
@@ -918,13 +917,13 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		    	     conntrack->proto.tcp.seen[dir].td_end)) {	
 		    	/* Attempt to reopen a closed connection.
 		    	* Delete this connection and look up again. */
-		    	WRITE_UNLOCK(&tcp_lock);
+		    	write_unlock_bh(&tcp_lock);
 		    	if (del_timer(&conntrack->timeout))
 		    		conntrack->timeout.function((unsigned long)
 		    					    conntrack);
 		    	return -NF_REPEAT;
 		} else {
-			WRITE_UNLOCK(&tcp_lock);
+			write_unlock_bh(&tcp_lock);
 			if (LOG_INVALID(IPPROTO_TCP))
 				nf_log_packet(PF_INET, 0, skb, NULL, NULL,
 				              "ip_ct_tcp: invalid SYN");
@@ -949,7 +948,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 
 	if (!tcp_in_window(&conntrack->proto.tcp, dir, index, 
 			   skb, iph, th)) {
-		WRITE_UNLOCK(&tcp_lock);
+		write_unlock_bh(&tcp_lock);
 		return -NF_ACCEPT;
 	}
     in_window:
@@ -972,7 +971,7 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 	timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
 		  && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
 		  ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
-	WRITE_UNLOCK(&tcp_lock);
+	write_unlock_bh(&tcp_lock);
 
 	if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
 		/* If only reply is a RST, we can consider ourselves not to
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index bc59f7b3980..42dc9510287 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -28,8 +28,8 @@
 #include <net/checksum.h>
 #include <net/ip.h>
 
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
 
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
@@ -119,7 +119,7 @@ static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
 
 static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	READ_LOCK(&ip_conntrack_lock);
+	read_lock_bh(&ip_conntrack_lock);
 	return ct_get_idx(seq, *pos);
 }
 
@@ -131,7 +131,7 @@ static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
   
 static void ct_seq_stop(struct seq_file *s, void *v)
 {
-	READ_UNLOCK(&ip_conntrack_lock);
+	read_unlock_bh(&ip_conntrack_lock);
 }
  
 static int ct_seq_show(struct seq_file *s, void *v)
@@ -140,7 +140,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
 	struct ip_conntrack_protocol *proto;
 
-	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+	ASSERT_READ_LOCK(&ip_conntrack_lock);
 	IP_NF_ASSERT(conntrack);
 
 	/* we only want to print DIR_ORIGINAL */
@@ -239,7 +239,7 @@ static void *exp_seq_start(struct seq_file *s, loff_t *pos)
 
 	/* strange seq_file api calls stop even if we fail,
 	 * thus we need to grab lock since stop unlocks */
-	READ_LOCK(&ip_conntrack_lock);
+	read_lock_bh(&ip_conntrack_lock);
 
 	if (list_empty(e))
 		return NULL;
@@ -267,7 +267,7 @@ static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
 
 static void exp_seq_stop(struct seq_file *s, void *v)
 {
-	READ_UNLOCK(&ip_conntrack_lock);
+	read_unlock_bh(&ip_conntrack_lock);
 }
 
 static int exp_seq_show(struct seq_file *s, void *v)
@@ -921,22 +921,22 @@ int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
 {
 	int ret = 0;
 
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
 		ret = -EBUSY;
 		goto out;
 	}
 	ip_ct_protos[proto->proto] = proto;
  out:
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 	return ret;
 }
 
 void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
 {
-	WRITE_LOCK(&ip_conntrack_lock);
+	write_lock_bh(&ip_conntrack_lock);
 	ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
-	WRITE_UNLOCK(&ip_conntrack_lock);
+	write_unlock_bh(&ip_conntrack_lock);
 	
 	/* Somebody could be still looking at the proto in bh. */
 	synchronize_net();
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index 9fc6f93af0d..739b6dde1c8 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -22,8 +22,8 @@
 #include <linux/udp.h>
 #include <linux/jhash.h>
 
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
 
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
@@ -41,7 +41,7 @@
 #define DEBUGP(format, args...)
 #endif
 
-DECLARE_RWLOCK(ip_nat_lock);
+DEFINE_RWLOCK(ip_nat_lock);
 
 /* Calculated at init based on memory size */
 static unsigned int ip_nat_htable_size;
@@ -65,9 +65,9 @@ static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
 	if (!(conn->status & IPS_NAT_DONE_MASK))
 		return;
 
-	WRITE_LOCK(&ip_nat_lock);
+	write_lock_bh(&ip_nat_lock);
 	list_del(&conn->nat.info.bysource);
-	WRITE_UNLOCK(&ip_nat_lock);
+	write_unlock_bh(&ip_nat_lock);
 }
 
 /* We do checksum mangling, so if they were wrong before they're still
@@ -142,7 +142,7 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
 	unsigned int h = hash_by_src(tuple);
 	struct ip_conntrack *ct;
 
-	READ_LOCK(&ip_nat_lock);
+	read_lock_bh(&ip_nat_lock);
 	list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
 		if (same_src(ct, tuple)) {
 			/* Copy source part from reply tuple. */
@@ -151,12 +151,12 @@ find_appropriate_src(const struct ip_conntrack_tuple *tuple,
 			result->dst = tuple->dst;
 
 			if (in_range(result, range)) {
-				READ_UNLOCK(&ip_nat_lock);
+				read_unlock_bh(&ip_nat_lock);
 				return 1;
 			}
 		}
 	}
-	READ_UNLOCK(&ip_nat_lock);
+	read_unlock_bh(&ip_nat_lock);
 	return 0;
 }
 
@@ -297,9 +297,9 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
 		unsigned int srchash
 			= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
 				      .tuple);
-		WRITE_LOCK(&ip_nat_lock);
+		write_lock_bh(&ip_nat_lock);
 		list_add(&info->bysource, &bysource[srchash]);
-		WRITE_UNLOCK(&ip_nat_lock);
+		write_unlock_bh(&ip_nat_lock);
 	}
 
 	/* It's done. */
@@ -474,23 +474,23 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
 {
 	int ret = 0;
 
-	WRITE_LOCK(&ip_nat_lock);
+	write_lock_bh(&ip_nat_lock);
 	if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
 		ret = -EBUSY;
 		goto out;
 	}
 	ip_nat_protos[proto->protonum] = proto;
  out:
-	WRITE_UNLOCK(&ip_nat_lock);
+	write_unlock_bh(&ip_nat_lock);
 	return ret;
 }
 
 /* Noone stores the protocol anywhere; simply delete it. */
 void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
 {
-	WRITE_LOCK(&ip_nat_lock);
+	write_lock_bh(&ip_nat_lock);
 	ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
-	WRITE_UNLOCK(&ip_nat_lock);
+	write_unlock_bh(&ip_nat_lock);
 
 	/* Someone could be still looking at the proto in a bh. */
 	synchronize_net();
@@ -509,13 +509,13 @@ int __init ip_nat_init(void)
 		return -ENOMEM;
 
 	/* Sew in builtin protocols. */
-	WRITE_LOCK(&ip_nat_lock);
+	write_lock_bh(&ip_nat_lock);
 	for (i = 0; i < MAX_IP_NAT_PROTO; i++)
 		ip_nat_protos[i] = &ip_nat_unknown_protocol;
 	ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
 	ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
 	ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
-	WRITE_UNLOCK(&ip_nat_lock);
+	write_unlock_bh(&ip_nat_lock);
 
 	for (i = 0; i < ip_nat_htable_size; i++) {
 		INIT_LIST_HEAD(&bysource[i]);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 1637b96d8c0..9cd51f180dc 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -28,8 +28,8 @@
 #include <net/tcp.h>
 #include <net/udp.h>
 
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
 
 #include <linux/netfilter_ipv4/ip_conntrack.h>
 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
@@ -47,7 +47,7 @@
 #define DUMP_OFFSET(x)
 #endif
 
-static DECLARE_LOCK(ip_nat_seqofs_lock);
+static DEFINE_SPINLOCK(ip_nat_seqofs_lock);
 
 /* Setup TCP sequence correction given this change at this sequence */
 static inline void 
@@ -70,7 +70,7 @@ adjust_tcp_sequence(u32 seq,
 	DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
 	DUMP_OFFSET(this_way);
 
-	LOCK_BH(&ip_nat_seqofs_lock);
+	spin_lock_bh(&ip_nat_seqofs_lock);
 
 	/* SYN adjust. If it's uninitialized, or this is after last
 	 * correction, record it: we don't handle more than one
@@ -82,7 +82,7 @@ adjust_tcp_sequence(u32 seq,
 		    this_way->offset_before = this_way->offset_after;
 		    this_way->offset_after += sizediff;
 	}
-	UNLOCK_BH(&ip_nat_seqofs_lock);
+	spin_unlock_bh(&ip_nat_seqofs_lock);
 
 	DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
 	DUMP_OFFSET(this_way);
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
index 581f097f5a2..60d70fa41a1 100644
--- a/net/ipv4/netfilter/ip_nat_rule.c
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -19,8 +19,8 @@
 #include <net/route.h>
 #include <linux/bitops.h>
 
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
 
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv4/ip_nat.h>
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 79f56f662b3..bc59d0d6e89 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -31,8 +31,8 @@
 #include <net/checksum.h>
 #include <linux/spinlock.h>
 
-#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
-#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+#define ASSERT_READ_LOCK(x)
+#define ASSERT_WRITE_LOCK(x)
 
 #include <linux/netfilter_ipv4/ip_nat.h>
 #include <linux/netfilter_ipv4/ip_nat_rule.h>
@@ -373,7 +373,6 @@ static int init_or_cleanup(int init)
  cleanup_rule_init:
 	ip_nat_rule_cleanup();
  cleanup_nothing:
-	MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
 	return ret;
 }
 
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 8a54f92b849..c88dfcd38c5 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -67,7 +67,6 @@ static DECLARE_MUTEX(ipt_mutex);
 /* Must have mutex */
 #define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
 #define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
-#include <linux/netfilter_ipv4/lockhelp.h>
 #include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0f12e3a3dc7..dc4362b57cf 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,7 +29,6 @@
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
 #include <linux/netfilter_ipv4/ip_conntrack.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
 
 #define CLUSTERIP_VERSION "0.6"
 
@@ -41,6 +40,8 @@
 #define DEBUGP
 #endif
 
+#define ASSERT_READ_LOCK(x)
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("iptables target for CLUSTERIP");
@@ -67,7 +68,7 @@ static LIST_HEAD(clusterip_configs);
 
 /* clusterip_lock protects the clusterip_configs list _AND_ the configurable
  * data within all structurses (num_local_nodes, local_nodes[]) */
-static DECLARE_RWLOCK(clusterip_lock);
+static DEFINE_RWLOCK(clusterip_lock);
 
 #ifdef CONFIG_PROC_FS
 static struct file_operations clusterip_proc_fops;
@@ -82,9 +83,9 @@ clusterip_config_get(struct clusterip_config *c) {
 static inline void
 clusterip_config_put(struct clusterip_config *c) {
 	if (atomic_dec_and_test(&c->refcount)) {
-		WRITE_LOCK(&clusterip_lock);
+		write_lock_bh(&clusterip_lock);
 		list_del(&c->list);
-		WRITE_UNLOCK(&clusterip_lock);
+		write_unlock_bh(&clusterip_lock);
 		dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
 		dev_put(c->dev);
 		kfree(c);
@@ -97,7 +98,7 @@ __clusterip_config_find(u_int32_t clusterip)
 {
 	struct list_head *pos;
 
-	MUST_BE_READ_LOCKED(&clusterip_lock);
+	ASSERT_READ_LOCK(&clusterip_lock);
 	list_for_each(pos, &clusterip_configs) {
 		struct clusterip_config *c = list_entry(pos, 
 					struct clusterip_config, list);
@@ -114,14 +115,14 @@ clusterip_config_find_get(u_int32_t clusterip)
 {
 	struct clusterip_config *c;
 
-	READ_LOCK(&clusterip_lock);
+	read_lock_bh(&clusterip_lock);
 	c = __clusterip_config_find(clusterip);
 	if (!c) {
-		READ_UNLOCK(&clusterip_lock);
+		read_unlock_bh(&clusterip_lock);
 		return NULL;
 	}
 	atomic_inc(&c->refcount);
-	READ_UNLOCK(&clusterip_lock);
+	read_unlock_bh(&clusterip_lock);
 
 	return c;
 }
@@ -160,9 +161,9 @@ clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
 	c->pde->data = c;
 #endif
 
-	WRITE_LOCK(&clusterip_lock);
+	write_lock_bh(&clusterip_lock);
 	list_add(&c->list, &clusterip_configs);
-	WRITE_UNLOCK(&clusterip_lock);
+	write_unlock_bh(&clusterip_lock);
 
 	return c;
 }
@@ -172,25 +173,25 @@ clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
 {
 	int i;
 
-	WRITE_LOCK(&clusterip_lock);
+	write_lock_bh(&clusterip_lock);
 
 	if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
 	    || nodenum > CLUSTERIP_MAX_NODES) {
-		WRITE_UNLOCK(&clusterip_lock);
+		write_unlock_bh(&clusterip_lock);
 		return 1;
 	}
 
 	/* check if we alrady have this number in our array */
 	for (i = 0; i < c->num_local_nodes; i++) {
 		if (c->local_nodes[i] == nodenum) {
-			WRITE_UNLOCK(&clusterip_lock);
+			write_unlock_bh(&clusterip_lock);
 			return 1;
 		}
 	}
 
 	c->local_nodes[c->num_local_nodes++] = nodenum;
 
-	WRITE_UNLOCK(&clusterip_lock);
+	write_unlock_bh(&clusterip_lock);
 	return 0;
 }
 
@@ -199,10 +200,10 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
 {
 	int i;
 
-	WRITE_LOCK(&clusterip_lock);
+	write_lock_bh(&clusterip_lock);
 
 	if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
-		WRITE_UNLOCK(&clusterip_lock);
+		write_unlock_bh(&clusterip_lock);
 		return 1;
 	}
 		
@@ -211,12 +212,12 @@ clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
 			int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
 			memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
 			c->num_local_nodes--;
-			WRITE_UNLOCK(&clusterip_lock);
+			write_unlock_bh(&clusterip_lock);
 			return 0;
 		}
 	}
 
-	WRITE_UNLOCK(&clusterip_lock);
+	write_unlock_bh(&clusterip_lock);
 	return 1;
 }
 
@@ -286,21 +287,21 @@ clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
 {
 	int i;
 
-	READ_LOCK(&clusterip_lock);
+	read_lock_bh(&clusterip_lock);
 
 	if (config->num_local_nodes == 0) {
-		READ_UNLOCK(&clusterip_lock);
+		read_unlock_bh(&clusterip_lock);
 		return 0;
 	}
 
 	for (i = 0; i < config->num_local_nodes; i++) {
 		if (config->local_nodes[i] == hash) {
-			READ_UNLOCK(&clusterip_lock);
+			read_unlock_bh(&clusterip_lock);
 			return 1;
 		}
 	}
 
-	READ_UNLOCK(&clusterip_lock);
+	read_unlock_bh(&clusterip_lock);
 
 	return 0;
 }
@@ -578,7 +579,7 @@ static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
 	struct clusterip_config *c = pde->data;
 	unsigned int *nodeidx;
 
-	READ_LOCK(&clusterip_lock);
+	read_lock_bh(&clusterip_lock);
 	if (*pos >= c->num_local_nodes)
 		return NULL;
 
@@ -608,7 +609,7 @@ static void clusterip_seq_stop(struct seq_file *s, void *v)
 {
 	kfree(v);
 
-	READ_UNLOCK(&clusterip_lock);
+	read_unlock_bh(&clusterip_lock);
 }
 
 static int clusterip_seq_show(struct seq_file *s, void *v)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 57e9f6cf1c3..91e74502c3d 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -33,7 +33,7 @@ MODULE_DESCRIPTION("iptables MASQUERADE target module");
 #endif
 
 /* Lock protects masq region inside conntrack */
-static DECLARE_RWLOCK(masq_lock);
+static DEFINE_RWLOCK(masq_lock);
 
 /* FIXME: Multiple targets. --RR */
 static int
@@ -103,9 +103,9 @@ masquerade_target(struct sk_buff **pskb,
 		return NF_DROP;
 	}
 
-	WRITE_LOCK(&masq_lock);
+	write_lock_bh(&masq_lock);
 	ct->nat.masq_index = out->ifindex;
-	WRITE_UNLOCK(&masq_lock);
+	write_unlock_bh(&masq_lock);
 
 	/* Transfer from original range. */
 	newrange = ((struct ip_nat_range)
@@ -122,9 +122,9 @@ device_cmp(struct ip_conntrack *i, void *ifindex)
 {
 	int ret;
 
-	READ_LOCK(&masq_lock);
+	read_lock_bh(&masq_lock);
 	ret = (i->nat.masq_index == (int)(long)ifindex);
-	READ_UNLOCK(&masq_lock);
+	read_unlock_bh(&masq_lock);
 
 	return ret;
 }
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index 6f2cefbe16c..52a0076302a 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -56,7 +56,6 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv4/ipt_ULOG.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
 #include <net/sock.h>
 #include <linux/bitops.h>
 
@@ -99,8 +98,8 @@ typedef struct {
 
 static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];	/* array of buffers */
 
-static struct sock *nflognl;	/* our socket */
-static DECLARE_LOCK(ulog_lock);	/* spinlock */
+static struct sock *nflognl;		/* our socket */
+static DEFINE_SPINLOCK(ulog_lock);	/* spinlock */
 
 /* send one ulog_buff_t to userspace */
 static void ulog_send(unsigned int nlgroupnum)
@@ -135,9 +134,9 @@ static void ulog_timer(unsigned long data)
 
 	/* lock to protect against somebody modifying our structure
 	 * from ipt_ulog_target at the same time */
-	LOCK_BH(&ulog_lock);
+	spin_lock_bh(&ulog_lock);
 	ulog_send(data);
-	UNLOCK_BH(&ulog_lock);
+	spin_unlock_bh(&ulog_lock);
 }
 
 static struct sk_buff *ulog_alloc_skb(unsigned int size)
@@ -193,7 +192,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
 
 	ub = &ulog_buffers[groupnum];
 	
-	LOCK_BH(&ulog_lock);
+	spin_lock_bh(&ulog_lock);
 
 	if (!ub->skb) {
 		if (!(ub->skb = ulog_alloc_skb(size)))
@@ -278,7 +277,7 @@ static void ipt_ulog_packet(unsigned int hooknum,
 		ulog_send(groupnum);
 	}
 
-	UNLOCK_BH(&ulog_lock);
+	spin_unlock_bh(&ulog_lock);
 
 	return;
 
@@ -288,7 +287,7 @@ nlmsg_failure:
 alloc_failure:
 	PRINTR("ipt_ULOG: Error building netlink message\n");
 
-	UNLOCK_BH(&ulog_lock);
+	spin_unlock_bh(&ulog_lock);
 }
 
 static unsigned int ipt_ulog_target(struct sk_buff **pskb,
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
index f1937190cd7..564b49bfebc 100644
--- a/net/ipv4/netfilter/ipt_hashlimit.c
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -37,7 +37,6 @@
 
 #include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netfilter_ipv4/ipt_hashlimit.h>
-#include <linux/netfilter_ipv4/lockhelp.h>
 
 /* FIXME: this is just for IP_NF_ASSERRT */
 #include <linux/netfilter_ipv4/ip_conntrack.h>
@@ -92,7 +91,7 @@ struct ipt_hashlimit_htable {
 	struct hlist_head hash[0];	/* hashtable itself */
 };
 
-static DECLARE_LOCK(hashlimit_lock);	/* protects htables list */
+static DEFINE_SPINLOCK(hashlimit_lock);	/* protects htables list */
 static DECLARE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
 static HLIST_HEAD(hashlimit_htables);
 static kmem_cache_t *hashlimit_cachep;
@@ -233,9 +232,9 @@ static int htable_create(struct ipt_hashlimit_info *minfo)
 	hinfo->timer.function = htable_gc;
 	add_timer(&hinfo->timer);
 
-	LOCK_BH(&hashlimit_lock);
+	spin_lock_bh(&hashlimit_lock);
 	hlist_add_head(&hinfo->node, &hashlimit_htables);
-	UNLOCK_BH(&hashlimit_lock);
+	spin_unlock_bh(&hashlimit_lock);
 
 	return 0;
 }
@@ -301,15 +300,15 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
 	struct ipt_hashlimit_htable *hinfo;
 	struct hlist_node *pos;
 
-	LOCK_BH(&hashlimit_lock);
+	spin_lock_bh(&hashlimit_lock);
 	hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
 		if (!strcmp(name, hinfo->pde->name)) {
 			atomic_inc(&hinfo->use);
-			UNLOCK_BH(&hashlimit_lock);
+			spin_unlock_bh(&hashlimit_lock);
 			return hinfo;
 		}
 	}
-	UNLOCK_BH(&hashlimit_lock);
+	spin_unlock_bh(&hashlimit_lock);
 
 	return NULL;
 }
@@ -317,9 +316,9 @@ static struct ipt_hashlimit_htable *htable_find_get(char *name)
 static void htable_put(struct ipt_hashlimit_htable *hinfo)
 {
 	if (atomic_dec_and_test(&hinfo->use)) {
-		LOCK_BH(&hashlimit_lock);
+		spin_lock_bh(&hashlimit_lock);
 		hlist_del(&hinfo->node);
-		UNLOCK_BH(&hashlimit_lock);
+		spin_unlock_bh(&hashlimit_lock);
 		htable_destroy(hinfo);
 	}
 }
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
index 33fdf364d3d..3e7dd014de4 100644
--- a/net/ipv4/netfilter/ipt_helper.c
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -53,7 +53,7 @@ match(const struct sk_buff *skb,
 		return ret;
 	}
 
-	READ_LOCK(&ip_conntrack_lock);
+	read_lock_bh(&ip_conntrack_lock);
 	if (!ct->master->helper) {
 		DEBUGP("ipt_helper: master ct %p has no helper\n", 
 			exp->expectant);
@@ -69,7 +69,7 @@ match(const struct sk_buff *skb,
 		ret ^= !strncmp(ct->master->helper->name, info->name, 
 		                strlen(ct->master->helper->name));
 out_unlock:
-	READ_UNLOCK(&ip_conntrack_lock);
+	read_unlock_bh(&ip_conntrack_lock);
 	return ret;
 }
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c735276fdd5..73034511c8d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -71,7 +71,6 @@ static DECLARE_MUTEX(ip6t_mutex);
 /* Must have mutex */
 #define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
 #define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0)
-#include <linux/netfilter_ipv4/lockhelp.h>
 #include <linux/netfilter_ipv4/listhelp.h>
 
 #if 0
-- 
cgit v1.2.3


From 18b8afc771102b1b6af97962808291a7d27f52af Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 21 Jun 2005 14:01:57 -0700
Subject: [NETFILTER]: Kill nf_debug

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_forward.c                |   3 -
 net/bridge/br_input.c                  |   4 -
 net/bridge/br_netfilter.c              |  38 ---------
 net/core/netfilter.c                   | 138 ---------------------------------
 net/core/skbuff.c                      |   6 --
 net/ipv4/ip_input.c                    |   4 -
 net/ipv4/ip_output.c                   |  11 ---
 net/ipv4/ipvs/ip_vs_xmit.c             |   1 -
 net/ipv4/netfilter/ip_conntrack_core.c |   9 ---
 net/ipv4/netfilter/ip_nat_helper.c     |   3 -
 net/ipv6/ip6_output.c                  |   3 -
 11 files changed, 220 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index ef9f2095f96..069253f830c 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -57,9 +57,6 @@ int br_forward_finish(struct sk_buff *skb)
 static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
 {
 	skb->dev = to->dev;
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug = 0;
-#endif
 	NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
 			br_forward_finish);
 }
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 8f5f2e73099..9a45e6279c5 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -23,11 +23,7 @@ const unsigned char bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
 
 static int br_pass_frame_up_finish(struct sk_buff *skb)
 {
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug = 0;
-#endif
 	netif_receive_skb(skb);
-
 	return 0;
 }
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index be03d3ad264..03ae4edddac 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -102,10 +102,6 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
-#endif
-
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -182,10 +178,6 @@ static void __br_dnat_complain(void)
  * --Bart, 20021007 (updated) */
 static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
 {
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug |= (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_FORWARD);
-#endif
-
 	if (skb->pkt_type == PACKET_OTHERHOST) {
 		skb->pkt_type = PACKET_HOST;
 		skb->nf_bridge->mask |= BRNF_PKT_TYPE;
@@ -207,10 +199,6 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
 	struct iphdr *iph = skb->nh.iph;
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING);
-#endif
-
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
 		nf_bridge->mask ^= BRNF_PKT_TYPE;
@@ -382,9 +370,6 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
 	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
 			goto inhdr_error;
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug ^= (1 << NF_IP6_PRE_ROUTING);
-#endif
 	if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
 		return NF_DROP;
 	setup_pre_routing(skb);
@@ -468,9 +453,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb,
 			skb->ip_summed = CHECKSUM_NONE;
 	}
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug ^= (1 << NF_IP_PRE_ROUTING);
-#endif
 	if ((nf_bridge = nf_bridge_alloc(skb)) == NULL)
 		return NF_DROP;
 	setup_pre_routing(skb);
@@ -517,10 +499,6 @@ static int br_nf_forward_finish(struct sk_buff *skb)
 	struct net_device *in;
 	struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug ^= (1 << NF_BR_FORWARD);
-#endif
-
 	if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) {
 		in = nf_bridge->physindev;
 		if (nf_bridge->mask & BRNF_PKT_TYPE) {
@@ -566,9 +544,6 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb,
 		(*pskb)->nh.raw += VLAN_HLEN;
 	}
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug ^= (1 << NF_BR_FORWARD);
-#endif
 	nf_bridge = skb->nf_bridge;
 	if (skb->pkt_type == PACKET_OTHERHOST) {
 		skb->pkt_type = PACKET_HOST;
@@ -605,10 +580,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
 		(*pskb)->nh.raw += VLAN_HLEN;
 	}
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug ^= (1 << NF_BR_FORWARD);
-#endif
-
 	if (skb->nh.arph->ar_pln != 4) {
 		if (IS_VLAN_ARP) {
 			skb_push(*pskb, VLAN_HLEN);
@@ -627,9 +598,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb,
 /* PF_BRIDGE/LOCAL_OUT ***********************************************/
 static int br_nf_local_out_finish(struct sk_buff *skb)
 {
-#ifdef CONFIG_NETFILTER_DEBUG
-	skb->nf_debug &= ~(1 << NF_BR_LOCAL_OUT);
-#endif
 	if (skb->protocol == __constant_htons(ETH_P_8021Q)) {
 		skb_push(skb, VLAN_HLEN);
 		skb->nh.raw -= VLAN_HLEN;
@@ -731,10 +699,6 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb,
 			       realoutdev, br_nf_local_out_finish,
 			       NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1);
 	} else {
-#ifdef CONFIG_NETFILTER_DEBUG
-		skb->nf_debug ^= (1 << NF_IP_LOCAL_OUT);
-#endif
-
 		NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev,
 			       realoutdev, br_nf_local_out_finish,
 			       NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1);
@@ -779,8 +743,6 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb,
 		printk(KERN_CRIT "br_netfilter: skb->dst == NULL.");
 		goto print_error;
 	}
-
-	skb->nf_debug ^= (1 << NF_IP_POST_ROUTING);
 #endif
 
 	/* We assume any code from br_dev_queue_push_xmit onwards doesn't care
diff --git a/net/core/netfilter.c b/net/core/netfilter.c
index 22a8f127c4a..076c156d5ed 100644
--- a/net/core/netfilter.c
+++ b/net/core/netfilter.c
@@ -141,136 +141,6 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
 	up(&nf_sockopt_mutex);
 }
 
-#ifdef CONFIG_NETFILTER_DEBUG
-#include <net/ip.h>
-#include <net/tcp.h>
-#include <linux/netfilter_ipv4.h>
-
-static void debug_print_hooks_ip(unsigned int nf_debug)
-{
-	if (nf_debug & (1 << NF_IP_PRE_ROUTING)) {
-		printk("PRE_ROUTING ");
-		nf_debug ^= (1 << NF_IP_PRE_ROUTING);
-	}
-	if (nf_debug & (1 << NF_IP_LOCAL_IN)) {
-		printk("LOCAL_IN ");
-		nf_debug ^= (1 << NF_IP_LOCAL_IN);
-	}
-	if (nf_debug & (1 << NF_IP_FORWARD)) {
-		printk("FORWARD ");
-		nf_debug ^= (1 << NF_IP_FORWARD);
-	}
-	if (nf_debug & (1 << NF_IP_LOCAL_OUT)) {
-		printk("LOCAL_OUT ");
-		nf_debug ^= (1 << NF_IP_LOCAL_OUT);
-	}
-	if (nf_debug & (1 << NF_IP_POST_ROUTING)) {
-		printk("POST_ROUTING ");
-		nf_debug ^= (1 << NF_IP_POST_ROUTING);
-	}
-	if (nf_debug)
-		printk("Crap bits: 0x%04X", nf_debug);
-	printk("\n");
-}
-
-static void nf_dump_skb(int pf, struct sk_buff *skb)
-{
-	printk("skb: pf=%i %s dev=%s len=%u\n", 
-	       pf,
-	       skb->sk ? "(owned)" : "(unowned)",
-	       skb->dev ? skb->dev->name : "(no dev)",
-	       skb->len);
-	switch (pf) {
-	case PF_INET: {
-		const struct iphdr *ip = skb->nh.iph;
-		__u32 *opt = (__u32 *) (ip + 1);
-		int opti;
-		__u16 src_port = 0, dst_port = 0;
-
-		if (ip->protocol == IPPROTO_TCP
-		    || ip->protocol == IPPROTO_UDP) {
-			struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl);
-			src_port = ntohs(tcp->source);
-			dst_port = ntohs(tcp->dest);
-		}
-	
-		printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu"
-		       " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu",
-		       ip->protocol, NIPQUAD(ip->saddr),
-		       src_port, NIPQUAD(ip->daddr),
-		       dst_port,
-		       ntohs(ip->tot_len), ip->tos, ntohs(ip->id),
-		       ntohs(ip->frag_off), ip->ttl);
-
-		for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++)
-			printk(" O=0x%8.8X", *opt++);
-		printk("\n");
-	}
-	}
-}
-
-void nf_debug_ip_local_deliver(struct sk_buff *skb)
-{
-	/* If it's a loopback packet, it must have come through
-	 * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and
-	 * NF_IP_LOCAL_IN.  Otherwise, must have gone through
-	 * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING.  */
-	if (!skb->dev) {
-		printk("ip_local_deliver: skb->dev is NULL.\n");
-	} else {
-		if (skb->nf_debug != ((1<<NF_IP_PRE_ROUTING)
-				      | (1<<NF_IP_LOCAL_IN))) {
-			printk("ip_local_deliver: bad skb: ");
-			debug_print_hooks_ip(skb->nf_debug);
-			nf_dump_skb(PF_INET, skb);
-		}
-	}
-}
-
-void nf_debug_ip_loopback_xmit(struct sk_buff *newskb)
-{
-	if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
-				 | (1 << NF_IP_POST_ROUTING))) {
-		printk("ip_dev_loopback_xmit: bad owned skb = %p: ", 
-		       newskb);
-		debug_print_hooks_ip(newskb->nf_debug);
-		nf_dump_skb(PF_INET, newskb);
-	}
-}
-
-void nf_debug_ip_finish_output2(struct sk_buff *skb)
-{
-	/* If it's owned, it must have gone through the
-	 * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING.
-	 * Otherwise, must have gone through
-	 * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING.
-	 */
-	if (skb->sk) {
-		if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
-				      | (1 << NF_IP_POST_ROUTING))) {
-			printk("ip_finish_output: bad owned skb = %p: ", skb);
-			debug_print_hooks_ip(skb->nf_debug);
-			nf_dump_skb(PF_INET, skb);
-		}
-	} else {
-		if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING)
-				      | (1 << NF_IP_FORWARD)
-				      | (1 << NF_IP_POST_ROUTING))) {
-			/* Fragments, entunnelled packets, TCP RSTs
-                           generated by ipt_REJECT will have no
-                           owners, but still may be local */
-			if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT)
-					      | (1 << NF_IP_POST_ROUTING))){
-				printk("ip_finish_output:"
-				       " bad unowned skb = %p: ",skb);
-				debug_print_hooks_ip(skb->nf_debug);
-				nf_dump_skb(PF_INET, skb);
-			}
-		}
-	}
-}
-#endif /*CONFIG_NETFILTER_DEBUG*/
-
 /* Call get/setsockopt() */
 static int nf_sockopt(struct sock *sk, int pf, int val, 
 		      char __user *opt, int *len, int get)
@@ -488,14 +358,6 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
 	/* We may already have this, but read-locks nest anyway */
 	rcu_read_lock();
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	if (unlikely((*pskb)->nf_debug & (1 << hook))) {
-		printk("nf_hook: hook %i already set.\n", hook);
-		nf_dump_skb(pf, *pskb);
-	}
-	(*pskb)->nf_debug |= (1 << hook);
-#endif
-
 	elem = &nf_hooks[pf][hook];
 next_hook:
 	verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f65b3de590a..6d68c03bc05 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -365,9 +365,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
 	C(nfct);
 	nf_conntrack_get(skb->nfct);
 	C(nfctinfo);
-#ifdef CONFIG_NETFILTER_DEBUG
-	C(nf_debug);
-#endif
 #ifdef CONFIG_BRIDGE_NETFILTER
 	C(nf_bridge);
 	nf_bridge_get(skb->nf_bridge);
@@ -432,9 +429,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->nfct	= old->nfct;
 	nf_conntrack_get(old->nfct);
 	new->nfctinfo	= old->nfctinfo;
-#ifdef CONFIG_NETFILTER_DEBUG
-	new->nf_debug	= old->nf_debug;
-#endif
 #ifdef CONFIG_BRIDGE_NETFILTER
 	new->nf_bridge	= old->nf_bridge;
 	nf_bridge_get(old->nf_bridge);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4e47a2658c7..2b7485e6504 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -200,10 +200,6 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb)
 {
 	int ihl = skb->nh.iph->ihl*4;
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	nf_debug_ip_local_deliver(skb);
-#endif /*CONFIG_NETFILTER_DEBUG*/
-
 	__skb_pull(skb, ihl);
 
 	/* Free reference early: we don't need it any more, and it may
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 760dc8238d6..ee07aec215a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,10 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 	newskb->pkt_type = PACKET_LOOPBACK;
 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
 	BUG_TRAP(newskb->dst);
-
-#ifdef CONFIG_NETFILTER_DEBUG
-	nf_debug_ip_loopback_xmit(newskb);
-#endif
 	nf_reset(newskb);
 	netif_rx(newskb);
 	return 0;
@@ -192,10 +188,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 		skb = skb2;
 	}
 
-#ifdef CONFIG_NETFILTER_DEBUG
-	nf_debug_ip_finish_output2(skb);
-#endif /*CONFIG_NETFILTER_DEBUG*/
-
 	nf_reset(skb);
 
 	if (hh) {
@@ -415,9 +407,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->nf_bridge = from->nf_bridge;
 	nf_bridge_get(to->nf_bridge);
 #endif
-#ifdef CONFIG_NETFILTER_DEBUG
-	to->nf_debug = from->nf_debug;
-#endif
 #endif
 }
 
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
index de21da00057..a8512a3fd08 100644
--- a/net/ipv4/ipvs/ip_vs_xmit.c
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -127,7 +127,6 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
 
 #define IP_VS_XMIT(skb, rt)				\
 do {							\
-	nf_reset_debug(skb);				\
 	(skb)->nfcache |= NFC_IPVS_PROPERTY;		\
 	(skb)->ip_summed = CHECKSUM_NONE;		\
 	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,	\
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index a7377a331ad..ffba0ad3c88 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -940,10 +940,6 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct,
 struct sk_buff *
 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
 {
-#ifdef CONFIG_NETFILTER_DEBUG
-	unsigned int olddebug = skb->nf_debug;
-#endif
-
 	skb_orphan(skb);
 
 	local_bh_disable(); 
@@ -953,12 +949,7 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
 	if (skb) {
 		ip_send_check(skb->nh.iph);
 		skb->nfcache |= NFC_ALTERED;
-#ifdef CONFIG_NETFILTER_DEBUG
-		/* Packet path as if nothing had happened. */
-		skb->nf_debug = olddebug;
-#endif
 	}
-
 	return skb;
 }
 
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index 9cd51f180dc..158f34f32c0 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -142,9 +142,6 @@ static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
 	/* Transfer socket to new skb. */
 	if ((*pskb)->sk)
 		skb_set_owner_w(nskb, (*pskb)->sk);
-#ifdef CONFIG_NETFILTER_DEBUG
-	nskb->nf_debug = (*pskb)->nf_debug;
-#endif
 	kfree_skb(*pskb);
 	*pskb = nskb;
 	return 1;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b78a5358680..06e7cdaeedc 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -484,9 +484,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->nf_bridge = from->nf_bridge;
 	nf_bridge_get(to->nf_bridge);
 #endif
-#ifdef CONFIG_NETFILTER_DEBUG
-	to->nf_debug = from->nf_debug;
-#endif
 #endif
 }
 
-- 
cgit v1.2.3


From e98231858bbfd2aca42f93d55133c2fca6df00f9 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 21 Jun 2005 14:02:15 -0700
Subject: [NETFILTER]: Restore netfilter assumptions in IPv6 multicast

Netfilter assumes that skb->data == skb->nh.ipv6h

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/mcast.c | 47 ++++++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index c0ca92e8230..562fcd14fde 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1301,15 +1301,6 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
 		return NULL;
 
 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
-	if (dev->hard_header) {
-		unsigned char ha[MAX_ADDR_LEN];
-
-		ndisc_mc_map(&mld2_all_mcr, ha, dev, 1);
-		if (dev->hard_header(skb, dev, ETH_P_IPV6,ha,NULL,size) < 0) {
-			kfree_skb(skb);
-			return NULL;
-		}
-	}
 
 	if (ipv6_get_lladdr(dev, &addr_buf)) {
 		/* <draft-ietf-magma-mld-source-05.txt>:
@@ -1333,6 +1324,30 @@ static struct sk_buff *mld_newpack(struct net_device *dev, int size)
 	return skb;
 }
 
+static inline int mld_dev_queue_xmit2(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+
+	if (dev->hard_header) {
+		unsigned char ha[MAX_ADDR_LEN];
+		int err;
+
+		ndisc_mc_map(&skb->nh.ipv6h->daddr, ha, dev, 1);
+		err = dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, skb->len);
+		if (err < 0) {
+			kfree_skb(skb);
+			return err;
+		}
+	}
+	return dev_queue_xmit(skb);
+}
+
+static inline int mld_dev_queue_xmit(struct sk_buff *skb)
+{
+	return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb, NULL, skb->dev,
+	               mld_dev_queue_xmit2);
+}
+
 static void mld_sendpack(struct sk_buff *skb)
 {
 	struct ipv6hdr *pip6 = skb->nh.ipv6h;
@@ -1350,7 +1365,7 @@ static void mld_sendpack(struct sk_buff *skb)
 	pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
 		IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0));
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
-		dev_queue_xmit);
+		mld_dev_queue_xmit);
 	if (!err) {
 		ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS);
 		IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
@@ -1656,12 +1671,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 	}
 
 	skb_reserve(skb, LL_RESERVED_SPACE(dev));
-	if (dev->hard_header) {
-		unsigned char ha[MAX_ADDR_LEN];
-		ndisc_mc_map(snd_addr, ha, dev, 1);
-		if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0)
-			goto out;
-	}
 
 	if (ipv6_get_lladdr(dev, &addr_buf)) {
 		/* <draft-ietf-magma-mld-source-05.txt>:
@@ -1689,7 +1698,7 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 	idev = in6_dev_get(skb->dev);
 
 	err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev,
-		dev_queue_xmit);
+		mld_dev_queue_xmit);
 	if (!err) {
 		if (type == ICMPV6_MGM_REDUCTION)
 			ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS);
@@ -1703,10 +1712,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
 	if (likely(idev != NULL))
 		in6_dev_put(idev);
 	return;
-
-out:
-	IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
-	kfree_skb(skb);
 }
 
 static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
-- 
cgit v1.2.3


From 1d3cdb41f52e299f70b66cbb569bff5216f3643a Mon Sep 17 00:00:00 2001
From: Phil Oester <kernel@linuxace.com>
Date: Tue, 21 Jun 2005 14:02:42 -0700
Subject: [NETFILTER]: expectation timeouts are compulsory

Since expectation timeouts were made compulsory [1], there is no need to
check for them in ip_conntrack_expect_insert.

[1] https://lists.netfilter.org/pipermail/netfilter-devel/2005-January/018143.html

Signed-off-by: Phil Oester <kernel@linuxace.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_core.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index ffba0ad3c88..4b78ebeb663 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -760,15 +760,11 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
 	exp->master->expecting++;
 	list_add(&exp->list, &ip_conntrack_expect_list);
 
-	if (exp->master->helper->timeout) {
-		init_timer(&exp->timeout);
-		exp->timeout.data = (unsigned long)exp;
-		exp->timeout.function = expectation_timed_out;
-		exp->timeout.expires
-			= jiffies + exp->master->helper->timeout * HZ;
-		add_timer(&exp->timeout);
-	} else
-		exp->timeout.function = NULL;
+	init_timer(&exp->timeout);
+	exp->timeout.data = (unsigned long)exp;
+	exp->timeout.function = expectation_timed_out;
+	exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
+	add_timer(&exp->timeout);
 
 	CONNTRACK_STAT_INC(expect_create);
 }
-- 
cgit v1.2.3


From 97216c799a6496163be8c81c9ceed297d92956c5 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 21 Jun 2005 14:03:01 -0700
Subject: [NETFILTER]: Missing owner-field initialization in ip6table_raw

I missed this one when fixing up iptable_raw.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/ip6table_raw.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 71407beaf79..c2982efd14a 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -129,13 +129,15 @@ static struct nf_hook_ops ip6t_ops[] = {
 	  .hook = ip6t_hook, 
 	  .pf = PF_INET6,
 	  .hooknum = NF_IP6_PRE_ROUTING,
-	  .priority = NF_IP6_PRI_FIRST
+	  .priority = NF_IP6_PRI_FIRST,
+	  .owner = THIS_MODULE,
 	},
 	{
 	  .hook = ip6t_hook, 
 	  .pf = PF_INET6, 
 	  .hooknum = NF_IP6_LOCAL_OUT,
-	  .priority = NF_IP6_PRI_FIRST
+	  .priority = NF_IP6_PRI_FIRST,
+	  .owner = THIS_MODULE,
 	},
 };
 
-- 
cgit v1.2.3


From e3be8ba79294df5de96692411e122506b40c5aa4 Mon Sep 17 00:00:00 2001
From: Keir Fraser <Keir.Fraser@xl.cam.ac.uk>
Date: Tue, 21 Jun 2005 14:03:23 -0700
Subject: [NETFILTER]: Avoid unncessary checksum validation in UDP connection
 tracking

Signed-off-by: Keir Fraser <Keir.Fraser@xl.cam.ac.uk>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_proto_udp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 5bc28a22462..8c1eaba098d 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -120,6 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
 	 * and moreover root might send raw packets.
 	 * FIXME: Source route IP option packets --RR */
 	if (hooknum == NF_IP_PRE_ROUTING
+	    && skb->ip_summed != CHECKSUM_UNNECESSARY
 	    && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
 			         skb->ip_summed == CHECKSUM_HW ? skb->csum
 			      	 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
-- 
cgit v1.2.3


From 6150bacfec95c7042678667561664efcf10d4508 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 21 Jun 2005 14:03:46 -0700
Subject: [NETFILTER]: Check TCP checksum in ipt_REJECT

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_REJECT.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index 266d6497928..91569644602 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -104,10 +104,12 @@ static inline struct rtable *route_reverse(struct sk_buff *skb,
 static void send_reset(struct sk_buff *oldskb, int hook)
 {
 	struct sk_buff *nskb;
+	struct iphdr *iph = oldskb->nh.iph;
 	struct tcphdr _otcph, *oth, *tcph;
 	struct rtable *rt;
 	u_int16_t tmp_port;
 	u_int32_t tmp_addr;
+	unsigned int tcplen;
 	int needs_ack;
 	int hh_len;
 
@@ -124,7 +126,16 @@ static void send_reset(struct sk_buff *oldskb, int hook)
 	if (oth->rst)
 		return;
 
-	/* FIXME: Check checksum --RR */
+	/* Check checksum */
+	tcplen = oldskb->len - iph->ihl * 4;
+	if (((hook != NF_IP_LOCAL_IN && oldskb->ip_summed != CHECKSUM_HW) ||
+	     (hook == NF_IP_LOCAL_IN &&
+	      oldskb->ip_summed != CHECKSUM_UNNECESSARY)) &&
+	    csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
+	                      oldskb->ip_summed == CHECKSUM_HW ? oldskb->csum :
+	                      skb_checksum(oldskb, iph->ihl * 4, tcplen, 0)))
+		return;
+
 	if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
 		return;
 
-- 
cgit v1.2.3


From 2715bcf9efc34063e05009f188eb896c462ae925 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 21 Jun 2005 14:06:24 -0700
Subject: [NETFILTER]: Drop conntrack reference in
 ip_call_ra_chain()/ip_mr_input()

Drop reference before handing the packets to raw_rcv()

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c | 1 +
 net/ipv4/ipmr.c     | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2b7485e6504..af2ec88bbb2 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -184,6 +184,7 @@ int ip_call_ra_chain(struct sk_buff *skb)
 					raw_rcv(last, skb2);
 			}
 			last = sk;
+			nf_reset(skb);
 		}
 	}
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e21c049ec62..e4f809a93f4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1350,6 +1350,7 @@ int ip_mr_input(struct sk_buff *skb)
 			     */
 			    read_lock(&mrt_lock);
 			    if (mroute_socket) {
+				    nf_reset(skb);
 				    raw_rcv(mroute_socket, skb);
 				    read_unlock(&mrt_lock);
 				    return 0;
-- 
cgit v1.2.3


From 047601bf7c803724bbab9a4fdbad9481cecc12e0 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 21 Jun 2005 14:07:13 -0700
Subject: [NETFILTER]: Fix ip6t_LOG sit tunnel logging

Sit tunnel logging is currently broken:

MAC=01:23:45:67:89:ab->01:23:45:47:89:ac TUNNEL=123.123.  0.123-> 12.123.  6.123

Apart from the broken IP address, MAC addresses are printed differently
for sit tunnels than for everything else.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/ip6t_LOG.c | 54 +++++++++++++++----------------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index bfc3d0185d1..c44685e391b 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -366,8 +366,6 @@ ip6t_log_packet(unsigned int hooknum,
 		const char *level_string,
 		const char *prefix)
 {
-	struct ipv6hdr *ipv6h = skb->nh.ipv6h;
-
 	spin_lock_bh(&log_lock);
 	printk(level_string);
 	printk("%sIN=%s OUT=%s ",
@@ -377,39 +375,25 @@ ip6t_log_packet(unsigned int hooknum,
 	if (in && !out) {
 		/* MAC logging for input chain only. */
 		printk("MAC=");
-		if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)ipv6h) {
-			if (skb->dev->type != ARPHRD_SIT){
-			  int i;
-			  unsigned char *p = skb->mac.raw;
-			  for (i = 0; i < skb->dev->hard_header_len; i++,p++)
-				printk("%02x%c", *p,
-			       		i==skb->dev->hard_header_len - 1
-			       		? ' ':':');
-			} else {
-			  int i;
-			  unsigned char *p = skb->mac.raw;
-			  if ( p - (ETH_ALEN*2+2) > skb->head ){
-			    p -= (ETH_ALEN+2);
-			    for (i = 0; i < (ETH_ALEN); i++,p++)
-				printk("%02x%s", *p,
-					i == ETH_ALEN-1 ? "->" : ":");
-			    p -= (ETH_ALEN*2);
-			    for (i = 0; i < (ETH_ALEN); i++,p++)
-				printk("%02x%c", *p,
-					i == ETH_ALEN-1 ? ' ' : ':');
-			  }
-			  
-			  if ((skb->dev->addr_len == 4) &&
-			      skb->dev->hard_header_len > 20){
-			    printk("TUNNEL=");
-			    p = skb->mac.raw + 12;
-			    for (i = 0; i < 4; i++,p++)
-				printk("%3d%s", *p,
-					i == 3 ? "->" : ".");
-			    for (i = 0; i < 4; i++,p++)
-				printk("%3d%c", *p,
-					i == 3 ? ' ' : '.');
-			  }
+		if (skb->dev && skb->dev->hard_header_len &&
+		    skb->mac.raw != skb->nh.raw) {
+			unsigned char *p = skb->mac.raw;
+			int i;
+
+			if (skb->dev->type == ARPHRD_SIT &&
+			    (p -= ETH_HLEN) < skb->head)
+				p = NULL;
+
+			if (p != NULL)
+				for (i = 0; i < skb->dev->hard_header_len; i++)
+					printk("%02x", p[i]);
+			printk(" ");
+
+			if (skb->dev->type == ARPHRD_SIT) {
+				struct iphdr *iph = (struct iphdr *)skb->mac.raw;
+				printk("TUNNEL=%u.%u.%u.%u->%u.%u.%u.%u ",
+				       NIPQUAD(iph->saddr),
+				       NIPQUAD(iph->daddr));
 			}
 		} else
 			printk(" ");
-- 
cgit v1.2.3


From 90f66914c89b0be63548d4387d1211280aa7bc8e Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 21 Jun 2005 14:43:28 -0700
Subject: [IPV4]: Fix fib_trie.c's args to fib_dump_info().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index c0ece94fc63..0671569ee6f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1786,7 +1786,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi
 				  &xkey,
 				  plen,
 				  fa->fa_tos,
-				  fa->fa_info) < 0) {
+				  fa->fa_info, 0) < 0) {
 			cb->args[3] = i;
 			return -1;
 			}
-- 
cgit v1.2.3


From b53542073927878b18d642f6bf794adef6d45a18 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@freescale.com>
Date: Wed, 22 Jun 2005 09:58:03 -0500
Subject: [PATCH] Fix extra double quote in IPV4 Kconfig

Kconfig option had an extra double quote at the end of the line
which was causing in warning when building.

Signed-off-by: Kumar Gala <kumar.gala@freescale.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/ipv4/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 05107e0dc14..567b03b1c34 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -2,7 +2,7 @@
 # IP configuration
 #
 choice 
-	prompt "Choose IP: FIB lookup""
+	prompt "Choose IP: FIB lookup"
 	depends on INET
 	default IP_FIB_HASH
 
-- 
cgit v1.2.3


From 5d927eb0101eb791fb2d4f72b49a2da5faf01941 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Wed, 22 Jun 2005 12:37:50 -0700
Subject: [NETFILTER]: Fix handling of ICMP packets (RELATED) in ipt_CLUSTERIP
 target.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index dc4362b57cf..9cde8c61f52 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -339,7 +339,7 @@ target(struct sk_buff **pskb,
 	 * error messages (RELATED) and information requests (see below) */
 	if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
 	    && (ctinfo == IP_CT_RELATED 
-		|| ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY))
+		|| ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
 		return IPT_CONTINUE;
 
 	/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 
-- 
cgit v1.2.3


From d05fdb0cec75415b2d9eb95748386e67414e49c3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:19 +0000
Subject: [PATCH] RPC: Fix a race with rpc_restart_call()

 If the task->tk_exit() wants to restart the RPC call after delaying
 then the current RPC code will clobber the timer by calling
 rpc_delete_timer() immediately after re-entering the loop in
 __rpc_execute().

 Problem noticed by Oleg Nesterov <oleg@tv-sign.ru>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/sched.c | 53 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index c06614d0e31..cc298fa4b81 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -554,6 +554,30 @@ __rpc_atrun(struct rpc_task *task)
 	rpc_wake_up_task(task);
 }
 
+/*
+ * Helper that calls task->tk_exit if it exists and then returns
+ * true if we should exit __rpc_execute.
+ */
+static inline int __rpc_do_exit(struct rpc_task *task)
+{
+	if (task->tk_exit != NULL) {
+		lock_kernel();
+		task->tk_exit(task);
+		unlock_kernel();
+		/* If tk_action is non-null, we should restart the call */
+		if (task->tk_action != NULL) {
+			if (!RPC_ASSASSINATED(task)) {
+				/* Release RPC slot and buffer memory */
+				xprt_release(task);
+				rpc_free(task);
+				return 0;
+			}
+			printk(KERN_ERR "RPC: dead task tried to walk away.\n");
+		}
+	}
+	return 1;
+}
+
 /*
  * This is the RPC `scheduler' (or rather, the finite state machine).
  */
@@ -566,8 +590,7 @@ static int __rpc_execute(struct rpc_task *task)
 
 	BUG_ON(RPC_IS_QUEUED(task));
 
- restarted:
-	while (1) {
+	for (;;) {
 		/*
 		 * Garbage collection of pending timers...
 		 */
@@ -600,11 +623,12 @@ static int __rpc_execute(struct rpc_task *task)
 		 * by someone else.
 		 */
 		if (!RPC_IS_QUEUED(task)) {
-			if (!task->tk_action)
+			if (task->tk_action != NULL) {
+				lock_kernel();
+				task->tk_action(task);
+				unlock_kernel();
+			} else if (__rpc_do_exit(task))
 				break;
-			lock_kernel();
-			task->tk_action(task);
-			unlock_kernel();
 		}
 
 		/*
@@ -645,23 +669,6 @@ static int __rpc_execute(struct rpc_task *task)
 		dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
 	}
 
-	if (task->tk_exit) {
-		lock_kernel();
-		task->tk_exit(task);
-		unlock_kernel();
-		/* If tk_action is non-null, the user wants us to restart */
-		if (task->tk_action) {
-			if (!RPC_ASSASSINATED(task)) {
-				/* Release RPC slot and buffer memory */
-				if (task->tk_rqstp)
-					xprt_release(task);
-				rpc_free(task);
-				goto restarted;
-			}
-			printk(KERN_ERR "RPC: dead task tries to walk away.\n");
-		}
-	}
-
 	dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status);
 	status = task->tk_status;
 
-- 
cgit v1.2.3


From 334ccfd545bba9690515f2c5c167d5adb161989b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:19 +0000
Subject: [PATCH] RPC: Ensure XDR iovec length is initialized correctly in
 call_header

 Fix up call_header() so that it calls xdr_adjust_iovec().
 Fix calculation of the scratch buffer length in xdr_init_encode().

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c |  4 +++-
 net/sunrpc/svc.c  |  1 +
 net/sunrpc/xdr.c  | 18 +++++++++++++++---
 3 files changed, 19 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 02bc029d46f..209aaf59569 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -957,7 +957,9 @@ call_header(struct rpc_task *task)
 	*p++ = htonl(clnt->cl_prog);	/* program number */
 	*p++ = htonl(clnt->cl_vers);	/* program version */
 	*p++ = htonl(task->tk_msg.rpc_proc->p_proc);	/* procedure */
-	return rpcauth_marshcred(task, p);
+	p = rpcauth_marshcred(task, p);
+	req->rq_slen = xdr_adjust_iovec(&req->rq_svec[0], p);
+	return p;
 }
 
 /*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bb2d99f3331..a02d424a740 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -281,6 +281,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 	rqstp->rq_res.len = 0;
 	rqstp->rq_res.page_base = 0;
 	rqstp->rq_res.page_len = 0;
+	rqstp->rq_res.buflen = PAGE_SIZE;
 	rqstp->rq_res.tail[0].iov_len = 0;
 	/* tcp needs a space for the record length... */
 	if (rqstp->rq_prot == IPPROTO_TCP)
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 67b9f035ba8..f86d1baa630 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -616,12 +616,24 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len)
 void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p)
 {
 	struct kvec *iov = buf->head;
+	int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
 
+	BUG_ON(scratch_len < 0);
 	xdr->buf = buf;
 	xdr->iov = iov;
-	xdr->end = (uint32_t *)((char *)iov->iov_base + iov->iov_len);
-	buf->len = iov->iov_len = (char *)p - (char *)iov->iov_base;
-	xdr->p = p;
+	xdr->p = (uint32_t *)((char *)iov->iov_base + iov->iov_len);
+	xdr->end = (uint32_t *)((char *)iov->iov_base + scratch_len);
+	BUG_ON(iov->iov_len > scratch_len);
+
+	if (p != xdr->p && p != NULL) {
+		size_t len;
+
+		BUG_ON(p < xdr->p || p > xdr->end);
+		len = (char *)p - (char *)xdr->p;
+		xdr->p = p;
+		buf->len += len;
+		iov->iov_len += len;
+	}
 }
 EXPORT_SYMBOL(xdr_init_encode);
 
-- 
cgit v1.2.3


From 5b616f5d596c0b056129f8aeafbc08409b3cd050 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:20 +0000
Subject: [PATCH] RPC: Make rpc_create_client() destroy the transport on
 failure.

 This saves us a couple of lines of cleanup code for each call.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c        | 1 +
 net/sunrpc/pmap_clnt.c   | 4 +---
 net/sunrpc/sunrpc_syms.c | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 209aaf59569..99515d7727a 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -178,6 +178,7 @@ out_no_path:
 		kfree(clnt->cl_server);
 	kfree(clnt);
 out_err:
+	xprt_destroy(xprt);
 	return ERR_PTR(err);
 }
 
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index d0b1d2c34a4..97c420ff1ee 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -210,9 +210,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
 	clnt = rpc_create_client(xprt, hostname,
 				&pmap_program, RPC_PMAP_VERSION,
 				RPC_AUTH_UNIX);
-	if (IS_ERR(clnt)) {
-		xprt_destroy(xprt);
-	} else {
+	if (!IS_ERR(clnt)) {
 		clnt->cl_softrtry = 1;
 		clnt->cl_chatty   = 1;
 		clnt->cl_oneshot  = 1;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index d4f26bf9e73..1b0ff7e0e86 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -61,7 +61,6 @@ EXPORT_SYMBOL(rpc_mkpipe);
 
 /* Client transport */
 EXPORT_SYMBOL(xprt_create_proto);
-EXPORT_SYMBOL(xprt_destroy);
 EXPORT_SYMBOL(xprt_set_timeout);
 EXPORT_SYMBOL(xprt_udp_slot_table_entries);
 EXPORT_SYMBOL(xprt_tcp_slot_table_entries);
-- 
cgit v1.2.3


From 5ee0ed7d3ab620a764740fb018f469d45f561931 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:20 +0000
Subject: [PATCH] RPC: Make rpc_create_client() probe server for RPC
 program+version support

 Ensure that we don't create an RPC client without checking that the server
 does indeed support the RPC program + version that we are trying to set up.

 This enables us to immediately return an error to "mount" if it turns out
 that the server is only supporting NFSv2, when we requested NFSv3 or NFSv4.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c      | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-
 net/sunrpc/pmap_clnt.c |  2 +-
 2 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 99515d7727a..b36797ad808 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -97,7 +97,7 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
  * made to sleep too long.
  */
 struct rpc_clnt *
-rpc_create_client(struct rpc_xprt *xprt, char *servname,
+rpc_new_client(struct rpc_xprt *xprt, char *servname,
 		  struct rpc_program *program, u32 vers,
 		  rpc_authflavor_t flavor)
 {
@@ -182,6 +182,36 @@ out_err:
 	return ERR_PTR(err);
 }
 
+/**
+ * Create an RPC client
+ * @xprt - pointer to xprt struct
+ * @servname - name of server
+ * @info - rpc_program
+ * @version - rpc_program version
+ * @authflavor - rpc_auth flavour to use
+ *
+ * Creates an RPC client structure, then pings the server in order to
+ * determine if it is up, and if it supports this program and version.
+ *
+ * This function should never be called by asynchronous tasks such as
+ * the portmapper.
+ */
+struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
+		struct rpc_program *info, u32 version, rpc_authflavor_t authflavor)
+{
+	struct rpc_clnt *clnt;
+	int err;
+	
+	clnt = rpc_new_client(xprt, servname, info, version, authflavor);
+	if (IS_ERR(clnt))
+		return clnt;
+	err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
+	if (err == 0)
+		return clnt;
+	rpc_shutdown_client(clnt);
+	return ERR_PTR(err);
+}
+
 /*
  * This function clones the RPC client structure. It allows us to share the
  * same transport while varying parameters such as the authentication
@@ -1086,3 +1116,30 @@ out_overflow:
 	printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__);
 	goto out_retry;
 }
+
+static int rpcproc_encode_null(void *rqstp, u32 *data, void *obj)
+{
+	return 0;
+}
+
+static int rpcproc_decode_null(void *rqstp, u32 *data, void *obj)
+{
+	return 0;
+}
+
+static struct rpc_procinfo rpcproc_null = {
+	.p_encode = rpcproc_encode_null,
+	.p_decode = rpcproc_decode_null,
+};
+
+int rpc_ping(struct rpc_clnt *clnt, int flags)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &rpcproc_null,
+	};
+	int err;
+	msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+	err = rpc_call_sync(clnt, &msg, flags);
+	put_rpccred(msg.rpc_cred);
+	return err;
+}
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 97c420ff1ee..df4d84c9020 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -207,7 +207,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
 	xprt->addr.sin_port = htons(RPC_PMAP_PORT);
 
 	/* printk("pmap: create clnt\n"); */
-	clnt = rpc_create_client(xprt, hostname,
+	clnt = rpc_new_client(xprt, hostname,
 				&pmap_program, RPC_PMAP_VERSION,
 				RPC_AUTH_UNIX);
 	if (!IS_ERR(clnt)) {
-- 
cgit v1.2.3


From 96651ab341cde0fee940ec837f323d711cbfa7d5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:21 +0000
Subject: [PATCH] RPC: Shrink struct rpc_task by switching to wait_on_bit()

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/sched.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index cc298fa4b81..2d9eb7fbd52 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -290,7 +290,7 @@ static void rpc_make_runnable(struct rpc_task *task)
 			return;
 		}
 	} else
-		wake_up(&task->u.tk_wait.waitq);
+		wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
 }
 
 /*
@@ -578,6 +578,14 @@ static inline int __rpc_do_exit(struct rpc_task *task)
 	return 1;
 }
 
+static int rpc_wait_bit_interruptible(void *word)
+{
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
 /*
  * This is the RPC `scheduler' (or rather, the finite state machine).
  */
@@ -648,22 +656,21 @@ static int __rpc_execute(struct rpc_task *task)
 
 		/* sync task: sleep here */
 		dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
-		if (RPC_TASK_UNINTERRUPTIBLE(task)) {
-			__wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task));
-		} else {
-			__wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status);
+		/* Note: Caller should be using rpc_clnt_sigmask() */
+		status = out_of_line_wait_on_bit(&task->tk_runstate,
+				RPC_TASK_QUEUED, rpc_wait_bit_interruptible,
+				TASK_INTERRUPTIBLE);
+		if (status == -ERESTARTSYS) {
 			/*
 			 * When a sync task receives a signal, it exits with
 			 * -ERESTARTSYS. In order to catch any callbacks that
 			 * clean up after sleeping on some queue, we don't
 			 * break the loop here, but go around once more.
 			 */
-			if (status == -ERESTARTSYS) {
-				dprintk("RPC: %4d got signal\n", task->tk_pid);
-				task->tk_flags |= RPC_TASK_KILLED;
-				rpc_exit(task, -ERESTARTSYS);
-				rpc_wake_up_task(task);
-			}
+			dprintk("RPC: %4d got signal\n", task->tk_pid);
+			task->tk_flags |= RPC_TASK_KILLED;
+			rpc_exit(task, -ERESTARTSYS);
+			rpc_wake_up_task(task);
 		}
 		rpc_set_running(task);
 		dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
@@ -766,8 +773,6 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
 
 	/* Initialize workqueue for async tasks */
 	task->tk_workqueue = rpciod_workqueue;
-	if (!RPC_IS_ASYNC(task))
-		init_waitqueue_head(&task->u.tk_wait.waitq);
 
 	if (clnt) {
 		atomic_inc(&clnt->cl_users);
-- 
cgit v1.2.3


From 438b6fdebf2a2e8573e7290bc176feb4d4475f43 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] RPC: Don't fall back from krb5p to krb5i

 We shouldn't be silently falling back from krb5p to krb5i.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth_gss/auth_gss.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a33b627cbef..7d88db83ab1 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -675,9 +675,8 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 		goto err_free;
 	}
 	gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
-	/* FIXME: Will go away once privacy support is merged in */
-	if (gss_auth->service == RPC_GSS_SVC_PRIVACY)
-		gss_auth->service = RPC_GSS_SVC_INTEGRITY;
+	if (gss_auth->service == 0)
+		goto err_put_mech;
 	INIT_LIST_HEAD(&gss_auth->upcalls);
 	spin_lock_init(&gss_auth->lock);
 	auth = &gss_auth->rpc_auth;
-- 
cgit v1.2.3


From 6a19275ada9137435da58990c8f8d3f58e170bf1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] RPC: [PATCH] improve rpcauthauth_create error returns

 Currently we return -ENOMEM for every single failure to create a new auth.
 This is actually accurate for auth_null and auth_unix, but for auth_gss it's a
 bit confusing.

 Allow rpcauth_create (and the ->create methods) to return errors.  With this
 patch, the user may sometimes see an EINVAL instead.  Whee.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/auth.c              |  6 +++---
 net/sunrpc/auth_gss/auth_gss.c | 13 +++++++++----
 net/sunrpc/clnt.c              |  6 ++++--
 3 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 9bcec9b927b..505e2d4b3d6 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -66,10 +66,10 @@ rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
 	u32			flavor = pseudoflavor_to_flavor(pseudoflavor);
 
 	if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor]))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	auth = ops->create(clnt, pseudoflavor);
-	if (!auth)
-		return NULL;
+	if (IS_ERR(auth))
+		return auth;
 	if (clnt->cl_auth)
 		rpcauth_destroy(clnt->cl_auth);
 	clnt->cl_auth = auth;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 7d88db83ab1..2f7b867161d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -660,14 +660,16 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 {
 	struct gss_auth *gss_auth;
 	struct rpc_auth * auth;
+	int err = -ENOMEM; /* XXX? */
 
 	dprintk("RPC:      creating GSS authenticator for client %p\n",clnt);
 
 	if (!try_module_get(THIS_MODULE))
-		return NULL;
+		return ERR_PTR(err);
 	if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL)))
 		goto out_dec;
 	gss_auth->client = clnt;
+	err = -EINVAL;
 	gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
 	if (!gss_auth->mech) {
 		printk(KERN_WARNING "%s: Pseudoflavor %d not found!",
@@ -686,15 +688,18 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
 	auth->au_flavor = flavor;
 	atomic_set(&auth->au_count, 1);
 
-	if (rpcauth_init_credcache(auth, GSS_CRED_EXPIRE) < 0)
+	err = rpcauth_init_credcache(auth, GSS_CRED_EXPIRE);
+	if (err)
 		goto err_put_mech;
 
 	snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s",
 			clnt->cl_pathname,
 			gss_auth->mech->gm_name);
 	gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN);
-	if (IS_ERR(gss_auth->dentry))
+	if (IS_ERR(gss_auth->dentry)) {
+		err = PTR_ERR(gss_auth->dentry);
 		goto err_put_mech;
+	}
 
 	return auth;
 err_put_mech:
@@ -703,7 +708,7 @@ err_free:
 	kfree(gss_auth);
 out_dec:
 	module_put(THIS_MODULE);
-	return NULL;
+	return ERR_PTR(err);
 }
 
 static void
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b36797ad808..9da1deb482e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -103,6 +103,7 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname,
 {
 	struct rpc_version	*version;
 	struct rpc_clnt		*clnt = NULL;
+	struct rpc_auth		*auth;
 	int err;
 	int len;
 
@@ -157,10 +158,11 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname,
 	if (err < 0)
 		goto out_no_path;
 
-	err = -ENOMEM;
-	if (!rpcauth_create(flavor, clnt)) {
+	auth = rpcauth_create(flavor, clnt);
+	if (IS_ERR(auth)) {
 		printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n",
 				flavor);
+		err = PTR_ERR(auth);
 		goto out_no_auth;
 	}
 
-- 
cgit v1.2.3


From cdf477068e6db0c3e19df96f46abb85202de138c Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] RPC: Return -EPFNOSUPPORT for RPC programs that are
 unavailable

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Signed-off-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 9da1deb482e..33f12b84e26 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1021,10 +1021,11 @@ call_verify(struct rpc_task *task)
 			case RPC_AUTH_ERROR:
 				break;
 			case RPC_MISMATCH:
-				printk(KERN_WARNING "%s: RPC call version mismatch!\n", __FUNCTION__);
-				goto out_eio;
+				dprintk("%s: RPC call version mismatch!\n", __FUNCTION__);
+				error = -EPROTONOSUPPORT;
+				goto out_err;
 			default:
-				printk(KERN_WARNING "%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n);
+				dprintk("%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n);
 				goto out_eio;
 		}
 		if (--len < 0)
@@ -1075,23 +1076,26 @@ call_verify(struct rpc_task *task)
 	case RPC_SUCCESS:
 		return p;
 	case RPC_PROG_UNAVAIL:
-		printk(KERN_WARNING "RPC: call_verify: program %u is unsupported by server %s\n",
+		dprintk("RPC: call_verify: program %u is unsupported by server %s\n",
 				(unsigned int)task->tk_client->cl_prog,
 				task->tk_client->cl_server);
-		goto out_eio;
+		error = -EPFNOSUPPORT;
+		goto out_err;
 	case RPC_PROG_MISMATCH:
-		printk(KERN_WARNING "RPC: call_verify: program %u, version %u unsupported by server %s\n",
+		dprintk("RPC: call_verify: program %u, version %u unsupported by server %s\n",
 				(unsigned int)task->tk_client->cl_prog,
 				(unsigned int)task->tk_client->cl_vers,
 				task->tk_client->cl_server);
-		goto out_eio;
+		error = -EPROTONOSUPPORT;
+		goto out_err;
 	case RPC_PROC_UNAVAIL:
-		printk(KERN_WARNING "RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n",
+		dprintk("RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n",
 				task->tk_msg.rpc_proc,
 				task->tk_client->cl_prog,
 				task->tk_client->cl_vers,
 				task->tk_client->cl_server);
-		goto out_eio;
+		error = -EOPNOTSUPP;
+		goto out_err;
 	case RPC_GARBAGE_ARGS:
 		dprintk("RPC: %4d %s: server saw garbage\n", task->tk_pid, __FUNCTION__);
 		break;			/* retry */
@@ -1104,7 +1108,7 @@ out_retry:
 	task->tk_client->cl_stats->rpcgarbage++;
 	if (task->tk_garb_retry) {
 		task->tk_garb_retry--;
-		dprintk(KERN_WARNING "RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid);
+		dprintk("RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid);
 		task->tk_action = call_bind;
 		return NULL;
 	}
-- 
cgit v1.2.3


From 007e251f2b2760f738c92adc8c80cbae0bed3ce5 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] RPC: Allow multiple RPC client programs to share the same
 transport

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c        | 40 ++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/pmap_clnt.c   |  3 +++
 net/sunrpc/sunrpc_syms.c |  1 +
 3 files changed, 44 insertions(+)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 33f12b84e26..c979fcf8879 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -241,6 +241,8 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
 	if (new->cl_auth)
 		atomic_inc(&new->cl_auth->au_count);
+	new->cl_pmap		= &new->cl_pmap_default;
+	rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait");
 	return new;
 out_no_clnt:
 	printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__);
@@ -329,6 +331,44 @@ rpc_release_client(struct rpc_clnt *clnt)
 		rpc_destroy_client(clnt);
 }
 
+/**
+ * rpc_bind_new_program - bind a new RPC program to an existing client
+ * @old - old rpc_client
+ * @program - rpc program to set
+ * @vers - rpc program version
+ *
+ * Clones the rpc client and sets up a new RPC program. This is mainly
+ * of use for enabling different RPC programs to share the same transport.
+ * The Sun NFSv2/v3 ACL protocol can do this.
+ */
+struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
+				      struct rpc_program *program,
+				      int vers)
+{
+	struct rpc_clnt *clnt;
+	struct rpc_version *version;
+	int err;
+
+	BUG_ON(vers >= program->nrvers || !program->version[vers]);
+	version = program->version[vers];
+	clnt = rpc_clone_client(old);
+	if (IS_ERR(clnt))
+		goto out;
+	clnt->cl_procinfo = version->procs;
+	clnt->cl_maxproc  = version->nrprocs;
+	clnt->cl_protname = program->name;
+	clnt->cl_prog     = program->number;
+	clnt->cl_vers     = version->number;
+	clnt->cl_stats    = program->stats;
+	err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
+	if (err != 0) {
+		rpc_shutdown_client(clnt);
+		clnt = ERR_PTR(err);
+	}
+out:	
+	return clnt;
+}
+
 /*
  * Default callback for async RPC calls
  */
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index df4d84c9020..4e81f276692 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -53,6 +53,9 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
 			task->tk_pid, clnt->cl_server,
 			map->pm_prog, map->pm_vers, map->pm_prot);
 
+	/* Autobind on cloned rpc clients is discouraged */
+	BUG_ON(clnt->cl_parent != clnt);
+
 	spin_lock(&pmap_lock);
 	if (map->pm_binding) {
 		rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 1b0ff7e0e86..d8673f66acc 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -42,6 +42,7 @@ EXPORT_SYMBOL(rpc_release_task);
 /* RPC client functions */
 EXPORT_SYMBOL(rpc_create_client);
 EXPORT_SYMBOL(rpc_clone_client);
+EXPORT_SYMBOL(rpc_bind_new_program);
 EXPORT_SYMBOL(rpc_destroy_client);
 EXPORT_SYMBOL(rpc_shutdown_client);
 EXPORT_SYMBOL(rpc_release_client);
-- 
cgit v1.2.3


From e053d1ab62c8ef0eff3dd4c95448cad3c6d2fbf4 Mon Sep 17 00:00:00 2001
From: Olaf Kirch <okir@suse.de>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: Lazy RPC receive buffer allocation

 Signed-off-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xdr.c  | 16 +++++++++++++---
 net/sunrpc/xprt.c | 26 ++++++++++++++++++++++----
 2 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f86d1baa630..65b268d3978 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -176,7 +176,7 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
 	xdr->buflen += len;
 }
 
-void
+int
 xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 			  skb_reader_t *desc,
 			  skb_read_actor_t copy_actor)
@@ -190,7 +190,7 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		len -= base;
 		ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
 		if (ret != len || !desc->count)
-			return;
+			return 0;
 		base = 0;
 	} else
 		base -= len;
@@ -210,6 +210,14 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 	do {
 		char *kaddr;
 
+		/* ACL likes to be lazy in allocating pages - ACLs
+		 * are small by default but can get huge. */
+		if (unlikely(*ppage == NULL)) {
+			*ppage = alloc_page(GFP_ATOMIC);
+			if (unlikely(*ppage == NULL))
+				return -ENOMEM;
+		}
+
 		len = PAGE_CACHE_SIZE;
 		kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA);
 		if (base) {
@@ -226,13 +234,15 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		flush_dcache_page(*ppage);
 		kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
 		if (ret != len || !desc->count)
-			return;
+			return 0;
 		ppage++;
 	} while ((pglen -= len) != 0);
 copy_tail:
 	len = xdr->tail[0].iov_len;
 	if (base < len)
 		copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+
+	return 0;
 }
 
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c74a6bb9407..a180ed4952d 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -725,7 +725,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 		goto no_checksum;
 
 	desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
-	xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits);
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0)
+		return -1;
 	if (desc.offset != skb->len) {
 		unsigned int csum2;
 		csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
@@ -737,7 +738,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 		return -1;
 	return 0;
 no_checksum:
-	xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits);
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
+		return -1;
 	if (desc.count)
 		return -1;
 	return 0;
@@ -907,6 +909,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 	struct rpc_rqst *req;
 	struct xdr_buf *rcvbuf;
 	size_t len;
+	int r;
 
 	/* Find and lock the request corresponding to this xid */
 	spin_lock(&xprt->sock_lock);
@@ -927,16 +930,30 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		len = xprt->tcp_reclen - xprt->tcp_offset;
 		memcpy(&my_desc, desc, sizeof(my_desc));
 		my_desc.count = len;
-		xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  &my_desc, tcp_copy_data);
 		desc->count -= len;
 		desc->offset += len;
 	} else
-		xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  desc, tcp_copy_data);
 	xprt->tcp_copied += len;
 	xprt->tcp_offset += len;
 
+	if (r < 0) {
+		/* Error when copying to the receive buffer,
+		 * usually because we weren't able to allocate
+		 * additional buffer pages. All we can do now
+		 * is turn off XPRT_COPY_DATA, so the request
+		 * will not receive any additional updates,
+		 * and time out.
+		 * Any remaining data from this record will
+		 * be discarded.
+		 */
+		xprt->tcp_flags &= ~XPRT_COPY_DATA;
+		goto out;
+	}
+
 	if (xprt->tcp_copied == req->rq_private_buf.buflen)
 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
 	else if (xprt->tcp_offset == xprt->tcp_reclen) {
@@ -949,6 +966,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 				req->rq_task->tk_pid);
 		xprt_complete_rqst(xprt, req, xprt->tcp_copied);
 	}
+out:
 	spin_unlock(&xprt->sock_lock);
 	tcp_check_recm(xprt);
 }
-- 
cgit v1.2.3


From 7e06b53d796a3740307b54aa2799077f8a0c84e7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: fix accounting bug in the case of a truncated RPC
 message

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xdr.c  | 22 ++++++++++++++--------
 net/sunrpc/xprt.c | 35 +++++++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 65b268d3978..b3ac3f72bf9 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -176,21 +176,23 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
 	xdr->buflen += len;
 }
 
-int
+ssize_t
 xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 			  skb_reader_t *desc,
 			  skb_read_actor_t copy_actor)
 {
 	struct page	**ppage = xdr->pages;
 	unsigned int	len, pglen = xdr->page_len;
+	ssize_t		copied = 0;
 	int		ret;
 
 	len = xdr->head[0].iov_len;
 	if (base < len) {
 		len -= base;
 		ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
+		copied += ret;
 		if (ret != len || !desc->count)
-			return 0;
+			goto out;
 		base = 0;
 	} else
 		base -= len;
@@ -214,8 +216,11 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		 * are small by default but can get huge. */
 		if (unlikely(*ppage == NULL)) {
 			*ppage = alloc_page(GFP_ATOMIC);
-			if (unlikely(*ppage == NULL))
-				return -ENOMEM;
+			if (unlikely(*ppage == NULL)) {
+				if (copied == 0)
+					copied = -ENOMEM;
+				goto out;
+			}
 		}
 
 		len = PAGE_CACHE_SIZE;
@@ -233,16 +238,17 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		}
 		flush_dcache_page(*ppage);
 		kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
+		copied += ret;
 		if (ret != len || !desc->count)
-			return 0;
+			goto out;
 		ppage++;
 	} while ((pglen -= len) != 0);
 copy_tail:
 	len = xdr->tail[0].iov_len;
 	if (base < len)
-		copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
-
-	return 0;
+		copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+out:
+	return copied;
 }
 
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a180ed4952d..ef941e7de8b 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -823,10 +823,15 @@ tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
 {
 	if (len > desc->count)
 		len = desc->count;
-	if (skb_copy_bits(desc->skb, desc->offset, p, len))
+	if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
+		dprintk("RPC:      failed to copy %zu bytes from skb. %zu bytes remain\n",
+				len, desc->count);
 		return 0;
+	}
 	desc->offset += len;
 	desc->count -= len;
+	dprintk("RPC:      copied %zu bytes from skb. %zu bytes remain\n",
+			len, desc->count);
 	return len;
 }
 
@@ -865,6 +870,8 @@ tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
 static void
 tcp_check_recm(struct rpc_xprt *xprt)
 {
+	dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
+			xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
 	if (xprt->tcp_offset == xprt->tcp_reclen) {
 		xprt->tcp_flags |= XPRT_COPY_RECM;
 		xprt->tcp_offset = 0;
@@ -909,7 +916,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 	struct rpc_rqst *req;
 	struct xdr_buf *rcvbuf;
 	size_t len;
-	int r;
+	ssize_t r;
 
 	/* Find and lock the request corresponding to this xid */
 	spin_lock(&xprt->sock_lock);
@@ -932,15 +939,17 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		my_desc.count = len;
 		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  &my_desc, tcp_copy_data);
-		desc->count -= len;
-		desc->offset += len;
+		desc->count -= r;
+		desc->offset += r;
 	} else
 		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  desc, tcp_copy_data);
-	xprt->tcp_copied += len;
-	xprt->tcp_offset += len;
 
-	if (r < 0) {
+	if (r > 0) {
+		xprt->tcp_copied += r;
+		xprt->tcp_offset += r;
+	}
+	if (r != len) {
 		/* Error when copying to the receive buffer,
 		 * usually because we weren't able to allocate
 		 * additional buffer pages. All we can do now
@@ -951,9 +960,18 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		 * be discarded.
 		 */
 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
+		dprintk("RPC:      XID %08x truncated request\n",
+				ntohl(xprt->tcp_xid));
+		dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+				xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
 		goto out;
 	}
 
+	dprintk("RPC:      XID %08x read %u bytes\n",
+			ntohl(xprt->tcp_xid), r);
+	dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+			xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
+
 	if (xprt->tcp_copied == req->rq_private_buf.buflen)
 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
 	else if (xprt->tcp_offset == xprt->tcp_reclen) {
@@ -961,12 +979,12 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 			xprt->tcp_flags &= ~XPRT_COPY_DATA;
 	}
 
+out:
 	if (!(xprt->tcp_flags & XPRT_COPY_DATA)) {
 		dprintk("RPC: %4d received reply complete\n",
 				req->rq_task->tk_pid);
 		xprt_complete_rqst(xprt, req, xprt->tcp_copied);
 	}
-out:
 	spin_unlock(&xprt->sock_lock);
 	tcp_check_recm(xprt);
 }
@@ -985,6 +1003,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
 	desc->count -= len;
 	desc->offset += len;
 	xprt->tcp_offset += len;
+	dprintk("RPC:      discarded %u bytes\n", len);
 	tcp_check_recm(xprt);
 }
 
-- 
cgit v1.2.3


From bd8100e7eda87507649c6ba4cb32173b34e49986 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: Encode and decode arbitrary XDR arrays

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/sunrpc_syms.c |   4 +
 net/sunrpc/xdr.c         | 256 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 257 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index d8673f66acc..32e8acbc60f 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -129,6 +129,10 @@ EXPORT_SYMBOL(xdr_encode_netobj);
 EXPORT_SYMBOL(xdr_encode_pages);
 EXPORT_SYMBOL(xdr_inline_pages);
 EXPORT_SYMBOL(xdr_shift_buf);
+EXPORT_SYMBOL(xdr_encode_word);
+EXPORT_SYMBOL(xdr_decode_word);
+EXPORT_SYMBOL(xdr_encode_array2);
+EXPORT_SYMBOL(xdr_decode_array2);
 EXPORT_SYMBOL(xdr_buf_from_iov);
 EXPORT_SYMBOL(xdr_buf_subsegment);
 EXPORT_SYMBOL(xdr_buf_read_netobj);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index b3ac3f72bf9..8a4d9c106af 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -887,8 +887,34 @@ out:
 	return status;
 }
 
-static int
-read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
+/* obj is assumed to point to allocated memory of size at least len: */
+int
+write_bytes_to_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len)
+{
+	struct xdr_buf subbuf;
+	int this_len;
+	int status;
+
+	status = xdr_buf_subsegment(buf, &subbuf, base, len);
+	if (status)
+		goto out;
+	this_len = min(len, (int)subbuf.head[0].iov_len);
+	memcpy(subbuf.head[0].iov_base, obj, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min(len, (int)subbuf.page_len);
+	if (this_len)
+		_copy_to_pages(subbuf.pages, subbuf.page_base, obj, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min(len, (int)subbuf.tail[0].iov_len);
+	memcpy(subbuf.tail[0].iov_base, obj, this_len);
+out:
+	return status;
+}
+
+int
+xdr_decode_word(struct xdr_buf *buf, int base, u32 *obj)
 {
 	u32	raw;
 	int	status;
@@ -900,6 +926,14 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
 	return 0;
 }
 
+int
+xdr_encode_word(struct xdr_buf *buf, int base, u32 obj)
+{
+	u32	raw = htonl(obj);
+
+	return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
+}
+
 /* If the netobj starting offset bytes from the start of xdr_buf is contained
  * entirely in the head or the tail, set object to point to it; otherwise
  * try to find space for it at the end of the tail, copy it there, and
@@ -910,7 +944,7 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
 	u32	tail_offset = buf->head[0].iov_len + buf->page_len;
 	u32	obj_end_offset;
 
-	if (read_u32_from_xdr_buf(buf, offset, &obj->len))
+	if (xdr_decode_word(buf, offset, &obj->len))
 		goto out;
 	obj_end_offset = offset + 4 + obj->len;
 
@@ -943,3 +977,219 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
 out:
 	return -1;
 }
+
+/* Returns 0 on success, or else a negative error code. */
+static int
+xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
+		 struct xdr_array2_desc *desc, int encode)
+{
+	char *elem = NULL, *c;
+	unsigned int copied = 0, todo, avail_here;
+	struct page **ppages = NULL;
+	int err;
+
+	if (encode) {
+		if (xdr_encode_word(buf, base, desc->array_len) != 0)
+			return -EINVAL;
+	} else {
+		if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
+		    (unsigned long) base + 4 + desc->array_len *
+				    desc->elem_size > buf->len)
+			return -EINVAL;
+	}
+	base += 4;
+
+	if (!desc->xcode)
+		return 0;
+
+	todo = desc->array_len * desc->elem_size;
+
+	/* process head */
+	if (todo && base < buf->head->iov_len) {
+		c = buf->head->iov_base + base;
+		avail_here = min_t(unsigned int, todo,
+				   buf->head->iov_len - base);
+		todo -= avail_here;
+
+		while (avail_here >= desc->elem_size) {
+			err = desc->xcode(desc, c);
+			if (err)
+				goto out;
+			c += desc->elem_size;
+			avail_here -= desc->elem_size;
+		}
+		if (avail_here) {
+			if (!elem) {
+				elem = kmalloc(desc->elem_size, GFP_KERNEL);
+				err = -ENOMEM;
+				if (!elem)
+					goto out;
+			}
+			if (encode) {
+				err = desc->xcode(desc, elem);
+				if (err)
+					goto out;
+				memcpy(c, elem, avail_here);
+			} else
+				memcpy(elem, c, avail_here);
+			copied = avail_here;
+		}
+		base = buf->head->iov_len;  /* align to start of pages */
+	}
+
+	/* process pages array */
+	base -= buf->head->iov_len;
+	if (todo && base < buf->page_len) {
+		unsigned int avail_page;
+
+		avail_here = min(todo, buf->page_len - base);
+		todo -= avail_here;
+
+		base += buf->page_base;
+		ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
+		base &= ~PAGE_CACHE_MASK;
+		avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
+					avail_here);
+		c = kmap(*ppages) + base;
+
+		while (avail_here) {
+			avail_here -= avail_page;
+			if (copied || avail_page < desc->elem_size) {
+				unsigned int l = min(avail_page,
+					desc->elem_size - copied);
+				if (!elem) {
+					elem = kmalloc(desc->elem_size,
+						       GFP_KERNEL);
+					err = -ENOMEM;
+					if (!elem)
+						goto out;
+				}
+				if (encode) {
+					if (!copied) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+					}
+					memcpy(c, elem + copied, l);
+					copied += l;
+					if (copied == desc->elem_size)
+						copied = 0;
+				} else {
+					memcpy(elem + copied, c, l);
+					copied += l;
+					if (copied == desc->elem_size) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+						copied = 0;
+					}
+				}
+				avail_page -= l;
+				c += l;
+			}
+			while (avail_page >= desc->elem_size) {
+				err = desc->xcode(desc, c);
+				if (err)
+					goto out;
+				c += desc->elem_size;
+				avail_page -= desc->elem_size;
+			}
+			if (avail_page) {
+				unsigned int l = min(avail_page,
+					    desc->elem_size - copied);
+				if (!elem) {
+					elem = kmalloc(desc->elem_size,
+						       GFP_KERNEL);
+					err = -ENOMEM;
+					if (!elem)
+						goto out;
+				}
+				if (encode) {
+					if (!copied) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+					}
+					memcpy(c, elem + copied, l);
+					copied += l;
+					if (copied == desc->elem_size)
+						copied = 0;
+				} else {
+					memcpy(elem + copied, c, l);
+					copied += l;
+					if (copied == desc->elem_size) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+						copied = 0;
+					}
+				}
+			}
+			if (avail_here) {
+				kunmap(*ppages);
+				ppages++;
+				c = kmap(*ppages);
+			}
+
+			avail_page = min(avail_here,
+				 (unsigned int) PAGE_CACHE_SIZE);
+		}
+		base = buf->page_len;  /* align to start of tail */
+	}
+
+	/* process tail */
+	base -= buf->page_len;
+	if (todo) {
+		c = buf->tail->iov_base + base;
+		if (copied) {
+			unsigned int l = desc->elem_size - copied;
+
+			if (encode)
+				memcpy(c, elem + copied, l);
+			else {
+				memcpy(elem + copied, c, l);
+				err = desc->xcode(desc, elem);
+				if (err)
+					goto out;
+			}
+			todo -= l;
+			c += l;
+		}
+		while (todo) {
+			err = desc->xcode(desc, c);
+			if (err)
+				goto out;
+			c += desc->elem_size;
+			todo -= desc->elem_size;
+		}
+	}
+	err = 0;
+
+out:
+	if (elem)
+		kfree(elem);
+	if (ppages)
+		kunmap(*ppages);
+	return err;
+}
+
+int
+xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+		  struct xdr_array2_desc *desc)
+{
+	if (base >= buf->len)
+		return -EINVAL;
+
+	return xdr_xcode_array2(buf, base, desc, 0);
+}
+
+int
+xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+		  struct xdr_array2_desc *desc)
+{
+	if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
+	    buf->head->iov_len + buf->page_len + buf->tail->iov_len)
+		return -EINVAL;
+
+	return xdr_xcode_array2(buf, base, desc, 1);
+}
-- 
cgit v1.2.3


From 9ba02638e4be28dd4ff724202a640264427c62d1 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: Allow the sunrpc server to multiplex serveral programs
 on a single port

 The NFS and NFSACL programs run on the same RPC transport.  This patch adds
 support for this by converting svc_program into a chained list of programs
 (server-side).

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Signed-off-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andrew Morton <akpm@osdl.org>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/svc.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a02d424a740..e9bd91265f7 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -35,20 +35,24 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
 	if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
 	memset(serv, 0, sizeof(*serv));
+	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
 	serv->sv_stats     = prog->pg_stats;
 	serv->sv_bufsz	   = bufsize? bufsize : 4096;
-	prog->pg_lovers = prog->pg_nvers-1;
 	xdrsize = 0;
-	for (vers=0; vers<prog->pg_nvers ; vers++)
-		if (prog->pg_vers[vers]) {
-			prog->pg_hivers = vers;
-			if (prog->pg_lovers > vers)
-				prog->pg_lovers = vers;
-			if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
-				xdrsize = prog->pg_vers[vers]->vs_xdrsize;
-		}
+	while (prog) {
+		prog->pg_lovers = prog->pg_nvers-1;
+		for (vers=0; vers<prog->pg_nvers ; vers++)
+			if (prog->pg_vers[vers]) {
+				prog->pg_hivers = vers;
+				if (prog->pg_lovers > vers)
+					prog->pg_lovers = vers;
+				if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
+					xdrsize = prog->pg_vers[vers]->vs_xdrsize;
+			}
+		prog = prog->pg_next;
+	}
 	serv->sv_xdrsize   = xdrsize;
 	INIT_LIST_HEAD(&serv->sv_threads);
 	INIT_LIST_HEAD(&serv->sv_sockets);
@@ -56,8 +60,6 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
 	INIT_LIST_HEAD(&serv->sv_permsocks);
 	spin_lock_init(&serv->sv_lock);
 
-	serv->sv_name      = prog->pg_name;
-
 	/* Remove any stale portmap registrations */
 	svc_register(serv, 0, 0);
 
@@ -339,7 +341,10 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 		goto sendit;
 	}
 		
-	if (prog != progp->pg_prog)
+	for (progp = serv->sv_program; progp; progp = progp->pg_next)
+		if (prog == progp->pg_prog)
+			break;
+	if (progp == NULL)
 		goto err_bad_prog;
 
 	if (vers >= progp->pg_nvers ||
@@ -452,11 +457,7 @@ err_bad_auth:
 	goto sendit;
 
 err_bad_prog:
-#ifdef RPC_PARANOIA
-	if (prog != 100227 || progp->pg_prog != 100003)
-		printk("svc: unknown program %d (me %d)\n", prog, progp->pg_prog);
-	/* else it is just a Solaris client seeing if ACLs are supported */
-#endif
+	dprintk("svc: unknown program %d\n", prog);
 	serv->sv_stats->rpcbadfmt++;
 	svc_putu32(resv, rpc_prog_unavail);
 	goto sendit;
-- 
cgit v1.2.3


From 14b218a8e4f110206c46e586a3da372f665631e7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:28 +0000
Subject: [PATCH] RPC: Ensure rpc calls respects the RPC_NOINTR flag

 For internal purposes, the rpc_clnt_sigmask() call is replaced by
 a call to rpc_task_sigmask(), which ensures that the current task
 sigmask respects both the client cl_intr flag and the per-task NOINTR flag.

 Problem noted by Jiaying Zhang.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/clnt.c | 71 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c979fcf8879..f17e6153b68 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -378,38 +378,41 @@ rpc_default_callback(struct rpc_task *task)
 }
 
 /*
- *	Export the signal mask handling for aysnchronous code that
+ *	Export the signal mask handling for synchronous code that
  *	sleeps on RPC calls
  */
+#define RPC_INTR_SIGNALS (sigmask(SIGINT) | sigmask(SIGQUIT) | sigmask(SIGKILL))
  
+static void rpc_save_sigmask(sigset_t *oldset, int intr)
+{
+	unsigned long	sigallow = 0;
+	sigset_t sigmask;
+
+	/* Block all signals except those listed in sigallow */
+	if (intr)
+		sigallow |= RPC_INTR_SIGNALS;
+	siginitsetinv(&sigmask, sigallow);
+	sigprocmask(SIG_BLOCK, &sigmask, oldset);
+}
+
+static inline void rpc_task_sigmask(struct rpc_task *task, sigset_t *oldset)
+{
+	rpc_save_sigmask(oldset, !RPC_TASK_UNINTERRUPTIBLE(task));
+}
+
+static inline void rpc_restore_sigmask(sigset_t *oldset)
+{
+	sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
 void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset)
 {
-	unsigned long	sigallow = sigmask(SIGKILL);
-	unsigned long	irqflags;
-	
-	/* Turn off various signals */
-	if (clnt->cl_intr) {
-		struct k_sigaction *action = current->sighand->action;
-		if (action[SIGINT-1].sa.sa_handler == SIG_DFL)
-			sigallow |= sigmask(SIGINT);
-		if (action[SIGQUIT-1].sa.sa_handler == SIG_DFL)
-			sigallow |= sigmask(SIGQUIT);
-	}
-	spin_lock_irqsave(&current->sighand->siglock, irqflags);
-	*oldset = current->blocked;
-	siginitsetinv(&current->blocked, sigallow & ~oldset->sig[0]);
-	recalc_sigpending();
-	spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+	rpc_save_sigmask(oldset, clnt->cl_intr);
 }
 
 void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset)
 {
-	unsigned long	irqflags;
-	
-	spin_lock_irqsave(&current->sighand->siglock, irqflags);
-	current->blocked = *oldset;
-	recalc_sigpending();
-	spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
+	rpc_restore_sigmask(oldset);
 }
 
 /*
@@ -427,26 +430,26 @@ int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 
 	BUG_ON(flags & RPC_TASK_ASYNC);
 
-	rpc_clnt_sigmask(clnt, &oldset);		
-
 	status = -ENOMEM;
 	task = rpc_new_task(clnt, NULL, flags);
 	if (task == NULL)
 		goto out;
 
+	/* Mask signals on RPC calls _and_ GSS_AUTH upcalls */
+	rpc_task_sigmask(task, &oldset);
+
 	rpc_call_setup(task, msg, 0);
 
 	/* Set up the call info struct and execute the task */
-	if (task->tk_status == 0)
+	if (task->tk_status == 0) {
 		status = rpc_execute(task);
-	else {
+	} else {
 		status = task->tk_status;
 		rpc_release_task(task);
 	}
 
+	rpc_restore_sigmask(&oldset);
 out:
-	rpc_clnt_sigunmask(clnt, &oldset);		
-
 	return status;
 }
 
@@ -467,8 +470,6 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
 
 	flags |= RPC_TASK_ASYNC;
 
-	rpc_clnt_sigmask(clnt, &oldset);		
-
 	/* Create/initialize a new RPC task */
 	if (!callback)
 		callback = rpc_default_callback;
@@ -477,6 +478,9 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
 		goto out;
 	task->tk_calldata = data;
 
+	/* Mask signals on GSS_AUTH upcalls */
+	rpc_task_sigmask(task, &oldset);		
+
 	rpc_call_setup(task, msg, 0);
 
 	/* Set up the call info struct and execute the task */
@@ -486,9 +490,8 @@ rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags,
 	else
 		rpc_release_task(task);
 
+	rpc_restore_sigmask(&oldset);		
 out:
-	rpc_clnt_sigunmask(clnt, &oldset);		
-
 	return status;
 }
 
@@ -666,7 +669,7 @@ call_allocate(struct rpc_task *task)
 		return;
 	printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task); 
 
-	if (RPC_IS_ASYNC(task) || !(task->tk_client->cl_intr && signalled())) {
+	if (RPC_IS_ASYNC(task) || !signalled()) {
 		xprt_release(task);
 		task->tk_action = call_reserve;
 		rpc_delay(task, HZ>>4);
-- 
cgit v1.2.3


From 0f9dc2b16884bb5957d010ed8e9114e771a05916 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:28 +0000
Subject: [PATCH] RPC: Clean up socket autodisconnect

 Cancel autodisconnect requests inside xprt_transmit() in order to avoid
 races.
 Use more efficient del_singleshot_timer_sync()

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ef941e7de8b..a74a1289113 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1240,6 +1240,8 @@ xprt_transmit(struct rpc_task *task)
 			list_add_tail(&req->rq_list, &xprt->recv);
 			spin_unlock_bh(&xprt->sock_lock);
 			xprt_reset_majortimeo(req);
+			/* Turn off autodisconnect */
+			del_singleshot_timer_sync(&xprt->timer);
 		}
 	} else if (!req->rq_bytes_sent)
 		return;
@@ -1370,8 +1372,6 @@ xprt_reserve(struct rpc_task *task)
 		spin_lock(&xprt->xprt_lock);
 		do_xprt_reserve(task);
 		spin_unlock(&xprt->xprt_lock);
-		if (task->tk_rqstp)
-			del_timer_sync(&xprt->timer);
 	}
 }
 
-- 
cgit v1.2.3


From 20e5ac828dfd23b9080159c62a34f32d2dcd92fc Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 22 Jun 2005 17:16:28 +0000
Subject: [PATCH] RPC: TCP reconnects are too slow

 When the network layer reports a connection close, the RPC task
 waiting to reconnect should be notified so it can retry immediately
 instead of waiting for the normal connection establishment timeout.

 This reverts a change made in 2.6.6 as part of adding client support
 for RPC over TCP socket idle timeouts.

 Test-plan:
 Destructive testing with NFS over TCP mounts.

 Version: Fri, 29 Apr 2005 15:31:46 -0400

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a74a1289113..2b8789cf8db 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1101,8 +1101,7 @@ tcp_state_change(struct sock *sk)
 	case TCP_SYN_RECV:
 		break;
 	default:
-		if (xprt_test_and_clear_connected(xprt))
-			rpc_wake_up_status(&xprt->pending, -ENOTCONN);
+		xprt_disconnect(xprt);
 		break;
 	}
  out:
-- 
cgit v1.2.3


From ae3884621bf5b4caff7785b9a417f262202965b2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Wed, 22 Jun 2005 17:16:28 +0000
Subject: [PATCH] RPC: kick off socket connect operations faster

 Make the socket transport kick the event queue to start socket connects
 immediately.  This should improve responsiveness of applications that are
 sensitive to slow mount operations (like automounters).

 We are now also careful to cancel the connect worker before destroying
 the xprt.  This eliminates a race where xprt_destroy can finish before
 the connect worker is even allowed to run.

 Test-plan:
 Destructive testing (unplugging the network temporarily).  Connectathon
 with UDP and TCP.  Hard-code impossibly small connect timeout.

 Version: Fri, 29 Apr 2005 15:32:01 -0400

 Signed-off-by: Chuck Lever <cel@netapp.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 net/sunrpc/xprt.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2b8789cf8db..eca92405948 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -569,8 +569,11 @@ void xprt_connect(struct rpc_task *task)
 		if (xprt->sock != NULL)
 			schedule_delayed_work(&xprt->sock_connect,
 					RPC_REESTABLISH_TIMEOUT);
-		else
+		else {
 			schedule_work(&xprt->sock_connect);
+			if (!RPC_IS_ASYNC(task))
+				flush_scheduled_work();
+		}
 	}
 	return;
  out_write:
@@ -1685,6 +1688,10 @@ xprt_shutdown(struct rpc_xprt *xprt)
 	rpc_wake_up(&xprt->backlog);
 	wake_up(&xprt->cong_wait);
 	del_timer_sync(&xprt->timer);
+
+	/* synchronously wait for connect worker to finish */
+	cancel_delayed_work(&xprt->sock_connect);
+	flush_scheduled_work();
 }
 
 /*
-- 
cgit v1.2.3


From f31f5f051269746179b01017fc5e3dcf6b37c67e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 22 Jun 2005 14:32:51 -0700
Subject: [NET]: dont use strlen() but the result from a prior sprintf()

Small patch to save an unecessary call to strlen() : sprintf() gave us
the length, just trust it.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 38729af0946..6f2a1788197 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -383,9 +383,8 @@ int sock_map_fd(struct socket *sock)
 			goto out;
 		}
 
-		sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
+		this.len = sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino);
 		this.name = name;
-		this.len = strlen(name);
 		this.hash = SOCK_INODE(sock)->i_ino;
 
 		file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);
-- 
cgit v1.2.3


From 115c1d6e61b70851d9a363328c3b8d4c2559a1d3 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:31 -0700
Subject: [NETPOLL]: Introduce a netpoll_info struct

This patch introduces a netpoll_info structure, which the struct net_device
will now point to instead of pointing to a struct netpoll.  The reason for
this is two-fold: 1) fields such as the rx_flags, poll_owner, and poll_lock
should be maintained per net_device, not per netpoll;  and 2) this is a first
step in providing support for multiple netpoll clients to register against the
same net_device.

The struct netpoll is now pointed to by the netpoll_info structure.  As
such, the previous behaviour of the code is preserved.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 57 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a119696d552..ab3c0c9713b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -130,19 +130,20 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
  */
 static void poll_napi(struct netpoll *np)
 {
+	struct netpoll_info *npinfo = np->dev->npinfo;
 	int budget = 16;
 
 	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
-	    np->poll_owner != smp_processor_id() &&
-	    spin_trylock(&np->poll_lock)) {
-		np->rx_flags |= NETPOLL_RX_DROP;
+	    npinfo->poll_owner != smp_processor_id() &&
+	    spin_trylock(&npinfo->poll_lock)) {
+		npinfo->rx_flags |= NETPOLL_RX_DROP;
 		atomic_inc(&trapped);
 
 		np->dev->poll(np->dev, &budget);
 
 		atomic_dec(&trapped);
-		np->rx_flags &= ~NETPOLL_RX_DROP;
-		spin_unlock(&np->poll_lock);
+		npinfo->rx_flags &= ~NETPOLL_RX_DROP;
+		spin_unlock(&npinfo->poll_lock);
 	}
 }
 
@@ -245,6 +246,7 @@ repeat:
 static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	int status;
+	struct netpoll_info *npinfo;
 
 repeat:
 	if(!np || !np->dev || !netif_running(np->dev)) {
@@ -253,8 +255,9 @@ repeat:
 	}
 
 	/* avoid recursion */
-	if(np->poll_owner == smp_processor_id() ||
-	   np->dev->xmit_lock_owner == smp_processor_id()) {
+	npinfo = np->dev->npinfo;
+	if (npinfo->poll_owner == smp_processor_id() ||
+	    np->dev->xmit_lock_owner == smp_processor_id()) {
 		if (np->drop)
 			np->drop(skb);
 		else
@@ -341,14 +344,18 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 
 static void arp_reply(struct sk_buff *skb)
 {
+	struct netpoll_info *npinfo = skb->dev->npinfo;
 	struct arphdr *arp;
 	unsigned char *arp_ptr;
 	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
 	u32 sip, tip;
 	struct sk_buff *send_skb;
-	struct netpoll *np = skb->dev->np;
+	struct netpoll *np = NULL;
 
-	if (!np) return;
+	if (npinfo)
+		np = npinfo->np;
+	if (!np)
+		return;
 
 	/* No arp on this interface */
 	if (skb->dev->flags & IFF_NOARP)
@@ -429,7 +436,7 @@ int __netpoll_rx(struct sk_buff *skb)
 	int proto, len, ulen;
 	struct iphdr *iph;
 	struct udphdr *uh;
-	struct netpoll *np = skb->dev->np;
+	struct netpoll *np = skb->dev->npinfo->np;
 
 	if (!np->rx_hook)
 		goto out;
@@ -611,9 +618,7 @@ int netpoll_setup(struct netpoll *np)
 {
 	struct net_device *ndev = NULL;
 	struct in_device *in_dev;
-
-	np->poll_lock = SPIN_LOCK_UNLOCKED;
-	np->poll_owner = -1;
+	struct netpoll_info *npinfo;
 
 	if (np->dev_name)
 		ndev = dev_get_by_name(np->dev_name);
@@ -624,7 +629,16 @@ int netpoll_setup(struct netpoll *np)
 	}
 
 	np->dev = ndev;
-	ndev->np = np;
+	if (!ndev->npinfo) {
+		npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+		if (!npinfo)
+			goto release;
+
+		npinfo->np = NULL;
+		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
+		npinfo->poll_owner = -1;
+	} else
+		npinfo = ndev->npinfo;
 
 	if (!ndev->poll_controller) {
 		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
@@ -693,12 +707,15 @@ int netpoll_setup(struct netpoll *np)
 	}
 
 	if(np->rx_hook)
-		np->rx_flags = NETPOLL_RX_ENABLED;
+		npinfo->rx_flags = NETPOLL_RX_ENABLED;
+	npinfo->np = np;
+	ndev->npinfo = npinfo;
 
 	return 0;
 
  release:
-	ndev->np = NULL;
+	if (!ndev->npinfo)
+		kfree(npinfo);
 	np->dev = NULL;
 	dev_put(ndev);
 	return -1;
@@ -706,9 +723,11 @@ int netpoll_setup(struct netpoll *np)
 
 void netpoll_cleanup(struct netpoll *np)
 {
-	if (np->dev)
-		np->dev->np = NULL;
-	dev_put(np->dev);
+	if (np->dev) {
+		if (np->dev->npinfo)
+			np->dev->npinfo->np = NULL;
+		dev_put(np->dev);
+	}
 	np->dev = NULL;
 }
 
-- 
cgit v1.2.3


From fbeec2e1552949002065435c9829dc244ad85407 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:59 -0700
Subject: [NETPOLL]: allow multiple netpoll_clients to register against one
 interface

This patch provides support for registering multiple netpoll clients to the
same network device.  Only one of these clients may register an rx_hook,
however.  In practice, this restriction has not been problematic.  It is
worth mentioning, though, that the current design can be easily extended to
allow for the registration of multiple rx_hooks.

The basic idea of the patch is that the rx_np pointer in the netpoll_info
structure points to the struct netpoll that has rx_hook filled in.  Aside
from this one case, there is no need for a pointer from the struct
net_device to an individual struct netpoll.

A lock is introduced to protect the setting and clearing of the np_rx
pointer.  The pointer will only be cleared upon netpoll client module
removal, and the lock should be uncontested.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netpoll.c | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index ab3c0c9713b..c327c9edadc 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -349,11 +349,15 @@ static void arp_reply(struct sk_buff *skb)
 	unsigned char *arp_ptr;
 	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
 	u32 sip, tip;
+	unsigned long flags;
 	struct sk_buff *send_skb;
 	struct netpoll *np = NULL;
 
-	if (npinfo)
-		np = npinfo->np;
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
+		np = npinfo->rx_np;
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
 	if (!np)
 		return;
 
@@ -436,9 +440,9 @@ int __netpoll_rx(struct sk_buff *skb)
 	int proto, len, ulen;
 	struct iphdr *iph;
 	struct udphdr *uh;
-	struct netpoll *np = skb->dev->npinfo->np;
+	struct netpoll *np = skb->dev->npinfo->rx_np;
 
-	if (!np->rx_hook)
+	if (!np)
 		goto out;
 	if (skb->dev->type != ARPHRD_ETHER)
 		goto out;
@@ -619,6 +623,7 @@ int netpoll_setup(struct netpoll *np)
 	struct net_device *ndev = NULL;
 	struct in_device *in_dev;
 	struct netpoll_info *npinfo;
+	unsigned long flags;
 
 	if (np->dev_name)
 		ndev = dev_get_by_name(np->dev_name);
@@ -634,9 +639,10 @@ int netpoll_setup(struct netpoll *np)
 		if (!npinfo)
 			goto release;
 
-		npinfo->np = NULL;
+		npinfo->rx_np = NULL;
 		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
 		npinfo->poll_owner = -1;
+		npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
 	} else
 		npinfo = ndev->npinfo;
 
@@ -706,9 +712,13 @@ int netpoll_setup(struct netpoll *np)
 		       np->name, HIPQUAD(np->local_ip));
 	}
 
-	if(np->rx_hook)
-		npinfo->rx_flags = NETPOLL_RX_ENABLED;
-	npinfo->np = np;
+	if (np->rx_hook) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+		npinfo->rx_np = np;
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+	}
+	/* last thing to do is link it to the net device structure */
 	ndev->npinfo = npinfo;
 
 	return 0;
@@ -723,11 +733,20 @@ int netpoll_setup(struct netpoll *np)
 
 void netpoll_cleanup(struct netpoll *np)
 {
+	struct netpoll_info *npinfo;
+	unsigned long flags;
+
 	if (np->dev) {
-		if (np->dev->npinfo)
-			np->dev->npinfo->np = NULL;
+		npinfo = np->dev->npinfo;
+		if (npinfo && npinfo->rx_np == np) {
+			spin_lock_irqsave(&npinfo->rx_lock, flags);
+			npinfo->rx_np = NULL;
+			npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+			spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+		}
 		dev_put(np->dev);
 	}
+
 	np->dev = NULL;
 }
 
-- 
cgit v1.2.3


From 7abaa27c1c54208bd76fa8bae55839c034aebfb2 Mon Sep 17 00:00:00 2001
From: Chuck Short <zulcss@gmail.com>
Date: Wed, 22 Jun 2005 22:10:23 -0700
Subject: [IPV4]: Fix route.c gcc4 warnings

Signed-off by: Chuck Short <zulcss@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f4d53c91986..80cf633d9f4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1767,7 +1767,7 @@ static inline int ip_mkroute_input_def(struct sk_buff *skb,
 				       struct in_device *in_dev,
 				       u32 daddr, u32 saddr, u32 tos)
 {
-	struct rtable* rth;
+	struct rtable* rth = NULL;
 	int err;
 	unsigned hash;
 
@@ -1794,7 +1794,7 @@ static inline int ip_mkroute_input(struct sk_buff *skb,
 				   u32 daddr, u32 saddr, u32 tos)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	struct rtable* rth;
+	struct rtable* rth = NULL;
 	unsigned char hop, hopcount, lasthop;
 	int err = -EINVAL;
 	unsigned int hash;
@@ -2239,7 +2239,7 @@ static inline int ip_mkroute_output_def(struct rtable **rp,
 					struct net_device *dev_out,
 					unsigned flags)
 {
-	struct rtable *rth;
+	struct rtable *rth = NULL;
 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
 	unsigned hash;
 	if (err == 0) {
@@ -2267,7 +2267,7 @@ static inline int ip_mkroute_output(struct rtable** rp,
 	unsigned char hop;
 	unsigned hash;
 	int err = -EINVAL;
-	struct rtable *rth;
+	struct rtable *rth = NULL;
 
 	if (res->fi && res->fi->fib_nhs > 1) {
 		unsigned char hopcount = res->fi->fib_nhs;
-- 
cgit v1.2.3


From 285b3afefacff14bc98e5754b8b48a0a2b42f0df Mon Sep 17 00:00:00 2001
From: Nishanth Aravamudan <nacc@us.ibm.com>
Date: Wed, 22 Jun 2005 22:11:44 -0700
Subject: [ATALK] aarp: replace schedule_timeout() with msleep()

From: Nishanth Aravamudan <nacc@us.ibm.com>

Use msleep() instead of schedule_timeout() to guarantee the task
delays as expected. The current code is not wrong, but it does not account for
early return due to signals, so I think msleep() should be appropriate.

Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/appletalk/aarp.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 10d04046102..c34614ea5fc 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -35,6 +35,7 @@
 #include <net/datalink.h>
 #include <net/psnap.h>
 #include <linux/atalk.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -462,8 +463,7 @@ void aarp_probe_network(struct atalk_iface *atif)
 			aarp_send_probe(atif->dev, &atif->address);
 
 			/* Defer 1/10th */
-			current->state = TASK_INTERRUPTIBLE;
-			schedule_timeout(HZ / 10);
+			msleep(100);
 
 			if (atif->status & ATIF_PROBE_FAIL)
 				break;
@@ -510,9 +510,8 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
 		aarp_send_probe(atif->dev, sa);
 
 		/* Defer 1/10th */
-		current->state = TASK_INTERRUPTIBLE;
 		write_unlock_bh(&aarp_lock);
-		schedule_timeout(HZ / 10);
+		msleep(100);
 		write_lock_bh(&aarp_lock);
 
 		if (entry->status & ATIF_PROBE_FAIL)
-- 
cgit v1.2.3


From 68d318720052154bc6b2513b0f15d0d947cc53c9 Mon Sep 17 00:00:00 2001
From: James Lamanna <jlamanna@gmail.com>
Date: Wed, 22 Jun 2005 22:12:57 -0700
Subject: [EBTABLES]: vfree() checking cleanups

From: jlamanna@gmail.com

ebtables.c vfree() checking cleanups.

Signed-off by: James Lamanna <jlamanna@gmail.com>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/ebtables.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 18ebc664769..c4540144f0f 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -859,8 +859,7 @@ static int translate_table(struct ebt_replace *repl,
 		if (repl->valid_hooks & (1 << i))
 			if (check_chainloops(newinfo->hook_entry[i],
 			   cl_s, udc_cnt, i, newinfo->entries)) {
-				if (cl_s)
-					vfree(cl_s);
+				vfree(cl_s);
 				return -EINVAL;
 			}
 
@@ -883,8 +882,7 @@ static int translate_table(struct ebt_replace *repl,
 		EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
 		   ebt_cleanup_entry, &i);
 	}
-	if (cl_s)
-		vfree(cl_s);
+	vfree(cl_s);
 	return ret;
 }
 
@@ -1030,8 +1028,7 @@ static int do_replace(void __user *user, unsigned int len)
 	}
 	vfree(table);
 
-	if (counterstmp)
-		vfree(counterstmp);
+	vfree(counterstmp);
 	return ret;
 
 free_unlock:
@@ -1040,8 +1037,7 @@ free_iterate:
 	EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
 	   ebt_cleanup_entry, NULL);
 free_counterstmp:
-	if (counterstmp)
-		vfree(counterstmp);
+	vfree(counterstmp);
 	/* can be initialized in translate_table() */
 	if (newinfo->chainstack) {
 		for (i = 0; i < num_possible_cpus(); i++)
@@ -1049,11 +1045,9 @@ free_counterstmp:
 		vfree(newinfo->chainstack);
 	}
 free_entries:
-	if (newinfo->entries)
-		vfree(newinfo->entries);
+	vfree(newinfo->entries);
 free_newinfo:
-	if (newinfo)
-		vfree(newinfo);
+	vfree(newinfo);
 	return ret;
 }
 
@@ -1213,8 +1207,7 @@ void ebt_unregister_table(struct ebt_table *table)
 	down(&ebt_mutex);
 	LIST_DELETE(&ebt_tables, table);
 	up(&ebt_mutex);
-	if (table->private->entries)
-		vfree(table->private->entries);
+	vfree(table->private->entries);
 	if (table->private->chainstack) {
 		for (i = 0; i < num_possible_cpus(); i++)
 			vfree(table->private->chainstack[i]);
-- 
cgit v1.2.3


From cb65d506c34c86df5bcef939ce5a8666a451bd8b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:15:01 -0700
Subject: [X25]: Selective sub-address matching with call user data.

From: Shaun Pereira <spereira@tusc.com.au>

This is the first (independent of the second) patch of two that I am
working on with x25 on linux (tested with xot on a cisco router).  Details
are as follows.

Current state of module:

A server using the current implementation (2.6.11.7) of the x25 module will
accept a call request/ incoming call packet at the listening x.25 address,
from all callers to that address, as long as NO call user data is present
in the packet header.

If the server needs to choose to accept a particular call request/ incoming
call packet arriving at its listening x25 address, then the kernel has to
allow a match of call user data present in the call request packet with its
own.  This is required when multiple servers listen at the same x25 address
and device interface.  The kernel currently matches ALL call user data, if
present.

Current Changes:

This patch is a follow up to the patch submitted previously by Andrew
Hendry, and allows the user to selectively control the number of octets of
call user data in the call request packet, that the kernel will match.  By
default no call user data is matched, even if call user data is present.
To allow call user data matching, a cudmatchlength > 0 has to be passed
into the kernel after which the passed number of octets will be matched.
Otherwise the kernel behavior is exactly as the original implementation.

This patch also ensures that as is normally the case, no call user data
will be present in the Call accepted / call connected packet sent back to
the caller

Future Changes on next patch:

There are cases however when call user data may be present in the call
accepted packet.  According to the X.25 recommendation (ITU-T 10/96)
section 5.2.3.2 call user data may be present in the call accepted packet
provided the fast select facility is used.  My next patch will include this
fast select utility and the ability to send up to 128 octets call user data
in the call accepted packet provided the fast select facility is used.  I
am currently testing this, again with xot on linux and cisco.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>

(With a fix from Alexey Dobriyan <adobriyan@gmail.com>)
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/af_x25.c   | 73 +++++++++++++++++++++++++++++++++++-------------------
 net/x25/x25_subr.c | 18 --------------
 2 files changed, 48 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 2a24b243b84..e17d84a55d5 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -29,6 +29,8 @@
  *	2000-11-14	Henner Eisen    Closing datalink from NETDEV_GOING_DOWN
  *	2002-10-06	Arnaldo C. Melo Get rid of cli/sti, move proc stuff to
  *					x25_proc.c, using seq_file
+ *	2005-04-02	Shaun Pereira	Selective sub address matching
+ *					with call user data
  */
 
 #include <linux/config.h>
@@ -219,7 +221,8 @@ static void x25_insert_socket(struct sock *sk)
  *	Note: if a listening socket has cud set it must only get calls
  *	with matching cud.
  */
-static struct sock *x25_find_listener(struct x25_address *addr, struct x25_calluserdata *calluserdata)
+static struct sock *x25_find_listener(struct x25_address *addr,
+					struct sk_buff *skb)
 {
 	struct sock *s;
 	struct sock *next_best;
@@ -230,22 +233,23 @@ static struct sock *x25_find_listener(struct x25_address *addr, struct x25_callu
 
 	sk_for_each(s, node, &x25_list)
 		if ((!strcmp(addr->x25_addr,
-			     x25_sk(s)->source_addr.x25_addr) ||
-		     !strcmp(addr->x25_addr,
-			     null_x25_address.x25_addr)) &&
-		     s->sk_state == TCP_LISTEN) {
-
+			x25_sk(s)->source_addr.x25_addr) ||
+				!strcmp(addr->x25_addr,
+					null_x25_address.x25_addr)) &&
+					s->sk_state == TCP_LISTEN) {
 			/*
 			 * Found a listening socket, now check the incoming
 			 * call user data vs this sockets call user data
 			 */
-			if (x25_check_calluserdata(&x25_sk(s)->calluserdata, calluserdata)) {
-				sock_hold(s);
-				goto found;
-			}
-			if (x25_sk(s)->calluserdata.cudlength == 0) {
+			if(skb->len > 0 && x25_sk(s)->cudmatchlength > 0) {
+			 	if((memcmp(x25_sk(s)->calluserdata.cuddata,
+			 		skb->data,
+					x25_sk(s)->cudmatchlength)) == 0) {
+					sock_hold(s);
+					goto found;
+				 }
+			} else
 				next_best = s;
-			}
 		}
 	if (next_best) {
 		s = next_best;
@@ -497,6 +501,7 @@ static int x25_create(struct socket *sock, int protocol)
 	x25->t23   = sysctl_x25_clear_request_timeout;
 	x25->t2    = sysctl_x25_ack_holdback_timeout;
 	x25->state = X25_STATE_0;
+	x25->cudmatchlength = 0;
 
 	x25->facilities.winsize_in  = X25_DEFAULT_WINDOW_SIZE;
 	x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -545,6 +550,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	x25->t2         = ox25->t2;
 	x25->facilities = ox25->facilities;
 	x25->qbitincl   = ox25->qbitincl;
+	x25->cudmatchlength = ox25->cudmatchlength;
 
 	x25_init_timers(sk);
 out:
@@ -822,7 +828,6 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	struct x25_sock *makex25;
 	struct x25_address source_addr, dest_addr;
 	struct x25_facilities facilities;
-	struct x25_calluserdata calluserdata;
 	int len, rc;
 
 	/*
@@ -844,20 +849,11 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	len = skb->data[0] + 1;
 	skb_pull(skb,len);
 
-	/*
-	 *	Incoming Call User Data.
-	 */
-	if (skb->len >= 0) {
-		memcpy(calluserdata.cuddata, skb->data, skb->len);
-		calluserdata.cudlength = skb->len;
-	}
-
-	skb_push(skb,len);
-
 	/*
 	 *	Find a listener for the particular address/cud pair.
 	 */
-	sk = x25_find_listener(&source_addr,&calluserdata);
+	sk = x25_find_listener(&source_addr,skb);
+	skb_push(skb,len);
 
 	/*
 	 *	We can't accept the Call Request.
@@ -900,12 +896,22 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	makex25->neighbour     = nb;
 	makex25->facilities    = facilities;
 	makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
-	makex25->calluserdata  = calluserdata;
+	/* ensure no reverse facil on accept */
+	makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
+	makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
 
 	x25_write_internal(make, X25_CALL_ACCEPTED);
 
 	makex25->state = X25_STATE_3;
 
+	/*
+	 *	Incoming Call User Data.
+	 */
+	if (skb->len >= 0) {
+		memcpy(makex25->calluserdata.cuddata, skb->data, skb->len);
+		makex25->calluserdata.cudlength = skb->len;
+	}
+
 	sk->sk_ack_backlog++;
 
 	x25_insert_socket(make);
@@ -1325,6 +1331,23 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			break;
 		}
 
+		case SIOCX25SCUDMATCHLEN: {
+			struct x25_subaddr sub_addr;
+			rc = -EINVAL;
+			if(sk->sk_state != TCP_CLOSE)
+				break;
+			rc = -EFAULT;
+			if (copy_from_user(&sub_addr, argp,
+					sizeof(sub_addr)))
+				break;
+		 	rc = -EINVAL;
+			if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN)
+				break;
+			x25->cudmatchlength = sub_addr.cudmatchlength;
+			rc = 0;
+			break;
+		}
+
  		default:
 			rc = dev_ioctl(cmd, argp);
 			break;
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 183fea3bba6..c349bbd6168 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -354,21 +354,3 @@ void x25_check_rbuf(struct sock *sk)
 	}
 }
 
-/*
- * Compare 2 calluserdata structures, used to find correct listening sockets
- * when call user data is used.
- */
-int x25_check_calluserdata(struct x25_calluserdata *ours, struct x25_calluserdata *theirs)
-{
-	int i;
-	if (ours->cudlength != theirs->cudlength)
-		return 0;
-
-	for (i=0;i<ours->cudlength;i++) {
-		if (ours->cuddata[i] != theirs->cuddata[i]) {
-			return 0;
-		}
-	}
-	return 1;
-}
-
-- 
cgit v1.2.3


From ebc3f64b864fc16a594c2e63bf55a55c7d42084b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:16:17 -0700
Subject: [X25]: Fast select with no restriction on response

This patch is a follow up to patch 1 regarding "Selective Sub Address
matching with call user data".  It allows use of the Fast-Select-Acceptance
optional user facility for X.25.

This patch just implements fast select with no restriction on response
(NRR).  What this means (according to ITU-T Recomendation 10/96 section
6.16) is that if in an incoming call packet, the relevant facility bits are
set for fast-select-NRR, then the called DTE can issue a direct response to
the incoming packet using a call-accepted packet that contains
call-user-data.  This patch allows such a response.

The called DTE can also respond with a clear-request packet that contains
call-user-data.  However, this feature is currently not implemented by the
patch.

How is Fast Select Acceptance used?
By default, the system does not allow fast select acceptance (as before).
To enable a response to fast select acceptance,
After a listen socket in created and bound as follows
	socket(AF_X25, SOCK_SEQPACKET, 0);
	bind(call_soc, (struct sockaddr *)&locl_addr, sizeof(locl_addr));
but before a listen system call is made, the following ioctl should be used.
	ioctl(call_soc,SIOCX25CALLACCPTAPPRV);
Now the listen system call can be made
	listen(call_soc, 4);
After this, an incoming-call packet will be accepted, but no call-accepted
packet will be sent back until the following system call is made on the socket
that accepts the call
	ioctl(vc_soc,SIOCX25SENDCALLACCPT);
The network (or cisco xot router used for testing here) will allow the
application server's call-user-data in the call-accepted packet,
provided the call-request was made with Fast-select NRR.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/af_x25.c         | 37 +++++++++++++++++++++++++++++++++----
 net/x25/x25_facilities.c | 34 +++++++++++++++++++++++++++++-----
 net/x25/x25_subr.c       | 23 ++++++++++++++++++-----
 3 files changed, 80 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index e17d84a55d5..04bec047fa9 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -31,6 +31,8 @@
  *					x25_proc.c, using seq_file
  *	2005-04-02	Shaun Pereira	Selective sub address matching
  *					with call user data
+ *	2005-04-15	Shaun Pereira	Fast select with no restriction on
+ *					response
  */
 
 #include <linux/config.h>
@@ -502,6 +504,8 @@ static int x25_create(struct socket *sock, int protocol)
 	x25->t2    = sysctl_x25_ack_holdback_timeout;
 	x25->state = X25_STATE_0;
 	x25->cudmatchlength = 0;
+	x25->accptapprv = X25_DENY_ACCPT_APPRV;		/* normally no cud  */
+							/* on call accept   */
 
 	x25->facilities.winsize_in  = X25_DEFAULT_WINDOW_SIZE;
 	x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -551,6 +555,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	x25->facilities = ox25->facilities;
 	x25->qbitincl   = ox25->qbitincl;
 	x25->cudmatchlength = ox25->cudmatchlength;
+	x25->accptapprv = ox25->accptapprv;
 
 	x25_init_timers(sk);
 out:
@@ -900,9 +905,11 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
 	makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
 
-	x25_write_internal(make, X25_CALL_ACCEPTED);
-
-	makex25->state = X25_STATE_3;
+	/* Normally all calls are accepted immediatly */
+	if(makex25->accptapprv & X25_DENY_ACCPT_APPRV) {
+		x25_write_internal(make, X25_CALL_ACCEPTED);
+		makex25->state = X25_STATE_3;
+	}
 
 	/*
 	 *	Incoming Call User Data.
@@ -1294,7 +1301,8 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			if (facilities.throughput < 0x03 ||
 			    facilities.throughput > 0xDD)
 				break;
-			if (facilities.reverse && facilities.reverse != 1)
+			if (facilities.reverse &&
+				(facilities.reverse | 0x81)!= 0x81)
 				break;
 			x25->facilities = facilities;
 			rc = 0;
@@ -1348,6 +1356,27 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			break;
 		}
 
+		case SIOCX25CALLACCPTAPPRV: {
+			rc = -EINVAL;
+			if (sk->sk_state != TCP_CLOSE)
+				break;
+			x25->accptapprv = X25_ALLOW_ACCPT_APPRV;
+			rc = 0;
+			break;
+		}
+
+		case SIOCX25SENDCALLACCPT:  {
+			rc = -EINVAL;
+			if (sk->sk_state != TCP_ESTABLISHED)
+				break;
+			if (x25->accptapprv)	/* must call accptapprv above */
+				break;
+			x25_write_internal(sk, X25_CALL_ACCEPTED);
+			x25->state = X25_STATE_3;
+			rc = 0;
+			break;
+		}
+
  		default:
 			rc = dev_ioctl(cmd, argp);
 			break;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index a21bdb95f9a..54278b962f4 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -17,6 +17,8 @@
  *	X.25 001	Split from x25_subr.c
  *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities 
  *					  negotiation.
+ *	apr/14/05	Shaun Pereira - Allow fast select with no restriction
+ *					on response.
  */
 
 #include <linux/kernel.h>
@@ -43,9 +45,31 @@ int x25_parse_facilities(struct sk_buff *skb,
 		case X25_FAC_CLASS_A:
 			switch (*p) {
 			case X25_FAC_REVERSE:
-				facilities->reverse = p[1] & 0x01;
-				*vc_fac_mask |= X25_MASK_REVERSE;
-				break;
+				if((p[1] & 0x81) == 0x81) {
+					facilities->reverse = p[1] & 0x81;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if((p[1] & 0x01) == 0x01) {
+					facilities->reverse = p[1] & 0x01;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if((p[1] & 0x80) == 0x80) {
+					facilities->reverse = p[1] & 0x80;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if(p[1] == 0x00) {
+					facilities->reverse
+						= X25_DEFAULT_REVERSE;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
 			case X25_FAC_THROUGHPUT:
 				facilities->throughput = p[1];
 				*vc_fac_mask |= X25_MASK_THROUGHPUT;
@@ -122,7 +146,7 @@ int x25_create_facilities(unsigned char *buffer,
 
 	if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) {
 		*p++ = X25_FAC_REVERSE;
-		*p++ = !!facilities->reverse;
+		*p++ = facilities->reverse;
 	}
 
 	if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) {
@@ -171,7 +195,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
 	/*
 	 *	They want reverse charging, we won't accept it.
 	 */
-	if (theirs.reverse && ours->reverse) {
+	if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) {
 		SOCK_DEBUG(sk, "X.25: rejecting reverse charging request");
 		return -1;
 	}
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index c349bbd6168..7fd872ad0c2 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -19,6 +19,8 @@
  *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities
  *					  negotiation.
  *	jun/24/01	Arnaldo C. Melo	  use skb_queue_purge, cleanups
+ *	apr/04/15	Shaun Pereira		Fast select with no
+ *						restriction on response.
  */
 
 #include <linux/kernel.h>
@@ -127,8 +129,12 @@ void x25_write_internal(struct sock *sk, int frametype)
 			len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN +
 			       X25_MAX_CUD_LEN;
 			break;
-		case X25_CALL_ACCEPTED:
-			len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+		case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */
+			if(x25->facilities.reverse & 0x80) {
+				len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+			} else {
+				len += 1 + X25_MAX_FAC_LEN;
+			}
 			break;
 		case X25_CLEAR_REQUEST:
 		case X25_RESET_REQUEST:
@@ -203,9 +209,16 @@ void x25_write_internal(struct sock *sk, int frametype)
 							x25->vc_facil_mask);
 			dptr    = skb_put(skb, len);
 			memcpy(dptr, facilities, len);
-			dptr = skb_put(skb, x25->calluserdata.cudlength);
-			memcpy(dptr, x25->calluserdata.cuddata,
-			       x25->calluserdata.cudlength);
+
+			/* fast select with no restriction on response
+				allows call user data. Userland must
+				ensure it is ours and not theirs */
+			if(x25->facilities.reverse & 0x80) {
+				dptr = skb_put(skb,
+					x25->calluserdata.cudlength);
+				memcpy(dptr, x25->calluserdata.cuddata,
+				       x25->calluserdata.cudlength);
+			}
 			x25->calluserdata.cudlength = 0;
 			break;
 
-- 
cgit v1.2.3


From 543537bd922692bc978e2e356fcd8bfc9c2ee7d5 Mon Sep 17 00:00:00 2001
From: Paulo Marques <pmarques@grupopie.com>
Date: Thu, 23 Jun 2005 00:09:02 -0700
Subject: [PATCH] create a kstrdup library function

This patch creates a new kstrdup library function and changes the "local"
implementations in several places to use this function.

Most of the changes come from the sound and net subsystems.  The sound part
had already been acknowledged by Takashi Iwai and the net part by David S.
Miller.

I left UML alone for now because I would need more time to read the code
carefully before making changes there.

Signed-off-by: Paulo Marques <pmarques@grupopie.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/core/neighbour.c       |  3 ++-
 net/core/sysctl_net_core.c | 15 ---------------
 net/ipv4/devinet.c         |  2 +-
 net/ipv6/addrconf.c        |  3 ++-
 net/sunrpc/svcauth_unix.c  | 11 ++---------
 5 files changed, 7 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f6bdcad47da..851eb927ed9 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -32,6 +32,7 @@
 #include <net/sock.h>
 #include <linux/rtnetlink.h>
 #include <linux/random.h>
+#include <linux/string.h>
 
 #define NEIGH_DEBUG 1
 
@@ -2592,7 +2593,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 		t->neigh_vars[17].extra1 = dev;
 	}
 
-	dev_name = net_sysctl_strdup(dev_name_source);
+	dev_name = kstrdup(dev_name_source, GFP_KERNEL);
 	if (!dev_name) {
 		err = -ENOBUFS;
 		goto free;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8be646cb19..880a8881521 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -35,19 +35,6 @@ extern int sysctl_somaxconn;
 extern char sysctl_divert_version[];
 #endif /* CONFIG_NET_DIVERT */
 
-/*
- * This strdup() is used for creating copies of network 
- * device names to be handed over to sysctl.
- */
- 
-char *net_sysctl_strdup(const char *s)
-{
-	char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
-	if (rv)
-		strcpy(rv, s);
-	return rv;
-}
-
 ctl_table core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -177,6 +164,4 @@ ctl_table core_table[] = {
 	{ .ctl_name = 0 }
 };
 
-EXPORT_SYMBOL(net_sysctl_strdup);
-
 #endif
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 650dcb12d9a..d8a10e3dd77 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1471,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev,
 	 * by sysctl and we wouldn't want anyone to change it under our feet
 	 * (see SIOCSIFNAME).
 	 */	
-	dev_name = net_sysctl_strdup(dev_name);
+	dev_name = kstrdup(dev_name, GFP_KERNEL);
 	if (!dev_name)
 	    goto free;
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 14f5c53235f..a54d4ef3fd3 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -57,6 +57,7 @@
 #endif
 #include <linux/delay.h>
 #include <linux/notifier.h>
+#include <linux/string.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -3437,7 +3438,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf
 	 * by sysctl and we wouldn't want anyone to change it under our feet
 	 * (see SIOCSIFNAME).
 	 */	
-	dev_name = net_sysctl_strdup(dev_name);
+	dev_name = kstrdup(dev_name, GFP_KERNEL);
 	if (!dev_name)
 	    goto free;
 
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2b99b4028d3..d6baf6fdf8a 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -8,6 +8,7 @@
 #include <linux/err.h>
 #include <linux/seq_file.h>
 #include <linux/hash.h>
+#include <linux/string.h>
 
 #define RPCDBG_FACILITY	RPCDBG_AUTH
 
@@ -20,14 +21,6 @@
  */
 
 
-static char *strdup(char *s)
-{
-	char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
-	if (rv)
-		strcpy(rv, s);
-	return rv;
-}
-
 struct unix_domain {
 	struct auth_domain	h;
 	int	addr_changes;
@@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name)
 	if (new == NULL)
 		return NULL;
 	cache_init(&new->h.h);
-	new->h.name = strdup(name);
+	new->h.name = kstrdup(name, GFP_KERNEL);
 	new->h.flavour = RPC_AUTH_UNIX;
 	new->addr_changes = 0;
 	new->h.h.expiry_time = NEVER;
-- 
cgit v1.2.3


From 317a76f9a44b437d6301718f4e5d08bd93f98da7 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:19:55 -0700
Subject: [TCP]: Add pluggable congestion control algorithm infrastructure.

Allow TCP to have multiple pluggable congestion control algorithms.
Algorithms are defined by a set of operations and can be built in
or modules.  The legacy "new RENO" algorithm is used as a starting
point and fallback.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Makefile          |   3 +-
 net/ipv4/sysctl_net_ipv4.c | 114 +++----
 net/ipv4/tcp.c             |   2 +
 net/ipv4/tcp_cong.c        | 195 ++++++++++++
 net/ipv4/tcp_diag.c        |  20 +-
 net/ipv4/tcp_input.c       | 737 ++++-----------------------------------------
 net/ipv4/tcp_ipv4.c        |   3 +
 net/ipv4/tcp_minisocks.c   |   4 +-
 net/ipv4/tcp_output.c      |  23 +-
 net/ipv6/tcp_ipv6.c        |   2 +-
 10 files changed, 313 insertions(+), 790 deletions(-)
 create mode 100644 net/ipv4/tcp_cong.c

(limited to 'net')

diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 65d57d8e1ad..89c0b4cb470 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -5,7 +5,8 @@
 obj-y     := utils.o route.o inetpeer.o protocol.o \
 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
 	     ip_output.o ip_sockglue.o \
-	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+	     tcp_minisocks.o tcp_cong.o \
 	     datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 23068bddbf0..e3289453241 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
 	return 1;
 }
 
+static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+				       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char val[TCP_CA_NAME_MAX];
+	ctl_table tbl = {
+		.data = val,
+		.maxlen = TCP_CA_NAME_MAX,
+	};
+	int ret;
+
+	tcp_get_default_congestion_control(val);
+
+	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	if (write && ret == 0)
+		ret = tcp_set_default_congestion_control(val);
+	return ret;
+}
+
+int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
+				  void __user *oldval, size_t __user *oldlenp,
+				  void __user *newval, size_t newlen,
+				  void **context)
+{
+	char val[TCP_CA_NAME_MAX];
+	ctl_table tbl = {
+		.data = val,
+		.maxlen = TCP_CA_NAME_MAX,
+	};
+	int ret;
+
+	tcp_get_default_congestion_control(val);
+	ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
+			    context);
+	if (ret == 0 && newval && newlen)
+		ret = tcp_set_default_congestion_control(val);
+	return ret;
+}
+
+
 ctl_table ipv4_table[] = {
         {
 		.ctl_name	= NET_IPV4_TCP_TIMESTAMPS,
@@ -611,70 +650,6 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= NET_TCP_WESTWOOD, 
-		.procname	= "tcp_westwood",
-		.data		= &sysctl_tcp_westwood,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS,
-		.procname	= "tcp_vegas_cong_avoid",
-		.data		= &sysctl_tcp_vegas_cong_avoid,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS_ALPHA,
-		.procname	= "tcp_vegas_alpha",
-		.data		= &sysctl_tcp_vegas_alpha,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS_BETA,
-		.procname	= "tcp_vegas_beta",
-		.data		= &sysctl_tcp_vegas_beta,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS_GAMMA,
-		.procname	= "tcp_vegas_gamma",
-		.data		= &sysctl_tcp_vegas_gamma,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_BIC,
-		.procname	= "tcp_bic",
-		.data		= &sysctl_tcp_bic,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_BIC_FAST_CONVERGENCE,
-		.procname	= "tcp_bic_fast_convergence",
-		.data		= &sysctl_tcp_bic_fast_convergence,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_BIC_LOW_WINDOW,
-		.procname	= "tcp_bic_low_window",
-		.data		= &sysctl_tcp_bic_low_window,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 	{
 		.ctl_name	= NET_TCP_MODERATE_RCVBUF,
 		.procname	= "tcp_moderate_rcvbuf",
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= NET_TCP_BIC_BETA,
-		.procname	= "tcp_bic_beta",
-		.data		= &sysctl_tcp_bic_beta,
-		.maxlen		= sizeof(int),
+		.ctl_name	= NET_TCP_CONG_CONTROL,
+		.procname	= "tcp_congestion_control",
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.maxlen		= TCP_CA_NAME_MAX,
+		.proc_handler	= &proc_tcp_congestion_control,
+		.strategy	= &sysctl_tcp_congestion_control,
 	},
+
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 674bbd8cfd3..f3dbc8dc126 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2333,6 +2333,8 @@ void __init tcp_init(void)
 	printk(KERN_INFO "TCP: Hash tables configured "
 	       "(established %d bind %d)\n",
 	       tcp_ehash_size << 1, tcp_bhash_size);
+
+	tcp_register_congestion_control(&tcp_reno);
 }
 
 EXPORT_SYMBOL(tcp_accept);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 00000000000..665394a63ae
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,195 @@
+/*
+ * Plugable TCP congestion control support and newReno
+ * congestion control.
+ * Based on ideas from I/O scheduler suport and Web100.
+ *
+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_cong_list_lock);
+static LIST_HEAD(tcp_cong_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+{
+	struct tcp_congestion_ops *e;
+
+	list_for_each_entry(e, &tcp_cong_list, list) {
+		if (strcmp(e->name, name) == 0)
+			return e;
+	}
+
+	return NULL;
+}
+
+/*
+ * Attach new congestion control algorthim to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+	int ret = 0;
+
+	/* all algorithms must implement ssthresh and cong_avoid ops */
+	if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
+		printk(KERN_ERR "TCP %s does not implement required ops\n",
+		       ca->name);
+		return -EINVAL;
+	}
+
+	spin_lock(&tcp_cong_list_lock);
+	if (tcp_ca_find(ca->name)) {
+		printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+		ret = -EEXIST;
+	} else {
+		list_add_rcu(&ca->list, &tcp_cong_list);
+		printk(KERN_INFO "TCP %s registered\n", ca->name);
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
+
+/*
+ * Remove congestion control algorithm, called from
+ * the module's remove function.  Module ref counts are used
+ * to ensure that this can't be done till all sockets using
+ * that method are closed.
+ */
+void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
+{
+	spin_lock(&tcp_cong_list_lock);
+	list_del_rcu(&ca->list);
+	spin_unlock(&tcp_cong_list_lock);
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+
+/* Assign choice of congestion control. */
+void tcp_init_congestion_control(struct tcp_sock *tp)
+{
+	struct tcp_congestion_ops *ca;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+		if (try_module_get(ca->owner)) {
+			tp->ca_ops = ca;
+			break;
+		}
+
+	}
+	rcu_read_unlock();
+
+	if (tp->ca_ops->init)
+		tp->ca_ops->init(tp);
+}
+
+/* Manage refcounts on socket close. */
+void tcp_cleanup_congestion_control(struct tcp_sock *tp)
+{
+	if (tp->ca_ops->release)
+		tp->ca_ops->release(tp);
+	module_put(tp->ca_ops->owner);
+}
+
+/* Used by sysctl to change default congestion control */
+int tcp_set_default_congestion_control(const char *name)
+{
+	struct tcp_congestion_ops *ca;
+	int ret = -ENOENT;
+
+	spin_lock(&tcp_cong_list_lock);
+	ca = tcp_ca_find(name);
+#ifdef CONFIG_KMOD
+	if (!ca) {
+		spin_unlock(&tcp_cong_list_lock);
+
+		request_module("tcp_%s", name);
+		spin_lock(&tcp_cong_list_lock);
+		ca = tcp_ca_find(name);
+	}
+#endif
+
+	if (ca) {
+		list_move(&ca->list, &tcp_cong_list);
+		ret = 0;
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	return ret;
+}
+
+/* Get current default congestion control */
+void tcp_get_default_congestion_control(char *name)
+{
+	struct tcp_congestion_ops *ca;
+	/* We will always have reno... */
+	BUG_ON(list_empty(&tcp_cong_list));
+
+	rcu_read_lock();
+	ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+	strncpy(name, ca->name, TCP_CA_NAME_MAX);
+	rcu_read_unlock();
+}
+
+/*
+ * TCP Reno congestion control
+ * This is special case used for fallback as well.
+ */
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
+			 int flag)
+{
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+                /* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+		 */
+		if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+		} else
+			tp->snd_cwnd_cnt++;
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
+
+/* Slow start threshold is half the congestion window (min 2) */
+u32 tcp_reno_ssthresh(struct tcp_sock *tp)
+{
+	return max(tp->snd_cwnd >> 1U, 2U);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+
+/* Lower bound on congestion window. */
+u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
+{
+	return tp->snd_ssthresh/2;
+}
+EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
+
+struct tcp_congestion_ops tcp_reno = {
+	.name		= "reno",
+	.owner		= THIS_MODULE,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+};
+
+EXPORT_SYMBOL_GPL(tcp_reno);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 634befc0792..867acc0f79d 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,7 +42,6 @@ struct tcpdiag_entry
 
 static struct sock *tcpnl;
 
-
 #define TCPDIAG_PUT(skb, attrtype, attrlen) \
 ({ int rtalen = RTA_LENGTH(attrlen);        \
    struct rtattr *rta;                      \
@@ -61,7 +60,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 	struct nlmsghdr  *nlh;
 	struct tcp_info  *info = NULL;
 	struct tcpdiag_meminfo  *minfo = NULL;
-	struct tcpvegas_info *vinfo = NULL;
 	unsigned char	 *b = skb->tail;
 
 	nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +71,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 		if (ext & (1<<(TCPDIAG_INFO-1)))
 			info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
 		
-		if ((tcp_is_westwood(tp) || tcp_is_vegas(tp))
-		    && (ext & (1<<(TCPDIAG_VEGASINFO-1))))
-			vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo));
 	}
 	r->tcpdiag_family = sk->sk_family;
 	r->tcpdiag_state = sk->sk_state;
@@ -166,19 +161,8 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 	if (info) 
 		tcp_get_info(sk, info);
 
-	if (vinfo) {
-		if (tcp_is_vegas(tp)) {
-			vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
-			vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
-			vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
-			vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
-		} else {
-			vinfo->tcpv_enabled = 0;
-			vinfo->tcpv_rttcnt = 0;
-			vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
-			vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
-		}
-	}
+	if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
+		tp->ca_ops->get_info(tp, ext, skb);
 
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630a..7bbbbc33eb4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
  *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
  *					engine. Lots of bugs are found.
  *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
- *		Angelo Dell'Aera:	TCP Westwood+ support
  */
 
 #include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
 int sysctl_tcp_max_orphans = NR_FILE;
 int sysctl_tcp_frto;
 int sysctl_tcp_nometrics_save;
-int sysctl_tcp_westwood;
-int sysctl_tcp_vegas_cong_avoid;
 
 int sysctl_tcp_moderate_rcvbuf = 1;
 
-/* Default values of the Vegas variables, in fixed-point representation
- * with V_PARAM_SHIFT bits to the right of the binary point.
- */
-#define V_PARAM_SHIFT 1
-int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_beta  = 3<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_bic = 1;
-int sysctl_tcp_bic_fast_convergence = 1;
-int sysctl_tcp_bic_low_window = 14;
-int sysctl_tcp_bic_beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
-
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
 #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-static void init_bictcp(struct tcp_sock *tp)
-{
-	tp->bictcp.cnt = 0;
-
-	tp->bictcp.last_max_cwnd = 0;
-	tp->bictcp.last_cwnd = 0;
-	tp->bictcp.last_stamp = 0;
-}
-
 /* 5. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 {
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
 		tcp_grow_window(sk, tp, skb);
 }
 
-/* When starting a new connection, pin down the current choice of 
- * congestion algorithm.
- */
-void tcp_ca_init(struct tcp_sock *tp)
-{
-	if (sysctl_tcp_westwood) 
-		tp->adv_cong = TCP_WESTWOOD;
-	else if (sysctl_tcp_bic)
-		tp->adv_cong = TCP_BIC;
-	else if (sysctl_tcp_vegas_cong_avoid) {
-		tp->adv_cong = TCP_VEGAS;
-		tp->vegas.baseRTT = 0x7fffffff;
-		tcp_vegas_enable(tp);
-	} 
-}
-
-/* Do RTT sampling needed for Vegas.
- * Basically we:
- *   o min-filter RTT samples from within an RTT to get the current
- *     propagation delay + queuing delay (we are min-filtering to try to
- *     avoid the effects of delayed ACKs)
- *   o min-filter RTT samples from a much longer window (forever for now)
- *     to find the propagation delay (baseRTT)
- */
-static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
-{
-	__u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
-
-	/* Filter to find propagation delay: */
-	if (vrtt < tp->vegas.baseRTT) 
-		tp->vegas.baseRTT = vrtt;
-
-	/* Find the min RTT during the last RTT to find
-	 * the current prop. delay + queuing delay:
-	 */
-	tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
-	tp->vegas.cntRTT++;
-}
-
 /* Called to compute a smoothed rtt estimate. The data fed to this
  * routine either comes from timestamps, or from segments that were
  * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
  * To save cycles in the RFC 1323 implementation it was better to break
  * it up into three procedures. -- erics
  */
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
+static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
 {
 	long m = mrtt; /* RTT */
 
-	if (tcp_vegas_enabled(tp))
-		vegas_rtt_calc(tp, mrtt);
-
 	/*	The following amusing code comes from Jacobson's
 	 *	article in SIGCOMM '88.  Note that rtt and mdev
 	 *	are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
 		tp->rtt_seq = tp->snd_nxt;
 	}
 
-	tcp_westwood_update_rtt(tp, tp->srtt >> 3);
+	if (tp->ca_ops->rtt_sample)
+		tp->ca_ops->rtt_sample(tp, *usrtt);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
             tp->snd_una == tp->high_seq ||
             (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		if (!tcp_westwood_ssthresh(tp))
-			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+		tcp_ca_event(tp, CA_EVENT_FRTO);
 	}
 
 	/* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
 	tcp_set_ca_state(tp, TCP_CA_Loss);
 	tp->high_seq = tp->frto_highmark;
 	TCP_ECN_queue_cwr(tp);
-
-	init_bictcp(tp);
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
 	if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
 	    (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+		tcp_ca_event(tp, CA_EVENT_LOSS);
 	}
 	tp->snd_cwnd	   = 1;
 	tp->snd_cwnd_cnt   = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 }
 
 /* Decrease cwnd each second ack. */
-
 static void tcp_cwnd_down(struct tcp_sock *tp)
 {
 	int decr = tp->snd_cwnd_cnt + 1;
-	__u32 limit;
-
-	/*
-	 * TCP Westwood
-	 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
-	 * in packets we use mss_cache). If sysctl_tcp_westwood is off
-	 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
-	 * still used as usual. It prevents other strange cases in which
-	 * BWE*RTTmin could assume value 0. It should not happen but...
-	 */
-
-	if (!(limit = tcp_westwood_bw_rttmin(tp)))
-		limit = tp->snd_ssthresh/2;
 
 	tp->snd_cwnd_cnt = decr&1;
 	decr >>= 1;
 
-	if (decr && tp->snd_cwnd > limit)
+	if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
 		tp->snd_cwnd -= decr;
 
 	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
 static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
 {
 	if (tp->prior_ssthresh) {
-		if (tcp_is_bic(tp))
-			tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd);
+		if (tp->ca_ops->undo_cwnd)
+			tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
 		else
 			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
 
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
 
 static inline void tcp_complete_cwr(struct tcp_sock *tp)
 {
-	if (tcp_westwood_cwnd(tp)) 
-		tp->snd_ssthresh = tp->snd_cwnd;
-	else
-		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
 }
 
 static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		if (tp->ca_state < TCP_CA_CWR) {
 			if (!(flag&FLAG_ECE))
 				tp->prior_ssthresh = tcp_current_ssthresh(tp);
-			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+			tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
 			TCP_ECN_queue_cwr(tp);
 		}
 
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 /* Read draft-ietf-tcplw-high-performance before mucking
  * with this code. (Superceeds RFC1323)
  */
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
+static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
 {
 	__u32 seq_rtt;
 
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
 	 * in window is lost... Voila.	 			--ANK (010210)
 	 */
 	seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-	tcp_rtt_estimator(tp, seq_rtt);
+	tcp_rtt_estimator(tp, seq_rtt, usrtt);
 	tcp_set_rto(tp);
 	tp->backoff = 0;
 	tcp_bound_rto(tp);
 }
 
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
+static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
 {
 	/* We don't have a timestamp. Can only use
 	 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
 	if (flag & FLAG_RETRANS_DATA_ACKED)
 		return;
 
-	tcp_rtt_estimator(tp, seq_rtt);
+	tcp_rtt_estimator(tp, seq_rtt, usrtt);
 	tcp_set_rto(tp);
 	tp->backoff = 0;
 	tcp_bound_rto(tp);
 }
 
 static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
-				      int flag, s32 seq_rtt)
+				      int flag, s32 seq_rtt, u32 *usrtt)
 {
 	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
-		tcp_ack_saw_tstamp(tp, flag);
+		tcp_ack_saw_tstamp(tp, usrtt, flag);
 	else if (seq_rtt >= 0)
-		tcp_ack_no_tstamp(tp, seq_rtt, flag);
+		tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
 }
 
-/*
- * Compute congestion window to use.
- *
- * This is from the implementation of BICTCP in
- * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
- *  "Binary Increase Congestion Control for Fast, Long Distance
- *  Networks" in InfoComm 2004
- * Available from:
- *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
- *
- * Unless BIC is enabled and congestion window is large
- * this behaves the same as the original Reno.
- */
-static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
-{
-	/* orignal Reno behaviour */
-	if (!tcp_is_bic(tp))
-		return tp->snd_cwnd;
-
-	if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
-	   (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
-		return tp->bictcp.cnt;
-
-	tp->bictcp.last_cwnd = tp->snd_cwnd;
-	tp->bictcp.last_stamp = tcp_time_stamp;
-      
-	/* start off normal */
-	if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
-		tp->bictcp.cnt = tp->snd_cwnd;
-
-	/* binary increase */
-	else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
-		__u32 	dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
-			/ BICTCP_B;
-
-		if (dist > BICTCP_MAX_INCREMENT)
-			/* linear increase */
-			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
-		else if (dist <= 1U)
-			/* binary search increase */
-			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
-				/ BICTCP_B;
-		else
-			/* binary search increase */
-			tp->bictcp.cnt = tp->snd_cwnd / dist;
-	} else {
-		/* slow start amd linear increase */
-		if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
-			/* slow start */
-			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
-				/ BICTCP_B;
-		else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
-			 		+ BICTCP_MAX_INCREMENT*(BICTCP_B-1))
-			/* slow start */
-			tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
-				/ (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
-		else
-			/* linear increase */
-			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
-	}
-	return tp->bictcp.cnt;
-}
-
-/* This is Jacobson's slow start and congestion avoidance. 
- * SIGCOMM '88, p. 328.
- */
-static inline void reno_cong_avoid(struct tcp_sock *tp)
+static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+				  u32 in_flight, int good)
 {
-        if (tp->snd_cwnd <= tp->snd_ssthresh) {
-                /* In "safe" area, increase. */
-		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-			tp->snd_cwnd++;
-	} else {
-                /* In dangerous area, increase slowly.
-		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
-		 */
-		if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
-			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-				tp->snd_cwnd++;
-			tp->snd_cwnd_cnt=0;
-		} else
-			tp->snd_cwnd_cnt++;
-        }
+	tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-/* This is based on the congestion detection/avoidance scheme described in
- *    Lawrence S. Brakmo and Larry L. Peterson.
- *    "TCP Vegas: End to end congestion avoidance on a global internet."
- *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
- *    October 1995. Available from:
- *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
- *
- * See http://www.cs.arizona.edu/xkernel/ for their implementation.
- * The main aspects that distinguish this implementation from the
- * Arizona Vegas implementation are:
- *   o We do not change the loss detection or recovery mechanisms of
- *     Linux in any way. Linux already recovers from losses quite well,
- *     using fine-grained timers, NewReno, and FACK.
- *   o To avoid the performance penalty imposed by increasing cwnd
- *     only every-other RTT during slow start, we increase during
- *     every RTT during slow start, just like Reno.
- *   o Largely to allow continuous cwnd growth during slow start,
- *     we use the rate at which ACKs come back as the "actual"
- *     rate, rather than the rate at which data is sent.
- *   o To speed convergence to the right rate, we set the cwnd
- *     to achieve the right ("actual") rate when we exit slow start.
- *   o To filter out the noise caused by delayed ACKs, we use the
- *     minimum RTT sample observed during the last RTT to calculate
- *     the actual rate.
- *   o When the sender re-starts from idle, it waits until it has
- *     received ACKs for an entire flight of new data before making
- *     a cwnd adjustment decision. The original Vegas implementation
- *     assumed senders never went idle.
- */
-static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
-{
-	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
-	 *
-	 * These are so named because they represent the approximate values
-	 * of snd_una and snd_nxt at the beginning of the current RTT. More
-	 * precisely, they represent the amount of data sent during the RTT.
-	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
-	 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
-	 * bytes of data have been ACKed during the course of the RTT, giving
-	 * an "actual" rate of:
-	 *
-	 *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
-	 *
-	 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
-	 * because delayed ACKs can cover more than one segment, so they
-	 * don't line up nicely with the boundaries of RTTs.
-	 *
-	 * Another unfortunate fact of life is that delayed ACKs delay the
-	 * advance of the left edge of our send window, so that the number
-	 * of bytes we send in an RTT is often less than our cwnd will allow.
-	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
-	 */
-
-	if (after(ack, tp->vegas.beg_snd_nxt)) {
-		/* Do the Vegas once-per-RTT cwnd adjustment. */
-		u32 old_wnd, old_snd_cwnd;
-
-		
-		/* Here old_wnd is essentially the window of data that was
-		 * sent during the previous RTT, and has all
-		 * been acknowledged in the course of the RTT that ended
-		 * with the ACK we just received. Likewise, old_snd_cwnd
-		 * is the cwnd during the previous RTT.
-		 */
-		old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
-			tp->mss_cache_std;
-		old_snd_cwnd = tp->vegas.beg_snd_cwnd;
-
-		/* Save the extent of the current window so we can use this
-		 * at the end of the next RTT.
-		 */
-		tp->vegas.beg_snd_una  = tp->vegas.beg_snd_nxt;
-		tp->vegas.beg_snd_nxt  = tp->snd_nxt;
-		tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
-
-		/* Take into account the current RTT sample too, to
-		 * decrease the impact of delayed acks. This double counts
-		 * this sample since we count it for the next window as well,
-		 * but that's not too awful, since we're taking the min,
-		 * rather than averaging.
-		 */
-		vegas_rtt_calc(tp, seq_rtt);
-
-		/* We do the Vegas calculations only if we got enough RTT
-		 * samples that we can be reasonably sure that we got
-		 * at least one RTT sample that wasn't from a delayed ACK.
-		 * If we only had 2 samples total,
-		 * then that means we're getting only 1 ACK per RTT, which
-		 * means they're almost certainly delayed ACKs.
-		 * If  we have 3 samples, we should be OK.
-		 */
-
-		if (tp->vegas.cntRTT <= 2) {
-			/* We don't have enough RTT samples to do the Vegas
-			 * calculation, so we'll behave like Reno.
-			 */
-			if (tp->snd_cwnd > tp->snd_ssthresh)
-				tp->snd_cwnd++;
-		} else {
-			u32 rtt, target_cwnd, diff;
-
-			/* We have enough RTT samples, so, using the Vegas
-			 * algorithm, we determine if we should increase or
-			 * decrease cwnd, and by how much.
-			 */
-
-			/* Pluck out the RTT we are using for the Vegas
-			 * calculations. This is the min RTT seen during the
-			 * last RTT. Taking the min filters out the effects
-			 * of delayed ACKs, at the cost of noticing congestion
-			 * a bit later.
-			 */
-			rtt = tp->vegas.minRTT;
-
-			/* Calculate the cwnd we should have, if we weren't
-			 * going too fast.
-			 *
-			 * This is:
-			 *     (actual rate in segments) * baseRTT
-			 * We keep it as a fixed point number with
-			 * V_PARAM_SHIFT bits to the right of the binary point.
-			 */
-			target_cwnd = ((old_wnd * tp->vegas.baseRTT)
-				       << V_PARAM_SHIFT) / rtt;
-
-			/* Calculate the difference between the window we had,
-			 * and the window we would like to have. This quantity
-			 * is the "Diff" from the Arizona Vegas papers.
-			 *
-			 * Again, this is a fixed point number with
-			 * V_PARAM_SHIFT bits to the right of the binary
-			 * point.
-			 */
-			diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
-
-			if (tp->snd_cwnd < tp->snd_ssthresh) {
-				/* Slow start.  */
-				if (diff > sysctl_tcp_vegas_gamma) {
-					/* Going too fast. Time to slow down
-					 * and switch to congestion avoidance.
-					 */
-					tp->snd_ssthresh = 2;
-
-					/* Set cwnd to match the actual rate
-					 * exactly:
-					 *   cwnd = (actual rate) * baseRTT
-					 * Then we add 1 because the integer
-					 * truncation robs us of full link
-					 * utilization.
-					 */
-					tp->snd_cwnd = min(tp->snd_cwnd,
-							   (target_cwnd >>
-							    V_PARAM_SHIFT)+1);
-
-				}
-			} else {
-				/* Congestion avoidance. */
-				u32 next_snd_cwnd;
-
-				/* Figure out where we would like cwnd
-				 * to be.
-				 */
-				if (diff > sysctl_tcp_vegas_beta) {
-					/* The old window was too fast, so
-					 * we slow down.
-					 */
-					next_snd_cwnd = old_snd_cwnd - 1;
-				} else if (diff < sysctl_tcp_vegas_alpha) {
-					/* We don't have enough extra packets
-					 * in the network, so speed up.
-					 */
-					next_snd_cwnd = old_snd_cwnd + 1;
-				} else {
-					/* Sending just as fast as we
-					 * should be.
-					 */
-					next_snd_cwnd = old_snd_cwnd;
-				}
-
-				/* Adjust cwnd upward or downward, toward the
-				 * desired value.
-				 */
-				if (next_snd_cwnd > tp->snd_cwnd)
-					tp->snd_cwnd++;
-				else if (next_snd_cwnd < tp->snd_cwnd)
-					tp->snd_cwnd--;
-			}
-		}
-
-		/* Wipe the slate clean for the next RTT. */
-		tp->vegas.cntRTT = 0;
-		tp->vegas.minRTT = 0x7fffffff;
-	}
-
-	/* The following code is executed for every ack we receive,
-	 * except for conditions checked in should_advance_cwnd()
-	 * before the call to tcp_cong_avoid(). Mainly this means that
-	 * we only execute this code if the ack actually acked some
-	 * data.
-	 */
-
-	/* If we are in slow start, increase our cwnd in response to this ACK.
-	 * (If we are not in slow start then we are in congestion avoidance,
-	 * and adjust our congestion window only once per RTT. See the code
-	 * above.)
-	 */
-	if (tp->snd_cwnd <= tp->snd_ssthresh) 
-		tp->snd_cwnd++;
-
-	/* to keep cwnd from growing without bound */
-	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
-
-	/* Make sure that we are never so timid as to reduce our cwnd below
-	 * 2 MSS.
-	 *
-	 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
-	 */
-	tp->snd_cwnd = max(tp->snd_cwnd, 2U);
-
-	tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
-{
-	if (tcp_vegas_enabled(tp))
-		vegas_cong_avoid(tp, ack, seq_rtt);
-	else
-		reno_cong_avoid(tp);
-}
-
 /* Restart timer after forward progress on connection.
  * RFC2988 recommends to restart timer to now+rto.
  */
@@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
 
 
 /* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	__u32 now = tcp_time_stamp;
 	int acked = 0;
 	__s32 seq_rtt = -1;
+	struct timeval usnow;
+	u32 pkts_acked = 0;
+
+	if (seq_usrtt)
+		do_gettimeofday(&usnow);
 
 	while ((skb = skb_peek(&sk->sk_write_queue)) &&
 	       skb != sk->sk_send_head) {
@@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 		 */
 		if (!(scb->flags & TCPCB_FLAG_SYN)) {
 			acked |= FLAG_DATA_ACKED;
+			++pkts_acked;
 		} else {
 			acked |= FLAG_SYN_ACKED;
 			tp->retrans_stamp = 0;
@@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 				seq_rtt = -1;
 			} else if (seq_rtt < 0)
 				seq_rtt = now - scb->when;
+			if (seq_usrtt)
+				*seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
+					+ (usnow.tv_usec - skb->stamp.tv_usec);
+
 			if (sacked & TCPCB_SACKED_ACKED)
 				tp->sacked_out -= tcp_skb_pcount(skb);
 			if (sacked & TCPCB_LOST)
@@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 	}
 
 	if (acked&FLAG_ACKED) {
-		tcp_ack_update_rtt(tp, acked, seq_rtt);
+		tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
 		tcp_ack_packets_out(sk, tp);
+
+		if (tp->ca_ops->pkts_acked)
+			tp->ca_ops->pkts_acked(tp, pkts_acked);
 	}
 
 #if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 	tp->frto_counter = (tp->frto_counter + 1) % 3;
 }
 
-/*
- * TCP Westwood+
- */
-
-/*
- * @init_westwood
- * This function initializes fields used in TCP Westwood+. We can't
- * get no information about RTTmin at this time so we simply set it to
- * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
- * since in this way we're sure it will be updated in a consistent
- * way as soon as possible. It will reasonably happen within the first
- * RTT period of the connection lifetime.
- */
-
-static void init_westwood(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-
-        tp->westwood.bw_ns_est = 0;
-        tp->westwood.bw_est = 0;
-        tp->westwood.accounted = 0;
-        tp->westwood.cumul_ack = 0;
-        tp->westwood.rtt_win_sx = tcp_time_stamp;
-        tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
-        tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
-        tp->westwood.snd_una = tp->snd_una;
-}
-
-/*
- * @westwood_do_filter
- * Low-pass filter. Implemented using constant coeffients.
- */
-
-static inline __u32 westwood_do_filter(__u32 a, __u32 b)
-{
-	return (((7 * a) + b) >> 3);
-}
-
-static void westwood_filter(struct sock *sk, __u32 delta)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->westwood.bw_ns_est =
-		westwood_do_filter(tp->westwood.bw_ns_est, 
-				   tp->westwood.bk / delta);
-	tp->westwood.bw_est =
-		westwood_do_filter(tp->westwood.bw_est,
-				   tp->westwood.bw_ns_est);
-}
-
-/* 
- * @westwood_update_rttmin
- * It is used to update RTTmin. In this case we MUST NOT use
- * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
- */
-
-static inline __u32 westwood_update_rttmin(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	__u32 rttmin = tp->westwood.rtt_min;
-
-	if (tp->westwood.rtt != 0 &&
-	    (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
-		rttmin = tp->westwood.rtt;
-
-	return rttmin;
-}
-
-/*
- * @westwood_acked
- * Evaluate increases for dk. 
- */
-
-static inline __u32 westwood_acked(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-
-	return tp->snd_una - tp->westwood.snd_una;
-}
-
-/*
- * @westwood_new_window
- * It evaluates if we are receiving data inside the same RTT window as
- * when we started.
- * Return value:
- * It returns 0 if we are still evaluating samples in the same RTT
- * window, 1 if the sample has to be considered in the next window.
- */
-
-static int westwood_new_window(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	__u32 left_bound;
-	__u32 rtt;
-	int ret = 0;
-
-	left_bound = tp->westwood.rtt_win_sx;
-	rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
-
-	/*
-	 * A RTT-window has passed. Be careful since if RTT is less than
-	 * 50ms we don't filter but we continue 'building the sample'.
-	 * This minimum limit was choosen since an estimation on small
-	 * time intervals is better to avoid...
-	 * Obvioulsy on a LAN we reasonably will always have
-	 * right_bound = left_bound + WESTWOOD_RTT_MIN
-         */
-
-	if ((left_bound + rtt) < tcp_time_stamp)
-		ret = 1;
-
-	return ret;
-}
-
-/*
- * @westwood_update_window
- * It updates RTT evaluation window if it is the right moment to do
- * it. If so it calls filter for evaluating bandwidth. 
- */
-
-static void __westwood_update_window(struct sock *sk, __u32 now)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	__u32 delta = now - tp->westwood.rtt_win_sx;
-
-        if (delta) {
-		if (tp->westwood.rtt)
-			westwood_filter(sk, delta);
-
-		tp->westwood.bk = 0;
-		tp->westwood.rtt_win_sx = tcp_time_stamp;
-	}
-}
-
-
-static void westwood_update_window(struct sock *sk, __u32 now)
-{
-	if (westwood_new_window(sk)) 
-		__westwood_update_window(sk, now);
-}
-
-/*
- * @__tcp_westwood_fast_bw
- * It is called when we are in fast path. In particular it is called when
- * header prediction is successfull. In such case infact update is
- * straight forward and doesn't need any particular care.
- */
-
-static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	westwood_update_window(sk, tcp_time_stamp);
-
-	tp->westwood.bk += westwood_acked(sk);
-	tp->westwood.snd_una = tp->snd_una;
-	tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-
-static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
-        if (tcp_is_westwood(tcp_sk(sk)))
-                __tcp_westwood_fast_bw(sk, skb);
-}
-
-
-/*
- * @westwood_dupack_update
- * It updates accounted and cumul_ack when receiving a dupack.
- */
-
-static void westwood_dupack_update(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->westwood.accounted += tp->mss_cache_std;
-	tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-
-static inline int westwood_may_change_cumul(struct tcp_sock *tp)
-{
-	return (tp->westwood.cumul_ack > tp->mss_cache_std);
-}
-
-static inline void westwood_partial_update(struct tcp_sock *tp)
-{
-	tp->westwood.accounted -= tp->westwood.cumul_ack;
-	tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-
-static inline void westwood_complete_update(struct tcp_sock *tp)
-{
-	tp->westwood.cumul_ack -= tp->westwood.accounted;
-	tp->westwood.accounted = 0;
-}
-
-/*
- * @westwood_acked_count
- * This function evaluates cumul_ack for evaluating dk in case of
- * delayed or partial acks.
- */
-
-static inline __u32 westwood_acked_count(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->westwood.cumul_ack = westwood_acked(sk);
-
-        /* If cumul_ack is 0 this is a dupack since it's not moving
-         * tp->snd_una.
-         */
-        if (!(tp->westwood.cumul_ack))
-                westwood_dupack_update(sk);
-
-        if (westwood_may_change_cumul(tp)) {
-		/* Partial or delayed ack */
-		if (tp->westwood.accounted >= tp->westwood.cumul_ack)
-			westwood_partial_update(tp);
-		else
-			westwood_complete_update(tp);
-	}
-
-	tp->westwood.snd_una = tp->snd_una;
-
-	return tp->westwood.cumul_ack;
-}
-
-
-/*
- * @__tcp_westwood_slow_bw
- * It is called when something is going wrong..even if there could
- * be no problems! Infact a simple delayed packet may trigger a
- * dupack. But we need to be careful in such case.
- */
-
-static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	westwood_update_window(sk, tcp_time_stamp);
-
-	tp->westwood.bk += westwood_acked_count(sk);
-	tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-
-static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
-        if (tcp_is_westwood(tcp_sk(sk)))
-                __tcp_westwood_slow_bw(sk, skb);
-}
-
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 {
@@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	u32 prior_in_flight;
 	s32 seq_rtt;
+	s32 seq_usrtt = 0;
 	int prior_packets;
 
 	/* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		 */
 		tcp_update_wl(tp, ack, ack_seq);
 		tp->snd_una = ack;
-		tcp_westwood_fast_bw(sk, skb);
 		flag |= FLAG_WIN_UPDATE;
 
+		tcp_ca_event(tp, CA_EVENT_FAST_ACK);
+
 		NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
 	} else {
 		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
 			flag |= FLAG_ECE;
 
-		tcp_westwood_slow_bw(sk,skb);
+		tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
 	}
 
 	/* We passed data and got it acked, remove any soft error
@@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	prior_in_flight = tcp_packets_in_flight(tp);
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+	flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
+				    tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
 
 	if (tp->frto_counter)
 		tcp_process_frto(sk, prior_snd_una);
 
 	if (tcp_ack_is_dubious(tp, flag)) {
 		/* Advanve CWND, if state allows this. */
-		if ((flag & FLAG_DATA_ACKED) &&
-		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
-		    tcp_may_raise_cwnd(tp, flag))
-			tcp_cong_avoid(tp, ack, seq_rtt);
+		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
+			tcp_cong_avoid(tp, ack,  seq_rtt, prior_in_flight, 0);
 		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
 	} else {
-		if ((flag & FLAG_DATA_ACKED) && 
-		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
-			tcp_cong_avoid(tp, ack, seq_rtt);
+		if ((flag & FLAG_DATA_ACKED))
+			tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_init_metrics(sk);
 
+		tcp_init_congestion_control(tp);
+
 		/* Prevent spurious tcp_cwnd_restart() on first data
 		 * packet.
 		 */
@@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if(tp->af_specific->conn_request(sk, skb) < 0)
 				return 1;
 
-			init_westwood(sk);
-			init_bictcp(tp);
-
 			/* Now we have several options: In theory there is 
 			 * nothing else in the frame. KA9Q has an option to 
 			 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		goto discard;
 
 	case TCP_SYN_SENT:
-		init_westwood(sk);
-		init_bictcp(tp);
-
 		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
 		if (queued >= 0)
 			return queued;
@@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 */
 				if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
 				    !tp->srtt)
-					tcp_ack_saw_tstamp(tp, 0);
+					tcp_ack_saw_tstamp(tp, 0, 0);
 
 				if (tp->rx_opt.tstamp_ok)
 					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 				tcp_init_metrics(sk);
 
+				tcp_init_congestion_control(tp);
+
 				/* Prevent spurious tcp_cwnd_restart() on
 				 * first data packet.
 				 */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2d41d5d6ad1..9122814c13a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->mss_cache_std = tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
+	tp->ca_ops = &tcp_reno;
 
 	sk->sk_state = TCP_CLOSE;
 
@@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
 
 	tcp_clear_xmit_timers(sk);
 
+	tcp_cleanup_congestion_control(tp);
+
 	/* Cleanup up the write buffer. */
   	sk_stream_writequeue_purge(sk);
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b3943e7562f..f42a284164b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->frto_counter = 0;
 		newtp->frto_highmark = 0;
 
+		newtp->ca_ops = &tcp_reno;
+
 		tcp_set_ca_state(newtp, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		if (newtp->ecn_flags&TCP_ECN_OK)
 			sock_set_flag(newsk, SOCK_NO_LARGESEND);
 
-		tcp_ca_init(newtp);
-
 		TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
 	}
 	return newsk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e33..0e17c244875 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
 	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
 	u32 cwnd = tp->snd_cwnd;
 
-	if (tcp_is_vegas(tp)) 
-		tcp_vegas_enable(tp);
+	tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
 
 	tp->snd_ssthresh = tcp_current_ssthresh(tp);
 	restart_cwnd = min(restart_cwnd, cwnd);
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 #define SYSCTL_FLAG_WSCALE	0x2
 #define SYSCTL_FLAG_SACK	0x4
 
+		/* If congestion control is doing timestamping */
+		if (tp->ca_ops->rtt_sample)
+			do_gettimeofday(&skb->stamp);
+
 		sysctl_flags = 0;
 		if (tcb->flags & TCPCB_FLAG_SYN) {
 			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 					    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 		}
 		
-		/*
-		 * If the connection is idle and we are restarting,
-		 * then we don't want to do any Vegas calculations
-		 * until we get fresh RTT samples.  So when we
-		 * restart, we reset our Vegas state to a clean
-		 * slate. After we get acks for this flight of
-		 * packets, _then_ we can make Vegas calculations
-		 * again.
-		 */
-		if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
-			tcp_vegas_enable(tp);
+		if (tcp_packets_in_flight(tp) == 0)
+			tcp_ca_event(tp, CA_EVENT_TX_START);
 
 		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 		skb->h.th = th;
@@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	 * skbs, which it never sent before. --ANK
 	 */
 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+	buff->stamp = skb->stamp;
 
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
 		tp->lost_out -= tcp_skb_pcount(skb);
@@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
 	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(sk);
-	tcp_ca_init(tp);
 
 	tcp_select_initial_window(tcp_full_space(sk),
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk)
 	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
 	tp->snd_nxt = tp->write_seq;
 	tp->pushed_seq = tp->write_seq;
-	tcp_ca_init(tp);
 
 	/* Send it off. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2414937f2a8..fce56039b0e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_state = TCP_CLOSE;
 
 	tp->af_specific = &ipv6_specific;
-
+	tp->ca_ops = &tcp_reno;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
-- 
cgit v1.2.3


From 7c99c909fa69a183c1b80bd64fb9f0d11459aff3 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:20:36 -0700
Subject: [TCP]: Change tcp_diag to use the existing __RTA_PUT() macro.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_diag.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 867acc0f79d..a4e512036d8 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -43,13 +43,7 @@ struct tcpdiag_entry
 static struct sock *tcpnl;
 
 #define TCPDIAG_PUT(skb, attrtype, attrlen) \
-({ int rtalen = RTA_LENGTH(attrlen);        \
-   struct rtattr *rta;                      \
-   if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
-   rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
-   rta->rta_type = attrtype;                \
-   rta->rta_len = rtalen;                   \
-   RTA_DATA(rta); })
+	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
 
 static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 			int ext, u32 pid, u32 seq, u16 nlmsg_flags)
@@ -167,6 +161,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
 
+rtattr_failure:
 nlmsg_failure:
 	skb_trim(skb, b - skb->data);
 	return -1;
-- 
cgit v1.2.3


From 056ede6cface66b400cd3b8e60ed077cc5b85c18 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:21:28 -0700
Subject: [TCP]: Report congestion control algorithm in tcp_diag.

Enhancement to the tcp_diag interface used by the iproute2 ss command
to report the tcp congestion control being used by a socket.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_diag.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a4e512036d8..f66945cb158 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -65,6 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 		if (ext & (1<<(TCPDIAG_INFO-1)))
 			info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
 		
+		if (ext & (1<<(TCPDIAG_CONG-1))) {
+			size_t len = strlen(tp->ca_ops->name);
+			strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
+			       tp->ca_ops->name);
+		}
 	}
 	r->tcpdiag_family = sk->sk_family;
 	r->tcpdiag_state = sk->sk_state;
-- 
cgit v1.2.3


From 83803034f4233d810c4adc52008921da060c55d1 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:23:25 -0700
Subject: [TCP]: Add TCP BIC congestion control module.

TCP BIC congestion control reworked to use the new congestion control
infrastructure. This version is more up to date than the BIC
code in 2.6.12; it incorporates enhancements from BICTCP 1.1,
to handle low latency links.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig   |  21 ++++
 net/ipv4/Makefile  |   1 +
 net/ipv4/tcp_bic.c | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 353 insertions(+)
 create mode 100644 net/ipv4/tcp_bic.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 567b03b1c34..712ebacacb6 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -433,5 +433,26 @@ config IP_TCPDIAG
 config IP_TCPDIAG_IPV6
 	def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
 
+# TCP Reno is builtin (required as fallback)
+menu "TCP congestion control"
+	depends on INET
+
+config TCP_CONG_BIC
+	tristate "Binary Increase Congestion (BIC) control"
+	depends on INET
+	default y
+	---help---
+	BIC-TCP is a sender-side only change that ensures a linear RTT
+	fairness under large windows while offering both scalability and
+	bounded TCP-friendliness. The protocol combines two schemes
+	called additive increase and binary search increase. When the
+	congestion window is large, additive increase with a large
+	increment ensures linear RTT fairness as well as good
+	scalability. Under small congestion windows, binary search
+	increase provides TCP friendliness.
+	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+
+endmenu
+
 source "net/ipv4/ipvs/Kconfig"
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 89c0b4cb470..1d1cac5ac06 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 00000000000..ec38d45d664
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
+/*
+ * Binary Increase Congestion control for TCP
+ *
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
+ *  "Binary Increase Congestion Control for Fast, Long Distance
+ *  Networks" in InfoComm 2004
+ * Available from:
+ *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+
+#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
+					 * max_cwnd = snd_cwnd * beta
+					 */
+#define BICTCP_B		4	 /*
+					  * In binary search,
+					  * go to point (max+min)/N
+					  */
+
+static int fast_convergence = 1;
+static int max_increment = 32;
+static int low_window = 14;
+static int beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
+static int low_utilization_threshold = 153;
+static int low_utilization_period = 2;
+static int initial_ssthresh = 100;
+static int smooth_part = 20;
+
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(low_window, int, 0644);
+MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(low_utilization_threshold, int, 0644);
+MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
+module_param(low_utilization_period, int, 0644);
+MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(smooth_part, int, 0644);
+MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
+
+
+/* BIC TCP Parameters */
+struct bictcp {
+	u32	cnt;		/* increase cwnd by 1 after ACKs */
+	u32 	last_max_cwnd;	/* last maximum snd_cwnd */
+	u32	loss_cwnd;	/* congestion window at last loss */
+	u32	last_cwnd;	/* the last snd_cwnd */
+	u32	last_time;	/* time when updated last_cwnd */
+	u32	delay_min;	/* min delay */
+	u32	delay_max;	/* max delay */
+	u32	last_delay;
+	u8	low_utilization;/* 0: high; 1: low */
+	u32	low_utilization_start;	/* starting time of low utilization detection*/
+	u32	epoch_start;	/* beginning of an epoch */
+#define ACK_RATIO_SHIFT	4
+	u32	delayed_ack;	/* estimate the ratio of Packets/ACKs << 4 */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+	ca->cnt = 0;
+	ca->last_max_cwnd = 0;
+	ca->loss_cwnd = 0;
+	ca->last_cwnd = 0;
+	ca->last_time = 0;
+	ca->delay_min = 0;
+	ca->delay_max = 0;
+	ca->last_delay = 0;
+	ca->low_utilization = 0;
+	ca->low_utilization_start = 0;
+	ca->epoch_start = 0;
+	ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+}
+
+static void bictcp_init(struct tcp_sock *tp)
+{
+	bictcp_reset(tcp_ca(tp));
+	if (initial_ssthresh)
+		tp->snd_ssthresh = initial_ssthresh;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+	if (ca->last_cwnd == cwnd &&
+	    (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+		return;
+
+	ca->last_cwnd = cwnd;
+	ca->last_time = tcp_time_stamp;
+
+	if (ca->epoch_start == 0) /* record the beginning of an epoch */
+		ca->epoch_start = tcp_time_stamp;
+
+	/* start off normal */
+	if (cwnd <= low_window) {
+		ca->cnt = cwnd;
+		return;
+	}
+
+	/* binary increase */
+	if (cwnd < ca->last_max_cwnd) {
+		__u32 	dist = (ca->last_max_cwnd - cwnd)
+			/ BICTCP_B;
+
+		if (dist > max_increment)
+			/* linear increase */
+			ca->cnt = cwnd / max_increment;
+		else if (dist <= 1U)
+			/* binary search increase */
+			ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+		else
+			/* binary search increase */
+			ca->cnt = cwnd / dist;
+	} else {
+		/* slow start AMD linear increase */
+		if (cwnd < ca->last_max_cwnd + BICTCP_B)
+			/* slow start */
+			ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+		else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
+			/* slow start */
+			ca->cnt = (cwnd * (BICTCP_B-1))
+				/ cwnd-ca->last_max_cwnd;
+		else
+			/* linear increase */
+			ca->cnt = cwnd / max_increment;
+	}
+
+	/* if in slow start or link utilization is very low */
+	if ( ca->loss_cwnd == 0 ||
+	     (cwnd > ca->loss_cwnd && ca->low_utilization)) {
+		if (ca->cnt > 20) /* increase cwnd 5% per RTT */
+			ca->cnt = 20;
+	}
+
+	ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+	if (ca->cnt == 0)			/* cannot be zero */
+		ca->cnt = 1;
+}
+
+
+/* Detect low utilization in congestion avoidance */
+static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
+{
+	struct bictcp *ca = tcp_ca(tp);
+	u32 dist, delay;
+
+	/* No time stamp */
+	if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
+	     /* Discard delay samples right after fast recovery */
+	     tcp_time_stamp < ca->epoch_start + HZ ||
+	     /* this delay samples may not be accurate */
+	     flag == 0) {
+		ca->last_delay = 0;
+		goto notlow;
+	}
+
+	delay = ca->last_delay<<3;	/* use the same scale as tp->srtt*/
+	ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+	if (delay == 0) 		/* no previous delay sample */
+		goto notlow;
+
+	/* first time call or link delay decreases */
+	if (ca->delay_min == 0 || ca->delay_min > delay) {
+		ca->delay_min = ca->delay_max = delay;
+		goto notlow;
+	}
+
+	if (ca->delay_max < delay)
+		ca->delay_max = delay;
+
+	/* utilization is low, if avg delay < dist*threshold
+	   for checking_period time */
+	dist = ca->delay_max - ca->delay_min;
+	if (dist <= ca->delay_min>>6 ||
+	    tp->srtt - ca->delay_min >=  (dist*low_utilization_threshold)>>10)
+		goto notlow;
+
+	if (ca->low_utilization_start == 0) {
+		ca->low_utilization = 0;
+		ca->low_utilization_start = tcp_time_stamp;
+	} else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
+			> low_utilization_period*HZ) {
+		ca->low_utilization = 1;
+	}
+
+	return;
+
+ notlow:
+	ca->low_utilization = 0;
+	ca->low_utilization_start = 0;
+
+}
+
+static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
+			      u32 seq_rtt, u32 in_flight, int data_acked)
+{
+	struct bictcp *ca = tcp_ca(tp);
+
+	bictcp_low_utilization(tp, data_acked);
+
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		/* In "safe" area, increase. */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+		bictcp_update(ca, tp->snd_cwnd);
+
+                /* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+		 */
+		if (tp->snd_cwnd_cnt >= ca->cnt) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+		} else
+			tp->snd_cwnd_cnt++;
+	}
+
+}
+
+/*
+ *	behave like Reno until low_window is reached,
+ *	then increase congestion window slowly
+ */
+static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
+{
+	struct bictcp *ca = tcp_ca(tp);
+
+	ca->epoch_start = 0;	/* end of epoch */
+
+	/* in case of wrong delay_max*/
+	if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
+		ca->delay_max = ca->delay_min
+			+ ((ca->delay_max - ca->delay_min)* 90) / 100;
+
+	/* Wmax and fast convergence */
+	if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+		ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+			/ (2 * BICTCP_BETA_SCALE);
+	else
+		ca->last_max_cwnd = tp->snd_cwnd;
+
+	ca->loss_cwnd = tp->snd_cwnd;
+
+
+	if (tp->snd_cwnd <= low_window)
+		return max(tp->snd_cwnd >> 1U, 2U);
+	else
+		return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
+{
+	struct bictcp *ca = tcp_ca(tp);
+
+	return max(tp->snd_cwnd, ca->last_max_cwnd);
+}
+
+static u32 bictcp_min_cwnd(struct tcp_sock *tp)
+{
+	return tp->snd_ssthresh;
+}
+
+static void bictcp_state(struct tcp_sock *tp, u8 new_state)
+{
+	if (new_state == TCP_CA_Loss)
+		bictcp_reset(tcp_ca(tp));
+}
+
+/* Track delayed acknowledgement ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
+{
+	if (cnt > 0 && 	tp->ca_state == TCP_CA_Open) {
+		struct bictcp *ca = tcp_ca(tp);
+		cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+		ca->delayed_ack += cnt;
+	}
+}
+
+
+static struct tcp_congestion_ops bictcp = {
+	.init		= bictcp_init,
+	.ssthresh	= bictcp_recalc_ssthresh,
+	.cong_avoid	= bictcp_cong_avoid,
+	.set_state	= bictcp_state,
+	.undo_cwnd	= bictcp_undo_cwnd,
+	.min_cwnd	= bictcp_min_cwnd,
+	.pkts_acked     = bictcp_acked,
+	.owner		= THIS_MODULE,
+	.name		= "bic",
+};
+
+static int __init bictcp_register(void)
+{
+	BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&bictcp);
+}
+
+static void __exit bictcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&bictcp);
+}
+
+module_init(bictcp_register);
+module_exit(bictcp_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("BIC TCP");
-- 
cgit v1.2.3


From 8727076289ec55298a05cabddf02b374d13c1624 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:24:09 -0700
Subject: [TCP]: Add TCP Westwood congestion control module.

This is the existing 2.6.12 Westwood code moved from tcp_input
to the new congestion framework. A lot of the inline functions
have been eliminated to try and make it clearer.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig        |  15 +++
 net/ipv4/Makefile       |   1 +
 net/ipv4/tcp_westwood.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 275 insertions(+)
 create mode 100644 net/ipv4/tcp_westwood.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 712ebacacb6..adbe855d931 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -452,6 +452,21 @@ config TCP_CONG_BIC
 	increase provides TCP friendliness.
 	See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
 
+config TCP_CONG_WESTWOOD
+	tristate "TCP Westwood+"
+	depends on INET
+	default m
+	---help---
+	TCP Westwood+ is a sender-side only modification of the TCP Reno
+	protocol stack that optimizes the performance of TCP congestion
+	control. It is based on end-to-end bandwidth estimation to set
+	congestion window and slow start threshold after a congestion
+	episode. Using this estimation, TCP Westwood+ adaptively sets a
+	slow start threshold and a congestion window which takes into
+	account the bandwidth used  at the time congestion is experienced.
+	TCP Westwood+ significantly increases fairness wrt TCP Reno in
+	wired networks and throughput over wireless links.
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 1d1cac5ac06..dedfbe62a10 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 00000000000..ef827242c94
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,259 @@
+/*
+ * TCP Westwood+
+ *
+ *	Angelo Dell'Aera:	TCP Westwood+ support
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp_diag.h>
+#include <net/tcp.h>
+
+/* TCP Westwood structure */
+struct westwood {
+	u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
+	u32    bw_est;           /* bandwidth estimate */
+	u32    rtt_win_sx;       /* here starts a new evaluation... */
+	u32    bk;
+	u32    snd_una;          /* used for evaluating the number of acked bytes */
+	u32    cumul_ack;
+	u32    accounted;
+	u32    rtt;
+	u32    rtt_min;          /* minimum observed RTT */
+};
+
+
+/* TCP Westwood functions and constants */
+#define TCP_WESTWOOD_RTT_MIN   (HZ/20)	/* 50ms */
+#define TCP_WESTWOOD_INIT_RTT  (20*HZ)	/* maybe too conservative?! */
+
+/*
+ * @tcp_westwood_create
+ * This function initializes fields used in TCP Westwood+,
+ * it is called after the initial SYN, so the sequence numbers
+ * are correct but new passive connections we have no
+ * information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+static void tcp_westwood_init(struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+
+	w->bk = 0;
+        w->bw_ns_est = 0;
+        w->bw_est = 0;
+        w->accounted = 0;
+        w->cumul_ack = 0;
+	w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
+	w->rtt_win_sx = tcp_time_stamp;
+	w->snd_una = tp->snd_una;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coefficients.
+ */
+static inline u32 westwood_do_filter(u32 a, u32 b)
+{
+	return (((7 * a) + b) >> 3);
+}
+
+static inline void westwood_filter(struct westwood *w, u32 delta)
+{
+	w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+	w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+}
+
+/*
+ * @westwood_pkts_acked
+ * Called after processing group of packets.
+ * but all westwood needs is the last sample of srtt.
+ */
+static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
+{
+	struct westwood *w = tcp_ca(tp);
+	if (cnt > 0)
+		w->rtt = tp->srtt >> 3;
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth.
+ */
+static void westwood_update_window(struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+	s32 delta = tcp_time_stamp - w->rtt_win_sx;
+
+	/*
+	 * See if a RTT-window has passed.
+	 * Be careful since if RTT is less than
+	 * 50ms we don't filter but we continue 'building the sample'.
+	 * This minimum limit was chosen since an estimation on small
+	 * time intervals is better to avoid...
+	 * Obviously on a LAN we reasonably will always have
+	 * right_bound = left_bound + WESTWOOD_RTT_MIN
+	 */
+	if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
+		westwood_filter(w, delta);
+
+		w->bk = 0;
+		w->rtt_win_sx = tcp_time_stamp;
+	}
+}
+
+/*
+ * @westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successful. In such case in fact update is
+ * straight forward and doesn't need any particular care.
+ */
+static inline void westwood_fast_bw(struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+
+	westwood_update_window(tp);
+
+	w->bk += tp->snd_una - w->snd_una;
+	w->snd_una = tp->snd_una;
+	w->rtt_min = min(w->rtt, w->rtt_min);
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating bk in case of
+ * delayed or partial acks.
+ */
+static inline u32 westwood_acked_count(struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+
+	w->cumul_ack = tp->snd_una - w->snd_una;
+
+        /* If cumul_ack is 0 this is a dupack since it's not moving
+         * tp->snd_una.
+         */
+        if (!w->cumul_ack) {
+		w->accounted += tp->mss_cache;
+		w->cumul_ack = tp->mss_cache;
+	}
+
+        if (w->cumul_ack > tp->mss_cache) {
+		/* Partial or delayed ack */
+		if (w->accounted >= w->cumul_ack) {
+			w->accounted -= w->cumul_ack;
+			w->cumul_ack = tp->mss_cache;
+		} else {
+			w->cumul_ack -= w->accounted;
+			w->accounted = 0;
+		}
+	}
+
+	w->snd_una = tp->snd_una;
+
+	return w->cumul_ack;
+}
+
+static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
+{
+	struct westwood *w = tcp_ca(tp);
+	return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
+}
+
+/*
+ * TCP Westwood
+ * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
+ * so avoids ever returning 0.
+ */
+static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
+{
+	return westwood_bw_rttmin(tp);
+}
+
+static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+	struct westwood *w = tcp_ca(tp);
+
+	switch(event) {
+	case CA_EVENT_FAST_ACK:
+		westwood_fast_bw(tp);
+		break;
+
+	case CA_EVENT_COMPLETE_CWR:
+		tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
+		break;
+
+	case CA_EVENT_FRTO:
+		tp->snd_ssthresh = westwood_bw_rttmin(tp);
+		break;
+
+	case CA_EVENT_SLOW_ACK:
+		westwood_update_window(tp);
+		w->bk += westwood_acked_count(tp);
+		w->rtt_min = min(w->rtt, w->rtt_min);
+		break;
+
+	default:
+		/* don't care */
+		break;
+	}
+}
+
+
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
+			      struct sk_buff *skb)
+{
+	const struct westwood *ca = tcp_ca(tp);
+	if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+		struct rtattr *rta;
+		struct tcpvegas_info *info;
+
+		rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
+		info = RTA_DATA(rta);
+		info->tcpv_enabled = 1;
+		info->tcpv_rttcnt = 0;
+		info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
+		info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
+	rtattr_failure:	;
+	}
+}
+
+
+static struct tcp_congestion_ops tcp_westwood = {
+	.init		= tcp_westwood_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_westwood_cwnd_min,
+	.cwnd_event	= tcp_westwood_event,
+	.get_info	= tcp_westwood_info,
+	.pkts_acked	= tcp_westwood_pkts_acked,
+
+	.owner		= THIS_MODULE,
+	.name		= "westwood"
+};
+
+static int __init tcp_westwood_register(void)
+{
+	BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_westwood);
+}
+
+static void __exit tcp_westwood_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_westwood);
+}
+
+module_init(tcp_westwood_register);
+module_exit(tcp_westwood_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Westwood+");
-- 
cgit v1.2.3


From a628d29b56d3f420bf3ff1d7543a9caf3ce3b994 Mon Sep 17 00:00:00 2001
From: John Heffner <jheffner@psc.edu>
Date: Thu, 23 Jun 2005 12:24:58 -0700
Subject: [TCP]: Add High Speed TCP congestion control module.

Sally Floyd's high speed TCP congestion control.
This is useful for comparison and research.

Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig         |  11 +++
 net/ipv4/Makefile        |   1 +
 net/ipv4/tcp_highspeed.c | 181 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 193 insertions(+)
 create mode 100644 net/ipv4/tcp_highspeed.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index adbe855d931..910e25b6575 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -467,6 +467,17 @@ config TCP_CONG_WESTWOOD
 	TCP Westwood+ significantly increases fairness wrt TCP Reno in
 	wired networks and throughput over wireless links.
 
+config TCP_CONG_HSTCP
+	tristate "High Speed TCP"
+	depends on INET && EXPERIMENTAL
+	default n
+	---help---
+	Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+	A modification to TCP's congestion control mechanism for use
+	with large congestion windows. A table indicates how much to
+	increase the congestion window by when an ACK is received.
+ 	For more detail	see http://www.icir.org/floyd/hstcp.html
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index dedfbe62a10..ddf5d7785bf 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 00000000000..36c51f8136b
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,181 @@
+/*
+ * Sally Floyd's High Speed TCP (RFC 3649) congestion control
+ *
+ * See http://www.icir.org/floyd/hstcp.html
+ *
+ * John Heffner <jheffner@psc.edu>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+
+/* From AIMD tables from RFC 3649 appendix B,
+ * with fixed-point MD scaled <<8.
+ */
+static const struct hstcp_aimd_val {
+        unsigned int cwnd;
+        unsigned int md;
+} hstcp_aimd_vals[] = {
+ {     38,  128, /*  0.50 */ },
+ {    118,  112, /*  0.44 */ },
+ {    221,  104, /*  0.41 */ },
+ {    347,   98, /*  0.38 */ },
+ {    495,   93, /*  0.37 */ },
+ {    663,   89, /*  0.35 */ },
+ {    851,   86, /*  0.34 */ },
+ {   1058,   83, /*  0.33 */ },
+ {   1284,   81, /*  0.32 */ },
+ {   1529,   78, /*  0.31 */ },
+ {   1793,   76, /*  0.30 */ },
+ {   2076,   74, /*  0.29 */ },
+ {   2378,   72, /*  0.28 */ },
+ {   2699,   71, /*  0.28 */ },
+ {   3039,   69, /*  0.27 */ },
+ {   3399,   68, /*  0.27 */ },
+ {   3778,   66, /*  0.26 */ },
+ {   4177,   65, /*  0.26 */ },
+ {   4596,   64, /*  0.25 */ },
+ {   5036,   62, /*  0.25 */ },
+ {   5497,   61, /*  0.24 */ },
+ {   5979,   60, /*  0.24 */ },
+ {   6483,   59, /*  0.23 */ },
+ {   7009,   58, /*  0.23 */ },
+ {   7558,   57, /*  0.22 */ },
+ {   8130,   56, /*  0.22 */ },
+ {   8726,   55, /*  0.22 */ },
+ {   9346,   54, /*  0.21 */ },
+ {   9991,   53, /*  0.21 */ },
+ {  10661,   52, /*  0.21 */ },
+ {  11358,   52, /*  0.20 */ },
+ {  12082,   51, /*  0.20 */ },
+ {  12834,   50, /*  0.20 */ },
+ {  13614,   49, /*  0.19 */ },
+ {  14424,   48, /*  0.19 */ },
+ {  15265,   48, /*  0.19 */ },
+ {  16137,   47, /*  0.19 */ },
+ {  17042,   46, /*  0.18 */ },
+ {  17981,   45, /*  0.18 */ },
+ {  18955,   45, /*  0.18 */ },
+ {  19965,   44, /*  0.17 */ },
+ {  21013,   43, /*  0.17 */ },
+ {  22101,   43, /*  0.17 */ },
+ {  23230,   42, /*  0.17 */ },
+ {  24402,   41, /*  0.16 */ },
+ {  25618,   41, /*  0.16 */ },
+ {  26881,   40, /*  0.16 */ },
+ {  28193,   39, /*  0.16 */ },
+ {  29557,   39, /*  0.15 */ },
+ {  30975,   38, /*  0.15 */ },
+ {  32450,   38, /*  0.15 */ },
+ {  33986,   37, /*  0.15 */ },
+ {  35586,   36, /*  0.14 */ },
+ {  37253,   36, /*  0.14 */ },
+ {  38992,   35, /*  0.14 */ },
+ {  40808,   35, /*  0.14 */ },
+ {  42707,   34, /*  0.13 */ },
+ {  44694,   33, /*  0.13 */ },
+ {  46776,   33, /*  0.13 */ },
+ {  48961,   32, /*  0.13 */ },
+ {  51258,   32, /*  0.13 */ },
+ {  53677,   31, /*  0.12 */ },
+ {  56230,   30, /*  0.12 */ },
+ {  58932,   30, /*  0.12 */ },
+ {  61799,   29, /*  0.12 */ },
+ {  64851,   28, /*  0.11 */ },
+ {  68113,   28, /*  0.11 */ },
+ {  71617,   27, /*  0.11 */ },
+ {  75401,   26, /*  0.10 */ },
+ {  79517,   26, /*  0.10 */ },
+ {  84035,   25, /*  0.10 */ },
+ {  89053,   24, /*  0.10 */ },
+};
+
+#define HSTCP_AIMD_MAX	ARRAY_SIZE(hstcp_aimd_vals)
+
+struct hstcp {
+	u32	ai;
+};
+
+static void hstcp_init(struct tcp_sock *tp)
+{
+	struct hstcp *ca = tcp_ca(tp);
+
+	ca->ai = 0;
+
+	/* Ensure the MD arithmetic works.  This is somewhat pedantic,
+	 * since I don't think we will see a cwnd this large. :) */
+	tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+
+static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
+			     u32 in_flight, int good)
+{
+	struct hstcp *ca = tcp_ca(tp);
+
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+		/* Update AIMD parameters */
+		if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
+			while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+			       ca->ai < HSTCP_AIMD_MAX)
+				ca->ai++;
+		} else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
+			while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+			       ca->ai > 0)
+				ca->ai--;
+		}
+
+		/* Do additive increase */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+			tp->snd_cwnd_cnt += ca->ai;
+			if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+				tp->snd_cwnd++;
+				tp->snd_cwnd_cnt -= tp->snd_cwnd;
+			}
+		}
+	}
+}
+
+static u32 hstcp_ssthresh(struct tcp_sock *tp)
+{
+	struct hstcp *ca = tcp_ca(tp);
+
+	/* Do multiplicative decrease */
+	return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_highspeed = {
+	.init		= hstcp_init,
+	.ssthresh	= hstcp_ssthresh,
+	.cong_avoid	= hstcp_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+
+	.owner		= THIS_MODULE,
+	.name		= "highspeed"
+};
+
+static int __init hstcp_register(void)
+{
+	BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_highspeed);
+}
+
+static void __exit hstcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_highspeed);
+}
+
+module_init(hstcp_register);
+module_exit(hstcp_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("High Speed TCP");
-- 
cgit v1.2.3


From 835b3f0c0d7e1f716c45ec576662eac7a68b8548 Mon Sep 17 00:00:00 2001
From: Daniele Lacamera <(root at danielinux.net)net>
Date: Thu, 23 Jun 2005 12:26:34 -0700
Subject: [TCP]: Add TCP Hybla congestion control module.

TCP Hybla congestion avoidance.

- "In heterogeneous networks, TCP connections that incorporate a
terrestrial or satellite radio link are greatly disadvantaged with
respect to entirely wired connections, because of their longer round
trip times (RTTs). To cope with this problem, a new TCP proposal, the
TCP Hybla, is presented and discussed in the paper[1]. It stems from an
analytical evaluation of the congestion window dynamics in the TCP
standard versions (Tahoe, Reno, NewReno), which suggests the necessary
modifications to remove the performance dependence on RTT.[...]"[1]

[1]: Carlo Caini, Rosario Firrincieli, "TCP Hybla: a TCP enhancement for
heterogeneous networks",
International Journal of Satellite Communications and Networking
Volume 22, Issue 5 , Pages 547 - 566. September 2004.

Signed-off-by: Daniele Lacamera (root at danielinux.net)net
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig     |  10 +++
 net/ipv4/Makefile    |   1 +
 net/ipv4/tcp_hybla.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 198 insertions(+)
 create mode 100644 net/ipv4/tcp_hybla.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 910e25b6575..516ffe84281 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -478,6 +478,16 @@ config TCP_CONG_HSTCP
 	increase the congestion window by when an ACK is received.
  	For more detail	see http://www.icir.org/floyd/hstcp.html
 
+config TCP_CONG_HYBLA
+	tristate "TCP-Hybla congestion control algorithm"
+	depends on INET && EXPERIMENTAL
+	default n
+	---help---
+	TCP-Hybla is a sender-side only change that eliminates penalization of
+	long-RTT, large-bandwidth connections, like when satellite legs are
+	involved, expecially when sharing a common bottleneck with normal
+	terrestrial connections.
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ddf5d7785bf..ede7a5d617f 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 00000000000..13a66342c30
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,187 @@
+/*
+ * TCP HYBLA
+ *
+ * TCP-HYBLA Congestion control algorithm, based on:
+ *   C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
+ *   for Heterogeneous Networks",
+ *   International Journal on satellite Communications,
+ *				       September 2004
+ *    Daniele Lacamera
+ *    root at danielinux.net
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* Tcp Hybla structure. */
+struct hybla {
+	u8    hybla_en;
+	u32   snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
+	u32   rho;	      /* Rho parameter, integer part  */
+	u32   rho2;	      /* Rho * Rho, integer part */
+	u32   rho_3ls;	      /* Rho parameter, <<3 */
+	u32   rho2_7ls;	      /* Rho^2, <<7	*/
+	u32   minrtt;	      /* Minimum smoothed round trip time value seen */
+};
+
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
+   expressed in jiffies */
+static int rtt0 = 25;
+module_param(rtt0, int, 0644);
+MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
+
+
+/* This is called to refresh values for hybla parameters */
+static inline void hybla_recalc_param (struct tcp_sock *tp)
+{
+	struct hybla *ca = tcp_ca(tp);
+
+	ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
+	ca->rho = ca->rho_3ls >> 3;
+	ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
+	ca->rho2 = ca->rho2_7ls >>7;
+}
+
+static void hybla_init(struct tcp_sock *tp)
+{
+	struct hybla *ca = tcp_ca(tp);
+
+	ca->rho = 0;
+	ca->rho2 = 0;
+	ca->rho_3ls = 0;
+	ca->rho2_7ls = 0;
+	ca->snd_cwnd_cents = 0;
+	ca->hybla_en = 1;
+	tp->snd_cwnd = 2;
+	tp->snd_cwnd_clamp = 65535;
+
+	/* 1st Rho measurement based on initial srtt */
+	hybla_recalc_param(tp);
+
+	/* set minimum rtt as this is the 1st ever seen */
+	ca->minrtt = tp->srtt;
+	tp->snd_cwnd = ca->rho;
+}
+
+static void hybla_state(struct tcp_sock *tp, u8 ca_state)
+{
+	struct hybla *ca = tcp_ca(tp);
+
+	ca->hybla_en = (ca_state == TCP_CA_Open);
+}
+
+static inline u32 hybla_fraction(u32 odds)
+{
+	static const u32 fractions[] = {
+		128, 139, 152, 165, 181, 197, 215, 234,
+	};
+
+	return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
+}
+
+/* TCP Hybla main routine.
+ * This is the algorithm behavior:
+ *     o Recalc Hybla parameters if min_rtt has changed
+ *     o Give cwnd a new value based on the model proposed
+ *     o remember increments <1
+ */
+static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+			    u32 in_flight, int flag)
+{
+	struct hybla *ca = tcp_ca(tp);
+	u32 increment, odd, rho_fractions;
+	int is_slowstart = 0;
+
+	/*  Recalculate rho only if this srtt is the lowest */
+	if (tp->srtt < ca->minrtt){
+		hybla_recalc_param(tp);
+		ca->minrtt = tp->srtt;
+	}
+
+	if (!ca->hybla_en)
+		return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
+
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+	if (ca->rho == 0)
+		hybla_recalc_param(tp);
+
+	rho_fractions = ca->rho_3ls - (ca->rho << 3);
+
+	if (tp->snd_cwnd < tp->snd_ssthresh) {
+		/*
+		 * slow start
+		 *      INC = 2^RHO - 1
+		 * This is done by splitting the rho parameter
+		 * into 2 parts: an integer part and a fraction part.
+		 * Inrement<<7 is estimated by doing:
+		 *	       [2^(int+fract)]<<7
+		 * that is equal to:
+		 *	       (2^int)	*  [(2^fract) <<7]
+		 * 2^int is straightly computed as 1<<int,
+		 * while we will use hybla_slowstart_fraction_increment() to
+		 * calculate 2^fract in a <<7 value.
+		 */
+		is_slowstart = 1;
+		increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
+			- 128;
+	} else {
+		/*
+		 * congestion avoidance
+		 * INC = RHO^2 / W
+		 * as long as increment is estimated as (rho<<7)/window
+		 * it already is <<7 and we can easily count its fractions.
+		 */
+		increment = ca->rho2_7ls / tp->snd_cwnd;
+		if (increment < 128)
+			tp->snd_cwnd_cnt++;
+	}
+
+	odd = increment % 128;
+	tp->snd_cwnd += increment >> 7;
+	ca->snd_cwnd_cents += odd;
+
+	/* check when fractions goes >=128 and increase cwnd by 1. */
+	while(ca->snd_cwnd_cents >= 128) {
+		tp->snd_cwnd++;
+		ca->snd_cwnd_cents -= 128;
+		tp->snd_cwnd_cnt = 0;
+	}
+
+	/* clamp down slowstart cwnd to ssthresh value. */
+	if (is_slowstart)
+		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+
+	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+
+static struct tcp_congestion_ops tcp_hybla = {
+	.init		= hybla_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.min_cwnd	= tcp_reno_min_cwnd,
+	.cong_avoid	= hybla_cong_avoid,
+	.set_state	= hybla_state,
+
+	.owner		= THIS_MODULE,
+	.name		= "hybla"
+};
+
+static int __init hybla_register(void)
+{
+	BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
+	return tcp_register_congestion_control(&tcp_hybla);
+}
+
+static void __exit hybla_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_hybla);
+}
+
+module_init(hybla_register);
+module_exit(hybla_unregister);
+
+MODULE_AUTHOR("Daniele Lacamera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Hybla");
-- 
cgit v1.2.3


From b87d8561d8667d221b728ccdcb18eb95b16a687b Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:27:19 -0700
Subject: [TCP]: Add TCP Vegas congestion control module.

TCP Vegas code modified for the new TCP infrastructure.
Vegas now uses microsecond resolution timestamps for
better estimation of performance over higher speed links.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig     |  11 ++
 net/ipv4/Makefile    |   1 +
 net/ipv4/tcp_vegas.c | 411 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 423 insertions(+)
 create mode 100644 net/ipv4/tcp_vegas.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 516ffe84281..6c105b60cc0 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -488,6 +488,17 @@ config TCP_CONG_HYBLA
 	involved, expecially when sharing a common bottleneck with normal
 	terrestrial connections.
 
+config TCP_CONG_VEGAS
+	tristate "TCP Vegas"
+	depends on INET && EXPERIMENTAL
+	default n
+	---help---
+	TCP Vegas is a sender-side only change to TCP that anticipates
+	the onset of congestion by estimating the bandwidth. TCP Vegas
+	adjusts the sending rate by modifying the congestion
+	window. TCP Vegas should provide less packet loss, but it is
+	not as aggressive as TCP Reno.
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ede7a5d617f..a801a97bd01 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 00000000000..9bd443db519
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp_diag.h>
+
+#include <net/tcp.h>
+
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static int alpha = 1<<V_PARAM_SHIFT;
+static int beta  = 3<<V_PARAM_SHIFT;
+static int gamma = 1<<V_PARAM_SHIFT;
+
+module_param(alpha, int, 0644);
+MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+
+
+/* Vegas variables */
+struct vegas {
+	u32	beg_snd_nxt;	/* right edge during last RTT */
+	u32	beg_snd_una;	/* left edge  during last RTT */
+	u32	beg_snd_cwnd;	/* saves the size of the cwnd */
+	u8	doing_vegas_now;/* if true, do vegas for this RTT */
+	u16	cntRTT;		/* # of RTTs measured within last RTT */
+	u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
+	u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
+};
+
+/* There are several situations when we must "re-start" Vegas:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static inline void vegas_enable(struct tcp_sock *tp)
+{
+	struct vegas *vegas = tcp_ca(tp);
+
+	/* Begin taking Vegas samples next time we send something. */
+	vegas->doing_vegas_now = 1;
+
+	/* Set the beginning of the next send window. */
+	vegas->beg_snd_nxt = tp->snd_nxt;
+
+	vegas->cntRTT = 0;
+	vegas->minRTT = 0x7fffffff;
+}
+
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct tcp_sock *tp)
+{
+	struct vegas *vegas = tcp_ca(tp);
+
+	vegas->doing_vegas_now = 0;
+}
+
+static void tcp_vegas_init(struct tcp_sock *tp)
+{
+	struct vegas *vegas = tcp_ca(tp);
+
+	vegas->baseRTT = 0x7fffffff;
+	vegas_enable(tp);
+}
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
+{
+	struct vegas *vegas = tcp_ca(tp);
+	u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
+
+	/* Filter to find propagation delay: */
+	if (vrtt < vegas->baseRTT)
+		vegas->baseRTT = vrtt;
+
+	/* Find the min RTT during the last RTT to find
+	 * the current prop. delay + queuing delay:
+	 */
+	vegas->minRTT = min(vegas->minRTT, vrtt);
+	vegas->cntRTT++;
+}
+
+static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
+{
+
+	if (ca_state == TCP_CA_Open)
+		vegas_enable(tp);
+	else
+		vegas_disable(tp);
+}
+
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples.  So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+	if (event == CA_EVENT_CWND_RESTART ||
+	    event == CA_EVENT_TX_START)
+		tcp_vegas_init(tp);
+}
+
+static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
+				 u32 seq_rtt, u32 in_flight, int flag)
+{
+	struct vegas *vegas = tcp_ca(tp);
+
+	if (!vegas->doing_vegas_now)
+		return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
+
+	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
+	 *
+	 * These are so named because they represent the approximate values
+	 * of snd_una and snd_nxt at the beginning of the current RTT. More
+	 * precisely, they represent the amount of data sent during the RTT.
+	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+	 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+	 * bytes of data have been ACKed during the course of the RTT, giving
+	 * an "actual" rate of:
+	 *
+	 *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+	 *
+	 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+	 * because delayed ACKs can cover more than one segment, so they
+	 * don't line up nicely with the boundaries of RTTs.
+	 *
+	 * Another unfortunate fact of life is that delayed ACKs delay the
+	 * advance of the left edge of our send window, so that the number
+	 * of bytes we send in an RTT is often less than our cwnd will allow.
+	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+	 */
+
+	if (after(ack, vegas->beg_snd_nxt)) {
+		/* Do the Vegas once-per-RTT cwnd adjustment. */
+		u32 old_wnd, old_snd_cwnd;
+
+
+		/* Here old_wnd is essentially the window of data that was
+		 * sent during the previous RTT, and has all
+		 * been acknowledged in the course of the RTT that ended
+		 * with the ACK we just received. Likewise, old_snd_cwnd
+		 * is the cwnd during the previous RTT.
+		 */
+		old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
+			tp->mss_cache;
+		old_snd_cwnd = vegas->beg_snd_cwnd;
+
+		/* Save the extent of the current window so we can use this
+		 * at the end of the next RTT.
+		 */
+		vegas->beg_snd_una  = vegas->beg_snd_nxt;
+		vegas->beg_snd_nxt  = tp->snd_nxt;
+		vegas->beg_snd_cwnd = tp->snd_cwnd;
+
+		/* Take into account the current RTT sample too, to
+		 * decrease the impact of delayed acks. This double counts
+		 * this sample since we count it for the next window as well,
+		 * but that's not too awful, since we're taking the min,
+		 * rather than averaging.
+		 */
+		tcp_vegas_rtt_calc(tp, seq_rtt*1000);
+
+		/* We do the Vegas calculations only if we got enough RTT
+		 * samples that we can be reasonably sure that we got
+		 * at least one RTT sample that wasn't from a delayed ACK.
+		 * If we only had 2 samples total,
+		 * then that means we're getting only 1 ACK per RTT, which
+		 * means they're almost certainly delayed ACKs.
+		 * If  we have 3 samples, we should be OK.
+		 */
+
+		if (vegas->cntRTT <= 2) {
+			/* We don't have enough RTT samples to do the Vegas
+			 * calculation, so we'll behave like Reno.
+			 */
+			if (tp->snd_cwnd > tp->snd_ssthresh)
+				tp->snd_cwnd++;
+		} else {
+			u32 rtt, target_cwnd, diff;
+
+			/* We have enough RTT samples, so, using the Vegas
+			 * algorithm, we determine if we should increase or
+			 * decrease cwnd, and by how much.
+			 */
+
+			/* Pluck out the RTT we are using for the Vegas
+			 * calculations. This is the min RTT seen during the
+			 * last RTT. Taking the min filters out the effects
+			 * of delayed ACKs, at the cost of noticing congestion
+			 * a bit later.
+			 */
+			rtt = vegas->minRTT;
+
+			/* Calculate the cwnd we should have, if we weren't
+			 * going too fast.
+			 *
+			 * This is:
+			 *     (actual rate in segments) * baseRTT
+			 * We keep it as a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary point.
+			 */
+			target_cwnd = ((old_wnd * vegas->baseRTT)
+				       << V_PARAM_SHIFT) / rtt;
+
+			/* Calculate the difference between the window we had,
+			 * and the window we would like to have. This quantity
+			 * is the "Diff" from the Arizona Vegas papers.
+			 *
+			 * Again, this is a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary
+			 * point.
+			 */
+			diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+
+			if (tp->snd_cwnd < tp->snd_ssthresh) {
+				/* Slow start.  */
+				if (diff > gamma) {
+					/* Going too fast. Time to slow down
+					 * and switch to congestion avoidance.
+					 */
+					tp->snd_ssthresh = 2;
+
+					/* Set cwnd to match the actual rate
+					 * exactly:
+					 *   cwnd = (actual rate) * baseRTT
+					 * Then we add 1 because the integer
+					 * truncation robs us of full link
+					 * utilization.
+					 */
+					tp->snd_cwnd = min(tp->snd_cwnd,
+							   (target_cwnd >>
+							    V_PARAM_SHIFT)+1);
+
+				}
+			} else {
+				/* Congestion avoidance. */
+				u32 next_snd_cwnd;
+
+				/* Figure out where we would like cwnd
+				 * to be.
+				 */
+				if (diff > beta) {
+					/* The old window was too fast, so
+					 * we slow down.
+					 */
+					next_snd_cwnd = old_snd_cwnd - 1;
+				} else if (diff < alpha) {
+					/* We don't have enough extra packets
+					 * in the network, so speed up.
+					 */
+					next_snd_cwnd = old_snd_cwnd + 1;
+				} else {
+					/* Sending just as fast as we
+					 * should be.
+					 */
+					next_snd_cwnd = old_snd_cwnd;
+				}
+
+				/* Adjust cwnd upward or downward, toward the
+				 * desired value.
+				 */
+				if (next_snd_cwnd > tp->snd_cwnd)
+					tp->snd_cwnd++;
+				else if (next_snd_cwnd < tp->snd_cwnd)
+					tp->snd_cwnd--;
+			}
+		}
+
+		/* Wipe the slate clean for the next RTT. */
+		vegas->cntRTT = 0;
+		vegas->minRTT = 0x7fffffff;
+	}
+
+	/* The following code is executed for every ack we receive,
+	 * except for conditions checked in should_advance_cwnd()
+	 * before the call to tcp_cong_avoid(). Mainly this means that
+	 * we only execute this code if the ack actually acked some
+	 * data.
+	 */
+
+	/* If we are in slow start, increase our cwnd in response to this ACK.
+	 * (If we are not in slow start then we are in congestion avoidance,
+	 * and adjust our congestion window only once per RTT. See the code
+	 * above.)
+	 */
+	if (tp->snd_cwnd <= tp->snd_ssthresh)
+		tp->snd_cwnd++;
+
+	/* to keep cwnd from growing without bound */
+	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+
+	/* Make sure that we are never so timid as to reduce our cwnd below
+	 * 2 MSS.
+	 *
+	 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
+	 */
+	tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+}
+
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
+			       struct sk_buff *skb)
+{
+	const struct vegas *ca = tcp_ca(tp);
+	if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+		struct tcpvegas_info *info;
+
+		info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
+					  sizeof(*info)));
+
+		info->tcpv_enabled = ca->doing_vegas_now;
+		info->tcpv_rttcnt = ca->cntRTT;
+		info->tcpv_rtt = ca->baseRTT;
+		info->tcpv_minrtt = ca->minRTT;
+	rtattr_failure:	;
+	}
+}
+
+static struct tcp_congestion_ops tcp_vegas = {
+	.init		= tcp_vegas_init,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_vegas_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+	.rtt_sample	= tcp_vegas_rtt_calc,
+	.set_state	= tcp_vegas_state,
+	.cwnd_event	= tcp_vegas_cwnd_event,
+	.get_info	= tcp_vegas_get_info,
+
+	.owner		= THIS_MODULE,
+	.name		= "vegas",
+};
+
+static int __init tcp_vegas_register(void)
+{
+	BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
+	tcp_register_congestion_control(&tcp_vegas);
+	return 0;
+}
+
+static void __exit tcp_vegas_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_vegas);
+}
+
+module_init(tcp_vegas_register);
+module_exit(tcp_vegas_unregister);
+
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Vegas");
-- 
cgit v1.2.3


From a7868ea68d29eb2c037952aeb3b549cf05749a18 Mon Sep 17 00:00:00 2001
From: Baruch Even <baruch@ev-en.org>
Date: Thu, 23 Jun 2005 12:28:11 -0700
Subject: [TCP]: Add H-TCP congestion control module.

H-TCP is a congestion control algorithm developed at the Hamilton Institute, by
Douglas Leith and Robert Shorten. It is extending the standard Reno algorithm
with mode switching is thus a relatively simple modification.

H-TCP is defined in a layered manner as it is still a research platform. The
basic form includes the modification of beta according to the ratio of maxRTT
to min RTT and the alpha=2*factor*(1-beta) relation, where factor is dependant
on the time since last congestion.

The other layers improve convergence by adding appropriate factors to alpha.

The following patch implements the H-TCP algorithm in it's basic form.

Signed-Off-By: Baruch Even <baruch@ev-en.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig    |  13 +++
 net/ipv4/Makefile   |   1 +
 net/ipv4/tcp_htcp.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 303 insertions(+)
 create mode 100644 net/ipv4/tcp_htcp.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6c105b60cc0..73a25b52bf7 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -467,6 +467,18 @@ config TCP_CONG_WESTWOOD
 	TCP Westwood+ significantly increases fairness wrt TCP Reno in
 	wired networks and throughput over wireless links.
 
+config TCP_CONG_HTCP
+        tristate "H-TCP"
+	depends on INET
+        default m
+	---help---
+	H-TCP is a send-side only modifications of the TCP Reno
+	protocol stack that optimizes the performance of TCP
+	congestion control for high speed network links. It uses a
+	modeswitch to change the alpha and beta parameters of TCP Reno
+	based on network conditions and in a way so as to be fair with
+	other Reno and H-TCP flows.
+
 config TCP_CONG_HSTCP
 	tristate "High Speed TCP"
 	depends on INET && EXPERIMENTAL
@@ -499,6 +511,7 @@ config TCP_CONG_VEGAS
 	window. TCP Vegas should provide less packet loss, but it is
 	not as aggressive as TCP Reno.
 
+
 endmenu
 
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index a801a97bd01..e96ed17deb9 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
 obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 00000000000..40168275acf
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,289 @@
+/*
+ * H-TCP congestion control. The algorithm is detailed in:
+ * R.N.Shorten, D.J.Leith:
+ *   "H-TCP: TCP for high-speed and long-distance networks"
+ *   Proc. PFLDnet, Argonne, 2004.
+ * http://www.hamilton.ie/net/htcp3.pdf
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+#define ALPHA_BASE	(1<<7)  /* 1.0 with shift << 7 */
+#define BETA_MIN	(1<<6)  /* 0.5 with shift << 7 */
+#define BETA_MAX	102	/* 0.8 with shift << 7 */
+
+static int use_rtt_scaling = 1;
+module_param(use_rtt_scaling, int, 0644);
+MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
+
+static int use_bandwidth_switch = 1;
+module_param(use_bandwidth_switch, int, 0644);
+MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
+
+struct htcp {
+	u16	alpha;		/* Fixed point arith, << 7 */
+	u8	beta;           /* Fixed point arith, << 7 */
+	u8	modeswitch;     /* Delay modeswitch until we had at least one congestion event */
+	u8	ccount;		/* Number of RTTs since last congestion event */
+	u8	undo_ccount;
+	u16	packetcount;
+	u32	minRTT;
+	u32	maxRTT;
+	u32	snd_cwnd_cnt2;
+
+	u32	undo_maxRTT;
+	u32	undo_old_maxB;
+
+	/* Bandwidth estimation */
+	u32	minB;
+	u32	maxB;
+	u32	old_maxB;
+	u32	Bi;
+	u32	lasttime;
+};
+
+static inline void htcp_reset(struct htcp *ca)
+{
+	ca->undo_ccount = ca->ccount;
+	ca->undo_maxRTT = ca->maxRTT;
+	ca->undo_old_maxB = ca->old_maxB;
+
+	ca->ccount = 0;
+	ca->snd_cwnd_cnt2 = 0;
+}
+
+static u32 htcp_cwnd_undo(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+	ca->ccount = ca->undo_ccount;
+	ca->maxRTT = ca->undo_maxRTT;
+	ca->old_maxB = ca->undo_old_maxB;
+	return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
+}
+
+static inline void measure_rtt(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+	u32 srtt = tp->srtt>>3;
+
+	/* keep track of minimum RTT seen so far, minRTT is zero at first */
+	if (ca->minRTT > srtt || !ca->minRTT)
+		ca->minRTT = srtt;
+
+	/* max RTT */
+	if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
+		if (ca->maxRTT < ca->minRTT)
+			ca->maxRTT = ca->minRTT;
+		if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
+			ca->maxRTT = srtt;
+	}
+}
+
+static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
+{
+	struct htcp *ca = tcp_ca(tp);
+	u32 now = tcp_time_stamp;
+
+	/* achieved throughput calculations */
+	if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
+		ca->packetcount = 0;
+		ca->lasttime = now;
+		return;
+	}
+
+	ca->packetcount += pkts_acked;
+
+	if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
+			&& now - ca->lasttime >= ca->minRTT
+			&& ca->minRTT > 0) {
+		__u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
+		if (ca->ccount <= 3) {
+			/* just after backoff */
+			ca->minB = ca->maxB = ca->Bi = cur_Bi;
+		} else {
+			ca->Bi = (3*ca->Bi + cur_Bi)/4;
+			if (ca->Bi > ca->maxB)
+				ca->maxB = ca->Bi;
+			if (ca->minB > ca->maxB)
+				ca->minB = ca->maxB;
+		}
+		ca->packetcount = 0;
+		ca->lasttime = now;
+	}
+}
+
+static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
+{
+	if (use_bandwidth_switch) {
+		u32 maxB = ca->maxB;
+		u32 old_maxB = ca->old_maxB;
+		ca->old_maxB = ca->maxB;
+
+		if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
+			ca->beta = BETA_MIN;
+			ca->modeswitch = 0;
+			return;
+		}
+	}
+
+	if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
+		ca->beta = (minRTT<<7)/maxRTT;
+		if (ca->beta < BETA_MIN)
+			ca->beta = BETA_MIN;
+		else if (ca->beta > BETA_MAX)
+			ca->beta = BETA_MAX;
+	} else {
+		ca->beta = BETA_MIN;
+		ca->modeswitch = 1;
+	}
+}
+
+static inline void htcp_alpha_update(struct htcp *ca)
+{
+	u32 minRTT = ca->minRTT;
+	u32 factor = 1;
+	u32 diff = ca->ccount * minRTT; /* time since last backoff */
+
+	if (diff > HZ) {
+		diff -= HZ;
+		factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
+	}
+
+	if (use_rtt_scaling && minRTT) {
+		u32 scale = (HZ<<3)/(10*minRTT);
+		scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
+		factor = (factor<<3)/scale;
+		if (!factor)
+			factor = 1;
+	}
+
+	ca->alpha = 2*factor*((1<<7)-ca->beta);
+	if (!ca->alpha)
+		ca->alpha = ALPHA_BASE;
+}
+
+/* After we have the rtt data to calculate beta, we'd still prefer to wait one
+ * rtt before we adjust our beta to ensure we are working from a consistent
+ * data.
+ *
+ * This function should be called when we hit a congestion event since only at
+ * that point do we really have a real sense of maxRTT (the queues en route
+ * were getting just too full now).
+ */
+static void htcp_param_update(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+	u32 minRTT = ca->minRTT;
+	u32 maxRTT = ca->maxRTT;
+
+	htcp_beta_update(ca, minRTT, maxRTT);
+	htcp_alpha_update(ca);
+
+	/* add slowly fading memory for maxRTT to accommodate routing changes etc */
+	if (minRTT > 0 && maxRTT > minRTT)
+		ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
+}
+
+static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+	htcp_param_update(tp);
+	return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+}
+
+static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+			    u32 in_flight, int data_acked)
+{
+	struct htcp *ca = tcp_ca(tp);
+
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+		measure_rtt(tp);
+
+		/* keep track of number of round-trip times since last backoff event */
+		if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
+			ca->ccount++;
+			ca->snd_cwnd_cnt2 = 0;
+			htcp_alpha_update(ca);
+		}
+
+                /* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
+		 */
+		if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+			ca->ccount++;
+		}
+	}
+}
+
+/* Lower bound on congestion window. */
+static u32 htcp_min_cwnd(struct tcp_sock *tp)
+{
+	return tp->snd_ssthresh;
+}
+
+
+static void htcp_init(struct tcp_sock *tp)
+{
+	struct htcp *ca = tcp_ca(tp);
+
+	memset(ca, 0, sizeof(struct htcp));
+	ca->alpha = ALPHA_BASE;
+	ca->beta = BETA_MIN;
+}
+
+static void htcp_state(struct tcp_sock *tp, u8 new_state)
+{
+	switch (new_state) {
+	case TCP_CA_CWR:
+	case TCP_CA_Recovery:
+	case TCP_CA_Loss:
+		htcp_reset(tcp_ca(tp));
+		break;
+	}
+}
+
+static struct tcp_congestion_ops htcp = {
+	.init		= htcp_init,
+	.ssthresh	= htcp_recalc_ssthresh,
+	.min_cwnd	= htcp_min_cwnd,
+	.cong_avoid	= htcp_cong_avoid,
+	.set_state	= htcp_state,
+	.undo_cwnd	= htcp_cwnd_undo,
+	.pkts_acked	= measure_achieved_throughput,
+	.owner		= THIS_MODULE,
+	.name		= "htcp",
+};
+
+static int __init htcp_register(void)
+{
+	BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
+	BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
+	if (!use_bandwidth_switch)
+		htcp.pkts_acked = NULL;
+	return tcp_register_congestion_control(&htcp);
+}
+
+static void __exit htcp_unregister(void)
+{
+	tcp_unregister_congestion_control(&htcp);
+}
+
+module_init(htcp_register);
+module_exit(htcp_unregister);
+
+MODULE_AUTHOR("Baruch Even");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("H-TCP");
-- 
cgit v1.2.3


From 0e57976b6376f7fda6bef8b7dee2a3c8819ec9e9 Mon Sep 17 00:00:00 2001
From: John Heffner <jheffner@psc.edu>
Date: Thu, 23 Jun 2005 12:29:07 -0700
Subject: [TCP]: Add Scalable TCP congestion control module.

This patch implements Tom Kelly's Scalable TCP congestion control algorithm
for the modular framework.

The algorithm has some nice scaling properties, and has been used a fair bit
in research, though is known to have significant fairness issues, so it's not
really suitable for general purpose use.

Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig        |  9 +++++++
 net/ipv4/Makefile       |  1 +
 net/ipv4/tcp_scalable.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+)
 create mode 100644 net/ipv4/tcp_scalable.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 73a25b52bf7..690e88ba248 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -511,6 +511,15 @@ config TCP_CONG_VEGAS
 	window. TCP Vegas should provide less packet loss, but it is
 	not as aggressive as TCP Reno.
 
+config TCP_CONG_SCALABLE
+	tristate "Scalable TCP"
+	depends on INET && EXPERIMENTAL
+	default n
+	---help---
+	Scalable TCP is a sender-side only change to TCP which uses a
+	MIMD congestion control algorithm which has some nice scaling
+	properties, though is known to have fairness issues.
+	See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
 
 endmenu
 
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index e96ed17deb9..5718cdb3a61 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
 obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
 obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
 obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
 		      xfrm4_output.o
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 00000000000..70e108e15c7
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,68 @@
+/* Tom Kelly's Scalable TCP
+ *
+ * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ *
+ * John Heffner <jheffner@sc.edu>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+
+/* These factors derived from the recommended values in the aer:
+ * .01 and and 7/8. We use 50 instead of 100 to account for
+ * delayed ack.
+ */
+#define TCP_SCALABLE_AI_CNT	50U
+#define TCP_SCALABLE_MD_SCALE	3
+
+static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+				    u32 in_flight, int flag)
+{
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+	if (tp->snd_cwnd <= tp->snd_ssthresh) {
+		tp->snd_cwnd++;
+	} else {
+		tp->snd_cwnd_cnt++;
+		if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
+			tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+		}
+	}
+	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
+{
+	return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
+}
+
+
+static struct tcp_congestion_ops tcp_scalable = {
+	.ssthresh	= tcp_scalable_ssthresh,
+	.cong_avoid	= tcp_scalable_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+
+	.owner		= THIS_MODULE,
+	.name		= "scalable",
+};
+
+static int __init tcp_scalable_register(void)
+{
+	return tcp_register_congestion_control(&tcp_scalable);
+}
+
+static void __exit tcp_scalable_unregister(void)
+{
+	tcp_unregister_congestion_control(&tcp_scalable);
+}
+
+module_init(tcp_scalable_register);
+module_exit(tcp_scalable_unregister);
+
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Scalable TCP");
-- 
cgit v1.2.3


From c1ebcdb8c422cd73f54bcd2b9953e443a47667e5 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:08:59 -0700
Subject: [NET]: Remove obsolete fastroute stats.

Remove last vestiages of fastroute code that is no longer used.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c             | 10 ++--------
 net/core/sysctl_net_core.c |  1 -
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index ab935778ce8..4f1ae2efe87 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2056,14 +2056,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   s->total, s->dropped, s->time_squeeze, s->throttled,
-		   s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
-		   s->fastroute_deferred_out,
-#if 0
-		   s->fastroute_latency_reduction
-#else
-		   s->cpu_collision
-#endif
-		  );
+		   0, 0, 0, 0, /* was fastroute */
+		   s->cpu_collision );
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 880a8881521..76e9987474c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -18,7 +18,6 @@ extern int no_cong_thresh;
 extern int no_cong;
 extern int lo_cong;
 extern int mod_cong;
-extern int netdev_fastroute;
 extern int net_msg_cost;
 extern int net_msg_burst;
 
-- 
cgit v1.2.3


From 34008d8c631d067caffa136313260525f3ae48a2 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:10:00 -0700
Subject: [NET]: Remove obsolete netif_rx congestion sensing mechanism.

Remove the congestion sensing mechanism from netif_rx, and always
return either full or empty.  Almost no driver checks the return value
from netif_rx, and those that do only use it for debug messages.

The original design of netif_rx was to do flow control based on the
receive queue, but NAPI has supplanted this and no driver uses the
feedback.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c             | 88 +---------------------------------------------
 net/core/sysctl_net_core.c | 36 -------------------
 2 files changed, 1 insertion(+), 123 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 4f1ae2efe87..3156df699f0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
 #endif	/* CONFIG_NET_RADIO */
 #include <asm/current.h>
 
-/* This define, if set, will randomly drop a packet when congestion
- * is more than moderate.  It helps fairness in the multi-interface
- * case when one of them is a hog, but it kills performance for the
- * single interface case so it is off now by default.
- */
-#undef RAND_LIE
-
-/* Setting this will sample the queue lengths and thus congestion
- * via a timer instead of as each packet is received.
- */
-#undef OFFLINE_SAMPLE
-
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];	/* 16 way hashed list */
 static struct list_head ptype_all;		/* Taps */
 
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy);
-static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
-#endif
-
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -1365,69 +1348,10 @@ out:
 
 int netdev_max_backlog = 300;
 int weight_p = 64;            /* old backlog weight */
-/* These numbers are selected based on intuition and some
- * experimentatiom, if you have more scientific way of doing this
- * please go ahead and fix things.
- */
-int no_cong_thresh = 10;
-int no_cong = 20;
-int lo_cong = 100;
-int mod_cong = 290;
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 
-static void get_sample_stats(int cpu)
-{
-#ifdef RAND_LIE
-	unsigned long rd;
-	int rq;
-#endif
-	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
-	int blog = sd->input_pkt_queue.qlen;
-	int avg_blog = sd->avg_blog;
-
-	avg_blog = (avg_blog >> 1) + (blog >> 1);
-
-	if (avg_blog > mod_cong) {
-		/* Above moderate congestion levels. */
-		sd->cng_level = NET_RX_CN_HIGH;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-		if (rq < avg_blog) /* unlucky bastard */
-			sd->cng_level = NET_RX_DROP;
-#endif
-	} else if (avg_blog > lo_cong) {
-		sd->cng_level = NET_RX_CN_MOD;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-			if (rq < avg_blog) /* unlucky bastard */
-				sd->cng_level = NET_RX_CN_HIGH;
-#endif
-	} else if (avg_blog > no_cong)
-		sd->cng_level = NET_RX_CN_LOW;
-	else  /* no congestion */
-		sd->cng_level = NET_RX_SUCCESS;
-
-	sd->avg_blog = avg_blog;
-}
-
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy)
-{
-/* 10 ms 0r 1ms -- i don't care -- JHS */
-	int next_tick = 1;
-	int cpu = smp_processor_id();
-
-	get_sample_stats(cpu);
-	next_tick += jiffies;
-	mod_timer(&samp_timer, next_tick);
-}
-#endif
-
-
 /**
  *	netif_rx	-	post buffer to the network code
  *	@skb: buffer to post
@@ -1476,11 +1400,8 @@ int netif_rx(struct sk_buff *skb)
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
-#ifndef OFFLINE_SAMPLE
-			get_sample_stats(this_cpu);
-#endif
 			local_irq_restore(flags);
-			return queue->cng_level;
+			return NET_RX_SUCCESS;
 		}
 
 		if (queue->throttle)
@@ -3300,8 +3221,6 @@ static int __init net_dev_init(void)
 		queue = &per_cpu(softnet_data, i);
 		skb_queue_head_init(&queue->input_pkt_queue);
 		queue->throttle = 0;
-		queue->cng_level = 0;
-		queue->avg_blog = 10; /* arbitrary non-zero */
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
 		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3310,11 +3229,6 @@ static int __init net_dev_init(void)
 		atomic_set(&queue->backlog_dev.refcnt, 1);
 	}
 
-#ifdef OFFLINE_SAMPLE
-	samp_timer.expires = jiffies + (10 * HZ);
-	add_timer(&samp_timer);
-#endif
-
 	dev_boot_phase = 0;
 
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 76e9987474c..fff63643a35 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -14,10 +14,6 @@
 
 extern int netdev_max_backlog;
 extern int weight_p;
-extern int no_cong_thresh;
-extern int no_cong;
-extern int lo_cong;
-extern int mod_cong;
 extern int net_msg_cost;
 extern int net_msg_burst;
 
@@ -84,38 +80,6 @@ ctl_table core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
-	{
-		.ctl_name	= NET_CORE_NO_CONG_THRESH,
-		.procname	= "no_cong_thresh",
-		.data		= &no_cong_thresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_NO_CONG,
-		.procname	= "no_cong",
-		.data		= &no_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_LO_CONG,
-		.procname	= "lo_cong",
-		.data		= &lo_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_MOD_CONG,
-		.procname	= "mod_cong",
-		.data		= &mod_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
 	{
 		.ctl_name	= NET_CORE_MSG_COST,
 		.procname	= "message_cost",
-- 
cgit v1.2.3


From 31aa02c53c84658f6694f319f09e232ede27be5a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:12:48 -0700
Subject: [NET]: Eliminate netif_rx massive packet drops.

Eliminate the throttling behaviour when the netif receive queue fills
because it behaves badly when using high speed networks under load.
The throttling cause multiple packet drops that cause TCP to go into
slow start mode. The same effective patch has been part of BIC TCP and
H-TCP as well as part of Web100.

The existing code drops 100's of packets when the queue fills;
this changes it to individual packet drop-tail.

Signed-off-by: Stephen Hemmminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 3156df699f0..1a64508e527 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -198,7 +198,7 @@ static struct notifier_block *netdev_chain;
  *	Device drivers call our routines to queue packets here. We empty the
  *	queue in the local softnet handler.
  */
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
+DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
 
 #ifdef CONFIG_SYSFS
 extern int netdev_sysfs_init(void);
@@ -1372,7 +1372,6 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 int netif_rx(struct sk_buff *skb)
 {
-	int this_cpu;
 	struct softnet_data *queue;
 	unsigned long flags;
 
@@ -1388,15 +1387,11 @@ int netif_rx(struct sk_buff *skb)
 	 * short when CPU is congested, but is still operating.
 	 */
 	local_irq_save(flags);
-	this_cpu = smp_processor_id();
 	queue = &__get_cpu_var(softnet_data);
 
 	__get_cpu_var(netdev_rx_stat).total++;
 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
 		if (queue->input_pkt_queue.qlen) {
-			if (queue->throttle)
-				goto drop;
-
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
@@ -1404,19 +1399,10 @@ enqueue:
 			return NET_RX_SUCCESS;
 		}
 
-		if (queue->throttle)
-			queue->throttle = 0;
-
 		netif_rx_schedule(&queue->backlog_dev);
 		goto enqueue;
 	}
 
-	if (!queue->throttle) {
-		queue->throttle = 1;
-		__get_cpu_var(netdev_rx_stat).throttled++;
-	}
-
-drop:
 	__get_cpu_var(netdev_rx_stat).dropped++;
 	local_irq_restore(flags);
 
@@ -1701,8 +1687,6 @@ job_done:
 	smp_mb__before_clear_bit();
 	netif_poll_enable(backlog_dev);
 
-	if (queue->throttle)
-		queue->throttle = 0;
 	local_irq_enable();
 	return 0;
 }
@@ -1976,7 +1960,7 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 	struct netif_rx_stats *s = v;
 
 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
-		   s->total, s->dropped, s->time_squeeze, s->throttled,
+		   s->total, s->dropped, s->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
 		   s->cpu_collision );
 	return 0;
@@ -3220,7 +3204,6 @@ static int __init net_dev_init(void)
 
 		queue = &per_cpu(softnet_data, i);
 		skb_queue_head_init(&queue->input_pkt_queue);
-		queue->throttle = 0;
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
 		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
-- 
cgit v1.2.3


From 51b0bdedb8e784d0d969a6b77151911130812400 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:14:40 -0700
Subject: [NET]: Separate two usages of netdev_max_backlog.

Separate out the two uses of netdev_max_backlog. One controls the
upper bound on packets processed per softirq, the new name for this is
netdev_budget; the other controls the limit on packets queued via
netif_rx.

Increase the max_backlog default to account for faster processors.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c             | 6 +++---
 net/core/sysctl_net_core.c | 9 +++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1a64508e527..7016e0c36b3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1346,7 +1346,8 @@ out:
 			Receiver routines
   =======================================================================*/
 
-int netdev_max_backlog = 300;
+int netdev_max_backlog = 1000;
+int netdev_budget = 300;
 int weight_p = 64;            /* old backlog weight */
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
@@ -1695,8 +1696,7 @@ static void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
 	unsigned long start_time = jiffies;
-	int budget = netdev_max_backlog;
-
+	int budget = netdev_budget;
 	
 	local_irq_disable();
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index fff63643a35..8f817ad9f54 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -13,6 +13,7 @@
 #ifdef CONFIG_SYSCTL
 
 extern int netdev_max_backlog;
+extern int netdev_budget;
 extern int weight_p;
 extern int net_msg_cost;
 extern int net_msg_burst;
@@ -124,6 +125,14 @@ ctl_table core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= NET_CORE_BUDGET,
+		.procname	= "netdev_budget",
+		.data		= &netdev_budget,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3


From 5f8ef48d240963093451bcf83df89f1a1364f51d Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:37:36 -0700
Subject: [TCP]: Allow choosing TCP congestion control via sockopt.

Allow using setsockopt to set TCP congestion control to use on a per
socket basis.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c      | 31 ++++++++++++++++++++++++++++++-
 net/ipv4/tcp_cong.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_ipv4.c |  2 +-
 net/ipv6/tcp_ipv6.c |  2 +-
 4 files changed, 76 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f3dbc8dc126..882436da9a3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1927,6 +1927,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		return tp->af_specific->setsockopt(sk, level, optname,
 						   optval, optlen);
 
+	/* This is a string value all the others are int's */
+	if (optname == TCP_CONGESTION) {
+		char name[TCP_CA_NAME_MAX];
+
+		if (optlen < 1)
+			return -EINVAL;
+
+		val = strncpy_from_user(name, optval,
+					min(TCP_CA_NAME_MAX-1, optlen));
+		if (val < 0)
+			return -EFAULT;
+		name[val] = 0;
+
+		lock_sock(sk);
+		err = tcp_set_congestion_control(tp, name);
+		release_sock(sk);
+		return err;
+	}
+
 	if (optlen < sizeof(int))
 		return -EINVAL;
 
@@ -2211,6 +2230,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	case TCP_QUICKACK:
 		val = !tp->ack.pingpong;
 		break;
+
+	case TCP_CONGESTION:
+		if (get_user(len, optlen))
+			return -EFAULT;
+		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, tp->ca_ops->name, len))
+			return -EFAULT;
+		return 0;
 	default:
 		return -ENOPROTOOPT;
 	};
@@ -2224,7 +2253,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 
 extern void __skb_cb_too_small_for_tcp(int, int);
-extern void tcpdiag_init(void);
+extern struct tcp_congestion_ops tcp_reno;
 
 static __initdata unsigned long thash_entries;
 static int __init set_thash_entries(char *str)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 665394a63ae..4970d10a778 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -21,7 +21,7 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
 {
 	struct tcp_congestion_ops *e;
 
-	list_for_each_entry(e, &tcp_cong_list, list) {
+	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
 		if (strcmp(e->name, name) == 0)
 			return e;
 	}
@@ -77,6 +77,9 @@ void tcp_init_congestion_control(struct tcp_sock *tp)
 {
 	struct tcp_congestion_ops *ca;
 
+	if (tp->ca_ops != &tcp_init_congestion_ops)
+		return;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
 		if (try_module_get(ca->owner)) {
@@ -139,6 +142,34 @@ void tcp_get_default_congestion_control(char *name)
 	rcu_read_unlock();
 }
 
+/* Change congestion control for socket */
+int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+{
+	struct tcp_congestion_ops *ca;
+	int err = 0;
+
+	rcu_read_lock();
+	ca = tcp_ca_find(name);
+	if (ca == tp->ca_ops)
+		goto out;
+
+	if (!ca)
+		err = -ENOENT;
+
+	else if (!try_module_get(ca->owner))
+		err = -EBUSY;
+
+	else {
+		tcp_cleanup_congestion_control(tp);
+		tp->ca_ops = ca;
+		if (tp->ca_ops->init)
+			tp->ca_ops->init(tp);
+	}
+ out:
+	rcu_read_unlock();
+	return err;
+}
+
 /*
  * TCP Reno congestion control
  * This is special case used for fallback as well.
@@ -192,4 +223,15 @@ struct tcp_congestion_ops tcp_reno = {
 	.min_cwnd	= tcp_reno_min_cwnd,
 };
 
-EXPORT_SYMBOL_GPL(tcp_reno);
+/* Initial congestion control used (until SYN)
+ * really reno under another name so we can tell difference
+ * during tcp_set_default_congestion_control
+ */
+struct tcp_congestion_ops tcp_init_congestion_ops  = {
+	.name		= "",
+	.owner		= THIS_MODULE,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+};
+EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9122814c13a..ebf112347a9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2048,7 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->mss_cache_std = tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
-	tp->ca_ops = &tcp_reno;
+	tp->ca_ops = &tcp_init_congestion_ops;
 
 	sk->sk_state = TCP_CLOSE;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fce56039b0e..9dac7fdf472 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_state = TCP_CLOSE;
 
 	tp->af_specific = &ipv6_specific;
-	tp->ca_ops = &tcp_reno;
+	tp->ca_ops = &tcp_init_congestion_ops;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
-- 
cgit v1.2.3


From 677e90eda3bd8cfde0b748daaa46476162a03950 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 20:59:51 -0700
Subject: [NET]: Zerocopy sequential reading of skb data

Implements sequential reading for both linear and non-linear
skb data at zerocopy cost. The data is returned in chunks of
arbitary length, therefore random access is not possible.

Usage:
	from	 := 0
	to	 := 128
	state	 := undef
	data	 := undef
	len	 := undef
	consumed := 0

	skb_prepare_seq_read(skb, from, to, &state)
	while (len = skb_seq_read(consumed, &data, &state)) != 0 do
		/* do something with 'data' of length 'len' */
		if abort then
			/* abort read if we don't wait for
			 * skb_seq_read() to return 0 */
			skb_abort_seq_read(&state)
			return
		endif
		/* not necessary to consume all of 'len' */
		consumed += len
	done

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6d68c03bc05..d285f2f7e81 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1500,6 +1500,120 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
 		skb_split_no_header(skb, skb1, len, pos);
 }
 
+/**
+ * skb_prepare_seq_read - Prepare a sequential read of skb data
+ * @skb: the buffer to read
+ * @from: lower offset of data to be read
+ * @to: upper offset of data to be read
+ * @st: state variable
+ *
+ * Initializes the specified state variable. Must be called before
+ * invoking skb_seq_read() for the first time.
+ */
+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
+			  unsigned int to, struct skb_seq_state *st)
+{
+	st->lower_offset = from;
+	st->upper_offset = to;
+	st->root_skb = st->cur_skb = skb;
+	st->frag_idx = st->stepped_offset = 0;
+	st->frag_data = NULL;
+}
+
+/**
+ * skb_seq_read - Sequentially read skb data
+ * @consumed: number of bytes consumed by the caller so far
+ * @data: destination pointer for data to be returned
+ * @st: state variable
+ *
+ * Reads a block of skb data at &consumed relative to the
+ * lower offset specified to skb_prepare_seq_read(). Assigns
+ * the head of the data block to &data and returns the length
+ * of the block or 0 if the end of the skb data or the upper
+ * offset has been reached.
+ *
+ * The caller is not required to consume all of the data
+ * returned, i.e. &consumed is typically set to the number
+ * of bytes already consumed and the next call to
+ * skb_seq_read() will return the remaining part of the block.
+ *
+ * Note: The size of each block of data returned can be arbitary,
+ *       this limitation is the cost for zerocopy seqeuental
+ *       reads of potentially non linear data.
+ *
+ * Note: Fragment lists within fragments are not implemented
+ *       at the moment, state->root_skb could be replaced with
+ *       a stack for this purpose.
+ */
+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
+			  struct skb_seq_state *st)
+{
+	unsigned int block_limit, abs_offset = consumed + st->lower_offset;
+	skb_frag_t *frag;
+
+	if (unlikely(abs_offset >= st->upper_offset))
+		return 0;
+
+next_skb:
+	block_limit = skb_headlen(st->cur_skb);
+
+	if (abs_offset < block_limit) {
+		*data = st->cur_skb->data + abs_offset;
+		return block_limit - abs_offset;
+	}
+
+	if (st->frag_idx == 0 && !st->frag_data)
+		st->stepped_offset += skb_headlen(st->cur_skb);
+
+	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
+		block_limit = frag->size + st->stepped_offset;
+
+		if (abs_offset < block_limit) {
+			if (!st->frag_data)
+				st->frag_data = kmap_skb_frag(frag);
+
+			*data = (u8 *) st->frag_data + frag->page_offset +
+				(abs_offset - st->stepped_offset);
+
+			return block_limit - abs_offset;
+		}
+
+		if (st->frag_data) {
+			kunmap_skb_frag(st->frag_data);
+			st->frag_data = NULL;
+		}
+
+		st->frag_idx++;
+		st->stepped_offset += frag->size;
+	}
+
+	if (st->cur_skb->next) {
+		st->cur_skb = st->cur_skb->next;
+		st->frag_idx = 0;
+		goto next_skb;
+	} else if (st->root_skb == st->cur_skb &&
+		   skb_shinfo(st->root_skb)->frag_list) {
+		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+		goto next_skb;
+	}
+
+	return 0;
+}
+
+/**
+ * skb_abort_seq_read - Abort a sequential read of skb data
+ * @st: state variable
+ *
+ * Must be called if skb_seq_read() was not called until it
+ * returned 0.
+ */
+void skb_abort_seq_read(struct skb_seq_state *st)
+{
+	if (st->frag_data)
+		kunmap_skb_frag(st->frag_data);
+}
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1538,3 +1652,6 @@ EXPORT_SYMBOL(skb_queue_tail);
 EXPORT_SYMBOL(skb_unlink);
 EXPORT_SYMBOL(skb_append);
 EXPORT_SYMBOL(skb_split);
+EXPORT_SYMBOL(skb_prepare_seq_read);
+EXPORT_SYMBOL(skb_seq_read);
+EXPORT_SYMBOL(skb_abort_seq_read);
-- 
cgit v1.2.3


From 3fc7e8a6d842f72d16d2623b1022814a635ab961 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 21:00:17 -0700
Subject: [NET]: skb_find_text() - Find a text pattern in skb data

Finds a pattern in the skb data according to the specified
textsearch configuration. Use textsearch_next() to retrieve
subsequent occurrences of the pattern. Returns the offset
to the first occurrence or UINT_MAX if no match was found.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d285f2f7e81..bb73b2190ec 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1614,6 +1614,45 @@ void skb_abort_seq_read(struct skb_seq_state *st)
 		kunmap_skb_frag(st->frag_data);
 }
 
+#define TS_SKB_CB(state)	((struct skb_seq_state *) &((state)->cb))
+
+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
+					  struct ts_config *conf,
+					  struct ts_state *state)
+{
+	return skb_seq_read(offset, text, TS_SKB_CB(state));
+}
+
+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
+{
+	skb_abort_seq_read(TS_SKB_CB(state));
+}
+
+/**
+ * skb_find_text - Find a text pattern in skb data
+ * @skb: the buffer to look in
+ * @from: search offset
+ * @to: search limit
+ * @config: textsearch configuration
+ * @state: uninitialized textsearch state variable
+ *
+ * Finds a pattern in the skb data according to the specified
+ * textsearch configuration. Use textsearch_next() to retrieve
+ * subsequent occurrences of the pattern. Returns the offset
+ * to the first occurrence or UINT_MAX if no match was found.
+ */
+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
+			   unsigned int to, struct ts_config *config,
+			   struct ts_state *state)
+{
+	config->get_next_block = skb_ts_get_next_block;
+	config->finish = skb_ts_finish;
+
+	skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
+
+	return textsearch_find(config, state);
+}
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1655,3 +1694,4 @@ EXPORT_SYMBOL(skb_split);
 EXPORT_SYMBOL(skb_prepare_seq_read);
 EXPORT_SYMBOL(skb_seq_read);
 EXPORT_SYMBOL(skb_abort_seq_read);
+EXPORT_SYMBOL(skb_find_text);
-- 
cgit v1.2.3


From d675c989ed2d4ba23dff615330b04371aea83534 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 21:00:58 -0700
Subject: [PKT_SCHED]: Packet classification based on textsearch (ematch)

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig   |  11 ++++
 net/sched/Makefile  |   1 +
 net/sched/em_text.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 169 insertions(+)
 create mode 100644 net/sched/em_text.c

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b22c9beb604..95d9bc5d862 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -449,6 +449,17 @@ config NET_EMATCH_META
 	  To compile this code as a module, choose M here: the
 	  module will be called em_meta.
 
+config NET_EMATCH_TEXT
+	tristate "Textsearch"
+	depends on NET_EMATCH
+	---help---
+	  Say Y here if you want to be ablt to classify packets based on
+	  textsearch comparisons. Please select the appropriate textsearch
+	  algorithms in the Library section.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_text.
+
 config NET_CLS_ACT
 	bool "Packet ACTION"
 	depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe583eba..8f58cecd626 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
 obj-$(CONFIG_NET_EMATCH_U32)	+= em_u32.o
 obj-$(CONFIG_NET_EMATCH_META)	+= em_meta.o
+obj-$(CONFIG_NET_EMATCH_TEXT)	+= em_text.o
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 00000000000..873840d8d07
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,157 @@
+/*
+ * net/sched/em_text.c	Textsearch ematch
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/textsearch.h>
+#include <linux/tc_ematch/tc_em_text.h>
+#include <net/pkt_cls.h>
+
+struct text_match
+{
+	u16			from_offset;
+	u16			to_offset;
+	u8			from_layer;
+	u8			to_layer;
+	struct ts_config	*config;
+};
+
+#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
+
+static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
+			 struct tcf_pkt_info *info)
+{
+	struct text_match *tm = EM_TEXT_PRIV(m);
+	int from, to;
+	struct ts_state state;
+
+	from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+	from += tm->from_offset;
+
+	to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+	to += tm->to_offset;
+
+	return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
+}
+
+static int em_text_change(struct tcf_proto *tp, void *data, int len,
+			  struct tcf_ematch *m)
+{
+	struct text_match *tm;
+	struct tcf_em_text *conf = data;
+	struct ts_config *ts_conf;
+	int flags = 0;
+
+	printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset,
+	    conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len);
+
+	if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
+		return -EINVAL;
+
+	if (conf->from_layer > conf->to_layer)
+		return -EINVAL;
+
+	if (conf->from_layer == conf->to_layer &&
+	    conf->from_offset > conf->to_offset)
+		return -EINVAL;
+
+retry:
+	ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
+				     conf->pattern_len, GFP_KERNEL, flags);
+
+	if (flags & TS_AUTOLOAD)
+		rtnl_lock();
+
+	if (IS_ERR(ts_conf)) {
+		if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
+			rtnl_unlock();
+			flags |= TS_AUTOLOAD;
+			goto retry;
+		} else
+			return PTR_ERR(ts_conf);
+	} else if (flags & TS_AUTOLOAD) {
+		textsearch_destroy(ts_conf);
+		return -EAGAIN;
+	}
+
+	tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+	if (tm == NULL) {
+		textsearch_destroy(ts_conf);
+		return -ENOBUFS;
+	}
+
+	tm->from_offset = conf->from_offset;
+	tm->to_offset   = conf->to_offset;
+	tm->from_layer  = conf->from_layer;
+	tm->to_layer    = conf->to_layer;
+	tm->config      = ts_conf;
+
+	m->datalen = sizeof(*tm);
+	m->data = (unsigned long) tm;
+
+	return 0;
+}
+
+static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+	textsearch_destroy(EM_TEXT_PRIV(m)->config);
+}
+
+static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+	struct text_match *tm = EM_TEXT_PRIV(m);
+	struct tcf_em_text conf;
+
+	strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+	conf.from_offset = tm->from_offset;
+	conf.to_offset = tm->to_offset;
+	conf.from_layer = tm->from_layer;
+	conf.to_layer = tm->to_layer;
+	conf.pattern_len = textsearch_get_pattern_len(tm->config);
+	conf.pad = 0;
+
+	RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
+	RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
+	return 0;
+
+rtattr_failure:
+	return -1;
+}		
+
+static struct tcf_ematch_ops em_text_ops = {
+	.kind	  = TCF_EM_TEXT,
+	.change	  = em_text_change,
+	.match	  = em_text_match,
+	.destroy  = em_text_destroy,
+	.dump	  = em_text_dump,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_text_ops.link)
+};
+
+static int __init init_em_text(void)
+{
+	return tcf_em_register(&em_text_ops);
+}
+
+static void __exit exit_em_text(void) 
+{
+	tcf_em_unregister(&em_text_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_text);
+module_exit(exit_em_text);
-- 
cgit v1.2.3


From f2d368fa3ef90f2159d9e542303901ebf68144dd Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 23 Jun 2005 23:55:41 -0700
Subject: [PKT_SCHED]: Make NET_EMATCH_TEXT select TEXTSEARCh

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 95d9bc5d862..447b89e556b 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -452,6 +452,7 @@ config NET_EMATCH_META
 config NET_EMATCH_TEXT
 	tristate "Textsearch"
 	depends on NET_EMATCH
+	select TEXTSEARCH
 	---help---
 	  Say Y here if you want to be ablt to classify packets based on
 	  textsearch comparisons. Please select the appropriate textsearch
-- 
cgit v1.2.3


From 5ba266d6323e957588712f6a7d31252cd6b797bb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:03:15 -0700
Subject: [PATCH] knfsd: nfsd4: fix probe_callback

rpc_create_client was modified recently to do its own (synchronous) NULL ping
of the server.  We'd rather do that on our own, asynchronously, so that we
don't have to block the nfsd thread doing the probe, and so that setclientid
handling (hence, client mounts) can proceed normally whether the callback is
succesful or not.  (We can still function fine without the callback
channel--we just won't be able to give out delegations till it's verified to
work.)

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/sunrpc/sunrpc_syms.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 32e8acbc60f..62a07349527 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -41,6 +41,7 @@ EXPORT_SYMBOL(rpc_release_task);
 
 /* RPC client functions */
 EXPORT_SYMBOL(rpc_create_client);
+EXPORT_SYMBOL(rpc_new_client);
 EXPORT_SYMBOL(rpc_clone_client);
 EXPORT_SYMBOL(rpc_bind_new_program);
 EXPORT_SYMBOL(rpc_destroy_client);
-- 
cgit v1.2.3


From 52c1da39534fb382c061de58b65f678ad74b59f5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Jun 2005 22:05:33 -0700
Subject: [PATCH] make various thing static

Another rollup of patches which give various symbols static scope

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/sctp/sm_statefuns.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 058189684c7..86073df418f 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
 					     sctp_cmd_seq_t *commands);
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
 
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+					   __u16 error,
+					   const struct sctp_association *asoc,
+					   struct sctp_transport *transport);
+
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands);
 
 /* Small helper function that checks if the chunk length
  * is of the appropriate length.  The 'required_length' argument
@@ -2328,7 +2339,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
  *
  * This is common code called by several sctp_sf_*_abort() functions above.
  */
-sctp_disposition_t  sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
 					   __u16 error,
 					   const struct sctp_association *asoc,
 					   struct sctp_transport *transport)
@@ -3687,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
  *
  * Generate an  ABORT chunk and terminate the association.
  */
-sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep,
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
 				     const sctp_subtype_t type,
 				     void *arg,
-- 
cgit v1.2.3


From f7704347a74fceaf79c89f8b8dbdd0111013e4d6 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 24 Jun 2005 17:39:03 -0700
Subject: [PKT_SCHED]: Make TEXTSEARCH* options only selected.

Do not present these confusing new options to the user
unless he picked some facility that makes use of it,
such as NET_EMATCH_TEXT.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 447b89e556b..7bac249258e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -453,10 +453,11 @@ config NET_EMATCH_TEXT
 	tristate "Textsearch"
 	depends on NET_EMATCH
 	select TEXTSEARCH
+	select TEXTSEARCH_KMP
+	select TEXTSEARCH_FSM
 	---help---
 	  Say Y here if you want to be ablt to classify packets based on
-	  textsearch comparisons. Please select the appropriate textsearch
-	  algorithms in the Library section.
+	  textsearch comparisons.
 
 	  To compile this code as a module, choose M here: the
 	  module will be called em_text.
-- 
cgit v1.2.3


From bb298ca3ce92574d57c4e49b329421425ea7d279 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 24 Jun 2005 17:50:53 -0700
Subject: [IPV4]: Move FIB lookup algorithm choice under IP_ADVANCED_ROUTING

Most users need not be concerned with a complex choice of what
FIB lookup algorithm to use.  So give them the safe default of
IP_FIB_HASH if IP_ADVANCED_ROUTING is disabled.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 64 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 690e88ba248..7bcfb84126d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,32 +1,6 @@
 #
 # IP configuration
 #
-choice 
-	prompt "Choose IP: FIB lookup"
-	depends on INET
-	default IP_FIB_HASH
-
-config IP_FIB_HASH
-	bool "FIB_HASH"
-	---help---
-	Current FIB is very proven and good enough for most users.
-
-config IP_FIB_TRIE
-	bool "FIB_TRIE"
-	---help---
-	Use new experimental LC-trie as FIB lookup algoritm. 
-        This improves lookup performance
-	
-	LC-trie is described in:
-	
- 	IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- 	IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
-	An experimental study of compression methods for dynamic tries
- 	Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- 	http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-       
-endchoice
-
 config IP_MULTICAST
 	bool "IP: multicasting"
 	depends on INET
@@ -79,6 +53,44 @@ config IP_ADVANCED_ROUTER
 
 	  If unsure, say N here.
 
+choice 
+	prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
+	depends on IP_ADVANCED_ROUTER
+	default IP_FIB_HASH
+
+config IP_FIB_HASH
+	bool "FIB_HASH"
+	---help---
+	Current FIB is very proven and good enough for most users.
+
+config IP_FIB_TRIE
+	bool "FIB_TRIE"
+	---help---
+	Use new experimental LC-trie as FIB lookup algoritm. 
+        This improves lookup performance if you have a large
+	number of routes.
+
+	LC-trie is a longest matching prefix lookup algorithm which
+	performs better than FIB_HASH for large routing tables.
+	But, it consumes more memory and is more complex.
+	
+	LC-trie is described in:
+	
+ 	IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+ 	IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+	An experimental study of compression methods for dynamic tries
+ 	Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+ 	http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+       
+endchoice
+
+# If the user does not enable advanced routing, he gets the safe
+# default of the fib-hash algorithm.
+config IP_FIB_HASH
+	bool
+	depends on !IP_ADVANCED_ROUTER
+	default y
+
 config IP_MULTIPLE_TABLES
 	bool "IP: policy routing"
 	depends on IP_ADVANCED_ROUTER
-- 
cgit v1.2.3


From a6484045fdd4154f8c8ee8c1dda4e32854c047e0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 24 Jun 2005 18:07:51 -0700
Subject: [TCP]: Do not present confusing congestion control options by
 default.

Create TCP_CONG_ADVANCED option, akin to IP_ADVANCED_ROUTER, which
when disabled will bypass all of the congestion control Kconfig
options and leave the user with a safe default.

That safe default is currently BIC-TCP with new Reno as a fallback.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7bcfb84126d..34708343312 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -445,9 +445,22 @@ config IP_TCPDIAG
 config IP_TCPDIAG_IPV6
 	def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
 
+config TCP_CONG_ADVANCED
+	bool "TCP: advanced congestion control"
+	depends on INET
+	default y
+	---help---
+	  Support for selection of various TCP congestion control
+	  modules.
+
+	  Nearly all users can safely say no here, and a safe default
+	  selection will be made (BIC-TCP with new Reno as a fallback).
+
+	  If unsure, say N.
+
 # TCP Reno is builtin (required as fallback)
 menu "TCP congestion control"
-	depends on INET
+	depends on TCP_CONG_ADVANCED
 
 config TCP_CONG_BIC
 	tristate "Binary Increase Congestion (BIC) control"
@@ -535,5 +548,10 @@ config TCP_CONG_SCALABLE
 
 endmenu
 
+config TCP_CONG_BIC
+	boolean
+	depends on !TCP_CONG_ADVANCED
+	default y
+
 source "net/ipv4/ipvs/Kconfig"
 
-- 
cgit v1.2.3


From c54d7e03c3a21b38c587f671704c5a12aa3987fc Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 24 Jun 2005 19:57:07 -0700
Subject: [SUNRPC]: Fix {s,}size_t printf format strings in xprt.c

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sunrpc/xprt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index eca92405948..269f217918a 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -970,7 +970,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		goto out;
 	}
 
-	dprintk("RPC:      XID %08x read %u bytes\n",
+	dprintk("RPC:      XID %08x read %Zd bytes\n",
 			ntohl(xprt->tcp_xid), r);
 	dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
 			xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
@@ -1006,7 +1006,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
 	desc->count -= len;
 	desc->offset += len;
 	xprt->tcp_offset += len;
-	dprintk("RPC:      discarded %u bytes\n", len);
+	dprintk("RPC:      discarded %Zu bytes\n", len);
 	tcp_check_recm(xprt);
 }
 
-- 
cgit v1.2.3


From 3e1d1d28d99dabe63c64f7f40f1ca1d646de1f73 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Fri, 24 Jun 2005 23:13:50 -0700
Subject: [PATCH] Cleanup patch for process freezing

1. Establish a simple API for process freezing defined in linux/include/sched.h:

   frozen(process)		Check for frozen process
   freezing(process)		Check if a process is being frozen
   freeze(process)		Tell a process to freeze (go to refrigerator)
   thaw_process(process)	Restart process
   frozen_process(process)	Process is frozen now

2. Remove all references to PF_FREEZE and PF_FROZEN from all
   kernel sources except sched.h

3. Fix numerous locations where try_to_freeze is manually done by a driver

4. Remove the argument that is no longer necessary from two function calls.

5. Some whitespace cleanup

6. Clear potential race in refrigerator (provides an open window of PF_FREEZE
   cleared before setting PF_FROZEN, recalc_sigpending does not check
   PF_FROZEN).

This patch does not address the problem of freeze_processes() violating the rule
that a task may only modify its own flags by setting PF_FREEZE. This is not clean
in an SMP environment. freeze(process) is therefore not SMP safe!

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/rxrpc/krxiod.c   | 2 +-
 net/rxrpc/krxsecd.c  | 2 +-
 net/rxrpc/krxtimod.c | 2 +-
 net/sunrpc/svcsock.c | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a1..dada34a77b2 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
 
 		_debug("### End Work");
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d922..1aadd026d35 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
 
 		_debug("### End Inbound Calls");
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290b..3ac81cdd121 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
 			complete_and_exit(&krxtimod_dead, 0);
 		}
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		/* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc9..56db8f13e6c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 	arg->page_len = (pages-2)*PAGE_SIZE;
 	arg->len = (pages-1)*PAGE_SIZE;
 	arg->tail[0].iov_len = 0;
-	
-	try_to_freeze(PF_FREEZE);
+
+	try_to_freeze();
 	if (signalled())
 		return -EINTR;
 
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 
 		schedule_timeout(timeout);
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		spin_lock_bh(&serv->sv_lock);
 		remove_wait_queue(&rqstp->rq_wait, &wait);
-- 
cgit v1.2.3